{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Fix MSC Embassy CSV" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The dotenv extension is already loaded. To reload it, use:\n", " %reload_ext dotenv\n" ] } ], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "%load_ext dotenv\n", "#%dotenv /work/megapixels_dev/env/flickr.env\n", "\n", "import sys, os\n", "from os.path import join\n", "from glob import glob, iglob\n", "from pathlib import Path\n", "from random import randint\n", "import urllib\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "fp_in = '/data_store/datasets/msc/embassies/embassies_on_flickr.csv'\n", "fp_in_metadata = '/data_store/datasets/msc/embassies/embassy_meta_nsid.csv'\n", "fp_out = '/data_store/datasets/msc/embassies/embassies_on_flickr_ext.csv'" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "df_embassies = pd.read_csv(fp_in)\n", "df_embassies.fillna('', inplace=True)\n", "embassy_records = df_embassies.to_dict('records')" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "df_meta = pd.read_csv(fp_in_metadata)\n", "df_meta.fillna('', inplace=True)\n", "meta_records = df_meta.to_dict('records')\n", "meta_records_nsid = {}\n", "for meta_record in meta_records:\n", " meta_records_nsid[meta_record['nsid']] = meta_record" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "for embassy_record in embassy_records:\n", " nsid = embassy_record.get('nsid')\n", " if nsid:\n", " meta = meta_records_nsid.get(nsid)\n", " if meta:\n", " embassy_record['first_name'] = meta['first_name']\n", " embassy_record['last_name'] = meta['last_name']\n", " embassy_record['occupation'] = meta['occupation']\n", " embassy_record['city'] = meta['city']\n", " embassy_record['country'] = meta['country']\n", " embassy_record['email'] = meta['email']\n", " embassy_record['facebook'] = meta['facebook']\n", " embassy_record['instagram'] = meta['instagram']\n", " embassy_record['join_date'] = meta['join_date']\n", " embassy_record['twitter'] = meta['twitter']\n", " embassy_record['profile_description'] = meta['profile_description']\n", " embassy_record['website'] = meta['website']\n" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "df_embassies_ext = pd.DataFrame.from_dict(embassy_records)" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "df_embassies_ext.to_csv(fp_out, index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fix country" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "fp_in = '/data_store/datasets/msc/embassies/embassies_on_flickr.csv'\n", "fp_in_ext = '/data_store/datasets/msc/embassies/embassies_on_flickr_ext.csv'\n", "fp_out = '/data_store/datasets/msc/embassies/embassies_on_flickr_ext_02.csv'" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "df_embassies = pd.read_csv(fp_in)\n", "df_embassies.fillna('', inplace=True)\n", "embassy_records = df_embassies.to_dict('records')\n", "\n", "df_embassies_ext = pd.read_csv(fp_in_ext)\n", "df_embassies_ext.fillna('', inplace=True)\n", "embassy_records_ext = df_embassies_ext.to_dict('records')\n", "\n", "embassy_records_nsid = {}\n", "for embassy_record in embassy_records:\n", " embassy_records_nsid[embassy_record['nsid']] = embassy_record" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "# df_embassies_copy.loc[(df_embassies['nsid'] == '124109311@N07').idxmax(),'country']\n", "# df_embassies_copy.at[df_embassies_copy['nsid'] == '124109311@N07'] = 'Test'\n", "# df_embassies_copy.loc[(df_embassies['nsid'] == '124109311@N07').idxmax(),'country']" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [], "source": [ "# Get country from the original unextended file\n", "for embassy_record_ext in embassy_records_ext:\n", " nsid = embassy_record_ext['nsid']\n", " if embassy_records_nsid.get(nsid):\n", " country = embassy_records_nsid.get(nsid).get('country')\n", " if country:\n", " #embassy_record_ext.setdefault('country', )\n", " country_ext = embassy_record_ext['country']\n", " if country_ext != country:\n", " print(f'set ext: {country_ext} to {country}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "megapixels", "language": "python", "name": "megapixels" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }