{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Images Per Country-Embassy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import os\n",
    "from os.path import join\n",
    "from glob import glob, iglob\n",
    "from pathlib import Path\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "# list of embassy flickr image counts\n",
    "fp_in = '/data_store/datasets/msc/embassies/embassy_counts.csv'\n",
    "fp_country_codes = '/data_store/datasets/msc/embassies/countries-20140629.csv'\n",
    "\n",
    "# summary file\n",
    "fp_out_location = '/data_store/datasets/msc/embassies/embassy_counts_summary.csv'\n",
    "fp_out_dataset = '/data_store/datasets/msc/embassies/embassy_counts_summary_dataset.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_counts = pd.read_csv(fp_in)\n",
    "records_counts = df_counts.to_dict('records')\n",
    "\n",
    "df_country_codes = pd.read_csv(fp_country_codes, encoding = \"ISO-8859-1\")\n",
    "records_country_codes = df_country_codes.to_dict('records')\n",
    "# convert to easy dict lookup\n",
    "cc_lookup = {}\n",
    "for record_country_codes in records_country_codes:\n",
    "  cc_lookup[record_country_codes['Code']] = record_country_codes['English Name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "75\n"
     ]
    }
   ],
   "source": [
    "print(len(records_counts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "country_groups = df_counts.groupby('guest')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8\n"
     ]
    }
   ],
   "source": [
    "print(len(country_groups))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "EC, 2\n",
      "FI, 2\n",
      "FR, 52\n",
      "GB, 995\n",
      "IT, 521\n",
      "NO, 2\n",
      "SE, 1\n",
      "US, 6866\n"
     ]
    }
   ],
   "source": [
    "country_summaries = []\n",
    "for cc, df in country_groups:\n",
    "  print(f'{cc}, {df[\"count\"].sum()}')\n",
    "  country = cc_lookup.get(cc)\n",
    "  country_summaries.append({'cc': cc, 'country': country, 'images': df['count'].sum()})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_summaries = pd.DataFrame.from_dict(country_summaries)\n",
    "df_summaries.to_csv(fp_out_location, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get CSV Dataset group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_groups = df_counts.groupby('dataset_key')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ibm_dif, 389\n",
      "megaface, 5679\n",
      "vgg_face, 1\n",
      "who_goes_there, 2372\n"
     ]
    }
   ],
   "source": [
    "summary = []\n",
    "for dataset_name, df in dataset_groups:\n",
    "  print(f'{dataset_name}, {df[\"count\"].sum()}')\n",
    "  summary.append({'dataset': dataset_name, 'images': df['count'].sum()})\n",
    "        \n",
    "df = pd.DataFrame.from_dict(summary)\n",
    "df.to_csv(fp_out_dataset, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>images</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ibm_dif</td>\n",
       "      <td>389</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>megaface</td>\n",
       "      <td>5679</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>vgg_face</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>who_goes_there</td>\n",
       "      <td>2372</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          dataset  images\n",
       "0         ibm_dif     389\n",
       "1        megaface    5679\n",
       "2        vgg_face       1\n",
       "3  who_goes_there    2372"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/data_store/datasets/msc/embassies/embassy_counts_summary_dataset.csv\n"
     ]
    }
   ],
   "source": [
    "print(fp_out_dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "megapixels",
   "language": "python",
   "name": "megapixels"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}