{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Images Per Country-Embassy" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "from os.path import join\n", "from glob import glob, iglob\n", "from pathlib import Path\n", "from tqdm import tqdm_notebook as tqdm\n", "\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "# list of embassy flickr image counts\n", "fp_in = '/data_store/datasets/msc/embassies/embassy_counts.csv'\n", "fp_country_codes = '/data_store/datasets/msc/embassies/countries-20140629.csv'\n", "\n", "# summary file\n", "fp_out_location = '/data_store/datasets/msc/embassies/embassy_counts_summary.csv'\n", "fp_out_dataset = '/data_store/datasets/msc/embassies/embassy_counts_summary_dataset.csv'" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "df_counts = pd.read_csv(fp_in)\n", "records_counts = df_counts.to_dict('records')\n", "\n", "df_country_codes = pd.read_csv(fp_country_codes, encoding = \"ISO-8859-1\")\n", "records_country_codes = df_country_codes.to_dict('records')\n", "# convert to easy dict lookup\n", "cc_lookup = {}\n", "for record_country_codes in records_country_codes:\n", " cc_lookup[record_country_codes['Code']] = record_country_codes['English Name']" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "75\n" ] } ], "source": [ "print(len(records_counts))" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "country_groups = df_counts.groupby('guest')" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "8\n" ] } ], "source": [ "print(len(country_groups))" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "EC, 2\n", "FI, 2\n", "FR, 52\n", "GB, 995\n", "IT, 521\n", "NO, 2\n", "SE, 1\n", "US, 6866\n" ] } ], "source": [ "country_summaries = []\n", "for cc, df in country_groups:\n", " print(f'{cc}, {df[\"count\"].sum()}')\n", " country = cc_lookup.get(cc)\n", " country_summaries.append({'cc': cc, 'country': country, 'images': df['count'].sum()})" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "df_summaries = pd.DataFrame.from_dict(country_summaries)\n", "df_summaries.to_csv(fp_out_location, index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get CSV Dataset group" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "dataset_groups = df_counts.groupby('dataset_key')" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ibm_dif, 389\n", "megaface, 5679\n", "vgg_face, 1\n", "who_goes_there, 2372\n" ] } ], "source": [ "summary = []\n", "for dataset_name, df in dataset_groups:\n", " print(f'{dataset_name}, {df[\"count\"].sum()}')\n", " summary.append({'dataset': dataset_name, 'images': df['count'].sum()})\n", " \n", "df = pd.DataFrame.from_dict(summary)\n", "df.to_csv(fp_out_dataset, index=False)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datasetimages
0ibm_dif389
1megaface5679
2vgg_face1
3who_goes_there2372
\n", "
" ], "text/plain": [ " dataset images\n", "0 ibm_dif 389\n", "1 megaface 5679\n", "2 vgg_face 1\n", "3 who_goes_there 2372" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/data_store/datasets/msc/embassies/embassy_counts_summary_dataset.csv\n" ] } ], "source": [ "print(fp_out_dataset)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "megapixels", "language": "python", "name": "megapixels" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }