1 files changed, 117 insertions, 362 deletions
diff --git a/megapixels/notebooks/datasets/ibm_dif/images_per_embassy.ipynb b/megapixels/notebooks/datasets/ibm_dif/images_per_embassy.ipynb
index 4cd3a4fb..08b5afb6 100644
--- a/megapixels/notebooks/datasets/ibm_dif/images_per_embassy.ipynb
+++ b/megapixels/notebooks/datasets/ibm_dif/images_per_embassy.ipynb
@@ -9,7 +9,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -27,30 +27,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 53,
    "metadata": {},
    "outputs": [],
    "source": [
     "# list of embassy flickr image counts\n",
     "fp_in = '/data_store/datasets/msc/embassies/embassy_counts.csv'\n",
+    "fp_country_codes = '/data_store/datasets/msc/embassies/countries-20140629.csv'\n",
     "\n",
     "# summary file\n",
-    "fp_out = '/data_store/datasets/msc/embassies/embassy_counts_summary.csv'"
+    "fp_out_location = '/data_store/datasets/msc/embassies/embassy_counts_summary.csv'\n",
+    "fp_out_dataset = '/data_store/datasets/msc/embassies/embassy_counts_summary_dataset.csv'"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 54,
    "metadata": {},
    "outputs": [],
    "source": [
     "df_counts = pd.read_csv(fp_in)\n",
-    "records_counts = df_counts.to_dict('records')"
+    "records_counts = df_counts.to_dict('records')\n",
+    "\n",
+    "df_country_codes = pd.read_csv(fp_country_codes, encoding = \"ISO-8859-1\")\n",
+    "records_country_codes = df_country_codes.to_dict('records')\n",
+    "# convert to easy dict lookup\n",
+    "cc_lookup = {}\n",
+    "for record_country_codes in records_country_codes:\n",
+    "  cc_lookup[record_country_codes['Code']] = record_country_codes['English Name']"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 55,
    "metadata": {},
    "outputs": [
     {
@@ -67,7 +76,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 56,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -76,7 +85,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 57,
    "metadata": {},
    "outputs": [
     {
@@ -93,60 +102,6 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# drop epmty NSIDs\n",
-    "df_meta_filepaths.drop_duplicates(subset='nsid', inplace=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 61,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_meta_filepaths.to_csv(fp_meta_filepaths_adj, index=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 55,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "nsid_filepaths = {}\n",
-    "dupes = []\n",
-    "for meta_filepath in meta_filepaths:\n",
-    "  nsid = meta_filepath['nsid']\n",
-    "  if nsid not in nsid_filepaths.keys():\n",
-    "    nsid_filepaths[nsid] = meta_filepath\n",
-    "  else:\n",
-    "    dupes.append(meta_filepath)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 56,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "98154\n",
-      "2284\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(len(nsid_filepaths))\n",
-    "print(len(dupes))"
-   ]
-  },
-  {
-   "cell_type": "code",
    "execution_count": 58,
    "metadata": {},
    "outputs": [
@@ -154,371 +109,171 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'filepath': '12537662393_247b2187ee.jpg', 'nsid': nan, 'photo_id': 12537662393, 'url': 'http://farm6.staticflickr.com/5476/12537662393_247b2187ee.jpg'}\n",
-      "{'filepath': '5837222502_29aaf5bb53.jpg', 'nsid': nan, 'photo_id': 5837222502, 'url': 'http://farm4.staticflickr.com/3089/5837222502_29aaf5bb53.jpg'}\n",
-      "{'filepath': '10859466623_4ceb1564dc.jpg', 'nsid': nan, 'photo_id': 10859466623, 'url': 'http://farm6.staticflickr.com/5530/10859466623_4ceb1564dc.jpg'}\n",
-      "{'filepath': '13719567455_fb96dc7ac6.jpg', 'nsid': nan, 'photo_id': 13719567455, 'url': 'http://farm4.staticflickr.com/3718/13719567455_fb96dc7ac6.jpg'}\n",
-      "{'filepath': '3486554266_ca1fc7d99c.jpg', 'nsid': nan, 'photo_id': 3486554266, 'url': 'http://farm4.staticflickr.com/3327/3486554266_ca1fc7d99c.jpg'}\n",
-      "{'filepath': '6168324261_d2fb7bbb60.jpg', 'nsid': nan, 'photo_id': 6168324261, 'url': 'http://farm7.staticflickr.com/6166/6168324261_d2fb7bbb60.jpg'}\n",
-      "{'filepath': '13938295982_0d950feba5.jpg', 'nsid': nan, 'photo_id': 13938295982, 'url': 'http://farm8.staticflickr.com/7162/13938295982_0d950feba5.jpg'}\n",
-      "{'filepath': '8881073633_546b6dbfe5.jpg', 'nsid': nan, 'photo_id': 8881073633, 'url': 'http://farm6.staticflickr.com/5459/8881073633_546b6dbfe5.jpg'}\n",
-      "{'filepath': '10918515734_404eb29879.jpg', 'nsid': nan, 'photo_id': 10918515734, 'url': 'http://farm6.staticflickr.com/5502/10918515734_404eb29879.jpg'}\n",
-      "{'filepath': '3236533532_05cacef8e9.jpg', 'nsid': nan, 'photo_id': 3236533532, 'url': 'http://farm4.staticflickr.com/3425/3236533532_05cacef8e9.jpg'}\n"
+      "EC, 2\n",
+      "FI, 2\n",
+      "FR, 52\n",
+      "GB, 995\n",
+      "IT, 521\n",
+      "NO, 2\n",
+      "SE, 1\n",
+      "US, 6866\n"
      ]
     }
    ],
    "source": [
-    "for dupe in dupes[:10]:\n",
-    "  print(dupe)"
+    "country_summaries = []\n",
+    "for cc, df in country_groups:\n",
+    "  print(f'{cc}, {df[\"count\"].sum()}')\n",
+    "  country = cc_lookup.get(cc)\n",
+    "  country_summaries.append({'cc': cc, 'country': country, 'images': df['count'].sum()})"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 59,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "100438\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "print(len(dupes))"
+    "df_summaries = pd.DataFrame.from_dict(country_summaries)\n",
+    "df_summaries.to_csv(fp_out_location, index=False)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 8,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "98153\n"
-     ]
-    }
-   ],
    "source": [
-    "print(len(nsid_groups))"
+    "## Get CSV Dataset group"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 60,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "100436\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "fp_ims = glob('/data_store_hdd/datasets/people/ibm_dif/downloads/images/*.jpg')\n",
-    "print(len(fp_ims))"
+    "dataset_groups = df_counts.groupby('dataset_key')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 61,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "9314013316\n"
-     ]
-    }
-   ],
-   "source": [
-    "photo_ids = [Path(x).stem.split('_')[0] for x in fp_ims]\n",
-    "print(photo_ids[0])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyError",
-     "evalue": "'photo_id'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m--------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m                           Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-45-fd2de6074950>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfilepath_photo_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'photo_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmeta_flickr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m<ipython-input-45-fd2de6074950>\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfilepath_photo_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'photo_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmeta_flickr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m: 'photo_id'"
+      "ibm_dif, 389\n",
+      "megaface, 5679\n",
+      "vgg_face, 1\n",
+      "who_goes_there, 2372\n"
      ]
     }
    ],
    "source": [
-    "filepath_photo_ids = [int(x['nsid']) for x in meta_flickr]"
+    "summary = []\n",
+    "for dataset_name, df in dataset_groups:\n",
+    "  print(f'{dataset_name}, {df[\"count\"].sum()}')\n",
+    "  summary.append({'dataset': dataset_name, 'images': df['count'].sum()})\n",
+    "        \n",
+    "df = pd.DataFrame.from_dict(summary)\n",
+    "df.to_csv(fp_out_dataset, index=False)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 62,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d7a9a78bf0e442a5b8445906bc85da99",
-       "version_major": 2,
-       "version_minor": 0
-      },
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>dataset</th>\n",
+       "      <th>images</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ibm_dif</td>\n",
+       "      <td>389</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>megaface</td>\n",
+       "      <td>5679</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>vgg_face</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>who_goes_there</td>\n",
+       "      <td>2372</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
       "text/plain": [
-       "HBox(children=(IntProgress(value=0, max=100436), HTML(value='')))"
+       "          dataset  images\n",
+       "0         ibm_dif     389\n",
+       "1        megaface    5679\n",
+       "2        vgg_face       1\n",
+       "3  who_goes_there    2372"
       ]
      },
+     "execution_count": 62,
      "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "# find which photo IDs are no longer accessible\n",
-    "missing_photo_ids = []\n",
-    "for photo_id in tqdm(photo_ids):\n",
-    "  photo_id = int(photo_id)\n",
-    "  if photo_id not in filepath_photo_ids:\n",
-    "    missing_photo_ids.append(photo_id)"
+    "df.head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0\n",
-      "[]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(len(missing_photo_ids))\n",
-    "print(missing_photo_ids[0:10])"
-   ]
+   "outputs": [],
+   "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 63,
    "metadata": {},
    "outputs": [
     {
-     "ename": "NameError",
-     "evalue": "name 'df_flickr_meta' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m--------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                          Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-30-75e9fdbbbfbb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtotal\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_flickr_meta\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'count'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'df_flickr_meta' is not defined"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/data_store/datasets/msc/embassies/embassy_counts_summary_dataset.csv\n"
      ]
     }
    ],
    "source": [
-    "total = df_flickr_meta['count'].sum()\n",
-    "print(total)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load ibm data and create count lookup with photoid\n",
-    "df_ibm_meta = pd.read_csv(fp_in_ibm_meta)\n",
-    "ibm_meta_records = df_ibm_meta.to_dict('records')\n",
-    "count_lookup = {}\n",
-    "for ibm_meta_record in ibm_meta_records:\n",
-    "  photo_id = int(Path(ibm_meta_record['url']).stem.split('_')[0])\n",
-    "  count_lookup[photo_id] = ibm_meta_record['count']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(count_lookup)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "results = []"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_flickr_meta = pd.read_csv(fp_in_flickr_meta, dtype={'count': int, 'username': str, 'sha256': str}).fillna('')\n",
-    "flickr_meta_records = df_flickr_meta.to_dict('records')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load flickr data\n",
-    "for flickr_meta_record in flickr_meta_records:\n",
-    "  try:\n",
-    "    nsid = flickr_meta_record['nsid']\n",
-    "    photo_id = int(flickr_meta_record['photo_id'])\n",
-    "    count = count_lookup[photo_id]\n",
-    "  except Exception as e:\n",
-    "    print(f'Error: {e}, {flickr_meta_record}')\n",
-    "    continue\n",
-    "  obj = {\n",
-    "    'photo_id': photo_id,\n",
-    "    'nsid': nsid,\n",
-    "    'count': count \n",
-    "  }\n",
-    "  results.append(obj)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_out = pd.DataFrame.from_dict(results)\n",
-    "df_out.to_csv(fp_out, index=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Create meta count file"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# photo ids and nsids\n",
-    "fp_flickr_api_dump = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n",
-    "\n",
-    "# file urls\n",
-    "fp_ibm_urls = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_urls.csv'\n",
-    "\n",
-    "# flickr meta\n",
-    "fp_out_filepaths = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_filepaths.csv'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_flickr_meta = pd.read_csv(fp_flickr_api_dump)\n",
-    "df_flickr_meta.fillna('', inplace=True)\n",
-    "flickr_metas = df_flickr_meta.to_dict('records')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "```\n",
-    "|filepath|nsid|photo_id|url|\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "photo_id_to_nsid = {}\n",
-    "for flickr_meta in flickr_metas:\n",
-    "  photo_id = flickr_meta.get('photo_id')\n",
-    "  if photo_id:\n",
-    "    photo_id = str(int(photo_id))\n",
-    "    photo_id_to_nsid[photo_id] = flickr_meta['nsid']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(list(photo_id_to_nsid.keys())[0:10])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_ibm_urls = pd.read_csv(fp_ibm_urls)\n",
-    "ibm_urls = df_ibm_urls.to_dict('records')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "photo_id_to_url = {}\n",
-    "missed = []\n",
-    "for ibm_url in ibm_urls:\n",
-    "  photo_id = str(ibm_url['filepath'].split('_')[0])\n",
-    "  try:\n",
-    "    ibm_url['photo_id'] = photo_id\n",
-    "    ibm_url['nsid'] = photo_id_to_nsid[photo_id]\n",
-    "  except Exception as e:\n",
-    "#     print(e, photo_id)\n",
-    "    missed.append(photo_id)\n",
-    "print(f'missed: {len(missed)}')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pd.DataFrame.from_dict(ibm_urls).to_csv(fp_out_filepaths, index=False)"
+    "print(fp_out_dataset)"
    ]
   },
   {