From d61bb411957f31302a0a969de74baa68af126a07 Mon Sep 17 00:00:00 2001 From: adamhrv Date: Thu, 27 Jun 2019 23:57:17 +0200 Subject: working MSC nbs --- .../datasets/ibm_dif/images_per_embassy.ipynb | 553 +++++++++++++++++++++ 1 file changed, 553 insertions(+) create mode 100644 megapixels/notebooks/datasets/ibm_dif/images_per_embassy.ipynb (limited to 'megapixels/notebooks/datasets/ibm_dif') diff --git a/megapixels/notebooks/datasets/ibm_dif/images_per_embassy.ipynb b/megapixels/notebooks/datasets/ibm_dif/images_per_embassy.ipynb new file mode 100644 index 00000000..4cd3a4fb --- /dev/null +++ b/megapixels/notebooks/datasets/ibm_dif/images_per_embassy.ipynb @@ -0,0 +1,553 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Images Per Country-Embassy" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "from os.path import join\n", + "from glob import glob, iglob\n", + "from pathlib import Path\n", + "from tqdm import tqdm_notebook as tqdm\n", + "\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# list of embassy flickr image counts\n", + "fp_in = '/data_store/datasets/msc/embassies/embassy_counts.csv'\n", + "\n", + "# summary file\n", + "fp_out = '/data_store/datasets/msc/embassies/embassy_counts_summary.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df_counts = pd.read_csv(fp_in)\n", + "records_counts = df_counts.to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "75\n" + ] + } + ], + "source": [ + "print(len(records_counts))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "country_groups = df_counts.groupby('guest')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8\n" + ] + } + ], + "source": [ + "print(len(country_groups))" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "# drop epmty NSIDs\n", + "df_meta_filepaths.drop_duplicates(subset='nsid', inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "df_meta_filepaths.to_csv(fp_meta_filepaths_adj, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "nsid_filepaths = {}\n", + "dupes = []\n", + "for meta_filepath in meta_filepaths:\n", + " nsid = meta_filepath['nsid']\n", + " if nsid not in nsid_filepaths.keys():\n", + " nsid_filepaths[nsid] = meta_filepath\n", + " else:\n", + " dupes.append(meta_filepath)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "98154\n", + "2284\n" + ] + } + ], + "source": [ + "print(len(nsid_filepaths))\n", + "print(len(dupes))" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'filepath': '12537662393_247b2187ee.jpg', 'nsid': nan, 'photo_id': 12537662393, 'url': 'http://farm6.staticflickr.com/5476/12537662393_247b2187ee.jpg'}\n", + "{'filepath': '5837222502_29aaf5bb53.jpg', 'nsid': nan, 'photo_id': 5837222502, 'url': 'http://farm4.staticflickr.com/3089/5837222502_29aaf5bb53.jpg'}\n", + "{'filepath': '10859466623_4ceb1564dc.jpg', 'nsid': nan, 'photo_id': 10859466623, 'url': 'http://farm6.staticflickr.com/5530/10859466623_4ceb1564dc.jpg'}\n", + "{'filepath': '13719567455_fb96dc7ac6.jpg', 'nsid': nan, 'photo_id': 13719567455, 'url': 'http://farm4.staticflickr.com/3718/13719567455_fb96dc7ac6.jpg'}\n", + "{'filepath': '3486554266_ca1fc7d99c.jpg', 'nsid': nan, 'photo_id': 3486554266, 'url': 'http://farm4.staticflickr.com/3327/3486554266_ca1fc7d99c.jpg'}\n", + "{'filepath': '6168324261_d2fb7bbb60.jpg', 'nsid': nan, 'photo_id': 6168324261, 'url': 'http://farm7.staticflickr.com/6166/6168324261_d2fb7bbb60.jpg'}\n", + "{'filepath': '13938295982_0d950feba5.jpg', 'nsid': nan, 'photo_id': 13938295982, 'url': 'http://farm8.staticflickr.com/7162/13938295982_0d950feba5.jpg'}\n", + "{'filepath': '8881073633_546b6dbfe5.jpg', 'nsid': nan, 'photo_id': 8881073633, 'url': 'http://farm6.staticflickr.com/5459/8881073633_546b6dbfe5.jpg'}\n", + "{'filepath': '10918515734_404eb29879.jpg', 'nsid': nan, 'photo_id': 10918515734, 'url': 'http://farm6.staticflickr.com/5502/10918515734_404eb29879.jpg'}\n", + "{'filepath': '3236533532_05cacef8e9.jpg', 'nsid': nan, 'photo_id': 3236533532, 'url': 'http://farm4.staticflickr.com/3425/3236533532_05cacef8e9.jpg'}\n" + ] + } + ], + "source": [ + "for dupe in dupes[:10]:\n", + " print(dupe)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100438\n" + ] + } + ], + "source": [ + "print(len(dupes))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "98153\n" + ] + } + ], + "source": [ + "print(len(nsid_groups))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100436\n" + ] + } + ], + "source": [ + "fp_ims = glob('/data_store_hdd/datasets/people/ibm_dif/downloads/images/*.jpg')\n", + "print(len(fp_ims))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9314013316\n" + ] + } + ], + "source": [ + "photo_ids = [Path(x).stem.split('_')[0] for x in fp_ims]\n", + "print(photo_ids[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'photo_id'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m--------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfilepath_photo_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'photo_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmeta_flickr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfilepath_photo_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'photo_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmeta_flickr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m: 'photo_id'" + ] + } + ], + "source": [ + "filepath_photo_ids = [int(x['nsid']) for x in meta_flickr]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d7a9a78bf0e442a5b8445906bc85da99", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=100436), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# find which photo IDs are no longer accessible\n", + "missing_photo_ids = []\n", + "for photo_id in tqdm(photo_ids):\n", + " photo_id = int(photo_id)\n", + " if photo_id not in filepath_photo_ids:\n", + " missing_photo_ids.append(photo_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n", + "[]\n" + ] + } + ], + "source": [ + "print(len(missing_photo_ids))\n", + "print(missing_photo_ids[0:10])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'df_flickr_meta' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m--------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtotal\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_flickr_meta\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'count'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'df_flickr_meta' is not defined" + ] + } + ], + "source": [ + "total = df_flickr_meta['count'].sum()\n", + "print(total)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load ibm data and create count lookup with photoid\n", + "df_ibm_meta = pd.read_csv(fp_in_ibm_meta)\n", + "ibm_meta_records = df_ibm_meta.to_dict('records')\n", + "count_lookup = {}\n", + "for ibm_meta_record in ibm_meta_records:\n", + " photo_id = int(Path(ibm_meta_record['url']).stem.split('_')[0])\n", + " count_lookup[photo_id] = ibm_meta_record['count']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(count_lookup)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_flickr_meta = pd.read_csv(fp_in_flickr_meta, dtype={'count': int, 'username': str, 'sha256': str}).fillna('')\n", + "flickr_meta_records = df_flickr_meta.to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load flickr data\n", + "for flickr_meta_record in flickr_meta_records:\n", + " try:\n", + " nsid = flickr_meta_record['nsid']\n", + " photo_id = int(flickr_meta_record['photo_id'])\n", + " count = count_lookup[photo_id]\n", + " except Exception as e:\n", + " print(f'Error: {e}, {flickr_meta_record}')\n", + " continue\n", + " obj = {\n", + " 'photo_id': photo_id,\n", + " 'nsid': nsid,\n", + " 'count': count \n", + " }\n", + " results.append(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_out = pd.DataFrame.from_dict(results)\n", + "df_out.to_csv(fp_out, index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create meta count file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# photo ids and nsids\n", + "fp_flickr_api_dump = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n", + "\n", + "# file urls\n", + "fp_ibm_urls = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_urls.csv'\n", + "\n", + "# flickr meta\n", + "fp_out_filepaths = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_filepaths.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_flickr_meta = pd.read_csv(fp_flickr_api_dump)\n", + "df_flickr_meta.fillna('', inplace=True)\n", + "flickr_metas = df_flickr_meta.to_dict('records')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "|filepath|nsid|photo_id|url|\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "photo_id_to_nsid = {}\n", + "for flickr_meta in flickr_metas:\n", + " photo_id = flickr_meta.get('photo_id')\n", + " if photo_id:\n", + " photo_id = str(int(photo_id))\n", + " photo_id_to_nsid[photo_id] = flickr_meta['nsid']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(list(photo_id_to_nsid.keys())[0:10])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_ibm_urls = pd.read_csv(fp_ibm_urls)\n", + "ibm_urls = df_ibm_urls.to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "photo_id_to_url = {}\n", + "missed = []\n", + "for ibm_url in ibm_urls:\n", + " photo_id = str(ibm_url['filepath'].split('_')[0])\n", + " try:\n", + " ibm_url['photo_id'] = photo_id\n", + " ibm_url['nsid'] = photo_id_to_nsid[photo_id]\n", + " except Exception as e:\n", + "# print(e, photo_id)\n", + " missed.append(photo_id)\n", + "print(f'missed: {len(missed)}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame.from_dict(ibm_urls).to_csv(fp_out_filepaths, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "megapixels", + "language": "python", + "name": "megapixels" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- cgit v1.2.3-70-g09d2