diff options
Diffstat (limited to 'megapixels/notebooks/datasets/ibm_dif/images_per_embassy.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/ibm_dif/images_per_embassy.ipynb | 479 |
1 files changed, 117 insertions, 362 deletions
diff --git a/megapixels/notebooks/datasets/ibm_dif/images_per_embassy.ipynb b/megapixels/notebooks/datasets/ibm_dif/images_per_embassy.ipynb index 4cd3a4fb..08b5afb6 100644 --- a/megapixels/notebooks/datasets/ibm_dif/images_per_embassy.ipynb +++ b/megapixels/notebooks/datasets/ibm_dif/images_per_embassy.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -27,30 +27,39 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "# list of embassy flickr image counts\n", "fp_in = '/data_store/datasets/msc/embassies/embassy_counts.csv'\n", + "fp_country_codes = '/data_store/datasets/msc/embassies/countries-20140629.csv'\n", "\n", "# summary file\n", - "fp_out = '/data_store/datasets/msc/embassies/embassy_counts_summary.csv'" + "fp_out_location = '/data_store/datasets/msc/embassies/embassy_counts_summary.csv'\n", + "fp_out_dataset = '/data_store/datasets/msc/embassies/embassy_counts_summary_dataset.csv'" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "df_counts = pd.read_csv(fp_in)\n", - "records_counts = df_counts.to_dict('records')" + "records_counts = df_counts.to_dict('records')\n", + "\n", + "df_country_codes = pd.read_csv(fp_country_codes, encoding = \"ISO-8859-1\")\n", + "records_country_codes = df_country_codes.to_dict('records')\n", + "# convert to easy dict lookup\n", + "cc_lookup = {}\n", + "for record_country_codes in records_country_codes:\n", + " cc_lookup[record_country_codes['Code']] = record_country_codes['English Name']" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 55, "metadata": {}, "outputs": [ { @@ -67,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -76,7 +85,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 57, "metadata": {}, "outputs": [ { @@ -93,60 +102,6 @@ }, { "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [], - "source": [ - "# drop epmty NSIDs\n", - "df_meta_filepaths.drop_duplicates(subset='nsid', inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [], - "source": [ - "df_meta_filepaths.to_csv(fp_meta_filepaths_adj, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [], - "source": [ - "nsid_filepaths = {}\n", - "dupes = []\n", - "for meta_filepath in meta_filepaths:\n", - " nsid = meta_filepath['nsid']\n", - " if nsid not in nsid_filepaths.keys():\n", - " nsid_filepaths[nsid] = meta_filepath\n", - " else:\n", - " dupes.append(meta_filepath)" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "98154\n", - "2284\n" - ] - } - ], - "source": [ - "print(len(nsid_filepaths))\n", - "print(len(dupes))" - ] - }, - { - "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ @@ -154,371 +109,171 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'filepath': '12537662393_247b2187ee.jpg', 'nsid': nan, 'photo_id': 12537662393, 'url': 'http://farm6.staticflickr.com/5476/12537662393_247b2187ee.jpg'}\n", - "{'filepath': '5837222502_29aaf5bb53.jpg', 'nsid': nan, 'photo_id': 5837222502, 'url': 'http://farm4.staticflickr.com/3089/5837222502_29aaf5bb53.jpg'}\n", - "{'filepath': '10859466623_4ceb1564dc.jpg', 'nsid': nan, 'photo_id': 10859466623, 'url': 'http://farm6.staticflickr.com/5530/10859466623_4ceb1564dc.jpg'}\n", - "{'filepath': '13719567455_fb96dc7ac6.jpg', 'nsid': nan, 'photo_id': 13719567455, 'url': 'http://farm4.staticflickr.com/3718/13719567455_fb96dc7ac6.jpg'}\n", - "{'filepath': '3486554266_ca1fc7d99c.jpg', 'nsid': nan, 'photo_id': 3486554266, 'url': 'http://farm4.staticflickr.com/3327/3486554266_ca1fc7d99c.jpg'}\n", - "{'filepath': '6168324261_d2fb7bbb60.jpg', 'nsid': nan, 'photo_id': 6168324261, 'url': 'http://farm7.staticflickr.com/6166/6168324261_d2fb7bbb60.jpg'}\n", - "{'filepath': '13938295982_0d950feba5.jpg', 'nsid': nan, 'photo_id': 13938295982, 'url': 'http://farm8.staticflickr.com/7162/13938295982_0d950feba5.jpg'}\n", - "{'filepath': '8881073633_546b6dbfe5.jpg', 'nsid': nan, 'photo_id': 8881073633, 'url': 'http://farm6.staticflickr.com/5459/8881073633_546b6dbfe5.jpg'}\n", - "{'filepath': '10918515734_404eb29879.jpg', 'nsid': nan, 'photo_id': 10918515734, 'url': 'http://farm6.staticflickr.com/5502/10918515734_404eb29879.jpg'}\n", - "{'filepath': '3236533532_05cacef8e9.jpg', 'nsid': nan, 'photo_id': 3236533532, 'url': 'http://farm4.staticflickr.com/3425/3236533532_05cacef8e9.jpg'}\n" + "EC, 2\n", + "FI, 2\n", + "FR, 52\n", + "GB, 995\n", + "IT, 521\n", + "NO, 2\n", + "SE, 1\n", + "US, 6866\n" ] } ], "source": [ - "for dupe in dupes[:10]:\n", - " print(dupe)" + "country_summaries = []\n", + "for cc, df in country_groups:\n", + " print(f'{cc}, {df[\"count\"].sum()}')\n", + " country = cc_lookup.get(cc)\n", + " country_summaries.append({'cc': cc, 'country': country, 'images': df['count'].sum()})" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 59, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "100438\n" - ] - } - ], + "outputs": [], "source": [ - "print(len(dupes))" + "df_summaries = pd.DataFrame.from_dict(country_summaries)\n", + "df_summaries.to_csv(fp_out_location, index=False)" ] }, { - "cell_type": "code", - "execution_count": 8, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "98153\n" - ] - } - ], "source": [ - "print(len(nsid_groups))" + "## Get CSV Dataset group" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 60, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "100436\n" - ] - } - ], + "outputs": [], "source": [ - "fp_ims = glob('/data_store_hdd/datasets/people/ibm_dif/downloads/images/*.jpg')\n", - "print(len(fp_ims))" + "dataset_groups = df_counts.groupby('dataset_key')" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 61, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "9314013316\n" - ] - } - ], - "source": [ - "photo_ids = [Path(x).stem.split('_')[0] for x in fp_ims]\n", - "print(photo_ids[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'photo_id'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m--------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m<ipython-input-45-fd2de6074950>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfilepath_photo_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'photo_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmeta_flickr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m<ipython-input-45-fd2de6074950>\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfilepath_photo_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'photo_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmeta_flickr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m: 'photo_id'" + "ibm_dif, 389\n", + "megaface, 5679\n", + "vgg_face, 1\n", + "who_goes_there, 2372\n" ] } ], "source": [ - "filepath_photo_ids = [int(x['nsid']) for x in meta_flickr]" + "summary = []\n", + "for dataset_name, df in dataset_groups:\n", + " print(f'{dataset_name}, {df[\"count\"].sum()}')\n", + " summary.append({'dataset': dataset_name, 'images': df['count'].sum()})\n", + " \n", + "df = pd.DataFrame.from_dict(summary)\n", + "df.to_csv(fp_out_dataset, index=False)" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 62, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d7a9a78bf0e442a5b8445906bc85da99", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>dataset</th>\n", + " <th>images</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>ibm_dif</td>\n", + " <td>389</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>megaface</td>\n", + " <td>5679</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>vgg_face</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>who_goes_there</td>\n", + " <td>2372</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], "text/plain": [ - "HBox(children=(IntProgress(value=0, max=100436), HTML(value='')))" + " dataset images\n", + "0 ibm_dif 389\n", + "1 megaface 5679\n", + "2 vgg_face 1\n", + "3 who_goes_there 2372" ] }, + "execution_count": 62, "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] + "output_type": "execute_result" } ], "source": [ - "# find which photo IDs are no longer accessible\n", - "missing_photo_ids = []\n", - "for photo_id in tqdm(photo_ids):\n", - " photo_id = int(photo_id)\n", - " if photo_id not in filepath_photo_ids:\n", - " missing_photo_ids.append(photo_id)" + "df.head()" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0\n", - "[]\n" - ] - } - ], - "source": [ - "print(len(missing_photo_ids))\n", - "print(missing_photo_ids[0:10])" - ] + "outputs": [], + "source": [] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 63, "metadata": {}, "outputs": [ { - "ename": "NameError", - "evalue": "name 'df_flickr_meta' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m--------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m<ipython-input-30-75e9fdbbbfbb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtotal\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_flickr_meta\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'count'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'df_flickr_meta' is not defined" + "name": "stdout", + "output_type": "stream", + "text": [ + "/data_store/datasets/msc/embassies/embassy_counts_summary_dataset.csv\n" ] } ], "source": [ - "total = df_flickr_meta['count'].sum()\n", - "print(total)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load ibm data and create count lookup with photoid\n", - "df_ibm_meta = pd.read_csv(fp_in_ibm_meta)\n", - "ibm_meta_records = df_ibm_meta.to_dict('records')\n", - "count_lookup = {}\n", - "for ibm_meta_record in ibm_meta_records:\n", - " photo_id = int(Path(ibm_meta_record['url']).stem.split('_')[0])\n", - " count_lookup[photo_id] = ibm_meta_record['count']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(count_lookup)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = []" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_flickr_meta = pd.read_csv(fp_in_flickr_meta, dtype={'count': int, 'username': str, 'sha256': str}).fillna('')\n", - "flickr_meta_records = df_flickr_meta.to_dict('records')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load flickr data\n", - "for flickr_meta_record in flickr_meta_records:\n", - " try:\n", - " nsid = flickr_meta_record['nsid']\n", - " photo_id = int(flickr_meta_record['photo_id'])\n", - " count = count_lookup[photo_id]\n", - " except Exception as e:\n", - " print(f'Error: {e}, {flickr_meta_record}')\n", - " continue\n", - " obj = {\n", - " 'photo_id': photo_id,\n", - " 'nsid': nsid,\n", - " 'count': count \n", - " }\n", - " results.append(obj)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_out = pd.DataFrame.from_dict(results)\n", - "df_out.to_csv(fp_out, index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create meta count file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# photo ids and nsids\n", - "fp_flickr_api_dump = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n", - "\n", - "# file urls\n", - "fp_ibm_urls = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_urls.csv'\n", - "\n", - "# flickr meta\n", - "fp_out_filepaths = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_filepaths.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_flickr_meta = pd.read_csv(fp_flickr_api_dump)\n", - "df_flickr_meta.fillna('', inplace=True)\n", - "flickr_metas = df_flickr_meta.to_dict('records')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```\n", - "|filepath|nsid|photo_id|url|\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "photo_id_to_nsid = {}\n", - "for flickr_meta in flickr_metas:\n", - " photo_id = flickr_meta.get('photo_id')\n", - " if photo_id:\n", - " photo_id = str(int(photo_id))\n", - " photo_id_to_nsid[photo_id] = flickr_meta['nsid']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(list(photo_id_to_nsid.keys())[0:10])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_ibm_urls = pd.read_csv(fp_ibm_urls)\n", - "ibm_urls = df_ibm_urls.to_dict('records')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "photo_id_to_url = {}\n", - "missed = []\n", - "for ibm_url in ibm_urls:\n", - " photo_id = str(ibm_url['filepath'].split('_')[0])\n", - " try:\n", - " ibm_url['photo_id'] = photo_id\n", - " ibm_url['nsid'] = photo_id_to_nsid[photo_id]\n", - " except Exception as e:\n", - "# print(e, photo_id)\n", - " missed.append(photo_id)\n", - "print(f'missed: {len(missed)}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pd.DataFrame.from_dict(ibm_urls).to_csv(fp_out_filepaths, index=False)" + "print(fp_out_dataset)" ] }, { |
