summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb')
-rw-r--r--megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb444
1 files changed, 381 insertions, 63 deletions
diff --git a/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb
index ff41e799..6d2b768a 100644
--- a/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb
+++ b/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb
@@ -29,70 +29,353 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Create CSV for API"
+ "## IBM DiF clean CSVs\n",
+ "\n",
+ "- 2283 files could not be downloaded or accessed in the API\n",
+ "- these images were downloaded, but possibly no longer exist"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": 60,
"metadata": {},
+ "outputs": [],
+ "source": [
+ "# flickr api data\n",
+ "fp_in_meta_flickr = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_meta_flickr.csv'\n",
+ "\n",
+ "# api query dump\n",
+ "fp_in_flickr_api = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n",
+ "\n",
+ "# ibm count data\n",
+ "fp_in_meta_filepaths = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_meta_filepaths.csv'\n",
+ "fp_meta_filepaths_adj = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_meta_filepaths_adj.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (2,3,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " interactivity=interactivity, compiler=compiler, result=result)\n"
+ ]
+ }
+ ],
"source": [
- "| photo_id |\n",
- "|:---|\n",
- "| 12234 |"
+ "df_meta_filepaths = pd.read_csv(fp_in_meta_filepaths)\n",
+ "meta_filepaths = df_meta_filepaths.to_dict('records')\n",
+ "df_meta_flickr = pd.read_csv(fp_in_meta_flickr)\n",
+ "meta_flickr = df_meta_flickr.to_dict('records')\n",
+ "df_flickr_api_dump = pd.read_csv(fp_in_flickr_api)"
]
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "98155\n",
+ "98155\n",
+ "98153\n",
+ "100438\n",
+ "98154\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(df_flickr_api_dump))\n",
+ "print(len(df_flickr_api_dump.drop_duplicates(subset='nsid')))\n",
+ "print(len(df_meta_flickr))\n",
+ "print(len(df_meta_filepaths))\n",
+ "print(len(df_meta_filepaths.drop_duplicates(subset='nsid')))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
- "# flickr api data\n",
- "fp_in_flickr_meta = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n",
- "# ibm count data\n",
- "fp_in_ibm_meta = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_metadata.csv'\n",
- "# output\n",
- "fp_out = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_metadata.csv'"
+ "# drop epmty NSIDs\n",
+ "df_meta_filepaths.drop_duplicates(subset='nsid', inplace=True)"
]
},
{
"cell_type": "code",
- "execution_count": 68,
+ "execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
- "# load ibm data and create count lookup with photoid\n",
- "df_ibm_meta = pd.read_csv(fp_in_ibm_meta)\n",
- "ibm_meta_records = df_ibm_meta.to_dict('records')\n",
- "count_lookup = {}\n",
- "for ibm_meta_record in ibm_meta_records:\n",
- " photo_id = int(Path(ibm_meta_record['url']).stem.split('_')[0])\n",
- " count_lookup[photo_id] = ibm_meta_record['count']"
+ "df_meta_filepaths.to_csv(fp_meta_filepaths_adj, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nsid_filepaths = {}\n",
+ "dupes = []\n",
+ "for meta_filepath in meta_filepaths:\n",
+ " nsid = meta_filepath['nsid']\n",
+ " if nsid not in nsid_filepaths.keys():\n",
+ " nsid_filepaths[nsid] = meta_filepath\n",
+ " else:\n",
+ " dupes.append(meta_filepath)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "98154\n",
+ "2284\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(nsid_filepaths))\n",
+ "print(len(dupes))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'filepath': '12537662393_247b2187ee.jpg', 'nsid': nan, 'photo_id': 12537662393, 'url': 'http://farm6.staticflickr.com/5476/12537662393_247b2187ee.jpg'}\n",
+ "{'filepath': '5837222502_29aaf5bb53.jpg', 'nsid': nan, 'photo_id': 5837222502, 'url': 'http://farm4.staticflickr.com/3089/5837222502_29aaf5bb53.jpg'}\n",
+ "{'filepath': '10859466623_4ceb1564dc.jpg', 'nsid': nan, 'photo_id': 10859466623, 'url': 'http://farm6.staticflickr.com/5530/10859466623_4ceb1564dc.jpg'}\n",
+ "{'filepath': '13719567455_fb96dc7ac6.jpg', 'nsid': nan, 'photo_id': 13719567455, 'url': 'http://farm4.staticflickr.com/3718/13719567455_fb96dc7ac6.jpg'}\n",
+ "{'filepath': '3486554266_ca1fc7d99c.jpg', 'nsid': nan, 'photo_id': 3486554266, 'url': 'http://farm4.staticflickr.com/3327/3486554266_ca1fc7d99c.jpg'}\n",
+ "{'filepath': '6168324261_d2fb7bbb60.jpg', 'nsid': nan, 'photo_id': 6168324261, 'url': 'http://farm7.staticflickr.com/6166/6168324261_d2fb7bbb60.jpg'}\n",
+ "{'filepath': '13938295982_0d950feba5.jpg', 'nsid': nan, 'photo_id': 13938295982, 'url': 'http://farm8.staticflickr.com/7162/13938295982_0d950feba5.jpg'}\n",
+ "{'filepath': '8881073633_546b6dbfe5.jpg', 'nsid': nan, 'photo_id': 8881073633, 'url': 'http://farm6.staticflickr.com/5459/8881073633_546b6dbfe5.jpg'}\n",
+ "{'filepath': '10918515734_404eb29879.jpg', 'nsid': nan, 'photo_id': 10918515734, 'url': 'http://farm6.staticflickr.com/5502/10918515734_404eb29879.jpg'}\n",
+ "{'filepath': '3236533532_05cacef8e9.jpg', 'nsid': nan, 'photo_id': 3236533532, 'url': 'http://farm4.staticflickr.com/3425/3236533532_05cacef8e9.jpg'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "for dupe in dupes[:10]:\n",
+ " print(dupe)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "100438\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(dupes))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "98153\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(nsid_groups))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "100436\n"
+ ]
+ }
+ ],
+ "source": [
+ "fp_ims = glob('/data_store_hdd/datasets/people/ibm_dif/downloads/images/*.jpg')\n",
+ "print(len(fp_ims))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9314013316\n"
+ ]
+ }
+ ],
+ "source": [
+ "photo_ids = [Path(x).stem.split('_')[0] for x in fp_ims]\n",
+ "print(photo_ids[0])"
]
},
{
"cell_type": "code",
- "execution_count": 69,
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "KeyError",
+ "evalue": "'photo_id'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m--------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m<ipython-input-45-fd2de6074950>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfilepath_photo_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'photo_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmeta_flickr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;32m<ipython-input-45-fd2de6074950>\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfilepath_photo_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'photo_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmeta_flickr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;31mKeyError\u001b[0m: 'photo_id'"
+ ]
+ }
+ ],
+ "source": [
+ "filepath_photo_ids = [int(x['nsid']) for x in meta_flickr]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d7a9a78bf0e442a5b8445906bc85da99",
+ "version_major": 2,
+ "version_minor": 0
+ },
"text/plain": [
- "100438"
+ "HBox(children=(IntProgress(value=0, max=100436), HTML(value='')))"
]
},
- "execution_count": 69,
"metadata": {},
- "output_type": "execute_result"
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# find which photo IDs are no longer accessible\n",
+ "missing_photo_ids = []\n",
+ "for photo_id in tqdm(photo_ids):\n",
+ " photo_id = int(photo_id)\n",
+ " if photo_id not in filepath_photo_ids:\n",
+ " missing_photo_ids.append(photo_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0\n",
+ "[]\n"
+ ]
}
],
"source": [
+ "print(len(missing_photo_ids))\n",
+ "print(missing_photo_ids[0:10])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "NameError",
+ "evalue": "name 'df_flickr_meta' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m--------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m<ipython-input-30-75e9fdbbbfbb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtotal\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_flickr_meta\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'count'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mNameError\u001b[0m: name 'df_flickr_meta' is not defined"
+ ]
+ }
+ ],
+ "source": [
+ "total = df_flickr_meta['count'].sum()\n",
+ "print(total)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# load ibm data and create count lookup with photoid\n",
+ "df_ibm_meta = pd.read_csv(fp_in_ibm_meta)\n",
+ "ibm_meta_records = df_ibm_meta.to_dict('records')\n",
+ "count_lookup = {}\n",
+ "for ibm_meta_record in ibm_meta_records:\n",
+ " photo_id = int(Path(ibm_meta_record['url']).stem.split('_')[0])\n",
+ " count_lookup[photo_id] = ibm_meta_record['count']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
"len(count_lookup)"
]
},
{
"cell_type": "code",
- "execution_count": 70,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -101,7 +384,7 @@
},
{
"cell_type": "code",
- "execution_count": 71,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -111,18 +394,9 @@
},
{
"cell_type": "code",
- "execution_count": 74,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Error: invalid literal for int() with base 10: '', {'country': '', 'description': 'Haircut Next...', 'lat': '', 'lon': '', 'nsid': '', 'owner_location': '', 'path_alias': '', 'photo_id': '', 'place': '', 'place_id': '', 'posted': '', 'realname': '', 'taken': '', 'username': '', 'woeid': ''}\n",
- "Error: invalid literal for int() with base 10: '', {'country': '', 'description': '', 'lat': '86085317@N00', 'lon': 'New York', 'nsid': 'anonymousthomas', 'owner_location': '4975598', 'path_alias': '', 'photo_id': '', 'place': '1108685469', 'place_id': 'Thomas', 'posted': '2005-02-18 00:11:09', 'realname': 'anonymousthomas', 'taken': '', 'username': '', 'woeid': ''}\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# load flickr data\n",
"for flickr_meta_record in flickr_meta_records:\n",
@@ -143,7 +417,7 @@
},
{
"cell_type": "code",
- "execution_count": 75,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -160,55 +434,99 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
+ "# photo ids and nsids\n",
"fp_flickr_api_dump = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n",
- "fp_out_meta = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_flickr_meta.csv'"
+ "\n",
+ "# file urls\n",
+ "fp_ibm_urls = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_urls.csv'\n",
+ "\n",
+ "# flickr meta\n",
+ "fp_out_filepaths = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_filepaths.csv'"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (2,3,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
- " interactivity=interactivity, compiler=compiler, result=result)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "df = pd.read_csv(fp_flickr_api_dump)\n",
- "groups = df.groupby('nsid')"
+ "df_flickr_meta = pd.read_csv(fp_flickr_api_dump)\n",
+ "df_flickr_meta.fillna('', inplace=True)\n",
+ "flickr_metas = df_flickr_meta.to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "```\n",
+ "|filepath|nsid|photo_id|url|\n",
+ "```"
]
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "results = []\n",
- "for nsid, group in groups:\n",
- " obj = {\n",
- " 'nsid': nsid,\n",
- " 'count': len(group)\n",
- " }\n",
- " results.append(obj)"
+ "photo_id_to_nsid = {}\n",
+ "for flickr_meta in flickr_metas:\n",
+ " photo_id = flickr_meta.get('photo_id')\n",
+ " if photo_id:\n",
+ " photo_id = str(int(photo_id))\n",
+ " photo_id_to_nsid[photo_id] = flickr_meta['nsid']"
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(list(photo_id_to_nsid.keys())[0:10])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_ibm_urls = pd.read_csv(fp_ibm_urls)\n",
+ "ibm_urls = df_ibm_urls.to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "photo_id_to_url = {}\n",
+ "missed = []\n",
+ "for ibm_url in ibm_urls:\n",
+ " photo_id = str(ibm_url['filepath'].split('_')[0])\n",
+ " try:\n",
+ " ibm_url['photo_id'] = photo_id\n",
+ " ibm_url['nsid'] = photo_id_to_nsid[photo_id]\n",
+ " except Exception as e:\n",
+ "# print(e, photo_id)\n",
+ " missed.append(photo_id)\n",
+ "print(f'missed: {len(missed)}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "pd.DataFrame.from_dict(results).to_csv(fp_out_meta, index=False)"
+ "pd.DataFrame.from_dict(ibm_urls).to_csv(fp_out_filepaths, index=False)"
]
},
{