diff options
Diffstat (limited to 'megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb | 243 |
1 files changed, 243 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb new file mode 100644 index 00000000..ff41e799 --- /dev/null +++ b/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb @@ -0,0 +1,243 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare Flickr API Batch CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "from os.path import join\n", + "from glob import glob, iglob\n", + "from pathlib import Path\n", + "from tqdm import tqdm_notebook as tqdm\n", + "\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create CSV for API" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "| photo_id |\n", + "|:---|\n", + "| 12234 |" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# flickr api data\n", + "fp_in_flickr_meta = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n", + "# ibm count data\n", + "fp_in_ibm_meta = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_metadata.csv'\n", + "# output\n", + "fp_out = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_metadata.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "# load ibm data and create count lookup with photoid\n", + "df_ibm_meta = pd.read_csv(fp_in_ibm_meta)\n", + "ibm_meta_records = df_ibm_meta.to_dict('records')\n", + "count_lookup = {}\n", + "for ibm_meta_record in ibm_meta_records:\n", + " photo_id = int(Path(ibm_meta_record['url']).stem.split('_')[0])\n", + " count_lookup[photo_id] = ibm_meta_record['count']" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100438" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(count_lookup)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "results = []" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "df_flickr_meta = pd.read_csv(fp_in_flickr_meta, dtype={'count': int, 'username': str, 'sha256': str}).fillna('')\n", + "flickr_meta_records = df_flickr_meta.to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error: invalid literal for int() with base 10: '', {'country': '', 'description': 'Haircut Next...', 'lat': '', 'lon': '', 'nsid': '', 'owner_location': '', 'path_alias': '', 'photo_id': '', 'place': '', 'place_id': '', 'posted': '', 'realname': '', 'taken': '', 'username': '', 'woeid': ''}\n", + "Error: invalid literal for int() with base 10: '', {'country': '', 'description': '', 'lat': '86085317@N00', 'lon': 'New York', 'nsid': 'anonymousthomas', 'owner_location': '4975598', 'path_alias': '', 'photo_id': '', 'place': '1108685469', 'place_id': 'Thomas', 'posted': '2005-02-18 00:11:09', 'realname': 'anonymousthomas', 'taken': '', 'username': '', 'woeid': ''}\n" + ] + } + ], + "source": [ + "# load flickr data\n", + "for flickr_meta_record in flickr_meta_records:\n", + " try:\n", + " nsid = flickr_meta_record['nsid']\n", + " photo_id = int(flickr_meta_record['photo_id'])\n", + " count = count_lookup[photo_id]\n", + " except Exception as e:\n", + " print(f'Error: {e}, {flickr_meta_record}')\n", + " continue\n", + " obj = {\n", + " 'photo_id': photo_id,\n", + " 'nsid': nsid,\n", + " 'count': count \n", + " }\n", + " results.append(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "df_out = pd.DataFrame.from_dict(results)\n", + "df_out.to_csv(fp_out, index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create meta count file" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "fp_flickr_api_dump = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n", + "fp_out_meta = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_flickr_meta.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (2,3,10) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " interactivity=interactivity, compiler=compiler, result=result)\n" + ] + } + ], + "source": [ + "df = pd.read_csv(fp_flickr_api_dump)\n", + "groups = df.groupby('nsid')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "results = []\n", + "for nsid, group in groups:\n", + " obj = {\n", + " 'nsid': nsid,\n", + " 'count': len(group)\n", + " }\n", + " results.append(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame.from_dict(results).to_csv(fp_out_meta, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "megapixels", + "language": "python", + "name": "megapixels" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} |
