summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb')
-rw-r--r--megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb243
1 files changed, 243 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb
new file mode 100644
index 00000000..ff41e799
--- /dev/null
+++ b/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb
@@ -0,0 +1,243 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Prepare Flickr API Batch CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "from os.path import join\n",
+ "from glob import glob, iglob\n",
+ "from pathlib import Path\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create CSV for API"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "| photo_id |\n",
+ "|:---|\n",
+ "| 12234 |"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# flickr api data\n",
+ "fp_in_flickr_meta = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n",
+ "# ibm count data\n",
+ "fp_in_ibm_meta = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_metadata.csv'\n",
+ "# output\n",
+ "fp_out = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_metadata.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# load ibm data and create count lookup with photoid\n",
+ "df_ibm_meta = pd.read_csv(fp_in_ibm_meta)\n",
+ "ibm_meta_records = df_ibm_meta.to_dict('records')\n",
+ "count_lookup = {}\n",
+ "for ibm_meta_record in ibm_meta_records:\n",
+ " photo_id = int(Path(ibm_meta_record['url']).stem.split('_')[0])\n",
+ " count_lookup[photo_id] = ibm_meta_record['count']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "100438"
+ ]
+ },
+ "execution_count": 69,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(count_lookup)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results = []"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_flickr_meta = pd.read_csv(fp_in_flickr_meta, dtype={'count': int, 'username': str, 'sha256': str}).fillna('')\n",
+ "flickr_meta_records = df_flickr_meta.to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Error: invalid literal for int() with base 10: '', {'country': '', 'description': 'Haircut Next...', 'lat': '', 'lon': '', 'nsid': '', 'owner_location': '', 'path_alias': '', 'photo_id': '', 'place': '', 'place_id': '', 'posted': '', 'realname': '', 'taken': '', 'username': '', 'woeid': ''}\n",
+ "Error: invalid literal for int() with base 10: '', {'country': '', 'description': '', 'lat': '86085317@N00', 'lon': 'New York', 'nsid': 'anonymousthomas', 'owner_location': '4975598', 'path_alias': '', 'photo_id': '', 'place': '1108685469', 'place_id': 'Thomas', 'posted': '2005-02-18 00:11:09', 'realname': 'anonymousthomas', 'taken': '', 'username': '', 'woeid': ''}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# load flickr data\n",
+ "for flickr_meta_record in flickr_meta_records:\n",
+ " try:\n",
+ " nsid = flickr_meta_record['nsid']\n",
+ " photo_id = int(flickr_meta_record['photo_id'])\n",
+ " count = count_lookup[photo_id]\n",
+ " except Exception as e:\n",
+ " print(f'Error: {e}, {flickr_meta_record}')\n",
+ " continue\n",
+ " obj = {\n",
+ " 'photo_id': photo_id,\n",
+ " 'nsid': nsid,\n",
+ " 'count': count \n",
+ " }\n",
+ " results.append(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_out = pd.DataFrame.from_dict(results)\n",
+ "df_out.to_csv(fp_out, index=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create meta count file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_flickr_api_dump = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n",
+ "fp_out_meta = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_flickr_meta.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (2,3,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " interactivity=interactivity, compiler=compiler, result=result)\n"
+ ]
+ }
+ ],
+ "source": [
+ "df = pd.read_csv(fp_flickr_api_dump)\n",
+ "groups = df.groupby('nsid')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results = []\n",
+ "for nsid, group in groups:\n",
+ " obj = {\n",
+ " 'nsid': nsid,\n",
+ " 'count': len(group)\n",
+ " }\n",
+ " results.append(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.DataFrame.from_dict(results).to_csv(fp_out_meta, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "megapixels",
+ "language": "python",
+ "name": "megapixels"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}