{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Who Goes There Prepare Flickr API" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "from os.path import join\n", "from glob import glob, iglob\n", "from pathlib import Path\n", "from tqdm import tqdm_notebook as tqdm\n", "\n", "import h5py\n", "from scipy import misc\n", "from io import BytesIO\n", "from base64 import b64decode\n", "\n", "from PIL import Image, ImageDraw\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "import scipy.io as sio\n", "import h5py\n", "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create filepaths CSV for individual lookup" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "fp_flickr_meta = '/data_store/datasets/people/who_goes_there/research/wgt_flickr_queries.csv'\n", "fp_filepaths = '/data_store/datasets/people/who_goes_there/research/who_goes_there_filepaths.csv'" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "df_flickr_meta = pd.read_csv(fp_flickr_meta)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Help on function drop in module pandas.core.frame:\n", "\n", "drop(self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')\n", " Drop specified labels from rows or columns.\n", " \n", " Remove rows or columns by specifying label names and corresponding\n", " axis, or by specifying directly index or column names. When using a\n", " multi-index, labels on different levels can be removed by specifying\n", " the level.\n", " \n", " Parameters\n", " ----------\n", " labels : single label or list-like\n", " Index or column labels to drop.\n", " axis : {0 or 'index', 1 or 'columns'}, default 0\n", " Whether to drop labels from the index (0 or 'index') or\n", " columns (1 or 'columns').\n", " index, columns : single label or list-like\n", " Alternative to specifying axis (``labels, axis=1``\n", " is equivalent to ``columns=labels``).\n", " \n", " .. versionadded:: 0.21.0\n", " level : int or level name, optional\n", " For MultiIndex, level from which the labels will be removed.\n", " inplace : bool, default False\n", " If True, do operation inplace and return None.\n", " errors : {'ignore', 'raise'}, default 'raise'\n", " If 'ignore', suppress error and only existing labels are\n", " dropped.\n", " \n", " Returns\n", " -------\n", " dropped : pandas.DataFrame\n", " \n", " Raises\n", " ------\n", " KeyError\n", " If none of the labels are found in the selected axis\n", " \n", " See Also\n", " --------\n", " DataFrame.loc : Label-location based indexer for selection by label.\n", " DataFrame.dropna : Return DataFrame with labels on given axis omitted\n", " where (all or any) data are missing.\n", " DataFrame.drop_duplicates : Return DataFrame with duplicate rows\n", " removed, optionally only considering certain columns.\n", " Series.drop : Return Series with specified index labels removed.\n", " \n", " Examples\n", " --------\n", " >>> df = pd.DataFrame(np.arange(12).reshape(3,4),\n", " ... columns=['A', 'B', 'C', 'D'])\n", " >>> df\n", " A B C D\n", " 0 0 1 2 3\n", " 1 4 5 6 7\n", " 2 8 9 10 11\n", " \n", " Drop columns\n", " \n", " >>> df.drop(['B', 'C'], axis=1)\n", " A D\n", " 0 0 3\n", " 1 4 7\n", " 2 8 11\n", " \n", " >>> df.drop(columns=['B', 'C'])\n", " A D\n", " 0 0 3\n", " 1 4 7\n", " 2 8 11\n", " \n", " Drop a row by index\n", " \n", " >>> df.drop([0, 1])\n", " A B C D\n", " 2 8 9 10 11\n", " \n", " Drop columns and/or rows of MultiIndex DataFrame\n", " \n", " >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],\n", " ... ['speed', 'weight', 'length']],\n", " ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],\n", " ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])\n", " >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],\n", " ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],\n", " ... [250, 150], [1.5, 0.8], [320, 250],\n", " ... [1, 0.8], [0.3,0.2]])\n", " >>> df\n", " big small\n", " lama speed 45.0 30.0\n", " weight 200.0 100.0\n", " length 1.5 1.0\n", " cow speed 30.0 20.0\n", " weight 250.0 150.0\n", " length 1.5 0.8\n", " falcon speed 320.0 250.0\n", " weight 1.0 0.8\n", " length 0.3 0.2\n", " \n", " >>> df.drop(index='cow', columns='small')\n", " big\n", " lama speed 45.0\n", " weight 200.0\n", " length 1.5\n", " falcon speed 320.0\n", " weight 1.0\n", " length 0.3\n", " \n", " >>> df.drop(index='length', level=1)\n", " big small\n", " lama speed 45.0 30.0\n", " weight 200.0 100.0\n", " cow speed 30.0 20.0\n", " weight 250.0 150.0\n", " falcon speed 320.0 250.0\n", " weight 1.0 0.8\n", "\n" ] } ], "source": [ "help(pd.DataFrame.drop)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['nickname', 'nsid', 'photo_id', 'url'], dtype='object')" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['nsid', 'photo_id', 'url'], dtype='object')\n" ] } ], "source": [ "df_flickr_meta.drop(labels=['subdir'],axis=1, inplace=True)\n", "print(df_flickr_meta.keys())" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "#df_flickr_meta['subdir'] = ''\n", "df_flickr_meta['filepath'] = ''" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "pd.DataFrame.from_dict(df_flickr_meta).to_csv(fp_filepaths, index=False)" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nsidphoto_idurlfilepath
051576145@N024762068863http://farm5.staticflickr.com/4117/4762068863_...
129689383@N025711730606http://farm3.staticflickr.com/2800/5711730606_...
229689383@N025711730606http://farm3.staticflickr.com/2800/5711730606_...
327982139@N002439203939http://farm3.staticflickr.com/2105/2439203939_...
427982139@N002464402099http://farm4.staticflickr.com/3030/2464402099_...
\n", "
" ], "text/plain": [ " nsid photo_id \\\n", "0 51576145@N02 4762068863 \n", "1 29689383@N02 5711730606 \n", "2 29689383@N02 5711730606 \n", "3 27982139@N00 2439203939 \n", "4 27982139@N00 2464402099 \n", "\n", " url filepath \n", "0 http://farm5.staticflickr.com/4117/4762068863_... \n", "1 http://farm3.staticflickr.com/2800/5711730606_... \n", "2 http://farm3.staticflickr.com/2800/5711730606_... \n", "3 http://farm3.staticflickr.com/2105/2439203939_... \n", "4 http://farm4.staticflickr.com/3030/2464402099_... " ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_flickr_meta.head()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "fp_in = '/data_store/datasets/people/who_goes_there/downloads/whogoesthere_dataset.hdf5'\n", "fp_out = '/data_store/datasets/people/who_goes_there/research/wgt_flickr_meta_base.csv'" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "keys_all = ['accuracy', 'admin1', 'admin2', 'age', 'capture_device', 'city', \n", " 'content_length', 'country_code', 'date_taken', 'date_uploaded', \n", " 'description', 'face', 'face_bounds', 'face_key', 'face_landmarks_f', \n", " 'face_landmarks_o', 'gender', 'im_download_url', 'im_extension_original', \n", " 'im_farm_id', 'im_id', 'im_page_url', 'im_secret', 'im_secret_original', \n", " 'im_server_id', 'index', 'latitude', 'license_name', 'license_url', 'longitude', \n", " 'machine_tags', 'title', 'user_nickname', 'user_nsid', 'user_tags']\n", "\n", "keys_keep = ['accuracy', 'admin1', 'admin2', 'age', 'capture_device', 'city', \n", " 'content_length', 'country_code', 'date_taken', 'date_uploaded', \n", " 'description', 'face', 'face_bounds', 'face_key', 'face_landmarks_f', \n", " 'face_landmarks_o', 'gender', 'im_download_url', 'im_extension_original', \n", " 'im_farm_id', 'im_id', 'im_page_url', 'im_secret', 'im_secret_original', \n", " 'im_server_id', 'index', 'latitude', 'license_name', 'license_url', 'longitude', \n", " 'machine_tags', 'title', 'user_nickname', 'user_nsid', 'user_tags']" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "35deca016b57430bbfc84a2f3eefde51", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=2106478), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "results = []\n", "with h5py.File(fp_in, 'r') as fp:\n", " print(fp.keys())\n", " print('')\n", " nsids = fp.get('user_nsid')\n", " for i, nsid in tqdm(enumerate(nsids), total=len(nsids)):\n", " obj = {\n", " 'nsid': nsid.decode(),\n", " 'photo_id': fp.get(\"im_id\")[i],\n", " 'url': fp.get(\"im_download_url\")[i].decode(),\n", " 'nickname': fp.get(\"user_nickname\")[0].decode()\n", " }\n", " results.append(obj)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame.from_dict(results)\n", "df.to_csv(fp_out, index=False)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "df_flickr_meta_ext = '/data_store/datasets/people/who_goes_there/research/wgt_flickr_queries.csv'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert Flickr Queries data to final meta file" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "fp_flickr_meta = '/data_store/datasets/people/who_goes_there/research/wgt_flickr_queries.csv'\n", "fp_meta_out = '/data_store/datasets/people/who_goes_there/research/who_goes_there_flickr_meta.csv'" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "df_flickr_meta = pd.read_csv(fp_flickr_meta)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "groups = df_flickr_meta.groupby('nsid')" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "results = []\n", "for nsid, group in groups:\n", " obj = {\n", " 'nsid': nsid,\n", " 'count': len(group)\n", " }\n", " results.append(obj)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "pd.DataFrame.from_dict(results).to_csv(fp_meta_out, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "megapixels", "language": "python", "name": "megapixels" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }