{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Prepare Flickr API Batch CSV" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "from os.path import join\n", "from glob import glob, iglob\n", "from pathlib import Path\n", "from tqdm import tqdm_notebook as tqdm\n", "\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cleanup filepaths CSV" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "fp_in_photo_ids = '/data_store_hdd/datasets/people/adience/research/adience_photo_ids.csv'\n", "fp_in_flickr_api_dump = '/data_store_hdd/datasets/people/adience/research/adience_flickr_api_dump.csv'\n", "fp_in_flickr_api_dump_photo_ids = '/data_store_hdd/datasets/people/adience/research/flickr_api_dump_photo_id.csv'\n", "fp_out_filepaths = '/data_store_hdd/datasets/people/adience/research/adience_filepaths.csv'" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "# photo id list\n", "df = pd.read_csv(fp_in_photo_ids)\n", "records = df.to_dict('records')\n", "\n", "# photo id --> url list\n", "df_api_urls = pd.read_csv(fp_in_flickr_api_dump_photo_ids)\n", "api_urls = df_api_urls.to_dict('records')\n", "\n", "df_flickr_api_dump = pd.read_csv(fp_in_flickr_api_dump)\n", "flickr_api_dump = df_flickr_api_dump.to_dict('records')\n", "\n", "# create lookup table for user info?\n", "flickr_api_lookup = {}\n", "for api_item in flickr_api_dump:\n", " nsid = api_item['nsid']\n", " flickr_api_lookup[nsid] = api_item\n", " \n", "# create lookup table for user info?\n", "api_url_lookup = {}\n", "for api_url_item in api_urls:\n", " photo_id = api_url_item['photo_id']\n", " api_url_lookup[photo_id] = api_url_item\n", " \n", "results = []\n", "for record in records:\n", " photo_id = record['photo_id']\n", " if photo_id in api_url_lookup.keys():\n", " api_item = api_url_lookup.get(photo_id)\n", " url = api_item.get('url')\n", " nsid = api_item.get('nsid')\n", " obj = {\n", " 'filepath': f'{photo_id}.jpg',\n", " 'nsid': nsid,\n", " 'photo_id': photo_id,\n", " 'url': url\n", " }\n", " results.append(obj)\n", " \n", "pd.DataFrame.from_dict(results).to_csv(fp_out_filepaths, index=False)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create the file meta csv\n", "results = []\n", "results_download = []\n", "for flickr_meta_record in flickr_meta_records:\n", " # farm, server, photo id, secret\n", " photo_id = str(flickr_meta_record['photo_id'])\n", " nsid = flickr_meta_record.get('nsid')\n", " fp_json = join(fp_dir_flickr_meta, f'{photo_id}.json')\n", " json_data = file_utils.load_json(fp_json)\n", " photo_meta = json_data.get('photo')\n", " farm = photo_meta.get('farm')\n", " server = photo_meta.get('server')\n", " secret = photo_meta.get('secret')\n", " # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg\n", " url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg'\n", " obj = {\n", " 'nsid': nsid,\n", " 'photo_id': photo_id,\n", " 'url': url,\n", " 'filepath': f'{photo_id}.jpg'\n", " }\n", " results.append(obj)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create Photo ID list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fp_files = glob(join(fp_in_dir, '*.txt'))\n", "print(len(fp_files))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_images = pd.DataFrame()\n", "for fp_file in fp_files:\n", " df = pd.read_csv(fp_file, delimiter='\\t')\n", " if 'user_id' in df.keys():\n", " df = df[['user_id', 'original_image']]\n", " df_images = dfs.append(df, ignore_index=True) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_images.drop_duplicates(inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "images = df_images.to_dict('records')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for image in images:\n", " image['photo_id'] = image['original_image'].split('_')[0]\n", " image['filename'] = f'{image[\"photo_id\"]}.json'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_images = pd.DataFrame.from_dict(images)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_images.drop(columns=['original_image'], inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(len(df_images))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_images.to_csv(fp_out_queries, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "megapixels", "language": "python", "name": "megapixels" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }