{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Prepare Flickr API Batch CSV" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "from os.path import join\n", "from glob import glob, iglob\n", "from pathlib import Path\n", "from tqdm import tqdm_notebook as tqdm\n", "\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create CSV for API" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "| filepath | query | count |\n", "|:---|:---|:---|\n", "| 12234 | 12234@123| 10 |" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "fp_in_dir = '/data_store/datasets/people/adience/dataset/'\n", "fp_out_queries = '/data_store/datasets/people/adience/research/adience_flickr_api_queries.csv'" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "9\n" ] } ], "source": [ "fp_files = glob(join(fp_in_dir, '*.txt'))\n", "print(len(fp_files))" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "df_images = pd.DataFrame()\n", "for fp_file in fp_files:\n", " df = pd.read_csv(fp_file, delimiter='\\t')\n", " if 'user_id' in df.keys():\n", " df = df[['user_id', 'original_image']]\n", " df_images = dfs.append(df, ignore_index=True) " ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "df_images.drop_duplicates(inplace=True)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "images = df_images.to_dict('records')" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "for image in images:\n", " image['photo_id'] = image['original_image'].split('_')[0]\n", " image['filename'] = f'{image[\"photo_id\"]}.json'" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "df_images = pd.DataFrame.from_dict(images)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "df_images.drop(columns=['original_image'], inplace=True)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10804\n" ] } ], "source": [ "print(len(df_images))" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "df_images.to_csv(fp_out_queries, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "megapixels", "language": "python", "name": "megapixels" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }