{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Adience Flickr Metadata Cleanup" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "from os.path import join\n", "from glob import glob\n", "from pathlib import Path\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels')\n", "from app.utils import file_utils" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load CSV" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "fp_in_files = '/data_store_hdd/datasets/people/adience/research/adience_flickr_api_queries.csv'\n", "fp_in_nsid_urls = '/data_store_hdd/datasets/people/adience/research/adience_flickr_nsid_url.csv'\n", "fp_in_nsid_profiles = '/data_store_hdd/datasets/people/adience/research/adience_flickr_nsid_profile.csv'\n", "fp_out = '/data_store_hdd/datasets/people/adience/research/adience_flickr_meta.csv'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create metadata csv output\n", "\n", "|nsid|path_alias|count|" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# files\n", "df_files = pd.read_csv(fp_in_files)\n", "df_files.fillna('', inplace=True)\n", "files = df_files.to_dict('records')\n", "\n", "# nsid urls\n", "df_urls = pd.read_csv(fp_in_nsid_urls)\n", "df_urls.fillna('', inplace=True)\n", "urls = df_urls.to_dict('records')\n", "\n", "# nsid profiles\n", "df_profiles = pd.read_csv(fp_in_nsid_profiles)\n", "df_profiles.fillna('', inplace=True)\n", "profiles = df_profiles.to_dict('records')" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# create nsid lookup table\n", "alias_lookup = {}\n", "for url_meta in urls:\n", " alias_lookup[url_meta['nsid']] = str(Path(url_meta['url']).stem)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "nsid_groups = df_files.groupby('nsid')" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Skipping: 10693681@N00\n", "Skipping: 10743505@N04\n", "Skipping: 113728563@N05\n", "Skipping: 7648211@N03\n" ] } ], "source": [ "items = []\n", "for nsid, nsid_group in nsid_groups:\n", " if nsid not in alias_lookup.keys():\n", " print(f'Skipping: {nsid}')\n", " continue\n", " path_alias = alias_lookup[nsid]\n", " obj = {'nsid': nsid, 'path_alias': path_alias, 'count': len(nsid_group)}\n", " items.append(obj)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "df_out = pd.DataFrame.from_dict(items)\n", "df_out.to_csv(fp_out, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "megapixels", "language": "python", "name": "megapixels" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }