{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# PIPA Flickr Metadata Cleanup" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "from os.path import join\n", "from glob import glob\n", "from pathlib import Path\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels')\n", "from app.utils import file_utils" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load CSV" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "fp_in_api_photo_id = '/data_store_hdd/datasets/people/pipa/research/flickr_api_photo_id.csv'\n", "fp_out_filepaths = '/data_store_hdd/datasets/people/pipa/research/pipa_filepaths.csv'" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(fp_in_api_photo_id)\n", "records = df.to_dict('records')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "results = []\n", "for record in records:\n", " obj = {\n", " 'photo_id': record.get('photo_id'),\n", " 'nsid': record.get('nsid'),\n", " 'url': record.get('url'),\n", " 'secret': record.get('secret'),\n", " 'filepath': f'{photo_id}_{secret}.jpg'\n", " }\n", " results.append(obj)\n", " " ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "pd.DataFrame.from_dict(results).to_csv(fp_out_filepaths, index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create metadata csv output\n", "\n", "|nsid|path_alias|count|" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "df_flickr_meta = pd.read_csv(fp_in)\n", "df_flickr_meta.fillna('', inplace=True)\n", "flickr_metas = df_flickr_meta.to_dict('records')" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# create nsid lookup table\n", "nsid_lookup = {}\n", "alias_lookup = {}\n", "for flickr_meta in flickr_metas:\n", " nsid = flickr_meta['nsid']\n", " nsid_lookup[nsid] = flickr_meta\n", " alias_lookup[nsid] = flickr_meta['path_alias']" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "nsid_groups = df_flickr_meta.groupby('nsid')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "items = []\n", "for nsid, nsid_group in nsid_groups:\n", " path_alias = alias_lookup[nsid]\n", " obj = {'nsid': nsid, 'path_alias': path_alias, 'count': len(nsid_group)}\n", " items.append(obj)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "df_out = pd.DataFrame.from_dict(items)\n", "df_out.to_csv(fp_out, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "megapixels", "language": "python", "name": "megapixels" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }