summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb')
-rw-r--r--megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb145
1 files changed, 0 insertions, 145 deletions
diff --git a/megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb b/megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb
deleted file mode 100644
index 8746a740..00000000
--- a/megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb
+++ /dev/null
@@ -1,145 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# PIPA Flickr Metadata Cleanup"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "%reload_ext autoreload\n",
- "%autoreload 2\n",
- "\n",
- "import os\n",
- "from os.path import join\n",
- "from glob import glob\n",
- "from pathlib import Path\n",
- "\n",
- "from tqdm import tqdm_notebook as tqdm\n",
- "import pandas as pd\n",
- "\n",
- "import sys\n",
- "sys.path.append('/work/megapixels_dev/megapixels')\n",
- "from app.utils import file_utils"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Load CSV"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [],
- "source": [
- "fp_in = '/data_store/datasets/people/pipa/research/pipa_flickr_metadata_ext.csv'\n",
- "fp_out = '/data_store/datasets/people/pipa/research/pipa_flickr_metadata_test.csv'"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Create metadata csv output\n",
- "\n",
- "|nsid|path_alias|count|"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_flickr_meta = pd.read_csv(fp_in)\n",
- "df_flickr_meta.fillna('', inplace=True)\n",
- "flickr_metas = df_flickr_meta.to_dict('records')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [],
- "source": [
- "# create nsid lookup table\n",
- "nsid_lookup = {}\n",
- "alias_lookup = {}\n",
- "for flickr_meta in flickr_metas:\n",
- " nsid = flickr_meta['nsid']\n",
- " nsid_lookup[nsid] = flickr_meta\n",
- " alias_lookup[nsid] = flickr_meta['path_alias']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {},
- "outputs": [],
- "source": [
- "nsid_groups = df_flickr_meta.groupby('nsid')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {},
- "outputs": [],
- "source": [
- "items = []\n",
- "for nsid, nsid_group in nsid_groups:\n",
- " path_alias = alias_lookup[nsid]\n",
- " obj = {'nsid': nsid, 'path_alias': path_alias, 'count': len(nsid_group)}\n",
- " items.append(obj)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_out = pd.DataFrame.from_dict(items)\n",
- "df_out.to_csv(fp_out, index=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "megapixels",
- "language": "python",
- "name": "megapixels"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}