diff options
| author | adamhrv <adam@ahprojects.com> | 2019-06-03 03:33:06 +0200 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-06-03 03:33:06 +0200 |
| commit | 1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch) | |
| tree | 86c37309ff5bcb62716638562489ddb747c16159 /megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb | |
| parent | e5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff) | |
add msc working utils
Diffstat (limited to 'megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb | 145 |
1 files changed, 0 insertions, 145 deletions
diff --git a/megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb b/megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb deleted file mode 100644 index 8746a740..00000000 --- a/megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb +++ /dev/null @@ -1,145 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PIPA Flickr Metadata Cleanup" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "%reload_ext autoreload\n", - "%autoreload 2\n", - "\n", - "import os\n", - "from os.path import join\n", - "from glob import glob\n", - "from pathlib import Path\n", - "\n", - "from tqdm import tqdm_notebook as tqdm\n", - "import pandas as pd\n", - "\n", - "import sys\n", - "sys.path.append('/work/megapixels_dev/megapixels')\n", - "from app.utils import file_utils" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load CSV" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "fp_in = '/data_store/datasets/people/pipa/research/pipa_flickr_metadata_ext.csv'\n", - "fp_out = '/data_store/datasets/people/pipa/research/pipa_flickr_metadata_test.csv'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create metadata csv output\n", - "\n", - "|nsid|path_alias|count|" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "df_flickr_meta = pd.read_csv(fp_in)\n", - "df_flickr_meta.fillna('', inplace=True)\n", - "flickr_metas = df_flickr_meta.to_dict('records')" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "# create nsid lookup table\n", - "nsid_lookup = {}\n", - "alias_lookup = {}\n", - "for flickr_meta in flickr_metas:\n", - " nsid = flickr_meta['nsid']\n", - " nsid_lookup[nsid] = flickr_meta\n", - " alias_lookup[nsid] = flickr_meta['path_alias']" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "nsid_groups = df_flickr_meta.groupby('nsid')" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "items = []\n", - "for nsid, nsid_group in nsid_groups:\n", - " path_alias = alias_lookup[nsid]\n", - " obj = {'nsid': nsid, 'path_alias': path_alias, 'count': len(nsid_group)}\n", - " items.append(obj)" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "df_out = pd.DataFrame.from_dict(items)\n", - "df_out.to_csv(fp_out, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "megapixels", - "language": "python", - "name": "megapixels" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} |
