summaryrefslogtreecommitdiff
path: root/megapixels/notebooks
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/notebooks')
-rw-r--r--megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb166
-rw-r--r--megapixels/notebooks/datasets/ffhq/prepare_flickr_meta.ipynb165
-rw-r--r--megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb104
-rw-r--r--megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb444
-rw-r--r--megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb206
-rw-r--r--megapixels/notebooks/datasets/megaface/prepare_flickr_api.ipynb (renamed from megapixels/notebooks/datasets/megaface/megaface_prepare_flickr_api.ipynb)113
-rw-r--r--megapixels/notebooks/datasets/pipa/flickr_cleanup.ipynb (renamed from megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb)44
-rw-r--r--megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb287
-rw-r--r--megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb312
9 files changed, 1738 insertions, 103 deletions
diff --git a/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb
index a35c3b24..8d3b4251 100644
--- a/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb
+++ b/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb
@@ -29,41 +29,145 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Create CSV for API"
+ "## Cleanup filepaths CSV"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_in_photo_ids = '/data_store_hdd/datasets/people/adience/research/adience_photo_ids.csv'\n",
+ "fp_in_flickr_api_dump = '/data_store_hdd/datasets/people/adience/research/adience_flickr_api_dump.csv'\n",
+ "fp_in_flickr_api_dump_photo_ids = '/data_store_hdd/datasets/people/adience/research/flickr_api_dump_photo_id.csv'\n",
+ "fp_out_filepaths = '/data_store_hdd/datasets/people/adience/research/adience_filepaths.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
"metadata": {},
+ "outputs": [],
"source": [
- "| filepath | query | count |\n",
- "|:---|:---|:---|\n",
- "| 12234 | 12234@123| 10 |"
+ "# photo id list\n",
+ "df = pd.read_csv(fp_in_photo_ids)\n",
+ "records = df.to_dict('records')\n",
+ "\n",
+ "# photo id --> url list\n",
+ "df_api_urls = pd.read_csv(fp_in_flickr_api_dump_photo_ids)\n",
+ "api_urls = df_api_urls.to_dict('records')\n",
+ "\n",
+ "df_flickr_api_dump = pd.read_csv(fp_in_flickr_api_dump)\n",
+ "flickr_api_dump = df_flickr_api_dump.to_dict('records')\n",
+ "\n",
+ "# create lookup table for user info?\n",
+ "flickr_api_lookup = {}\n",
+ "for api_item in flickr_api_dump:\n",
+ " nsid = api_item['nsid']\n",
+ " flickr_api_lookup[nsid] = api_item\n",
+ " \n",
+ "# create lookup table for user info?\n",
+ "api_url_lookup = {}\n",
+ "for api_url_item in api_urls:\n",
+ " photo_id = api_url_item['photo_id']\n",
+ " api_url_lookup[photo_id] = api_url_item\n",
+ " \n",
+ "results = []\n",
+ "for record in records:\n",
+ " photo_id = record['photo_id']\n",
+ " if photo_id in api_url_lookup.keys():\n",
+ " api_item = api_url_lookup.get(photo_id)\n",
+ " url = api_item.get('url')\n",
+ " nsid = api_item.get('nsid')\n",
+ " obj = {\n",
+ " 'filepath': f'{photo_id}.jpg',\n",
+ " 'nsid': nsid,\n",
+ " 'photo_id': photo_id,\n",
+ " 'url': url\n",
+ " }\n",
+ " results.append(obj)\n",
+ " \n",
+ "pd.DataFrame.from_dict(results).to_csv(fp_out_filepaths, index=False)"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {},
"outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create the file meta csv\n",
+ "results = []\n",
+ "results_download = []\n",
+ "for flickr_meta_record in flickr_meta_records:\n",
+ " # farm, server, photo id, secret\n",
+ " photo_id = str(flickr_meta_record['photo_id'])\n",
+ " nsid = flickr_meta_record.get('nsid')\n",
+ " fp_json = join(fp_dir_flickr_meta, f'{photo_id}.json')\n",
+ " json_data = file_utils.load_json(fp_json)\n",
+ " photo_meta = json_data.get('photo')\n",
+ " farm = photo_meta.get('farm')\n",
+ " server = photo_meta.get('server')\n",
+ " secret = photo_meta.get('secret')\n",
+ " # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg\n",
+ " url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg'\n",
+ " obj = {\n",
+ " 'nsid': nsid,\n",
+ " 'photo_id': photo_id,\n",
+ " 'url': url,\n",
+ " 'filepath': f'{photo_id}.jpg'\n",
+ " }\n",
+ " results.append(obj)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
"source": [
- "fp_in_dir = '/data_store/datasets/people/adience/dataset/'\n",
- "fp_out_queries = '/data_store/datasets/people/adience/research/adience_flickr_api_queries.csv'"
+ "## Create Photo ID list"
]
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "9\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"fp_files = glob(join(fp_in_dir, '*.txt'))\n",
"print(len(fp_files))"
@@ -71,7 +175,7 @@
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -85,7 +189,7 @@
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -94,7 +198,7 @@
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -103,7 +207,7 @@
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -114,7 +218,7 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -123,7 +227,7 @@
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -132,24 +236,16 @@
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "10804\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"print(len(df_images))"
]
},
{
"cell_type": "code",
- "execution_count": 39,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
diff --git a/megapixels/notebooks/datasets/ffhq/prepare_flickr_meta.ipynb b/megapixels/notebooks/datasets/ffhq/prepare_flickr_meta.ipynb
new file mode 100644
index 00000000..3d571aff
--- /dev/null
+++ b/megapixels/notebooks/datasets/ffhq/prepare_flickr_meta.ipynb
@@ -0,0 +1,165 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Download FFHQ Images\n",
+ "\n",
+ "- https://github.com/NVlabs/ffhq-dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "from pathlib import Path\n",
+ "import requests\n",
+ "import json\n",
+ "\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "import pandas as pd\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "import sys\n",
+ "sys.path.append('/work/megapixels_dev/megapixels')\n",
+ "from app.utils import file_utils"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# load JSON\n",
+ "fp_in = '/data_store/datasets/people/ffhq/ffhq-dataset-v1.json'\n",
+ "fp_out = '/data_store/datasets/people/ffhq/research/flickr_api_urls.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(fp_in, 'r') as fp:\n",
+ " ffhq_items = json.load(fp)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "217c694742e8408d871c3b41183676fb",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=70000), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "results = []\n",
+ "# get photos urls\n",
+ "for idx, ffhq_item in tqdm(ffhq_items.items()):\n",
+ " url = ffhq_item.get('metadata').get('photo_url')\n",
+ " photo_id = Path(url).stem\n",
+ " obj = {'photo_id': photo_id}\n",
+ " results.append(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'photo_id': '1133484654'}"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "results[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.DataFrame.from_dict(results)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.drop_duplicates(inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_csv(fp_out, index=False)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "megapixels",
+ "language": "python",
+ "name": "megapixels"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb
index 311d3462..140b6361 100644
--- a/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb
+++ b/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb
@@ -40,6 +40,110 @@
]
},
{
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create filepaths CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_flickr_meta = '/data_store_hdd/datasets/people/helen/research/helen_flickr_api_dump.csv'\n",
+ "fp_photo_ids = '/data_store_hdd/datasets/people/helen/research/helen_flickr_photo_ids.csv'\n",
+ "fp_filepaths = '/data_store_hdd/datasets/people/helen/research/helen_file_meta.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_photo_ids = pd.read_csv(fp_photo_ids)\n",
+ "photo_ids = df_photo_ids.to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_flickr_meta = pd.read_csv(fp_flickr_meta, dtype={'photo_id': str})\n",
+ "flickr_meta_records = df_flickr_meta.to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1854\n",
+ "2122\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(flickr_meta_records))\n",
+ "print(len(df_photo_ids))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create the file meta csv\n",
+ "results = []\n",
+ "results_download = []\n",
+ "for flickr_meta_record in flickr_meta_records:\n",
+ " # farm, server, photo id, secret\n",
+ " photo_id = str(flickr_meta_record['photo_id'])\n",
+ " nsid = flickr_meta_record.get('nsid')\n",
+ " fp_json = join(fp_dir_flickr_meta, f'{photo_id}.json')\n",
+ " json_data = file_utils.load_json(fp_json)\n",
+ " photo_meta = json_data.get('photo')\n",
+ " farm = photo_meta.get('farm')\n",
+ " server = photo_meta.get('server')\n",
+ " secret = photo_meta.get('secret')\n",
+ " # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg\n",
+ " url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg'\n",
+ " obj = {\n",
+ " 'nsid': nsid,\n",
+ " 'photo_id': photo_id,\n",
+ " 'url': url,\n",
+ " 'filepath': f'{photo_id}.jpg'\n",
+ " }\n",
+ " results.append(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_out = pd.DataFrame.from_dict(results)\n",
+ "df_out.to_csv(fp_filepaths, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
"cell_type": "code",
"execution_count": 16,
"metadata": {},
diff --git a/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb
index ff41e799..6d2b768a 100644
--- a/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb
+++ b/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb
@@ -29,70 +29,353 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Create CSV for API"
+ "## IBM DiF clean CSVs\n",
+ "\n",
+ "- 2283 files could not be downloaded or accessed in the API\n",
+ "- these images were downloaded, but possibly no longer exist"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": 60,
"metadata": {},
+ "outputs": [],
+ "source": [
+ "# flickr api data\n",
+ "fp_in_meta_flickr = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_meta_flickr.csv'\n",
+ "\n",
+ "# api query dump\n",
+ "fp_in_flickr_api = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n",
+ "\n",
+ "# ibm count data\n",
+ "fp_in_meta_filepaths = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_meta_filepaths.csv'\n",
+ "fp_meta_filepaths_adj = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_meta_filepaths_adj.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (2,3,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " interactivity=interactivity, compiler=compiler, result=result)\n"
+ ]
+ }
+ ],
"source": [
- "| photo_id |\n",
- "|:---|\n",
- "| 12234 |"
+ "df_meta_filepaths = pd.read_csv(fp_in_meta_filepaths)\n",
+ "meta_filepaths = df_meta_filepaths.to_dict('records')\n",
+ "df_meta_flickr = pd.read_csv(fp_in_meta_flickr)\n",
+ "meta_flickr = df_meta_flickr.to_dict('records')\n",
+ "df_flickr_api_dump = pd.read_csv(fp_in_flickr_api)"
]
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "98155\n",
+ "98155\n",
+ "98153\n",
+ "100438\n",
+ "98154\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(df_flickr_api_dump))\n",
+ "print(len(df_flickr_api_dump.drop_duplicates(subset='nsid')))\n",
+ "print(len(df_meta_flickr))\n",
+ "print(len(df_meta_filepaths))\n",
+ "print(len(df_meta_filepaths.drop_duplicates(subset='nsid')))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
- "# flickr api data\n",
- "fp_in_flickr_meta = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n",
- "# ibm count data\n",
- "fp_in_ibm_meta = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_metadata.csv'\n",
- "# output\n",
- "fp_out = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_metadata.csv'"
+ "# drop epmty NSIDs\n",
+ "df_meta_filepaths.drop_duplicates(subset='nsid', inplace=True)"
]
},
{
"cell_type": "code",
- "execution_count": 68,
+ "execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
- "# load ibm data and create count lookup with photoid\n",
- "df_ibm_meta = pd.read_csv(fp_in_ibm_meta)\n",
- "ibm_meta_records = df_ibm_meta.to_dict('records')\n",
- "count_lookup = {}\n",
- "for ibm_meta_record in ibm_meta_records:\n",
- " photo_id = int(Path(ibm_meta_record['url']).stem.split('_')[0])\n",
- " count_lookup[photo_id] = ibm_meta_record['count']"
+ "df_meta_filepaths.to_csv(fp_meta_filepaths_adj, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nsid_filepaths = {}\n",
+ "dupes = []\n",
+ "for meta_filepath in meta_filepaths:\n",
+ " nsid = meta_filepath['nsid']\n",
+ " if nsid not in nsid_filepaths.keys():\n",
+ " nsid_filepaths[nsid] = meta_filepath\n",
+ " else:\n",
+ " dupes.append(meta_filepath)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "98154\n",
+ "2284\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(nsid_filepaths))\n",
+ "print(len(dupes))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'filepath': '12537662393_247b2187ee.jpg', 'nsid': nan, 'photo_id': 12537662393, 'url': 'http://farm6.staticflickr.com/5476/12537662393_247b2187ee.jpg'}\n",
+ "{'filepath': '5837222502_29aaf5bb53.jpg', 'nsid': nan, 'photo_id': 5837222502, 'url': 'http://farm4.staticflickr.com/3089/5837222502_29aaf5bb53.jpg'}\n",
+ "{'filepath': '10859466623_4ceb1564dc.jpg', 'nsid': nan, 'photo_id': 10859466623, 'url': 'http://farm6.staticflickr.com/5530/10859466623_4ceb1564dc.jpg'}\n",
+ "{'filepath': '13719567455_fb96dc7ac6.jpg', 'nsid': nan, 'photo_id': 13719567455, 'url': 'http://farm4.staticflickr.com/3718/13719567455_fb96dc7ac6.jpg'}\n",
+ "{'filepath': '3486554266_ca1fc7d99c.jpg', 'nsid': nan, 'photo_id': 3486554266, 'url': 'http://farm4.staticflickr.com/3327/3486554266_ca1fc7d99c.jpg'}\n",
+ "{'filepath': '6168324261_d2fb7bbb60.jpg', 'nsid': nan, 'photo_id': 6168324261, 'url': 'http://farm7.staticflickr.com/6166/6168324261_d2fb7bbb60.jpg'}\n",
+ "{'filepath': '13938295982_0d950feba5.jpg', 'nsid': nan, 'photo_id': 13938295982, 'url': 'http://farm8.staticflickr.com/7162/13938295982_0d950feba5.jpg'}\n",
+ "{'filepath': '8881073633_546b6dbfe5.jpg', 'nsid': nan, 'photo_id': 8881073633, 'url': 'http://farm6.staticflickr.com/5459/8881073633_546b6dbfe5.jpg'}\n",
+ "{'filepath': '10918515734_404eb29879.jpg', 'nsid': nan, 'photo_id': 10918515734, 'url': 'http://farm6.staticflickr.com/5502/10918515734_404eb29879.jpg'}\n",
+ "{'filepath': '3236533532_05cacef8e9.jpg', 'nsid': nan, 'photo_id': 3236533532, 'url': 'http://farm4.staticflickr.com/3425/3236533532_05cacef8e9.jpg'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "for dupe in dupes[:10]:\n",
+ " print(dupe)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "100438\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(dupes))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "98153\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(nsid_groups))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "100436\n"
+ ]
+ }
+ ],
+ "source": [
+ "fp_ims = glob('/data_store_hdd/datasets/people/ibm_dif/downloads/images/*.jpg')\n",
+ "print(len(fp_ims))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9314013316\n"
+ ]
+ }
+ ],
+ "source": [
+ "photo_ids = [Path(x).stem.split('_')[0] for x in fp_ims]\n",
+ "print(photo_ids[0])"
]
},
{
"cell_type": "code",
- "execution_count": 69,
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "KeyError",
+ "evalue": "'photo_id'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m--------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m<ipython-input-45-fd2de6074950>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfilepath_photo_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'photo_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmeta_flickr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;32m<ipython-input-45-fd2de6074950>\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfilepath_photo_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'photo_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmeta_flickr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;31mKeyError\u001b[0m: 'photo_id'"
+ ]
+ }
+ ],
+ "source": [
+ "filepath_photo_ids = [int(x['nsid']) for x in meta_flickr]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d7a9a78bf0e442a5b8445906bc85da99",
+ "version_major": 2,
+ "version_minor": 0
+ },
"text/plain": [
- "100438"
+ "HBox(children=(IntProgress(value=0, max=100436), HTML(value='')))"
]
},
- "execution_count": 69,
"metadata": {},
- "output_type": "execute_result"
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# find which photo IDs are no longer accessible\n",
+ "missing_photo_ids = []\n",
+ "for photo_id in tqdm(photo_ids):\n",
+ " photo_id = int(photo_id)\n",
+ " if photo_id not in filepath_photo_ids:\n",
+ " missing_photo_ids.append(photo_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0\n",
+ "[]\n"
+ ]
}
],
"source": [
+ "print(len(missing_photo_ids))\n",
+ "print(missing_photo_ids[0:10])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "NameError",
+ "evalue": "name 'df_flickr_meta' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m--------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m<ipython-input-30-75e9fdbbbfbb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtotal\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_flickr_meta\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'count'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mNameError\u001b[0m: name 'df_flickr_meta' is not defined"
+ ]
+ }
+ ],
+ "source": [
+ "total = df_flickr_meta['count'].sum()\n",
+ "print(total)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# load ibm data and create count lookup with photoid\n",
+ "df_ibm_meta = pd.read_csv(fp_in_ibm_meta)\n",
+ "ibm_meta_records = df_ibm_meta.to_dict('records')\n",
+ "count_lookup = {}\n",
+ "for ibm_meta_record in ibm_meta_records:\n",
+ " photo_id = int(Path(ibm_meta_record['url']).stem.split('_')[0])\n",
+ " count_lookup[photo_id] = ibm_meta_record['count']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
"len(count_lookup)"
]
},
{
"cell_type": "code",
- "execution_count": 70,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -101,7 +384,7 @@
},
{
"cell_type": "code",
- "execution_count": 71,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -111,18 +394,9 @@
},
{
"cell_type": "code",
- "execution_count": 74,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Error: invalid literal for int() with base 10: '', {'country': '', 'description': 'Haircut Next...', 'lat': '', 'lon': '', 'nsid': '', 'owner_location': '', 'path_alias': '', 'photo_id': '', 'place': '', 'place_id': '', 'posted': '', 'realname': '', 'taken': '', 'username': '', 'woeid': ''}\n",
- "Error: invalid literal for int() with base 10: '', {'country': '', 'description': '', 'lat': '86085317@N00', 'lon': 'New York', 'nsid': 'anonymousthomas', 'owner_location': '4975598', 'path_alias': '', 'photo_id': '', 'place': '1108685469', 'place_id': 'Thomas', 'posted': '2005-02-18 00:11:09', 'realname': 'anonymousthomas', 'taken': '', 'username': '', 'woeid': ''}\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# load flickr data\n",
"for flickr_meta_record in flickr_meta_records:\n",
@@ -143,7 +417,7 @@
},
{
"cell_type": "code",
- "execution_count": 75,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -160,55 +434,99 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
+ "# photo ids and nsids\n",
"fp_flickr_api_dump = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n",
- "fp_out_meta = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_flickr_meta.csv'"
+ "\n",
+ "# file urls\n",
+ "fp_ibm_urls = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_urls.csv'\n",
+ "\n",
+ "# flickr meta\n",
+ "fp_out_filepaths = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_filepaths.csv'"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (2,3,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
- " interactivity=interactivity, compiler=compiler, result=result)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "df = pd.read_csv(fp_flickr_api_dump)\n",
- "groups = df.groupby('nsid')"
+ "df_flickr_meta = pd.read_csv(fp_flickr_api_dump)\n",
+ "df_flickr_meta.fillna('', inplace=True)\n",
+ "flickr_metas = df_flickr_meta.to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "```\n",
+ "|filepath|nsid|photo_id|url|\n",
+ "```"
]
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "results = []\n",
- "for nsid, group in groups:\n",
- " obj = {\n",
- " 'nsid': nsid,\n",
- " 'count': len(group)\n",
- " }\n",
- " results.append(obj)"
+ "photo_id_to_nsid = {}\n",
+ "for flickr_meta in flickr_metas:\n",
+ " photo_id = flickr_meta.get('photo_id')\n",
+ " if photo_id:\n",
+ " photo_id = str(int(photo_id))\n",
+ " photo_id_to_nsid[photo_id] = flickr_meta['nsid']"
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(list(photo_id_to_nsid.keys())[0:10])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_ibm_urls = pd.read_csv(fp_ibm_urls)\n",
+ "ibm_urls = df_ibm_urls.to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "photo_id_to_url = {}\n",
+ "missed = []\n",
+ "for ibm_url in ibm_urls:\n",
+ " photo_id = str(ibm_url['filepath'].split('_')[0])\n",
+ " try:\n",
+ " ibm_url['photo_id'] = photo_id\n",
+ " ibm_url['nsid'] = photo_id_to_nsid[photo_id]\n",
+ " except Exception as e:\n",
+ "# print(e, photo_id)\n",
+ " missed.append(photo_id)\n",
+ "print(f'missed: {len(missed)}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "pd.DataFrame.from_dict(results).to_csv(fp_out_meta, index=False)"
+ "pd.DataFrame.from_dict(ibm_urls).to_csv(fp_out_filepaths, index=False)"
]
},
{
diff --git a/megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb b/megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb
new file mode 100644
index 00000000..b4a29243
--- /dev/null
+++ b/megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Count IJB sources"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "from pathlib import Path\n",
+ "import requests\n",
+ "import json\n",
+ "\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "import pandas as pd\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "import sys\n",
+ "sys.path.append('/work/megapixels_dev/megapixels')\n",
+ "from app.utils import file_utils"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# load JSON\n",
+ "fp_in_cs3 = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv'\n",
+ "fp_in_cs4 = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs4_media.csv'\n",
+ "fp_in_ijb_b = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-B/ijbb_licenses_and_sources.csv'\n",
+ "fp_in_ijb_a = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-A/SOURCES.csv'\n",
+ "fp_out = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/summary.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_cs3 = pd.read_csv(fp_in_cs3)\n",
+ "df_cs4 = pd.read_csv(fp_in_cs4)\n",
+ "df_sources = df_cs3.append(df_cs4)\n",
+ "df_sources.fillna('', inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ijb b\n",
+ "#df_sources = pd.read_csv(fp_in_ijb_b).fillna('')\n",
+ "# ijb a\n",
+ "df_sources = pd.read_csv(fp_in_ijb_a).fillna('')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sources = df_sources.to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results = {}\n",
+ "others = []\n",
+ "keys = ['flickr.com', 'youtube.com', 'wikipedia.org', 'wikimedia.org']\n",
+ "for k in keys:\n",
+ " results[k] = []\n",
+ "for source in sources:\n",
+ " url = str(source['Media URL'])\n",
+ " media_id = source['Media ID']\n",
+ " if 'nonfaces' in media_id:\n",
+ " continue\n",
+ " found = False\n",
+ " for k in keys:\n",
+ " if k in url:\n",
+ " results[k].append(url)\n",
+ " found = True\n",
+ " if not found:\n",
+ " if url:\n",
+ " others.append(url)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "flickr.com 0\n",
+ "youtube.com 1388\n",
+ "wikipedia.org 0\n",
+ "wikimedia.org 4298\n"
+ ]
+ }
+ ],
+ "source": [
+ "for k,v in results.items():\n",
+ " print(k, len(set(v)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "siliconangle.com/files/2011/06/kaz-hirai.jpg\n",
+ "etnosi.files.wordpress.com/2012/05/sofi-marinova-baku.jpg\n",
+ "images.coveralia.com/audio/p/Pia_Zadora-When_The_Lights_Go_Out-Interior_Frontal.jpg\n",
+ "4.bp.blogspot.com/-TFHOJVIW3a8/T_1mD6MdOxI/AAAAAAAADAg/PhKDPx0Aqu0/s1600/ivan_pavlov.jpg\n",
+ "863793661388437597-a-1802744773732722657-s-sites.googlegroups.com/site/virginmarysite/Home/jackneosex.jpg\n",
+ "amckiereads.files.wordpress.com/2010/12/darwish.jpg?w=600\n",
+ "img.interia.pl/komputery/nimg/5/7/Kazuo_Hirai_plan_odbudowe_5726348.jpg\n",
+ "2.bp.blogspot.com/-JAYvKsHcQPI/T4f3wbCIMDI/AAAAAAAAFDM/lTs3uKlb3A0/s1600/deeksha_seth_launches_chandana_brothers_showroom_Yellow+Saree+smiling+pics+%25285%2529.jpg\n",
+ "1.bp.blogspot.com/-D3SI27GS7-g/U-iD5fPcFDI/AAAAAAAABOs/VaB_BRRa6OU/s320/news8.jpg\n",
+ "1.bp.blogspot.com/_ilOjS7A_kk4/SVGCtcyAAmI/AAAAAAAAAH4/9-KKBqYeDBA/s400/playstation-3-grill_12.jpg\n"
+ ]
+ }
+ ],
+ "source": [
+ "for other in others[:10]:\n",
+ " print(other)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "21319"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(results['flickr.com']) +len(results['wikimedia.org']) + len(others)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "megapixels",
+ "language": "python",
+ "name": "megapixels"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/megapixels/notebooks/datasets/megaface/megaface_prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/megaface/prepare_flickr_api.ipynb
index 48133228..3c0dd631 100644
--- a/megapixels/notebooks/datasets/megaface/megaface_prepare_flickr_api.ipynb
+++ b/megapixels/notebooks/datasets/megaface/prepare_flickr_api.ipynb
@@ -4,12 +4,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Prepare Flickr API Batch CSV"
+ "# MegaFace: Prepare Flickr API Batch CSV"
]
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -29,6 +29,115 @@
"cell_type": "markdown",
"metadata": {},
"source": [
+ "## Create the file meta CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_in_meta_files = '/data_store_hdd/datasets/people/megaface/research/megaface_meta_file.csv'\n",
+ "fp_out_meta_files = '/data_store_hdd/datasets/people/megaface/research/megaface_meta_file_ext.csv'\n",
+ "fp_out_meta_flickr = '/data_store_hdd/datasets/people/megaface/research/megaface_meta_flickr_02.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_files = pd.read_csv(fp_in_meta_files)\n",
+ "df_files.rename(columns={'subdir': 'filepath'}, inplace=True)\n",
+ "file_records = df_files.to_dict('records')\n",
+ "photo_ids = [x['photo_id'] for x in file_records]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d91329c27b8b4fc4ae68eb817ea82e19",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=4753520), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "for file_record in tqdm(file_records):\n",
+ " photo_id = Path(file_record['url']).stem.split('_')[0]\n",
+ " filepath = f'{photo_id}.jpg'\n",
+ " file_record['filepath'] = filepath\n",
+ "\n",
+ "df_meta_file = pd.DataFrame.from_dict(file_records)\n",
+ "df_meta_file.drop_duplicates(inplace=True)\n",
+ "df_meta_file.to_csv(fp_out_meta_files, index=False)\n",
+ "print(f'Wrote {len(df_meta_file)} lines')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create the NSID/count CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total users: 48,382\n",
+ "Total images: 3,311,471\n"
+ ]
+ }
+ ],
+ "source": [
+ "nsid_groups = df_meta_file.groupby('nsid')\n",
+ "results = []\n",
+ "for nsid, group in nsid_groups:\n",
+ " results.append({'nsid': nsid, 'count': len(group)})\n",
+ "df_meta_flickr = pd.DataFrame.from_dict(results)\n",
+ "df_meta_flickr.to_csv(fp_out_meta_flickr, index=False)\n",
+ "\n",
+ "print(f'Total users: {len(results):,}')\n",
+ "print(f'Total images: {df_meta_flickr[\"count\"].sum():,}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
"## Create CSV for API"
]
},
diff --git a/megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb b/megapixels/notebooks/datasets/pipa/flickr_cleanup.ipynb
index 8746a740..57c32bec 100644
--- a/megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb
+++ b/megapixels/notebooks/datasets/pipa/flickr_cleanup.ipynb
@@ -38,12 +38,50 @@
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
- "fp_in = '/data_store/datasets/people/pipa/research/pipa_flickr_metadata_ext.csv'\n",
- "fp_out = '/data_store/datasets/people/pipa/research/pipa_flickr_metadata_test.csv'"
+ "fp_in_api_photo_id = '/data_store_hdd/datasets/people/pipa/research/flickr_api_photo_id.csv'\n",
+ "fp_out_filepaths = '/data_store_hdd/datasets/people/pipa/research/pipa_filepaths.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv(fp_in_api_photo_id)\n",
+ "records = df.to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results = []\n",
+ "for record in records:\n",
+ " obj = {\n",
+ " 'photo_id': record.get('photo_id'),\n",
+ " 'nsid': record.get('nsid'),\n",
+ " 'url': record.get('url'),\n",
+ " 'secret': record.get('secret'),\n",
+ " 'filepath': f'{photo_id}_{secret}.jpg'\n",
+ " }\n",
+ " results.append(obj)\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.DataFrame.from_dict(results).to_csv(fp_out_filepaths, index=False)"
]
},
{
diff --git a/megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb b/megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb
new file mode 100644
index 00000000..99bbe32e
--- /dev/null
+++ b/megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb
@@ -0,0 +1,287 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# VGG Face (V1) Prepare Flickr API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "from os.path import join\n",
+ "from glob import glob, iglob\n",
+ "from pathlib import Path\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "\n",
+ "import pandas as pd\n",
+ "import sys\n",
+ "sys.path.append('/work/megapixels_dev/megapixels/')\n",
+ "from app.utils import file_utils"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Convert annotation files to list of photo IDs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_dir_annos = '/data_store/datasets/people/vgg_face/downloads/vgg_face_dataset/files/'\n",
+ "fp_photo_ids = '/data_store/datasets/people/vgg_face/research/photo_ids.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b92b24eac4c84f2f96e32f6eba8d2dc0",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=2622), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "photo_ids = []\n",
+ "all_photos = []\n",
+ "fp_annos = glob(join(fp_dir_annos, '*.txt'))\n",
+ "for fp_anno in tqdm(fp_annos):\n",
+ " df_annos = pd.read_csv(fp_anno, delimiter=' ', names=['url', 'a', 'b', 'c', 'd', 'e', 'f', 'g'])\n",
+ " records = df_annos.to_dict('records')\n",
+ " for record in records:\n",
+ " url = record['url']\n",
+ " all_photos.append(url)\n",
+ " if 'flickr.com' in url:\n",
+ " photo_id = Path(url).stem.split('_')[0]\n",
+ " photo_ids.append({'photo_id': photo_id})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2604849\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(all_photos))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PosixPath('/data_store/datasets/people/vgg_face/research/photo_ids.csv')"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "file_utils.ensure_posixpath(fp_photo_ids)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.DataFrame.from_dict(photo_ids).to_csv(fp_photo_ids, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Convert Flickr API data to filepaths and counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_in_flickr_api = '/data_store_hdd/datasets/people/vgg_face/research/vgg_flickr_api_photo_ids.csv'\n",
+ "fp_out_filepaths = '/data_store_hdd/datasets/people/vgg_face/research/vgg_filepaths.csv'\n",
+ "fp_out_counts = '/data_store_hdd/datasets/people/vgg_face/research/vgg_counts.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv(fp_in_flickr_api)\n",
+ "records = df.to_dict('records')\n",
+ "\n",
+ "# write filepaths\n",
+ "results = []\n",
+ "for record in records:\n",
+ " photo_id = record['photo_id']\n",
+ " obj = {\n",
+ " 'filepath': f'{photo_id}.jpg',\n",
+ " 'nsid': record['nsid'],\n",
+ " 'photo_id': photo_id,\n",
+ " 'url': record['url']\n",
+ " }\n",
+ " results.append(obj)\n",
+ "\n",
+ "pd.DataFrame.from_dict(results).to_csv(fp_out_filepaths, index=False)\n",
+ "\n",
+ "# write counts\n",
+ "results = []\n",
+ "nsid_groups = df.groupby('nsid')\n",
+ "for nsid, group in nsid_groups:\n",
+ " results.append({'nsid': nsid, 'count': len(group)})\n",
+ "\n",
+ "pd.DataFrame.from_dict(results).to_csv(fp_out_counts, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp = '/data_store/datasets/msc/embassies/embassies_on_flickr.csv'\n",
+ "df = pd.read_csv(fp)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_match = df[df['nsid'] == '50747072@N03']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " bureau country nsid path_alias type \\\n",
+ "0 EUR Russia 50747072@N03 otkroyameriku Consulate \n",
+ "\n",
+ " url username \\\n",
+ "0 http://www.flickr.com/photos/otkroyameriku Генконсульство США в СПб \n",
+ "\n",
+ " verified notes \n",
+ "0 NaN NaN 1\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(df_match, len(df_match))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'50747072@N03'"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "match.nsid"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "megapixels",
+ "language": "python",
+ "name": "megapixels"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb
index c2ec5c84..66f803a4 100644
--- a/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb
+++ b/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb
@@ -37,6 +37,318 @@
]
},
{
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create filepaths CSV for individual lookup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_flickr_meta = '/data_store/datasets/people/who_goes_there/research/wgt_flickr_queries.csv'\n",
+ "fp_filepaths = '/data_store/datasets/people/who_goes_there/research/who_goes_there_filepaths.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_flickr_meta = pd.read_csv(fp_flickr_meta)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Help on function drop in module pandas.core.frame:\n",
+ "\n",
+ "drop(self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')\n",
+ " Drop specified labels from rows or columns.\n",
+ " \n",
+ " Remove rows or columns by specifying label names and corresponding\n",
+ " axis, or by specifying directly index or column names. When using a\n",
+ " multi-index, labels on different levels can be removed by specifying\n",
+ " the level.\n",
+ " \n",
+ " Parameters\n",
+ " ----------\n",
+ " labels : single label or list-like\n",
+ " Index or column labels to drop.\n",
+ " axis : {0 or 'index', 1 or 'columns'}, default 0\n",
+ " Whether to drop labels from the index (0 or 'index') or\n",
+ " columns (1 or 'columns').\n",
+ " index, columns : single label or list-like\n",
+ " Alternative to specifying axis (``labels, axis=1``\n",
+ " is equivalent to ``columns=labels``).\n",
+ " \n",
+ " .. versionadded:: 0.21.0\n",
+ " level : int or level name, optional\n",
+ " For MultiIndex, level from which the labels will be removed.\n",
+ " inplace : bool, default False\n",
+ " If True, do operation inplace and return None.\n",
+ " errors : {'ignore', 'raise'}, default 'raise'\n",
+ " If 'ignore', suppress error and only existing labels are\n",
+ " dropped.\n",
+ " \n",
+ " Returns\n",
+ " -------\n",
+ " dropped : pandas.DataFrame\n",
+ " \n",
+ " Raises\n",
+ " ------\n",
+ " KeyError\n",
+ " If none of the labels are found in the selected axis\n",
+ " \n",
+ " See Also\n",
+ " --------\n",
+ " DataFrame.loc : Label-location based indexer for selection by label.\n",
+ " DataFrame.dropna : Return DataFrame with labels on given axis omitted\n",
+ " where (all or any) data are missing.\n",
+ " DataFrame.drop_duplicates : Return DataFrame with duplicate rows\n",
+ " removed, optionally only considering certain columns.\n",
+ " Series.drop : Return Series with specified index labels removed.\n",
+ " \n",
+ " Examples\n",
+ " --------\n",
+ " >>> df = pd.DataFrame(np.arange(12).reshape(3,4),\n",
+ " ... columns=['A', 'B', 'C', 'D'])\n",
+ " >>> df\n",
+ " A B C D\n",
+ " 0 0 1 2 3\n",
+ " 1 4 5 6 7\n",
+ " 2 8 9 10 11\n",
+ " \n",
+ " Drop columns\n",
+ " \n",
+ " >>> df.drop(['B', 'C'], axis=1)\n",
+ " A D\n",
+ " 0 0 3\n",
+ " 1 4 7\n",
+ " 2 8 11\n",
+ " \n",
+ " >>> df.drop(columns=['B', 'C'])\n",
+ " A D\n",
+ " 0 0 3\n",
+ " 1 4 7\n",
+ " 2 8 11\n",
+ " \n",
+ " Drop a row by index\n",
+ " \n",
+ " >>> df.drop([0, 1])\n",
+ " A B C D\n",
+ " 2 8 9 10 11\n",
+ " \n",
+ " Drop columns and/or rows of MultiIndex DataFrame\n",
+ " \n",
+ " >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],\n",
+ " ... ['speed', 'weight', 'length']],\n",
+ " ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],\n",
+ " ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])\n",
+ " >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],\n",
+ " ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],\n",
+ " ... [250, 150], [1.5, 0.8], [320, 250],\n",
+ " ... [1, 0.8], [0.3,0.2]])\n",
+ " >>> df\n",
+ " big small\n",
+ " lama speed 45.0 30.0\n",
+ " weight 200.0 100.0\n",
+ " length 1.5 1.0\n",
+ " cow speed 30.0 20.0\n",
+ " weight 250.0 150.0\n",
+ " length 1.5 0.8\n",
+ " falcon speed 320.0 250.0\n",
+ " weight 1.0 0.8\n",
+ " length 0.3 0.2\n",
+ " \n",
+ " >>> df.drop(index='cow', columns='small')\n",
+ " big\n",
+ " lama speed 45.0\n",
+ " weight 200.0\n",
+ " length 1.5\n",
+ " falcon speed 320.0\n",
+ " weight 1.0\n",
+ " length 0.3\n",
+ " \n",
+ " >>> df.drop(index='length', level=1)\n",
+ " big small\n",
+ " lama speed 45.0 30.0\n",
+ " weight 200.0 100.0\n",
+ " cow speed 30.0 20.0\n",
+ " weight 250.0 150.0\n",
+ " falcon speed 320.0 250.0\n",
+ " weight 1.0 0.8\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "help(pd.DataFrame.drop)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['nickname', 'nsid', 'photo_id', 'url'], dtype='object')"
+ ]
+ },
+ "execution_count": 59,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Index(['nsid', 'photo_id', 'url'], dtype='object')\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_flickr_meta.drop(labels=['subdir'],axis=1, inplace=True)\n",
+ "print(df_flickr_meta.keys())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#df_flickr_meta['subdir'] = ''\n",
+ "df_flickr_meta['filepath'] = ''"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.DataFrame.from_dict(df_flickr_meta).to_csv(fp_filepaths, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>nsid</th>\n",
+ " <th>photo_id</th>\n",
+ " <th>url</th>\n",
+ " <th>filepath</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>51576145@N02</td>\n",
+ " <td>4762068863</td>\n",
+ " <td>http://farm5.staticflickr.com/4117/4762068863_...</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>29689383@N02</td>\n",
+ " <td>5711730606</td>\n",
+ " <td>http://farm3.staticflickr.com/2800/5711730606_...</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>29689383@N02</td>\n",
+ " <td>5711730606</td>\n",
+ " <td>http://farm3.staticflickr.com/2800/5711730606_...</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>27982139@N00</td>\n",
+ " <td>2439203939</td>\n",
+ " <td>http://farm3.staticflickr.com/2105/2439203939_...</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>27982139@N00</td>\n",
+ " <td>2464402099</td>\n",
+ " <td>http://farm4.staticflickr.com/3030/2464402099_...</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " nsid photo_id \\\n",
+ "0 51576145@N02 4762068863 \n",
+ "1 29689383@N02 5711730606 \n",
+ "2 29689383@N02 5711730606 \n",
+ "3 27982139@N00 2439203939 \n",
+ "4 27982139@N00 2464402099 \n",
+ "\n",
+ " url filepath \n",
+ "0 http://farm5.staticflickr.com/4117/4762068863_... \n",
+ "1 http://farm3.staticflickr.com/2800/5711730606_... \n",
+ "2 http://farm3.staticflickr.com/2800/5711730606_... \n",
+ "3 http://farm3.staticflickr.com/2105/2439203939_... \n",
+ "4 http://farm4.staticflickr.com/3030/2464402099_... "
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_flickr_meta.head()"
+ ]
+ },
+ {
"cell_type": "code",
"execution_count": 31,
"metadata": {},