nbs for data collection

author: adamhrv <adam@ahprojects.com> 2019-03-19 12:21:21 +0100
committer: adamhrv <adam@ahprojects.com> 2019-03-19 12:21:21 +0100
commit: a16b3cc7f796a5abe6c8c79f22b178785e6971f5 (patch)
tree: ec0a62f66479d7e7c7f0f03a5c2de79d87c51c3e /megapixels/notebooks/datasets/lfpw/lfpw_csv_download.ipynb
parent: 70f79c37278d7c47bee29cdf091bde448aae9240 (diff)
1 files changed, 115 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/lfpw/lfpw_csv_download.ipynb b/megapixels/notebooks/datasets/lfpw/lfpw_csv_download.ipynb
new file mode 100644
index 00000000..dd038a8f
--- /dev/null
+++ b/megapixels/notebooks/datasets/lfpw/lfpw_csv_download.ipynb
@@ -0,0 +1,115 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare CSV URL for LFPW"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import os\n",
+    "from os.path import join\n",
+    "from glob import glob\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import hashlib\n",
+    "\n",
+    "import sys\n",
+    "sys.path.append('/work/megapixels_dev/megapixels')\n",
+    "from app.utils import api_utils, identity_utils"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create CSV for Image Download"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_in_train = '/data_store/datasets/people/lfpw/downloads/kbvt_lfpw_v1_train.csv'\n",
+    "fp_in_test = '/data_store/datasets/people/lfpw/downloads/kbvt_lfpw_v1_test.csv'\n",
+    "fp_out = '/data_store/datasets/people/lfpw/downloads/urls.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train = pd.read_csv(fp_in_train, sep='\\t')\n",
+    "df_test = pd.read_csv(fp_in_test, sep='\\t')\n",
+    "df = pd.concat([df_test, df_train], sort=False)\n",
+    "records = df.to_dict('records')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urlmaps = []\n",
+    "for record in records:\n",
+    "  url = record['imgurl']\n",
+    "  ext = Path(url).suffix.lower()\n",
+    "  if ext == '.jpeg':\n",
+    "    ext = '.jpg'\n",
+    "  if ext != '':\n",
+    "    ext = '.jpg'\n",
+    "  sha256 = hashlib.sha256(str.encode(url)).hexdigest()\n",
+    "  filepath = sha256 + ext\n",
+    "  urlmaps.append({'url':url, 'filepath':filepath})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_urls = pd.DataFrame.from_dict(urlmaps)\n",
+    "df_urls.to_csv(fp_out, index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "megapixels",
+   "language": "python",
+   "name": "megapixels"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
author	adamhrv <adam@ahprojects.com>	2019-03-19 12:21:21 +0100
committer	adamhrv <adam@ahprojects.com>	2019-03-19 12:21:21 +0100
commit	a16b3cc7f796a5abe6c8c79f22b178785e6971f5 (patch)
tree	ec0a62f66479d7e7c7f0f03a5c2de79d87c51c3e /megapixels/notebooks/datasets/lfpw/lfpw_csv_download.ipynb
parent	70f79c37278d7c47bee29cdf091bde448aae9240 (diff)