1 files changed, 115 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/lfpw/lfpw_csv_download.ipynb b/megapixels/notebooks/datasets/lfpw/lfpw_csv_download.ipynb
new file mode 100644
index 00000000..dd038a8f
--- /dev/null
+++ b/megapixels/notebooks/datasets/lfpw/lfpw_csv_download.ipynb
@@ -0,0 +1,115 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare CSV URL for LFPW"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import os\n",
+    "from os.path import join\n",
+    "from glob import glob\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import hashlib\n",
+    "\n",
+    "import sys\n",
+    "sys.path.append('/work/megapixels_dev/megapixels')\n",
+    "from app.utils import api_utils, identity_utils"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create CSV for Image Download"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_in_train = '/data_store/datasets/people/lfpw/downloads/kbvt_lfpw_v1_train.csv'\n",
+    "fp_in_test = '/data_store/datasets/people/lfpw/downloads/kbvt_lfpw_v1_test.csv'\n",
+    "fp_out = '/data_store/datasets/people/lfpw/downloads/urls.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train = pd.read_csv(fp_in_train, sep='\\t')\n",
+    "df_test = pd.read_csv(fp_in_test, sep='\\t')\n",
+    "df = pd.concat([df_test, df_train], sort=False)\n",
+    "records = df.to_dict('records')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urlmaps = []\n",
+    "for record in records:\n",
+    "  url = record['imgurl']\n",
+    "  ext = Path(url).suffix.lower()\n",
+    "  if ext == '.jpeg':\n",
+    "    ext = '.jpg'\n",
+    "  if ext != '':\n",
+    "    ext = '.jpg'\n",
+    "  sha256 = hashlib.sha256(str.encode(url)).hexdigest()\n",
+    "  filepath = sha256 + ext\n",
+    "  urlmaps.append({'url':url, 'filepath':filepath})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_urls = pd.DataFrame.from_dict(urlmaps)\n",
+    "df_urls.to_csv(fp_out, index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "megapixels",
+   "language": "python",
+   "name": "megapixels"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}