new nbs

author: adamhrv <adam@ahprojects.com> 2018-12-13 14:33:05 +0100
committer: adamhrv <adam@ahprojects.com> 2018-12-13 14:33:05 +0100
commit: 49a49bebe3f972e93add837180f5672a4ae62ce0 (patch)
tree: 03175a840591518998e4e8ecd92d64e599ef4eb0 /megapixels/notebooks/datasets/lfw/lfw_names.ipynb
parent: 7891e9d0dc9adcb68749f0e8049c0c8901b4f288 (diff)
1 files changed, 226 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/lfw/lfw_names.ipynb b/megapixels/notebooks/datasets/lfw/lfw_names.ipynb
new file mode 100644
index 00000000..37a1bd8f
--- /dev/null
+++ b/megapixels/notebooks/datasets/lfw/lfw_names.ipynb
@@ -0,0 +1,226 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# LFW Names\n",
+    "\n",
+    "- add gender and format names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import os\n",
+    "from os.path import join\n",
+    "import math\n",
+    "from glob import glob\n",
+    "from random import randint\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "import sys\n",
+    "sys.path.append('/work/megapixels_dev/megapixels/')\n",
+    "from app.utils import file_utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DATA_STORE = '/data_store_nas/'\n",
+    "dir_dataset = join(DATA_STORE, 'datasets/people/lfw')\n",
+    "fp_names = join(dir_dataset, 'lfw_names.csv')\n",
+    "fp_male = join(dir_dataset, 'male_names.txt')\n",
+    "fp_female = join(dir_dataset, 'female_names.txt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Alfred Ford', 'Craig Fitzgibbon']\n",
+      "['Claudia Coslovich', 'Allison Searing']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# load names\n",
+    "df_names = pd.read_csv(fp_names)\n",
+    "names = df_names.to_dict('index')\n",
+    "# load gender\n",
+    "names_male = file_utils.load_text(fp_male)\n",
+    "names_female = file_utils.load_text(fp_female)\n",
+    "# convert filenames to csv names\n",
+    "names_male = [t.replace('_',' ')[:-9] for t in names_male]\n",
+    "names_female = [t.replace('_',' ')[:-9] for t in names_female]\n",
+    "# check names\n",
+    "print(names_male[:2])\n",
+    "print(names_female[:2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'name': 'AJ Cook', 'images': 1}\n"
+     ]
+    }
+   ],
+   "source": [
+    "for idx, n in names.items():\n",
+    "  print(n)\n",
+    "  break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# add gender to name item dict\n",
+    "for idx, item in names.items():\n",
+    "  name = item['name']\n",
+    "  if name in names_male:\n",
+    "    g = 'm'\n",
+    "  elif name in names_female:\n",
+    "    g = 'f'\n",
+    "  elif name == 'Tara Kirk':\n",
+    "    g = 'f'  # unlabeled item\n",
+    "  else:\n",
+    "    g = 'x'\n",
+    "  names[idx]['gender'] = g"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'name': 'AJ Cook', 'images': 1, 'gender': 'f'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "names_list = list(names.values())\n",
+    "for n in names_list:\n",
+    "  print(n)\n",
+    "  break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save to csv\n",
+    "fp_gendered = join(dir_dataset, 'lfw_names_gendered.csv')\n",
+    "df_names_gendered = pd.DataFrame.from_dict(list(names.values()))  # ignore the indices\n",
+    "df_names_gendered.to_csv(fp_gendered, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#%cat $fp_names_gendered | head -n2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[]\n",
+      "4277 1472 5749 None\n"
+     ]
+    }
+   ],
+   "source": [
+    "f = [x for k, x in names.items() if x['gender'] == 'f']\n",
+    "m = [x for k, x in names.items() if x['gender'] == 'm']\n",
+    "x = [x for k, x in names.items() if x['gender'] not in ['f','m']]\n",
+    "print(len(m), len(f), len(f) + len(m), print(x))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "5749\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(names))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:megapixels]",
+   "language": "python",
+   "name": "conda-env-megapixels-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
author	adamhrv <adam@ahprojects.com>	2018-12-13 14:33:05 +0100
committer	adamhrv <adam@ahprojects.com>	2018-12-13 14:33:05 +0100
commit	49a49bebe3f972e93add837180f5672a4ae62ce0 (patch)
tree	03175a840591518998e4e8ecd92d64e599ef4eb0 /megapixels/notebooks/datasets/lfw/lfw_names.ipynb
parent	7891e9d0dc9adcb68749f0e8049c0c8901b4f288 (diff)