summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/lfw/lfw_names.ipynb
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2018-12-13 14:33:05 +0100
committeradamhrv <adam@ahprojects.com>2018-12-13 14:33:05 +0100
commit49a49bebe3f972e93add837180f5672a4ae62ce0 (patch)
tree03175a840591518998e4e8ecd92d64e599ef4eb0 /megapixels/notebooks/datasets/lfw/lfw_names.ipynb
parent7891e9d0dc9adcb68749f0e8049c0c8901b4f288 (diff)
new nbs
Diffstat (limited to 'megapixels/notebooks/datasets/lfw/lfw_names.ipynb')
-rw-r--r--megapixels/notebooks/datasets/lfw/lfw_names.ipynb226
1 files changed, 226 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/lfw/lfw_names.ipynb b/megapixels/notebooks/datasets/lfw/lfw_names.ipynb
new file mode 100644
index 00000000..37a1bd8f
--- /dev/null
+++ b/megapixels/notebooks/datasets/lfw/lfw_names.ipynb
@@ -0,0 +1,226 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# LFW Names\n",
+ "\n",
+ "- add gender and format names"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "from os.path import join\n",
+ "import math\n",
+ "from glob import glob\n",
+ "from random import randint\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "import sys\n",
+ "sys.path.append('/work/megapixels_dev/megapixels/')\n",
+ "from app.utils import file_utils"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DATA_STORE = '/data_store_nas/'\n",
+ "dir_dataset = join(DATA_STORE, 'datasets/people/lfw')\n",
+ "fp_names = join(dir_dataset, 'lfw_names.csv')\n",
+ "fp_male = join(dir_dataset, 'male_names.txt')\n",
+ "fp_female = join(dir_dataset, 'female_names.txt')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['Alfred Ford', 'Craig Fitzgibbon']\n",
+ "['Claudia Coslovich', 'Allison Searing']\n"
+ ]
+ }
+ ],
+ "source": [
+ "# load names\n",
+ "df_names = pd.read_csv(fp_names)\n",
+ "names = df_names.to_dict('index')\n",
+ "# load gender\n",
+ "names_male = file_utils.load_text(fp_male)\n",
+ "names_female = file_utils.load_text(fp_female)\n",
+ "# convert filenames to csv names\n",
+ "names_male = [t.replace('_',' ')[:-9] for t in names_male]\n",
+ "names_female = [t.replace('_',' ')[:-9] for t in names_female]\n",
+ "# check names\n",
+ "print(names_male[:2])\n",
+ "print(names_female[:2])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'name': 'AJ Cook', 'images': 1}\n"
+ ]
+ }
+ ],
+ "source": [
+ "for idx, n in names.items():\n",
+ " print(n)\n",
+ " break"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add gender to name item dict\n",
+ "for idx, item in names.items():\n",
+ " name = item['name']\n",
+ " if name in names_male:\n",
+ " g = 'm'\n",
+ " elif name in names_female:\n",
+ " g = 'f'\n",
+ " elif name == 'Tara Kirk':\n",
+ " g = 'f' # unlabeled item\n",
+ " else:\n",
+ " g = 'x'\n",
+ " names[idx]['gender'] = g"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'name': 'AJ Cook', 'images': 1, 'gender': 'f'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "names_list = list(names.values())\n",
+ "for n in names_list:\n",
+ " print(n)\n",
+ " break"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# save to csv\n",
+ "fp_gendered = join(dir_dataset, 'lfw_names_gendered.csv')\n",
+ "df_names_gendered = pd.DataFrame.from_dict(list(names.values())) # ignore the indices\n",
+ "df_names_gendered.to_csv(fp_gendered, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#%cat $fp_names_gendered | head -n2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[]\n",
+ "4277 1472 5749 None\n"
+ ]
+ }
+ ],
+ "source": [
+ "f = [x for k, x in names.items() if x['gender'] == 'f']\n",
+ "m = [x for k, x in names.items() if x['gender'] == 'm']\n",
+ "x = [x for k, x in names.items() if x['gender'] not in ['f','m']]\n",
+ "print(len(m), len(f), len(f) + len(m), print(x))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "5749\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(names))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:megapixels]",
+ "language": "python",
+ "name": "conda-env-megapixels-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}