{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# LFW Names\n",
    "\n",
    "- add gender and format names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import os\n",
    "from os.path import join\n",
    "import math\n",
    "from glob import glob\n",
    "from random import randint\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import sys\n",
    "sys.path.append('/work/megapixels_dev/megapixels/')\n",
    "from app.utils import file_utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_STORE = '/data_store_nas/'\n",
    "dir_dataset = join(DATA_STORE, 'datasets/people/lfw')\n",
    "fp_names = join(dir_dataset, 'lfw_names.csv')\n",
    "fp_male = join(dir_dataset, 'male_names.txt')\n",
    "fp_female = join(dir_dataset, 'female_names.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Alfred Ford', 'Craig Fitzgibbon']\n",
      "['Claudia Coslovich', 'Allison Searing']\n"
     ]
    }
   ],
   "source": [
    "# load names\n",
    "df_names = pd.read_csv(fp_names)\n",
    "names = df_names.to_dict('index')\n",
    "# load gender\n",
    "names_male = file_utils.load_text(fp_male)\n",
    "names_female = file_utils.load_text(fp_female)\n",
    "# convert filenames to csv names\n",
    "names_male = [t.replace('_',' ')[:-9] for t in names_male]\n",
    "names_female = [t.replace('_',' ')[:-9] for t in names_female]\n",
    "# check names\n",
    "print(names_male[:2])\n",
    "print(names_female[:2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'name': 'AJ Cook', 'images': 1}\n"
     ]
    }
   ],
   "source": [
    "for idx, n in names.items():\n",
    "  print(n)\n",
    "  break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add gender to name item dict\n",
    "for idx, item in names.items():\n",
    "  name = item['name']\n",
    "  if name in names_male:\n",
    "    g = 'm'\n",
    "  elif name in names_female:\n",
    "    g = 'f'\n",
    "  elif name == 'Tara Kirk':\n",
    "    g = 'f'  # unlabeled item\n",
    "  else:\n",
    "    g = 'x'\n",
    "  names[idx]['gender'] = g"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'name': 'AJ Cook', 'images': 1, 'gender': 'f'}\n"
     ]
    }
   ],
   "source": [
    "names_list = list(names.values())\n",
    "for n in names_list:\n",
    "  print(n)\n",
    "  break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save to csv\n",
    "fp_gendered = join(dir_dataset, 'lfw_names_gendered.csv')\n",
    "df_names_gendered = pd.DataFrame.from_dict(list(names.values()))  # ignore the indices\n",
    "df_names_gendered.to_csv(fp_gendered, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "#%cat $fp_names_gendered | head -n2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[]\n",
      "4277 1472 5749 None\n"
     ]
    }
   ],
   "source": [
    "f = [x for k, x in names.items() if x['gender'] == 'f']\n",
    "m = [x for k, x in names.items() if x['gender'] == 'm']\n",
    "x = [x for k, x in names.items() if x['gender'] not in ['f','m']]\n",
    "print(len(m), len(f), len(f) + len(m), print(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5749\n"
     ]
    }
   ],
   "source": [
    "print(len(names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:megapixels]",
   "language": "python",
   "name": "conda-env-megapixels-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}