diff options
Diffstat (limited to 'megapixels/notebooks/datasets/identity/lfw_names.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/identity/lfw_names.ipynb | 226 |
1 files changed, 226 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/identity/lfw_names.ipynb b/megapixels/notebooks/datasets/identity/lfw_names.ipynb new file mode 100644 index 00000000..8c474dd7 --- /dev/null +++ b/megapixels/notebooks/datasets/identity/lfw_names.ipynb @@ -0,0 +1,226 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LFW Names\n", + "\n", + "- add gender and format names" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "from os.path import join\n", + "import math\n", + "from glob import glob\n", + "from random import randint\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import sys\n", + "sys.path.append('/work/megapixels_dev/megapixels/')\n", + "from app.utils import file_utils" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_STORE = '/data_store_nas/'\n", + "dir_dataset = join(DATA_STORE, 'datasets/people/lfw')\n", + "fp_names = join(dir_dataset, 'lfw_names.csv')\n", + "fp_male = join(dir_dataset, 'male_names.txt')\n", + "fp_female = join(dir_dataset, 'female_names.txt')" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Alfred Ford', 'Craig Fitzgibbon']\n", + "['Claudia Coslovich', 'Allison Searing']\n" + ] + } + ], + "source": [ + "# load names\n", + "df_names = pd.read_csv(fp_names)\n", + "names = df_names.to_dict('index')\n", + "# load gender\n", + "names_male = file_utils.load_text(fp_male)\n", + "names_female = file_utils.load_text(fp_female)\n", + "# convert filenames to csv names\n", + "names_male = [t.replace('_',' ')[:-9] for t in names_male]\n", + "names_female = [t.replace('_',' ')[:-9] for t in names_female]\n", + "# check names\n", + "print(names_male[:2])\n", + "print(names_female[:2])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'name': 'AJ Cook', 'images': 1}\n" + ] + } + ], + "source": [ + "for idx, n in names.items():\n", + " print(n)\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "# add gender to name item dict\n", + "for idx, item in names.items():\n", + " name = item['name']\n", + " if name in names_male:\n", + " g = 'm'\n", + " elif name in names_female:\n", + " g = 'f'\n", + " elif name == 'Tara Kirk':\n", + " g = 'f' # unlabeled item\n", + " else:\n", + " g = 'x'\n", + " names[idx]['gender'] = g" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'name': 'AJ Cook', 'images': 1, 'gender': 'f'}\n" + ] + } + ], + "source": [ + "names_list = list(names.values())\n", + "for n in names_list:\n", + " print(n)\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "# save to csv\n", + "fp_gendered = join(dir_dataset, 'lfw_names_gendered.csv')\n", + "df_names_gendered = pd.DataFrame.from_dict(list(names.values())) # ignore the indices\n", + "df_names_gendered.to_csv(fp_gendered, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "#%cat $fp_names_gendered | head -n2" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n", + "4277 1472 5749 None\n" + ] + } + ], + "source": [ + "f = [x for k, x in names.items() if x['gender'] == 'f']\n", + "m = [x for k, x in names.items() if x['gender'] == 'm']\n", + "x = [x for k, x in names.items() if x['gender'] not in ['f','m']]\n", + "print(len(m), len(f), len(f) + len(m), print(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5749\n" + ] + } + ], + "source": [ + "print(len(names))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} |
