{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# LFW Names\n", "\n", "- add gender and format names" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "from os.path import join\n", "import math\n", "from glob import glob\n", "from random import randint\n", "\n", "import numpy as np\n", "import pandas as pd\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels/')\n", "from app.utils import file_utils" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "DATA_STORE = '/data_store_nas/'\n", "dir_dataset = join(DATA_STORE, 'datasets/people/lfw')\n", "fp_names = join(dir_dataset, 'lfw_names.csv')\n", "fp_male = join(dir_dataset, 'male_names.txt')\n", "fp_female = join(dir_dataset, 'female_names.txt')" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Alfred Ford', 'Craig Fitzgibbon']\n", "['Claudia Coslovich', 'Allison Searing']\n" ] } ], "source": [ "# load names\n", "df_names = pd.read_csv(fp_names)\n", "names = df_names.to_dict('index')\n", "# load gender\n", "names_male = file_utils.load_text(fp_male)\n", "names_female = file_utils.load_text(fp_female)\n", "# convert filenames to csv names\n", "names_male = [t.replace('_',' ')[:-9] for t in names_male]\n", "names_female = [t.replace('_',' ')[:-9] for t in names_female]\n", "# check names\n", "print(names_male[:2])\n", "print(names_female[:2])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'name': 'AJ Cook', 'images': 1}\n" ] } ], "source": [ "for idx, n in names.items():\n", " print(n)\n", " break" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "# add gender to name item dict\n", "for idx, item in names.items():\n", " name = item['name']\n", " if name in names_male:\n", " g = 'm'\n", " elif name in names_female:\n", " g = 'f'\n", " elif name == 'Tara Kirk':\n", " g = 'f' # unlabeled item\n", " else:\n", " g = 'x'\n", " names[idx]['gender'] = g" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'name': 'AJ Cook', 'images': 1, 'gender': 'f'}\n" ] } ], "source": [ "names_list = list(names.values())\n", "for n in names_list:\n", " print(n)\n", " break" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "# save to csv\n", "fp_gendered = join(dir_dataset, 'lfw_names_gendered.csv')\n", "df_names_gendered = pd.DataFrame.from_dict(list(names.values())) # ignore the indices\n", "df_names_gendered.to_csv(fp_gendered, index=False)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "#%cat $fp_names_gendered | head -n2" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[]\n", "4277 1472 5749 None\n" ] } ], "source": [ "f = [x for k, x in names.items() if x['gender'] == 'f']\n", "m = [x for k, x in names.items() if x['gender'] == 'm']\n", "x = [x for k, x in names.items() if x['gender'] not in ['f','m']]\n", "print(len(m), len(f), len(f) + len(m), print(x))" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5749\n" ] } ], "source": [ "print(len(names))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:megapixels]", "language": "python", "name": "conda-env-megapixels-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }