summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/lfw/lfw_make_identity_csv.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/notebooks/datasets/lfw/lfw_make_identity_csv.ipynb')
-rw-r--r--megapixels/notebooks/datasets/lfw/lfw_make_identity_csv.ipynb510
1 files changed, 0 insertions, 510 deletions
diff --git a/megapixels/notebooks/datasets/lfw/lfw_make_identity_csv.ipynb b/megapixels/notebooks/datasets/lfw/lfw_make_identity_csv.ipynb
deleted file mode 100644
index 039614f0..00000000
--- a/megapixels/notebooks/datasets/lfw/lfw_make_identity_csv.ipynb
+++ /dev/null
@@ -1,510 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Add identity ID to index"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "from os.path import join\n",
- "from pathlib import Path\n",
- "import difflib\n",
- "\n",
- "from tqdm import tqdm_notebook as tqdm\n",
- "import pandas as pd\n",
- "import numpy as np"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "# names\n",
- "DATA_STORE = '/data_store_ssd/'\n",
- "dir_dataset = 'datasets/people/lfw/metadata'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [],
- "source": [
- "# split records into index and uuids\n",
- "fp_identity_in = join(DATA_STORE, dir_dataset, 'identities_old.csv')\n",
- "fp_identity_out = join(DATA_STORE, dir_dataset, 'identity_lookup.csv')\n",
- "\n",
- "df_identity = pd.read_csv(fp_identity_in).set_index('index')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>name</th>\n",
- " <th>name_orig</th>\n",
- " <th>description</th>\n",
- " <th>gender</th>\n",
- " <th>images</th>\n",
- " <th>image_index</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>index</th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>A. J. Cook</td>\n",
- " <td>AJ Cook</td>\n",
- " <td>Canadian actress</td>\n",
- " <td>f</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>AJ Lamas</td>\n",
- " <td>AJ Lamas</td>\n",
- " <td>American actor</td>\n",
- " <td>m</td>\n",
- " <td>1</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>Aaron Eckhart</td>\n",
- " <td>Aaron Eckhart</td>\n",
- " <td>American actor</td>\n",
- " <td>m</td>\n",
- " <td>1</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>Aaron Guiel</td>\n",
- " <td>Aaron Guiel</td>\n",
- " <td>Professional baseball player</td>\n",
- " <td>m</td>\n",
- " <td>1</td>\n",
- " <td>3</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>Aaron Patterson</td>\n",
- " <td>Aaron Patterson</td>\n",
- " <td>Author</td>\n",
- " <td>m</td>\n",
- " <td>1</td>\n",
- " <td>4</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " name name_orig description gender \\\n",
- "index \n",
- "0 A. J. Cook AJ Cook Canadian actress f \n",
- "1 AJ Lamas AJ Lamas American actor m \n",
- "2 Aaron Eckhart Aaron Eckhart American actor m \n",
- "3 Aaron Guiel Aaron Guiel Professional baseball player m \n",
- "4 Aaron Patterson Aaron Patterson Author m \n",
- "\n",
- " images image_index \n",
- "index \n",
- "0 1 0 \n",
- "1 1 1 \n",
- "2 1 2 \n",
- "3 1 3 \n",
- "4 1 4 "
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_identity.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>name</th>\n",
- " <th>name_orig</th>\n",
- " <th>description</th>\n",
- " <th>gender</th>\n",
- " <th>images</th>\n",
- " <th>image_index</th>\n",
- " <th>subdir</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>index</th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>A. J. Cook</td>\n",
- " <td>AJ Cook</td>\n",
- " <td>Canadian actress</td>\n",
- " <td>f</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " <td></td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>AJ Lamas</td>\n",
- " <td>AJ Lamas</td>\n",
- " <td>American actor</td>\n",
- " <td>m</td>\n",
- " <td>1</td>\n",
- " <td>1</td>\n",
- " <td></td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " name name_orig description gender images image_index \\\n",
- "index \n",
- "0 A. J. Cook AJ Cook Canadian actress f 1 0 \n",
- "1 AJ Lamas AJ Lamas American actor m 1 1 \n",
- "\n",
- " subdir \n",
- "index \n",
- "0 \n",
- "1 "
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# associate each file with an identity\n",
- "df_identity['subdir'] = [''] * len(df_identity)\n",
- "df_identity.head(2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "ece5c11b90954b25b1f1e28fc2fe6b55",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
- "source": [
- "for row in tqdm(df_identity.itertuples(), total=len(df_identity)):\n",
- " name = row.name_orig\n",
- " subdir = name.replace(' ','_')\n",
- " df_identity.at[row.Index, 'subdir'] = subdir"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>name</th>\n",
- " <th>name_orig</th>\n",
- " <th>description</th>\n",
- " <th>gender</th>\n",
- " <th>images</th>\n",
- " <th>image_index</th>\n",
- " <th>subdir</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>index</th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>A. J. Cook</td>\n",
- " <td>AJ Cook</td>\n",
- " <td>Canadian actress</td>\n",
- " <td>f</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " <td>AJ_Cook</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>AJ Lamas</td>\n",
- " <td>AJ Lamas</td>\n",
- " <td>American actor</td>\n",
- " <td>m</td>\n",
- " <td>1</td>\n",
- " <td>1</td>\n",
- " <td>AJ_Lamas</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>Aaron Eckhart</td>\n",
- " <td>Aaron Eckhart</td>\n",
- " <td>American actor</td>\n",
- " <td>m</td>\n",
- " <td>1</td>\n",
- " <td>2</td>\n",
- " <td>Aaron_Eckhart</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>Aaron Guiel</td>\n",
- " <td>Aaron Guiel</td>\n",
- " <td>Professional baseball player</td>\n",
- " <td>m</td>\n",
- " <td>1</td>\n",
- " <td>3</td>\n",
- " <td>Aaron_Guiel</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>Aaron Patterson</td>\n",
- " <td>Aaron Patterson</td>\n",
- " <td>Author</td>\n",
- " <td>m</td>\n",
- " <td>1</td>\n",
- " <td>4</td>\n",
- " <td>Aaron_Patterson</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " name name_orig description gender \\\n",
- "index \n",
- "0 A. J. Cook AJ Cook Canadian actress f \n",
- "1 AJ Lamas AJ Lamas American actor m \n",
- "2 Aaron Eckhart Aaron Eckhart American actor m \n",
- "3 Aaron Guiel Aaron Guiel Professional baseball player m \n",
- "4 Aaron Patterson Aaron Patterson Author m \n",
- "\n",
- " images image_index subdir \n",
- "index \n",
- "0 1 0 AJ_Cook \n",
- "1 1 1 AJ_Lamas \n",
- "2 1 2 Aaron_Eckhart \n",
- "3 1 3 Aaron_Guiel \n",
- "4 1 4 Aaron_Patterson "
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_identity.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_identity.to_csv(fp_identity_out)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 138,
- "metadata": {},
- "outputs": [],
- "source": [
- "# make a clean index separate from files"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 145,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'AJ Lamas'"
- ]
- },
- "execution_count": 145,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#df_identies = pd.read_csv('identities.csv')\n",
- "df_identities.iloc[1]['name']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 149,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "1 2 3 4\n"
- ]
- }
- ],
- "source": [
- "a = [1,2,3,4]\n",
- "\n",
- "print(*a)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python [conda env:megapixels]",
- "language": "python",
- "name": "conda-env-megapixels-py"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}