diff options
Diffstat (limited to 'megapixels/notebooks/datasets/lfw/count_images.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/lfw/count_images.ipynb | 247 |
1 files changed, 247 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/lfw/count_images.ipynb b/megapixels/notebooks/datasets/lfw/count_images.ipynb new file mode 100644 index 00000000..26682f8b --- /dev/null +++ b/megapixels/notebooks/datasets/lfw/count_images.ipynb @@ -0,0 +1,247 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Count Images for LFW\n", + "\n", + "- use sub-directory as `identity_key`" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "from os.path import join\n", + "from glob import glob\n", + "from pathlib import Path\n", + "from pprint import pprint\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd\n", + "import numpy as np\n", + "from slugify import slugify\n", + "\n", + "import sys\n", + "sys.path.append('/work/megapixels_dev/megapixels')\n", + "from app.utils import file_utils\n", + "from app.settings import types, app_cfg\n", + "from app.models.data_store import DataStore" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get image counts" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "opt_dataset = types.Dataset.LFW\n", + "opt_data_store = types.DataStore.HDD\n", + "data_store = DataStore(opt_data_store, opt_dataset)\n", + "# get filepath out\n", + "fp_records = data_store.metadata(types.Metadata.FILE_RECORD)\n", + "fp_img_counts = data_store.metadata(types.Metadata.IMAGE_COUNT)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "df_records = pd.read_csv(fp_records).set_index('index')\n", + "records = df_records.to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# paths\n", + "fp_dirs = '/data_store_hdd/datasets/people/lfw/media/original/'\n", + "\n", + "fp_out = '/data_store_hdd/datasets/people/lfw/metadata/image_counts.csv'\n", + "\n", + "# glob\n", + "dirs = glob(join(fp_dirs,'*'))\n", + "\n", + "# count images\n", + "image_counts = []\n", + "\n", + "for d in tqdm(dirs):\n", + " # get number of images\n", + " files = file_utils.glob_multi(d, ['jpg', 'png'], recursive=False)\n", + " count = len(files)\n", + " name = Path(d).stem\n", + " image_counts.append({'identity_key': name, 'count': count})" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "df_counts = pd.DataFrame.from_dict(image_counts)\n", + "df_counts.index.name = 'index'\n", + "df_counts.to_csv(fp_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>count</th>\n", + " <th>identity_key</th>\n", + " </tr>\n", + " <tr>\n", + " <th>index</th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>14</td>\n", + " <td>Kim_Clijsters</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>William_Rosenberg</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2</td>\n", + " <td>John_Brady</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>3</td>\n", + " <td>Juan_Ignacio_Chela</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>Floyd_Keith</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " count identity_key\n", + "index \n", + "0 14 Kim_Clijsters\n", + "1 1 William_Rosenberg\n", + "2 2 John_Brady\n", + "3 3 Juan_Ignacio_Chela\n", + "4 1 Floyd_Keith" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_counts.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} |
