{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# IMDB-WIKI Convert .mat" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import os.path as osp\n", "from os.path import join\n", "from glob import glob\n", "import random\n", "import math\n", "from datetime import datetime\n", "\n", "import cv2 as cv\n", "import pandas as pd\n", "from scipy.io import loadmat\n", "import numpy as np\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "%reload_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Metadata" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "fp_mat = '/data_store_hdd/datasets/people/imdb_wiki/downloads/imdb.mat'\n", "dir_out = '/data_store_hdd/datasets/people/imdb_wiki/metadata/'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "mat_data = loadmat(fp_mat)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# row 3\n", "def load_parse_imdb_mat(mat):\n", " metadata = mat['imdb'][0][0]\n", " results = []\n", " num_records = len(metadata[0][0])\n", " print(f'loaded: {num_records} records')\n", " for i in tqdm(range(num_records), total=num_records):\n", " dob_matlab = metadata[0][0][i]\n", " dob = datetime.fromordinal(dob_matlab)\n", " dob_str = f'{dob.year}-{dob.month}-{dob.day}'\n", " year_photo = metadata[1][0][i]\n", " fp = metadata[2][0][i][0]\n", " gender_val = metadata[3][0][i]\n", " if gender_val == 0:\n", " gender = 'f'\n", " elif gender_val == 1:\n", " gender = 'm'\n", " else:\n", " gender = None\n", " name = metadata[4][0][i][0]\n", " roi = metadata[5][0][i][0]\n", " face_conf = metadata[6][0][i]\n", " face_conf_second = metadata[7][0][i]\n", " celeb_id = metadata[9][0][i]\n", " result = {\n", " 'dob': dob_str,\n", " 'year_photo': year_photo,\n", " 'filepath': fp,\n", " 'gender': gender,\n", " 'name': name,\n", " 'x1': roi[0],\n", " 'y1': roi[1],\n", " 'x2': roi[2],\n", " 'y2': roi[3],\n", " 'celeb_id': celeb_id\n", " }\n", " results.append(result)\n", " return results" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loaded: 460723 records\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3ab4e2fa182c402a88a29ed81c1bdeeb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=460723), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "results_meta = load_parse_imdb_mat(mat_data)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "df_meta = pd.DataFrame.from_dict(results_meta)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
celeb_iddobfilepathgendernamex1x2y1y2year_photo
064881900-5-1101/nm0000001_rm124825600_1899-5-10_1968.jpgmFred Astaire1072.9260001214.784000161.838000303.6960001968
164881900-5-1101/nm0000001_rm3343756032_1899-5-10_1970.jpgmFred Astaire477.184000622.592000100.352000245.7600001970
264881900-5-1101/nm0000001_rm577153792_1899-5-10_1968.jpgmFred Astaire114.969643451.686572114.969643451.6865721968
364881900-5-1101/nm0000001_rm946909184_1899-5-10_1968.jpgmFred Astaire622.885506844.339008424.217504645.6710061968
464881900-5-1101/nm0000001_rm980463616_1899-5-10_1968.jpgmFred Astaire1013.8590021201.586128233.882042421.6091681968
\n", "
" ], "text/plain": [ " celeb_id dob filepath gender \\\n", "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg m \n", "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg m \n", "2 6488 1900-5-11 01/nm0000001_rm577153792_1899-5-10_1968.jpg m \n", "3 6488 1900-5-11 01/nm0000001_rm946909184_1899-5-10_1968.jpg m \n", "4 6488 1900-5-11 01/nm0000001_rm980463616_1899-5-10_1968.jpg m \n", "\n", " name x1 x2 y1 y2 year_photo \n", "0 Fred Astaire 1072.926000 1214.784000 161.838000 303.696000 1968 \n", "1 Fred Astaire 477.184000 622.592000 100.352000 245.760000 1970 \n", "2 Fred Astaire 114.969643 451.686572 114.969643 451.686572 1968 \n", "3 Fred Astaire 622.885506 844.339008 424.217504 645.671006 1968 \n", "4 Fred Astaire 1013.859002 1201.586128 233.882042 421.609168 1968 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_meta.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create DataFrame for metadata" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df_results = pd.DataFrame.from_dict(results_meta)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
celeb_iddobfilepathgendernamex1x2y1y2year_photo
064881900-5-1101/nm0000001_rm124825600_1899-5-10_1968.jpgmFred Astaire1072.9260001214.784000161.838000303.6960001968
164881900-5-1101/nm0000001_rm3343756032_1899-5-10_1970.jpgmFred Astaire477.184000622.592000100.352000245.7600001970
264881900-5-1101/nm0000001_rm577153792_1899-5-10_1968.jpgmFred Astaire114.969643451.686572114.969643451.6865721968
364881900-5-1101/nm0000001_rm946909184_1899-5-10_1968.jpgmFred Astaire622.885506844.339008424.217504645.6710061968
464881900-5-1101/nm0000001_rm980463616_1899-5-10_1968.jpgmFred Astaire1013.8590021201.586128233.882042421.6091681968
\n", "
" ], "text/plain": [ " celeb_id dob filepath gender \\\n", "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg m \n", "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg m \n", "2 6488 1900-5-11 01/nm0000001_rm577153792_1899-5-10_1968.jpg m \n", "3 6488 1900-5-11 01/nm0000001_rm946909184_1899-5-10_1968.jpg m \n", "4 6488 1900-5-11 01/nm0000001_rm980463616_1899-5-10_1968.jpg m \n", "\n", " name x1 x2 y1 y2 year_photo \n", "0 Fred Astaire 1072.926000 1214.784000 161.838000 303.696000 1968 \n", "1 Fred Astaire 477.184000 622.592000 100.352000 245.760000 1970 \n", "2 Fred Astaire 114.969643 451.686572 114.969643 451.686572 1968 \n", "3 Fred Astaire 622.885506 844.339008 424.217504 645.671006 1968 \n", "4 Fred Astaire 1013.859002 1201.586128 233.882042 421.609168 1968 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_results.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "df_results.index.name = 'index'\n", "df_results.to_csv(join(dir_out,'imdb_wiki.csv'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Count Images per Person" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "df_name_groups = df_results.groupby('name')\n", "images_per_person = []\n", "for name, df_name in df_name_groups:\n", " images_per_person.append({'name': name, 'num_images': len(df_name)})\n", "df_images_per_person = pd.DataFrame.from_dict(images_per_person)\n", "df_images_per_person.index.name = 'index'\n", "df_images_per_person.to_csv(join(dir_out, 'imdb_images_per_person.csv'), index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Find Face Size" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "sizes = [(x['x2'] - x['x1']) for x in results_meta]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "buckets = list(range(0,500,50))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from matplotlib import pyplot as plt \n", "import numpy as np \n", "bins = list(range(0,500,20))\n", "plt.figure(figsize=(12,8))\n", "plt.hist(sizes, bins=bins)\n", "plt.title(\"Face Image Sizes\") \n", "plt.ylabel(\"Images\")\n", "plt.xlabel(\"Width (px)\")\n", "plt.yticks(range(0, 60000, 10000))\n", "plt.title('IMDB-Wiki: Face Pixel Size')\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "dob: date of birth (Matlab serial date number)\n", "photo_taken: year when the photo was taken\n", "full_path: path to file\n", "gender: 0 for female and 1 for male, NaN if unknown\n", "name: name of the celebrity\n", "face_location: location of the face. To crop the face in Matlab run\n", "\n", "img(face_location(2):face_location(4),face_location(1):face_location(3),:))\n", "\n", "face_score: detector score (the higher the better). Inf implies that no face was found in the image and the face_location then just returns the entire image\n", "second_face_score: detector score of the face with the second highest score. This is useful to ignore images with more than one face. second_face_score is NaN if no second face was detected.\n", "celeb_names (IMDB only): list of all celebrity names\n", "celeb_id (IMDB only): index of celebrity name\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:megapixels]", "language": "python", "name": "conda-env-megapixels-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }