From dd2c36288aa1e8af14588f9258f6785879b8638c Mon Sep 17 00:00:00 2001 From: adamhrv Date: Mon, 28 Jan 2019 18:11:36 +0100 Subject: add utils for analyzing identities --- .../notebooks/datasets/imdb_wiki/convert_mat.ipynb | 427 +++++++++++++++ .../notebooks/datasets/imdb_wiki/identity.ipynb | 498 ++++++++++++++++++ .../datasets/imdb_wiki/imdb_wiki_kg.ipynb | 468 ----------------- .../datasets/imdb_wiki/imdb_wiki_meta_debug.ipynb | 573 --------------------- 4 files changed, 925 insertions(+), 1041 deletions(-) create mode 100644 megapixels/notebooks/datasets/imdb_wiki/convert_mat.ipynb create mode 100644 megapixels/notebooks/datasets/imdb_wiki/identity.ipynb delete mode 100644 megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb delete mode 100644 megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_meta_debug.ipynb (limited to 'megapixels/notebooks/datasets/imdb_wiki') diff --git a/megapixels/notebooks/datasets/imdb_wiki/convert_mat.ipynb b/megapixels/notebooks/datasets/imdb_wiki/convert_mat.ipynb new file mode 100644 index 00000000..1bf7b590 --- /dev/null +++ b/megapixels/notebooks/datasets/imdb_wiki/convert_mat.ipynb @@ -0,0 +1,427 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# IMDB WIKI: Convert .mat to CSVs" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import os.path as osp\n", + "from os.path import join\n", + "from glob import glob\n", + "import random\n", + "import math\n", + "from datetime import datetime\n", + "\n", + "import cv2 as cv\n", + "import pandas as pd\n", + "from scipy.io import loadmat\n", + "import numpy as np\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "%reload_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "fp_mat = '/data_store_hdd/datasets/people/imdb_wiki/downloads/imdb.mat'\n", + "dir_out = '/data_store_hdd/datasets/people/imdb_wiki/metadata/'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "mat_data = loadmat(fp_mat)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# row 3\n", + "def load_parse_imdb_mat(mat):\n", + " metadata = mat['imdb'][0][0]\n", + " results = []\n", + " num_records = len(metadata[0][0])\n", + " print(f'loaded: {num_records} records')\n", + " for i in tqdm(range(num_records), total=num_records):\n", + " dob_matlab = metadata[0][0][i]\n", + " dob = datetime.fromordinal(dob_matlab)\n", + " dob_str = f'{dob.year}-{dob.month}-{dob.day}'\n", + " year_photo = metadata[1][0][i]\n", + " fp = metadata[2][0][i][0]\n", + " gender_val = metadata[3][0][i]\n", + " if gender_val == 0:\n", + " gender = 'f'\n", + " elif gender_val == 1:\n", + " gender = 'm'\n", + " else:\n", + " gender = None\n", + " name = metadata[4][0][i][0]\n", + " roi = metadata[5][0][i][0]\n", + " face_conf = metadata[6][0][i]\n", + " face_conf_second = metadata[7][0][i]\n", + " celeb_id = metadata[9][0][i]\n", + " result = {\n", + " 'dob': dob_str,\n", + " 'year_photo': year_photo,\n", + " 'filepath': fp,\n", + " 'gender': gender,\n", + " 'name': name,\n", + " 'x1': roi[0],\n", + " 'y1': roi[1],\n", + " 'x2': roi[2],\n", + " 'y2': roi[3],\n", + " 'celeb_id': celeb_id\n", + " }\n", + " results.append(result)\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loaded: 460723 records\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8a4a106e3bee4fde89492ceef50b9c05", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=460723), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "results_meta = load_parse_imdb_mat(mat_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "df_meta = pd.DataFrame.from_dict(results_meta)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
celeb_iddobfilepathgendernamex1x2y1y2year_photo
064881900-5-1101/nm0000001_rm124825600_1899-5-10_1968.jpgmFred Astaire1072.9260001214.784000161.838000303.6960001968
164881900-5-1101/nm0000001_rm3343756032_1899-5-10_1970.jpgmFred Astaire477.184000622.592000100.352000245.7600001970
264881900-5-1101/nm0000001_rm577153792_1899-5-10_1968.jpgmFred Astaire114.969643451.686572114.969643451.6865721968
364881900-5-1101/nm0000001_rm946909184_1899-5-10_1968.jpgmFred Astaire622.885506844.339008424.217504645.6710061968
464881900-5-1101/nm0000001_rm980463616_1899-5-10_1968.jpgmFred Astaire1013.8590021201.586128233.882042421.6091681968
\n", + "
" + ], + "text/plain": [ + " celeb_id dob filepath gender \\\n", + "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg m \n", + "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg m \n", + "2 6488 1900-5-11 01/nm0000001_rm577153792_1899-5-10_1968.jpg m \n", + "3 6488 1900-5-11 01/nm0000001_rm946909184_1899-5-10_1968.jpg m \n", + "4 6488 1900-5-11 01/nm0000001_rm980463616_1899-5-10_1968.jpg m \n", + "\n", + " name x1 x2 y1 y2 year_photo \n", + "0 Fred Astaire 1072.926000 1214.784000 161.838000 303.696000 1968 \n", + "1 Fred Astaire 477.184000 622.592000 100.352000 245.760000 1970 \n", + "2 Fred Astaire 114.969643 451.686572 114.969643 451.686572 1968 \n", + "3 Fred Astaire 622.885506 844.339008 424.217504 645.671006 1968 \n", + "4 Fred Astaire 1013.859002 1201.586128 233.882042 421.609168 1968 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_meta.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "df_meta.index.name = 'index'\n", + "df_meta.to_csv(join(dir_out,'imdb_mat.csv'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Count Images per Person" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# count images per person and save to CSV\n", + "df_name_groups = df_meta.groupby('name')\n", + "images_per_person = []\n", + "for name, df_name in df_name_groups:\n", + " images_per_person.append({'name': name, 'num_images': len(df_name)})\n", + "df_images_per_person = pd.DataFrame.from_dict(images_per_person)\n", + "df_images_per_person.index.name = 'index'\n", + "df_images_per_person.to_csv(join(dir_out, 'imdb_images_per_person.csv'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Find Face Size" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "sizes = [(x['x2'] - x['x1']) for x in results_meta]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "buckets = list(range(0,500,50))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAuQAAAHwCAYAAADuC3p1AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3Xu4ZlV9J/jvL5SK7Y2LSCuXlGnpSdAkXhjES/J4GRElHcxovLYQRXm61UQnJnZpOkNitBunZ2JLYswwyghGJcZ4IYKNBDXGKAooARENpSkFvIAWoA7xgv7mj3eXvh5PFQeq3rM4dT6f53mfd++1195r7bOLw/esd717V3cHAAAY46dGdwAAANYzgRwAAAYSyAEAYCCBHAAABhLIAQBgIIEcAAAGEsgB1oCqekZVvW9uvavqPtup+62q+pnV691iVNXrq+plu+A4H66q37gV++0x/SwP3tk+AOyIQA6sG1W1par+l2n5N6ZQ++oldY6Zyt84rW+c1r81vb5aVe+pqscsc+x/mepcV1VnVdVBO+jLZ6vqKXPrD5vaWVr2zara0N1v7u4jV3Ke3X3n7v78Cn8mH6yqb8+d37eq6iEr2XdXmMLytvavraq3V9W/TpLufk53/5cFt793Vb2xqr5SVd+YrsvvTu1/f/pZfnGRfQAQyIH17HNJnlxVG+bKjkvyT8vU3au775zkF5Ocm+Sdy4y6/rupzj2TfDXJn+yg7Q8l+eW59V9O8pllyj7a3Tet4Fx2xgum4Lnt9dEFt7fUf5h+bj+bZL8k/+cqtn1ykttPbe+V5AmZ/bsAWDUCObCefSXJpUkemyRVtU+ShyY5c3s7dPdXuvs1Sf4gyauq6id+j3b3t5O8PcmhO2h7aSD/pSSvWqbsQ1PffqOqPrzcgarq4VV1ZVU9Ylrf7nSWW6Kq/rSqrppGji+oqofObdtQVb9fVZ+btl9YVfeath1aVX9bVVur6jNV9cSVtNfdX0/yjiT3m47zF1X1B9Py71XVR6pqj2n9N6vq0qq6w7T+sKo6v6qur6qLq+qXt9PMUv9zkrd09/Xd/YPuvry73zF3jj19SnLwkk8RbqyqH/6hVFXPmc71uqp6744+HQFYSiAH1rvTkxw7LT81ybuTfGcF+70jyT2S/E9LN1TVv0rylCTn72D/DyW5b1XtM4X6w5L8ZZK95soeNtXbrqo6Kslbkzyxuz+4zPanV9UlKzif5XwsyS8k2SezPzD+alsATvK7SZ6U5KjMRpafk+TbVXXnzD5BOD2zn88zkpxSVT/xc1qmr/sl+V+TfHKZzScl6SQvraqfTfLyJM/o7u9M4ffMJCdOfd2U5B1Vte8KzvH8JP91+oPnkO1V6u4vzn+KkORvMvu5Z/qD43eTHJPZCP/HkrxlBW0DJBHIAd6Z5BFVdbfMgvnpK9zvS9P7PnNl76qq65PckOQxSf7b9nbu7i8k+WJmo+C/mOSK7v6XJP8wV3b7zMLd9vx6kv87yeO6++Pbaect3f0LN3MuJ08jy9dX1Sfm9n1Td2+dpsz8H0nummTbyPtzkrysu6+YRpYv7u6tmYXSf+ru07v7pu6+KMm7Mgvv2/Nn08/t4uln8jvLnMf3M7s+vz0d779097Y/NI5NcmZ3nzP15X8k+cfM/li4Oc/L7A+h30pyeVVdUVU7nKtfVb+X5N5JnjsV/YepP5+dflavSHJ4VR2wgvYBBHJgfZtC8FlJ/nOSfbv7H1a467awtXWu7AndvVeSPZO8IMnfVdW/XjrdYa7+tmkrv5zk76eyD8+Vfby7dzRa/6Ikb+vuT62wz9vzW9291/R64LbCqnrJNA3jhiTXJblTkrtPmw/K8nOtfzrJw+YC/vWZfVpwzx20/7yp7QO6+5nT1JWf0N2fy+zndFCS1y1p82lL2jwiyb1u7sS7+8bufsV03vtm9snHX09/oP2Eqvp3mYX4X5umJm1r/7VzbX8tyQ+SHHhz7QMkAjlAMhsVf3GSv7gF+/xakmuSfHbphunuHO9I8v0kD19musM22wL5L+VHgfzv58p2OF0lsxHyJ1TVC29Bv1ekqh6Z2Wj0EzObkrJ3km8lqanKlUn+zTK7XpnkvLmAv9d03i/YBX06JsmDkvxdZvPt59v8f5e0eafu3u4nFMvp7huS/Nckd06ycZn2fy7JqUme1N1XL2n/+CXt37G7d/TpBsAPCeQAs4D3mOz4rihJkqrav6pekNl85Zd29w+WqVNTeNw7yeU7ONyHkjwgswC+bWT+0symQzwyNx/Iv5Tk0UleWFX/8eb6fgvdJclNmY323i6zL7HeaW7765O8oqr+zXS+95++FHtmZnPjn15Vt5teh69kDvmOVNU9kpyS5NmZ3QnniVX12Gnzm5L8WlU9pmb3Dt+zqh657UumN3PcE6vqsKq6fVXtmdnUla1JrlhSb6/Mvl/wkmXuQvPnSX5vCuypqr2qakdTdAB+jEAOrHs9c940B3p7rq+q/y+zwPz4JL/e3acuqfM305SUbyR5ZZLjuvuyHbT7T0muTfKV7r5+KvtBko9nNl/7Iyvo+xczC+Wbquo5S7fX7IFC2+3DDpyd5G8zC6ZbMjunL89t/2+ZzeU+b9p2SpI9p1Hmxyb591P9r2Q26nyH7JzXJ/mr7n5fd1+b2fztN1TV3t29JbNPLH4/s5/nFzP7xGOl/487LcnXM/sD5xFJju7uG5fUOSzJIUn+ZG760bZr9ldJ/jizL71+I8klme7cA7AS1d2j+wAAAOuWEXIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYaMPoDqy2u9/97r1x48bR3QAAYDd20UUXfa2791tJ3XUXyDdu3JgLL7xwdDcAANiNVdUXVlrXlBUAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhow+gOwHI2bjpr4W1sOenohbcBAHBzjJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADLRhdAdYezZuOmt0FwAAdhtGyAEAYCCBHAAABhLIAQBgIIEcAAAGEsgBAGAggRwAAAYSyAEAYCCBHAAABlpoIK+qLVV1aVVdXFUXTmX7VNW5VXXF9L73VF5VdXJVba6qS6rqgXPHOW6qf0VVHTdX/qDp+JunfWuR5wMAALvaaoyQP7K779/dh03rm5Kc192HJDlvWk+SxyU5ZHqdkOR1ySzAJzkxyYOTHJ7kxG0hfqrz3Ln9jlr86QAAwK4zYsrKMUlOm5ZPS/KEufLTe+b8JHtV1T2TPDbJud29tbuvS3JukqOmbXft7vO7u5OcPncsAABYExYdyDvJ+6rqoqo6YSrbv7u/PC1/Jcn+0/IBSa6c2/eqqWxH5VctU/4TquqEqrqwqi689tprd+Z8AABgl9qw4OM/vLuvrqp7JDm3qj4zv7G7u6p6wX1Id5+S5JQkOeywwxbeHgAArNRCR8i7++rp/Zok78xsDvhXp+kmmd6vmapfneSgud0PnMp2VH7gMuUAALBmLCyQV9Wdquou25aTHJnkU0nOTLLtTinHJXn3tHxmkmOnu60ckeSGaWrLOUmOrKq9py9zHpnknGnbN6rqiOnuKsfOHQsAANaERU5Z2T/JO6c7EW5I8pbu/h9VdUGSt1XV8Um+kOTJU/2zkzw+yeYkNyZ5VpJ099aq+qMkF0z1Xt7dW6fl5yV5Y5I7Jnnv9AIAgDVjYYG8uz+f5BeXKf96kkcvU95Jnr+dY52a5NRlyi9Mcr+d7iwAAAziSZ0AADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAANtGN0BYO3YuOmshbex5aSjF94GANyWGCEHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBFh7Iq2qPqvpkVb1nWr93VX2sqjZX1V9W1e2n8jtM65un7RvnjvHSqfyzVfXYufKjprLNVbVp0ecCAAC72mqMkL8wyeVz669K8uruvk+S65IcP5Ufn+S6qfzVU71U1aFJnprkvkmOSvJnU8jfI8lrkzwuyaFJnjbVBQCANWOhgbyqDkxydJLXT+uV5FFJ3j5VOS3JE6blY6b1TNsfPdU/JskZ3f2d7v7nJJuTHD69Nnf357v7u0nOmOoCAMCasegR8v+e5CVJfjCt75vk+u6+aVq/KskB0/IBSa5Mkmn7DVP9H5Yv2Wd75QAAsGYsLJBX1a8kuaa7L1pUG7egLydU1YVVdeG11147ujsAAPBDixwhf1iSX62qLZlNJ3lUktck2auqNkx1Dkxy9bR8dZKDkmTafrckX58vX7LP9sp/Qnef0t2Hdfdh++23386fGQAA7CILC+Td/dLuPrC7N2b2pcz3d/czknwgyZOmasclefe0fOa0nmn7+7u7p/KnTndhuXeSQ5J8PMkFSQ6Z7tpy+6mNMxd1PgAAsAgbbr7KLvefkpxRVa9I8skkb5jK35DkTVW1OcnWzAJ2uvuyqnpbkk8nuSnJ87v7+0lSVS9Ick6SPZKc2t2XreqZAADATlqVQN7dH0zywWn585ndIWVpnW8n+fXt7P/KJK9cpvzsJGfvwq4CAMCq8qROAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYaEWBvKpeWFV3rZk3VNUnqurIRXcOAAB2dysdIX92d38jyZFJ9k7yzCQnLaxXAACwTqw0kNf0/vgkb+ruy+bKAACAW2mlgfyiqnpfZoH8nKq6S5IfLK5bAACwPmxYYb3jk9w/yee7+8aq2jfJsxbXLQAAWB9WOkLeSQ5N8lvT+p2S7LmQHgEAwDqy0kD+Z0kekuRp0/o3k7x2IT0CAIB1ZKVTVh7c3Q+sqk8mSXdfV1W3X2C/AABgXVjpCPn3qmqPzKaupKr2iy91AgDATlvpCPnJSd6Z5B5V9cokT0rynxfWK1gFGzedtSrtbDnp6FVpBwBYm1YUyLv7zVV1UZJHZ3b/8Sd09+UL7RkAAKwDKwrkVbVPkmuSvHWu7Hbd/b1FdQwAANaDlc4h/0SSa5P8U5IrpuUtVfWJqnrQojoHAAC7u5UG8nOTPL67797d+yZ5XJL3JHleZrdEBAAAboWVBvIjuvucbSvd/b4kD+nu85PcYSE9AwCAdWCld1n5clX9pyRnTOtPSfLV6VaIbn8IAAC30kpHyJ+e5MAk75peB09leyR58mK6BgAAu7+V3vbwa0l+czubN++67gAAwPqy0tse7pfkJUnum2TPbeXd/agF9QsAANaFlU5ZeXOSzyS5d5I/TLIlyQUL6hMAAKwbKw3k+3b3G5J8r7v/rrufncToOAAA7KSV3mVl2xM5v1xVRyf5UpJ9FtMlAABYP1YayF9RVXdL8uIkf5Lkrkn+t4X1CgAA1omV3mXlPdPiDUkeubjuAADA+rLSu6zcO7PbHm6c36e7f3Ux3QIAgPVhpVNW3pXkDUn+Jp7MCQAAu8xKA/m3u/vkhfYEAADWoZXe9vA1VXViVT2kqh647bWjHapqz6r6eFX9Y1VdVlV/OJXfu6o+VlWbq+ovq+r2U/kdpvXN0/aNc8d66VT+2ap67Fz5UVPZ5qradIvPHgAABlvpCPnPJ3lmZvce3zZlpbPje5F/J8mjuvtbVXW7JB+uqvcm+e0kr+7uM6rqz5Mcn+R10/t13X2fqnpqklcleUpVHZrkqZk9JfReSf62qv7t1MZrkzwmyVVJLqiqM7v70ys8JwAAGG6lgfzXk/xMd393pQfu7k7yrWn1dtNrW4h/+lR+WpI/yCyQHzMtJ8nbk/xpVdVUfkZ3fyfJP1fV5iSHT/U2d/fnk6SqzpjqCuQAAKwZK52y8qkke93Sg1fVHlV1cZJrkpyb5HNJru/um6YqVyU5YFo+IMmVSTJtvyHJvvPlS/bZXjkAAKwZKx0h3yvJZ6rqgsymoiS5+dsedvf3k9y/qvZK8s4kP3trO7ozquqEJCckycEHHzyiCwAAsKyVBvITd6aR7r6+qj6Q5CFJ9qqqDdMo+IFJrp6qXZ3koCRXVdWGJHdL8vW58m3m99le+dL2T0lySpIcdthhvTPnAgAAu9JKn9T5d7f0wFW1X5LvTWH8jpl9+fJVST6Q5ElJzkhyXJJ3T7ucOa1/dNr+/u7uqjozyVuq6o8z+1LnIUk+nqSSHDI9tOjqzL74uW1uOgAArAk7DORV9c3Mvoj5E5sy+97mXXew+z2TnFZVe2Q2V/1t3f2eqvp0kjOq6hVJPpnZA4cyvb9p+tLm1swCdrr7sqp6W2Zf1rwpyfOnqTCpqhckOSfJHklO7e7LVnLSAABwW7HDQN7dd7m1B+7uS5I8YJnyz+dHd0mZL/92ZndzWe5Yr0zyymXKz05y9q3tIwAAjLbSu6wAAAALIJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMtGF0BwDmbdx01sLb2HLS0QtvAwBWygg5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAG0Z3AHZ3GzedtfA2tpx09MLbAAAWY2Ej5FV1UFV9oKo+XVWXVdULp/J9qurcqrpiet97Kq+qOrmqNlfVJVX1wLljHTfVv6Kqjpsrf1BVXTrtc3JV1aLOBwAAFmGRU1ZuSvLi7j40yRFJnl9VhybZlOS87j4kyXnTepI8Lskh0+uEJK9LZgE+yYlJHpzk8CQnbgvxU53nzu131ALPBwAAdrmFBfLu/nJ3f2Ja/maSy5MckOSYJKdN1U5L8oRp+Zgkp/fM+Un2qqp7JnlsknO7e2t3X5fk3CRHTdvu2t3nd3cnOX3uWAAAsCasypc6q2pjkgck+ViS/bv7y9OmryTZf1o+IMmVc7tdNZXtqPyqZcqXa/+Eqrqwqi689tprd+pcAABgV1p4IK+qOyf56yQv6u5vzG+bRrZ70X3o7lO6+7DuPmy//fZbdHMAALBiCw3kVXW7zML4m7v7HVPxV6fpJpner5nKr05y0NzuB05lOyo/cJlyAABYMxZ5l5VK8oYkl3f3H89tOjPJtjulHJfk3XPlx053WzkiyQ3T1JZzkhxZVXtPX+Y8Msk507ZvVNURU1vHzh0LAADWhEXeh/xhSZ6Z5NKqungqe1mSk5K8raqOT/KFJE+etp2d5PFJNie5McmzkqS7t1bVHyW5YKr38u7eOi0/L8kbk9wxyXunFwAArBkLC+Td/eEk27sv+KOXqd9Jnr+dY52a5NRlyi9Mcr+d6CYAAAy1KndZAQAAlieQAwDAQAI5AAAMtMgvdQKrZOOms0Z3AQC4lYyQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAy0YXQHAGC1bNx01sLb2HLS0QtvA9i9GCEHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgTaM7gAAcMtt3HTWwtvYctLRC28DMEIOAABDCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQBwMB685qPFAlWZ2Hqng4DMDaZ4QcAAAGEsgBAGAgU1YAYBdarSlRwO5DIAdYEMEMgJUwZQUAAAYSyAEAYCCBHAAABhLIAQBgIIEcAAAGEsgBAGAgtz3cjbjFGgDA2mOEHAAABhLIAQBgIIEcAAAGEsgBAGAggRwAAAZaWCCvqlOr6pqq+tRc2T5VdW5VXTG97z2VV1WdXFWbq+qSqnrg3D7HTfWvqKrj5sofVFWXTvucXFW1qHMBAIBFWeQI+RuTHLWkbFOS87r7kCTnTetJ8rgkh0yvE5K8LpkF+CQnJnlwksOTnLgtxE91nju339K2AADgNm9hgby7P5Rk65LiY5KcNi2fluQJc+Wn98z5SfaqqnsmeWySc7t7a3dfl+TcJEdN2+7a3ed3dyc5fe5YAACwZqz2HPL9u/vL0/JXkuw/LR+Q5Mq5eldNZTsqv2qZcgAAWFOGPamzu7uqejXaqqoTMpsKk4MPPng1mgQAbkNW42nWW046euFtsHta7RHyr07TTTK9XzOVX53koLl6B05lOyo/cJnyZXX3Kd19WHcftt9+++30SQAAwK6y2oH8zCTb7pRyXJJ3z5UfO91t5YgkN0xTW85JcmRV7T19mfPIJOdM275RVUdMd1c5du5YAACwZixsykpVvTXJI5LcvaquyuxuKScleVtVHZ/kC0mePFU/O8njk2xOcmOSZyVJd2+tqj9KcsFU7+Xdve2Los/L7E4ud0zy3ukFAABrysICeXc/bTubHr1M3U7y/O0c59Qkpy5TfmGS++1MHwEAYDRP6gQAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEWdh9yAAB2vY2bzlp4G1tOOnrhbfAjRsgBAGAggRwAAAYSyAEAYCCBHAAABhLIAQBgIIEcAAAGEsgBAGAggRwAAAYSyAEAYCBP6gQAhlqNJ0/CbZkRcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABjIXVYA2KHVugPGlpOOXpV2AG5rBHIAAH7Mavwh7o/wHzFlBQAABhLIAQBgIFNWAAB2AU8c5dYSyAEAWHW+MP4jpqwAAMBAAjkAAAxkygoAtwnm3wLrlRFyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYaMPoDgAAt00bN501uguwLhghBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGGjNB/KqOqqqPltVm6tq0+j+AADALbGmA3lV7ZHktUkel+TQJE+rqkPH9goAAFZuTQfyJIcn2dzdn+/u7yY5I8kxg/sEAAArttYD+QFJrpxbv2oqAwCANWHD6A6shqo6IckJ0+q3quqzA7px9yRfG9Au47n265drvz657uuXa38bVK9alWaWu/Y/vdKd13ogvzrJQXPrB05lP6a7T0lyymp1ajlVdWF3HzayD4zh2q9frv365LqvX679+rWz136tT1m5IMkhVXXvqrp9kqcmOXNwnwAAYMXW9Ah5d99UVS9Ick6SPZKc2t2XDe4WAACs2JoO5EnS3WcnOXt0P1Zg6JQZhnLt1y/Xfn1y3dcv13792qlrX929qzoCAADcQmt9DjkAAKxpAvkqqKqjquqzVbW5qjaN7g+7VlWdWlXXVNWn5sr2qapzq+qK6X3vqbyq6uTp38IlVfXAcT1nZ1TVQVX1gar6dFVdVlUvnMpd+91cVe1ZVR+vqn+crv0fTuX3rqqPTdf4L6ebDaSq7jCtb562bxzZf3ZOVe1RVZ+sqvdM6677OlBVW6rq0qq6uKounMp22e97gXzBqmqPJK9N8rgkhyZ5WlUdOrZX7GJvTHLUkrJNSc7r7kOSnDetJ7N/B4dMrxOSvG6V+siud1OSF3f3oUmOSPL86b9t1373950kj+ruX0xy/yRHVdURSV6V5NXdfZ8k1yU5fqp/fJLrpvJXT/VYu16Y5PK5ddd9/Xhkd99/7vaGu+z3vUC+eIcn2dzdn+/u7yY5I8kxg/vELtTdH0qydUnxMUlOm5ZPS/KEufLTe+b8JHtV1T1Xp6fsSt395e7+xLT8zcz+B31AXPvd3nQNvzWt3m56dZJHJXnWA83gAAAFkklEQVT7VL702m/7N/H2JI+uqlql7rILVdWBSY5O8vppveK6r2e77Pe9QL54ByS5cm79qqmM3dv+3f3lafkrSfaflv172A1NH0U/IMnH4tqvC9O0hYuTXJPk3CSfS3J9d980VZm/vj+89tP2G5Lsu7o9Zhf570lekuQH0/q+cd3Xi07yvqq6aHoCfLILf9+v+dsewm1dd3dVuZ3Rbqqq7pzkr5O8qLu/MT8A5trvvrr7+0nuX1V7JXlnkp8d3CUWrKp+Jck13X1RVT1idH9YdQ/v7qur6h5Jzq2qz8xv3Nnf90bIF+/qJAfNrR84lbF7++q2j6em92umcv8ediNVdbvMwvibu/sdU7Frv4509/VJPpDkIZl9LL1toGv++v7w2k/b75bk66vcVXbew5L8alVtyWz66aOSvCau+7rQ3VdP79dk9kf44dmFv+8F8sW7IMkh07ewb5/kqUnOHNwnFu/MJMdNy8clefdc+bHTN7CPSHLD3MddrCHTXNA3JLm8u/94bpNrv5urqv2mkfFU1R2TPCaz7xB8IMmTpmpLr/22fxNPSvL+9hCQNae7X9rdB3b3xsz+X/7+7n5GXPfdXlXdqarusm05yZFJPpVd+Pveg4FWQVU9PrN5Z3skObW7Xzm4S+xCVfXWJI9IcvckX01yYpJ3JXlbkoOTfCHJk7t76xTi/jSzu7LcmORZ3X3hiH6zc6rq4Un+Psml+dF80pdlNo/ctd+NVdUvZPYFrj0yG9h6W3e/vKp+JrOR032SfDLJv+/u71TVnknelNn3DLYmeWp3f35M79kVpikrv9Pdv+K67/6ma/zOaXVDkrd09yurat/sot/3AjkAAAxkygoAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADrFFV9eqqetHc+jlV9fq59f+rqn67qu5VVW/fzjE+WFWHTcsvmyvfWFWfWmE/XlRVx97Kc3hBVT371uwLsLsQyAHWrn9I8tAkqaqfyuxe+Ped2/7QJB/p7i9195OW2X+pl918lR83PYHw2Uneckv3nZya5Ddv5b4AuwWBHGDt+khmj2xPZkH8U0m+WVV7V9Udkvxckk/Mj3ZX1R2r6oyquryq3pnkjlP5SUnuWFUXV9Wbp2PuUVX/T1VdVlXvm55KudSjknyiu2+ajvPBqnrNdJxPVdXhU/lrqup/n5YfW1Ufqqqf6u4bk2zZVg9gPRLIAdao7v5Skpuq6uDMRsM/mtmTQh+S5LAkl3b3d5fs9h+T3NjdP5fZU2UfNB1rU5J/6e77T48DT5JDkry2u++b5PokT1ymGw9LctGSsn/V3fdP8rzMRsCT5KVJnlJVj0xycmZPrtv2hNMLk/zSLf4BAOwmBHKAte0jmYXxbYH8o3Pr/7BM/V9O8hdJ0t2XJLlkB8f+5+6+eFq+KMnGZercM8m1S8reOh3/Q0nuWlV7TSPhz01ybpI/7e7PzdW/Jsm9dtAPgN2aQA6wtm2bR/7zmU1ZOT+zEfKHZhbWd8Z35pa/n2TDMnX+JcmeS8p6O+s/n+Tr+cnwved0HIB1SSAHWNs+kuRXkmzt7u9399Yke2UWypcL5B9K8vQkqar7JfmFuW3fq6rb3cL2L09ynyVlT5mO//AkN3T3DVX100lenOQBSR5XVQ+eq/9vM/tjAmBdEsgB1rZLM7u7yvlLym7o7q8tU/91Se5cVZcneXl+fP73KUkumftS50q8N7NpMPO+XVWfTPLnSY6vqkryhiS/M817Pz7J66tq28j6wzKbygKwLlX30k8WAWDlpru1vKS7r6iqD2YWvC9c4b4PSPLb3f3MRfYR4LbMCDkAO2tTZl/uvDXunuT3d2FfANYcI+QAADCQEXIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICB/n8av9LMcfvPhwAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from matplotlib import pyplot as plt \n", + "import numpy as np \n", + "bins = list(range(0,500,20))\n", + "plt.figure(figsize=(12,8))\n", + "plt.hist(sizes, bins=bins)\n", + "plt.title(\"Face Image Sizes\") \n", + "plt.ylabel(\"Images\")\n", + "plt.xlabel(\"Width (px)\")\n", + "plt.yticks(range(0, 60000, 10000))\n", + "plt.title('IMDB-Wiki: Face Pixel Size')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "dob: date of birth (Matlab serial date number)\n", + "photo_taken: year when the photo was taken\n", + "full_path: path to file\n", + "gender: 0 for female and 1 for male, NaN if unknown\n", + "name: name of the celebrity\n", + "face_location: location of the face. To crop the face in Matlab run\n", + "\n", + "img(face_location(2):face_location(4),face_location(1):face_location(3),:))\n", + "\n", + "face_score: detector score (the higher the better). Inf implies that no face was found in the image and the face_location then just returns the entire image\n", + "second_face_score: detector score of the face with the second highest score. This is useful to ignore images with more than one face. second_face_score is NaN if no second face was detected.\n", + "celeb_names (IMDB only): list of all celebrity names\n", + "celeb_id (IMDB only): index of celebrity name\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/megapixels/notebooks/datasets/imdb_wiki/identity.ipynb b/megapixels/notebooks/datasets/imdb_wiki/identity.ipynb new file mode 100644 index 00000000..40d7bd86 --- /dev/null +++ b/megapixels/notebooks/datasets/imdb_wiki/identity.ipynb @@ -0,0 +1,498 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# IMDB-WIKI Knowledge Graph\n", + "\n", + "- convert names to Knowledge Graph entity IDs\n", + "- The `imdb.mat` file contains only full names, need KG ids `/m/12345`" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "import os.path as osp\n", + "from os.path import join\n", + "from glob import glob\n", + "from pathlib import Path\n", + "import random\n", + "import math\n", + "from datetime import datetime\n", + "import requests\n", + "import json\n", + "import time\n", + "from pprint import pprint\n", + "from multiprocessing.pool import ThreadPool\n", + "import threading\n", + "import urllib.request\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd\n", + "from scipy.io import loadmat\n", + "import numpy as np\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load IMDB Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
celeb_iddobfilepathgendernamex1x2y1y2year_photo
index
064881900-5-1101/nm0000001_rm124825600_1899-5-10_1968.jpgmFred Astaire1072.9261214.784161.838303.6961968
164881900-5-1101/nm0000001_rm3343756032_1899-5-10_1970.jpgmFred Astaire477.184622.592100.352245.7601970
\n", + "
" + ], + "text/plain": [ + " celeb_id dob filepath \\\n", + "index \n", + "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg \n", + "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg \n", + "\n", + " gender name x1 x2 y1 y2 year_photo \n", + "index \n", + "0 m Fred Astaire 1072.926 1214.784 161.838 303.696 1968 \n", + "1 m Fred Astaire 477.184 622.592 100.352 245.760 1970 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fp_meta_imdb = '/data_store_hdd/datasets/people/imdb_wiki/metadata/imdb_mat.csv'\n", + "df_meta_imdb = pd.read_csv(fp_meta_imdb).set_index('index')\n", + "df_meta_imdb.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Google Knowledge Graph API" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# read API key\n", + "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", + "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "def _get_kg_meta(result_obj, params):\n", + " global api_key, url_kg_api\n", + " \n", + " params['indent'] = True\n", + " params['key'] = api_key\n", + " params['limit'] = 1\n", + " \n", + " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n", + " try:\n", + " json_response = urllib.request.urlopen(url).read()\n", + " except Exception as e:\n", + " result['error'] = str(e)\n", + " else:\n", + " try:\n", + " response = json.loads(json_response)\n", + " items = response.get('itemListElement', [])\n", + " result_obj['accessed'] = True\n", + " if items:\n", + " item = items[0]\n", + " item_result = item.get('result', [])\n", + " result_obj['description'] = item_result.get('description', '')\n", + " det_desc = item_result.get('detailedDescription', '')\n", + " if not result_obj['kg_id']:\n", + " result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n", + " if det_desc:\n", + " result_obj['description_extended'] = det_desc.get('articleBody','')\n", + " result_obj['description_license'] = det_desc.get('license','')\n", + " result_obj['description_url'] = det_desc.get('url','')\n", + " else:\n", + " result_obj['description_extended'] = ''\n", + " result_obj['description_license'] = ''\n", + " result_obj['description_url'] = ''\n", + " result_img = item_result.get('image', '')\n", + " if result_img:\n", + " result_obj['image_url'] = result_img.get('contentUrl', '')\n", + " result_obj['name'] = item_result.get('name', '')\n", + " result_obj['score'] = item.get('resultScore', 0.0)\n", + " result_obj['url'] = item_result.get('url', '')\n", + " except Exception as e:\n", + " result_obj['error'] = str(e)\n", + " return result_obj\n", + " \n", + "def get_kg_from_name(obj):\n", + " if obj['accessed']:\n", + " return obj\n", + " params = {'query': obj['query']}\n", + " return _get_kg_meta(obj, params)\n", + " \n", + "def get_kg_from_kg_id(obj):\n", + " if obj['accessed']:\n", + " return obj\n", + " params = {'ids': obj['kg_id']}\n", + " return _get_kg_meta(obj, params)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'accessed': True,\n", + " 'description': 'American singer',\n", + " 'description_extended': 'Taylor Alison Swift is an American '\n", + " \"singer-songwriter. As one of the world's leading \"\n", + " 'contemporary recording artists, she is known for '\n", + " 'narrative songs about her personal life, which has '\n", + " 'received widespread media coverage.\\n',\n", + " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n", + " 'description_url': 'https://en.wikipedia.org/wiki/Taylor_Swift',\n", + " 'image_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcST848UJ0u31E6aoQfb2nnKZFyad7rwNF0ZLOCACGpu4jnboEzV',\n", + " 'kg_id': '/m/0dl567',\n", + " 'name': 'Taylor Swift',\n", + " 'query': 'Taylor Swift',\n", + " 'score': 1241.476318,\n", + " 'url': 'http://taylorswift.com/'}\n" + ] + } + ], + "source": [ + "# make a test query to check if API works\n", + "obj = {'query': 'Taylor Swift', 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", + "result = get_kg_from_name(obj)\n", + "pprint(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "obj = {'query': 'Taylor Swift', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", + "result = get_kg_from_id(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "# build mapped_person objects\n", + "mapped_persons = []\n", + "count = 0\n", + "df_person_groups = df_meta_imdb.groupby('name')\n", + "for group_name, df_name_group in df_person_groups:\n", + " obj = {'query': group_name, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n", + " mapped_persons.append(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "# define thread mapping function\n", + "def pool_map_persons(obj):\n", + " global pbar\n", + " pbar.update(1)\n", + " kg_obj = get_kg_from_name(obj)\n", + " return kg_obj" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "87f6a2be42284199b8a67458f4090497", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=20284), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0/20284 remaining\n" + ] + } + ], + "source": [ + "num_threads = 2\n", + "pbar = tqdm(total=len(mapped_persons))\n", + "\n", + "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", + "print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n", + "\n", + "# convert to thread pool\n", + "while num_non_accessed > 0:\n", + " print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n", + " pool = ThreadPool(num_threads)\n", + "\n", + " # start threading\n", + " with tqdm(total=len(mapped_persons)) as pbar:\n", + " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", + "\n", + " # close tqdm\n", + " pbar.close()\n", + "\n", + " num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", + " if num_non_accessed > 0:\n", + " print(f'{num_non_accessed}/{len(mapped_persons)} remaining. Sleeping...')\n", + " time.sleep(60*20) # wait X minutes" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'query': \"'Lee' George Quinones\", 'kg_id': '/m/08hvx1', 'score': 280.322754, 'description': 'Artist', 'url': 'http://www.leequinones.com/', 'accessed': True, 'description_extended': 'George Lee QuiƱones is a Puerto Rican artist and actor. He is one of several artists to gain fame from the New York City Subway graffiti movement.\\n', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Lee_Qui%C3%B1ones', 'name': 'Lee QuiƱones'}\n" + ] + } + ], + "source": [ + "# test output for a person\n", + "print(mapped_persons[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + } + ], + "source": [ + "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n", + "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n", + "cc_short = 'CC BY-SA 3.0'\n", + "nchanged = 0\n", + "for mapped_person in mapped_persons:\n", + " license = mapped_person.get('description_license', None)\n", + " if license == cc_long:\n", + " nchanged += 1\n", + " mapped_person['description_license'] = cc_short\n", + "print(nchanged)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + } + ], + "source": [ + "# find number not accessed\n", + "n_empty = 0\n", + "for mapped_person in mapped_persons:\n", + " if not mapped_person.get('accessed', False):\n", + " n_empty += 1\n", + " print(mapped_person['kg_id'])\n", + "print(n_empty)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "# create dataframe for mapped persons\n", + "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", + "df_mapped_persons.index.name = 'index'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# check output\n", + "df_mapped_persons.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "# save\n", + "fp_out = '/data_store_hdd/datasets/people/imdb_wiki/metadata/identity_kg.csv'\n", + "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "# create small version\n", + "limit = 1000\n", + "fpp_out = Path(fp_out)\n", + "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n", + "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", + "df_mapped_persons_sm.index.name = 'index'\n", + "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb b/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb deleted file mode 100644 index b9a77fda..00000000 --- a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb +++ /dev/null @@ -1,468 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# IMDB-WIKI Knowledge Graph" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import os.path as osp\n", - "from os.path import join\n", - "from glob import glob\n", - "import random\n", - "import math\n", - "from datetime import datetime\n", - "import requests\n", - "import json\n", - "import urllib\n", - "\n", - "import cv2 as cv\n", - "import pandas as pd\n", - "from scipy.io import loadmat\n", - "import numpy as np\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from tqdm import tqdm_notebook as tqdm\n", - "%reload_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "fp_meta = '/data_store_hdd/datasets/people/imdb_wiki/metadata/imdb_wiki.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "df_meta = pd.read_csv(fp_meta).set_index('index')" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
celeb_iddobfilepathgendernamex1x2y1y2year_photo
index
064881900-5-1101/nm0000001_rm124825600_1899-5-10_1968.jpgmFred Astaire1072.9260001214.784000161.838000303.6960001968
164881900-5-1101/nm0000001_rm3343756032_1899-5-10_1970.jpgmFred Astaire477.184000622.592000100.352000245.7600001970
264881900-5-1101/nm0000001_rm577153792_1899-5-10_1968.jpgmFred Astaire114.969643451.686572114.969643451.6865721968
364881900-5-1101/nm0000001_rm946909184_1899-5-10_1968.jpgmFred Astaire622.885506844.339008424.217504645.6710061968
464881900-5-1101/nm0000001_rm980463616_1899-5-10_1968.jpgmFred Astaire1013.8590021201.586128233.882042421.6091681968
\n", - "
" - ], - "text/plain": [ - " celeb_id dob filepath \\\n", - "index \n", - "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg \n", - "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg \n", - "2 6488 1900-5-11 01/nm0000001_rm577153792_1899-5-10_1968.jpg \n", - "3 6488 1900-5-11 01/nm0000001_rm946909184_1899-5-10_1968.jpg \n", - "4 6488 1900-5-11 01/nm0000001_rm980463616_1899-5-10_1968.jpg \n", - "\n", - " gender name x1 x2 y1 y2 \\\n", - "index \n", - "0 m Fred Astaire 1072.926000 1214.784000 161.838000 303.696000 \n", - "1 m Fred Astaire 477.184000 622.592000 100.352000 245.760000 \n", - "2 m Fred Astaire 114.969643 451.686572 114.969643 451.686572 \n", - "3 m Fred Astaire 622.885506 844.339008 424.217504 645.671006 \n", - "4 m Fred Astaire 1013.859002 1201.586128 233.882042 421.609168 \n", - "\n", - " year_photo \n", - "index \n", - "0 1968 \n", - "1 1970 \n", - "2 1968 \n", - "3 1968 \n", - "4 1968 " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_meta.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ids" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", - "\n", - "def get_knowledge(q, api_key):\n", - " service_url = 'https://kgsearch.googleapis.com/v1/entities:search'\n", - " params = {\n", - " 'query': q,\n", - " 'limit': 5,\n", - " 'indent': True,\n", - " 'key': api_key,\n", - " }\n", - " url = service_url + '?' + urllib.parse.urlencode(params) # TODO: use requests\n", - " response = json.loads(urllib.request.urlopen(url).read())\n", - " response = response.get('itemListElement', [])\n", - " if len(response) > 0:\n", - " result = response[0].get('result', [])\n", - " result['score'] = response[0]['resultScore']\n", - " return result\n", - " else:\n", - " return []" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "General Secretary of the Communist Party of China\n", - "Xi Jinping\n" - ] - }, - { - "ename": "KeyError", - "evalue": "'url'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m--------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'description'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'name'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'url'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'score'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: 'url'" - ] - } - ], - "source": [ - "# test\n", - "q = 'Xi Jinping'\n", - "r = get_knowledge(q, api_key)\n", - "print(r['description'])\n", - "print(r['name'])\n", - "print(r['url'])\n", - "print(r['score'])" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [], - "source": [ - "from pprint import pprint" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "kg:/m/06ff60\n" - ] - } - ], - "source": [ - "print(r['@id'])" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'@id': 'kg:/g/11f4ksbzcm',\n", - " '@type': ['Thing', 'Event'],\n", - " 'detailedDescription': {'articleBody': 'On February 14, 2018, a gunman opened '\n", - " 'fire at Marjory Stoneman Douglas High '\n", - " 'School in Parkland, Florida, killing '\n", - " 'seventeen students and staff members '\n", - " 'and injuring seventeen others. ',\n", - " 'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n", - " 'url': 'https://en.wikipedia.org/wiki/Stoneman_Douglas_High_School_shooting'},\n", - " 'image': {'contentUrl': 'http://t1.gstatic.com/images?q=tbn:ANd9GcQmY7VqmGt4zEJU8Rc4EwPWroYd-L0QQ5wkZfiFO-WRqNBC-FPN',\n", - " 'url': 'https://en.wikipedia.org/wiki/Stoneman_Douglas_High_School_shooting'},\n", - " 'name': 'Stoneman Douglas High School shooting',\n", - " 'score': 60.411652}\n" - ] - } - ], - "source": [ - "pprint(r)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "dir_msceleb = '/data_store_hdd/datasets/people/msceleb/media/original/'" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [], - "source": [ - "kgs_msceleb = os.listdir(dir_msceleb)\n", - "kgs_msceleb = ['/' + x.replace('.','/') for x in kgs_msceleb]" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 109, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'/m/06ff60' in kgs_msceleb" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [], - "source": [ - "def get_kg_by_id(kg_id, api_key):\n", - " service_url = 'https://kgsearch.googleapis.com/v1/entities:search'\n", - " params = {\n", - " 'ids': kg_id,\n", - " 'limit': 1,\n", - " 'indent': True,\n", - " 'key': api_key,\n", - " }\n", - " url = service_url + '?' + urllib.parse.urlencode(params) # TODO: use requests\n", - " try:\n", - " response = json.loads(urllib.request.urlopen(url).read())\n", - " response = response.get('itemListElement', [])\n", - " result = response[0].get('result', [])\n", - " result['score'] = response[0]['resultScore']\n", - " return result\n", - " except Exception as e:\n", - " return []" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [], - "source": [ - "a = get_kg_by_id('/m/0100n5bs', api_key)" - ] - }, - { - "cell_type": "code", - "execution_count": 123, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 123, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:megapixels]", - "language": "python", - "name": "conda-env-megapixels-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_meta_debug.ipynb b/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_meta_debug.ipynb deleted file mode 100644 index 648fb9ac..00000000 --- a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_meta_debug.ipynb +++ /dev/null @@ -1,573 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 06: Face pose dlib/MTCNN" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import os.path as osp\n", - "from os.path import join\n", - "from glob import glob\n", - "import random\n", - "import math\n", - "from datetime import datetime\n", - "\n", - "import cv2 as cv\n", - "import pandas as pd\n", - "from scipy.io import loadmat\n", - "import numpy as np\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from tqdm import tqdm_notebook as tqdm\n", - "%reload_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "fp_mat = '/data_store_hdd/datasets/people/imdb_wiki/downloads/imdb.mat'\n", - "dir_out = '/data_store_hdd/datasets/people/imdb_wiki/metadata/'" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "mat_data = loadmat(fp_mat)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# row 3\n", - "def load_parse_imdb_mat(mat):\n", - " metadata = mat['imdb'][0][0]\n", - " results = []\n", - " num_records = len(metadata[0][0])\n", - " print(f'loaded: {num_records} records')\n", - " for i in tqdm(range(num_records), total=num_records):\n", - " dob_matlab = metadata[0][0][i]\n", - " dob = datetime.fromordinal(dob_matlab)\n", - " dob_str = f'{dob.year}-{dob.month}-{dob.day}'\n", - " year_photo = metadata[1][0][i]\n", - " fp = metadata[2][0][i][0]\n", - " gender_val = metadata[3][0][i]\n", - " if gender_val == 0:\n", - " gender = 'f'\n", - " elif gender_val == 1:\n", - " gender = 'm'\n", - " else:\n", - " gender = None\n", - " name = metadata[4][0][i][0]\n", - " roi = metadata[5][0][i][0]\n", - " face_conf = metadata[6][0][i]\n", - " face_conf_second = metadata[7][0][i]\n", - " celeb_id = metadata[9][0][i]\n", - " result = {\n", - " 'dob': dob_str,\n", - " 'year_photo': year_photo,\n", - " 'filepath': fp,\n", - " 'gender': gender,\n", - " 'name': name,\n", - " 'x1': roi[0],\n", - " 'y1': roi[1],\n", - " 'x2': roi[2],\n", - " 'y2': roi[3],\n", - " 'celeb_id': celeb_id\n", - " }\n", - " results.append(result)\n", - " return results" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "loaded: 460723 records\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d50c6e22d1694b54815a86d85cda6241", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, max=460723), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "results_meta = load_parse_imdb_mat(mat_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "df_meta = pd.DataFrame.from_dict(results_meta)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
celeb_iddobfilepathgendernamex1x2y1y2year_photo
064881900-5-1101/nm0000001_rm124825600_1899-5-10_1968.jpgmFred Astaire1072.9260001214.784000161.838000303.6960001968
164881900-5-1101/nm0000001_rm3343756032_1899-5-10_1970.jpgmFred Astaire477.184000622.592000100.352000245.7600001970
264881900-5-1101/nm0000001_rm577153792_1899-5-10_1968.jpgmFred Astaire114.969643451.686572114.969643451.6865721968
364881900-5-1101/nm0000001_rm946909184_1899-5-10_1968.jpgmFred Astaire622.885506844.339008424.217504645.6710061968
464881900-5-1101/nm0000001_rm980463616_1899-5-10_1968.jpgmFred Astaire1013.8590021201.586128233.882042421.6091681968
\n", - "
" - ], - "text/plain": [ - " celeb_id dob filepath gender \\\n", - "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg m \n", - "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg m \n", - "2 6488 1900-5-11 01/nm0000001_rm577153792_1899-5-10_1968.jpg m \n", - "3 6488 1900-5-11 01/nm0000001_rm946909184_1899-5-10_1968.jpg m \n", - "4 6488 1900-5-11 01/nm0000001_rm980463616_1899-5-10_1968.jpg m \n", - "\n", - " name x1 x2 y1 y2 year_photo \n", - "0 Fred Astaire 1072.926000 1214.784000 161.838000 303.696000 1968 \n", - "1 Fred Astaire 477.184000 622.592000 100.352000 245.760000 1970 \n", - "2 Fred Astaire 114.969643 451.686572 114.969643 451.686572 1968 \n", - "3 Fred Astaire 622.885506 844.339008 424.217504 645.671006 1968 \n", - "4 Fred Astaire 1013.859002 1201.586128 233.882042 421.609168 1968 " - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_meta.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create DataFrame for metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "df_results = pd.DataFrame.from_dict(results_meta)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
celeb_iddobfilepathgendernamex1x2y1y2year_photo
064881900-5-1101/nm0000001_rm124825600_1899-5-10_1968.jpgmFred Astaire1072.9260001214.784000161.838000303.6960001968
164881900-5-1101/nm0000001_rm3343756032_1899-5-10_1970.jpgmFred Astaire477.184000622.592000100.352000245.7600001970
264881900-5-1101/nm0000001_rm577153792_1899-5-10_1968.jpgmFred Astaire114.969643451.686572114.969643451.6865721968
364881900-5-1101/nm0000001_rm946909184_1899-5-10_1968.jpgmFred Astaire622.885506844.339008424.217504645.6710061968
464881900-5-1101/nm0000001_rm980463616_1899-5-10_1968.jpgmFred Astaire1013.8590021201.586128233.882042421.6091681968
\n", - "
" - ], - "text/plain": [ - " celeb_id dob filepath gender \\\n", - "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg m \n", - "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg m \n", - "2 6488 1900-5-11 01/nm0000001_rm577153792_1899-5-10_1968.jpg m \n", - "3 6488 1900-5-11 01/nm0000001_rm946909184_1899-5-10_1968.jpg m \n", - "4 6488 1900-5-11 01/nm0000001_rm980463616_1899-5-10_1968.jpg m \n", - "\n", - " name x1 x2 y1 y2 year_photo \n", - "0 Fred Astaire 1072.926000 1214.784000 161.838000 303.696000 1968 \n", - "1 Fred Astaire 477.184000 622.592000 100.352000 245.760000 1970 \n", - "2 Fred Astaire 114.969643 451.686572 114.969643 451.686572 1968 \n", - "3 Fred Astaire 622.885506 844.339008 424.217504 645.671006 1968 \n", - "4 Fred Astaire 1013.859002 1201.586128 233.882042 421.609168 1968 " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_results.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "df_results.to_csv(join(dir_out,'imdb_wiki.csv'), index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Count Images per Person" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "df_name_groups = df_results.groupby('name')\n", - "images_per_person = []\n", - "for name, df_name in df_name_groups:\n", - " images_per_person.append({'name': name, 'num_images': len(df_name)})\n", - "df_images_per_person = pd.DataFrame.from_dict(images_per_person)\n", - "df_images_per_person.to_csv(join(dir_out, 'imdb_images_per_person.csv'), index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Find Face Size" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "sizes = [(x['x2'] - x['x1']) for x in results_meta]" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "buckets = list(range(0,500,50))" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAuQAAAHwCAYAAADuC3p1AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3Xu4ZlV9J/jvL5SK7Y2LSCuXlGnpSdAkXhjES/J4GRElHcxovLYQRXm61UQnJnZpOkNitBunZ2JLYswwyghGJcZ4IYKNBDXGKAooARENpSkFvIAWoA7xgv7mj3eXvh5PFQeq3rM4dT6f53mfd++1195r7bOLw/esd717V3cHAAAY46dGdwAAANYzgRwAAAYSyAEAYCCBHAAABhLIAQBgIIEcAAAGEsgB1oCqekZVvW9uvavqPtup+62q+pnV691iVNXrq+plu+A4H66q37gV++0x/SwP3tk+AOyIQA6sG1W1par+l2n5N6ZQ++oldY6Zyt84rW+c1r81vb5aVe+pqscsc+x/mepcV1VnVdVBO+jLZ6vqKXPrD5vaWVr2zara0N1v7u4jV3Ke3X3n7v78Cn8mH6yqb8+d37eq6iEr2XdXmMLytvavraq3V9W/TpLufk53/5cFt793Vb2xqr5SVd+YrsvvTu1/f/pZfnGRfQAQyIH17HNJnlxVG+bKjkvyT8vU3au775zkF5Ocm+Sdy4y6/rupzj2TfDXJn+yg7Q8l+eW59V9O8pllyj7a3Tet4Fx2xgum4Lnt9dEFt7fUf5h+bj+bZL8k/+cqtn1ykttPbe+V5AmZ/bsAWDUCObCefSXJpUkemyRVtU+ShyY5c3s7dPdXuvs1Sf4gyauq6id+j3b3t5O8PcmhO2h7aSD/pSSvWqbsQ1PffqOqPrzcgarq4VV1ZVU9Ylrf7nSWW6Kq/rSqrppGji+oqofObdtQVb9fVZ+btl9YVfeath1aVX9bVVur6jNV9cSVtNfdX0/yjiT3m47zF1X1B9Py71XVR6pqj2n9N6vq0qq6w7T+sKo6v6qur6qLq+qXt9PMUv9zkrd09/Xd/YPuvry73zF3jj19SnLwkk8RbqyqH/6hVFXPmc71uqp6744+HQFYSiAH1rvTkxw7LT81ybuTfGcF+70jyT2S/E9LN1TVv0rylCTn72D/DyW5b1XtM4X6w5L8ZZK95soeNtXbrqo6Kslbkzyxuz+4zPanV9UlKzif5XwsyS8k2SezPzD+alsATvK7SZ6U5KjMRpafk+TbVXXnzD5BOD2zn88zkpxSVT/xc1qmr/sl+V+TfHKZzScl6SQvraqfTfLyJM/o7u9M4ffMJCdOfd2U5B1Vte8KzvH8JP91+oPnkO1V6u4vzn+KkORvMvu5Z/qD43eTHJPZCP/HkrxlBW0DJBHIAd6Z5BFVdbfMgvnpK9zvS9P7PnNl76qq65PckOQxSf7b9nbu7i8k+WJmo+C/mOSK7v6XJP8wV3b7zMLd9vx6kv87yeO6++Pbaect3f0LN3MuJ08jy9dX1Sfm9n1Td2+dpsz8H0nummTbyPtzkrysu6+YRpYv7u6tmYXSf+ru07v7pu6+KMm7Mgvv2/Nn08/t4uln8jvLnMf3M7s+vz0d779097Y/NI5NcmZ3nzP15X8k+cfM/li4Oc/L7A+h30pyeVVdUVU7nKtfVb+X5N5JnjsV/YepP5+dflavSHJ4VR2wgvYBBHJgfZtC8FlJ/nOSfbv7H1a467awtXWu7AndvVeSPZO8IMnfVdW/XjrdYa7+tmkrv5zk76eyD8+Vfby7dzRa/6Ikb+vuT62wz9vzW9291/R64LbCqnrJNA3jhiTXJblTkrtPmw/K8nOtfzrJw+YC/vWZfVpwzx20/7yp7QO6+5nT1JWf0N2fy+zndFCS1y1p82lL2jwiyb1u7sS7+8bufsV03vtm9snHX09/oP2Eqvp3mYX4X5umJm1r/7VzbX8tyQ+SHHhz7QMkAjlAMhsVf3GSv7gF+/xakmuSfHbphunuHO9I8v0kD19musM22wL5L+VHgfzv58p2OF0lsxHyJ1TVC29Bv1ekqh6Z2Wj0EzObkrJ3km8lqanKlUn+zTK7XpnkvLmAv9d03i/YBX06JsmDkvxdZvPt59v8f5e0eafu3u4nFMvp7huS/Nckd06ycZn2fy7JqUme1N1XL2n/+CXt37G7d/TpBsAPCeQAs4D3mOz4rihJkqrav6pekNl85Zd29w+WqVNTeNw7yeU7ONyHkjwgswC+bWT+0symQzwyNx/Iv5Tk0UleWFX/8eb6fgvdJclNmY323i6zL7HeaW7765O8oqr+zXS+95++FHtmZnPjn15Vt5teh69kDvmOVNU9kpyS5NmZ3QnniVX12Gnzm5L8WlU9pmb3Dt+zqh657UumN3PcE6vqsKq6fVXtmdnUla1JrlhSb6/Mvl/wkmXuQvPnSX5vCuypqr2qakdTdAB+jEAOrHs9c940B3p7rq+q/y+zwPz4JL/e3acuqfM305SUbyR5ZZLjuvuyHbT7T0muTfKV7r5+KvtBko9nNl/7Iyvo+xczC+Wbquo5S7fX7IFC2+3DDpyd5G8zC6ZbMjunL89t/2+ZzeU+b9p2SpI9p1Hmxyb591P9r2Q26nyH7JzXJ/mr7n5fd1+b2fztN1TV3t29JbNPLH4/s5/nFzP7xGOl/487LcnXM/sD5xFJju7uG5fUOSzJIUn+ZG760bZr9ldJ/jizL71+I8klme7cA7AS1d2j+wAAAOuWEXIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYaMPoDqy2u9/97r1x48bR3QAAYDd20UUXfa2791tJ3XUXyDdu3JgLL7xwdDcAANiNVdUXVlrXlBUAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhow+gOwHI2bjpr4W1sOenohbcBAHBzjJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADLRhdAdYezZuOmt0FwAAdhtGyAEAYCCBHAAABhLIAQBgIIEcAAAGEsgBAGAggRwAAAYSyAEAYCCBHAAABlpoIK+qLVV1aVVdXFUXTmX7VNW5VXXF9L73VF5VdXJVba6qS6rqgXPHOW6qf0VVHTdX/qDp+JunfWuR5wMAALvaaoyQP7K779/dh03rm5Kc192HJDlvWk+SxyU5ZHqdkOR1ySzAJzkxyYOTHJ7kxG0hfqrz3Ln9jlr86QAAwK4zYsrKMUlOm5ZPS/KEufLTe+b8JHtV1T2TPDbJud29tbuvS3JukqOmbXft7vO7u5OcPncsAABYExYdyDvJ+6rqoqo6YSrbv7u/PC1/Jcn+0/IBSa6c2/eqqWxH5VctU/4TquqEqrqwqi689tprd+Z8AABgl9qw4OM/vLuvrqp7JDm3qj4zv7G7u6p6wX1Id5+S5JQkOeywwxbeHgAArNRCR8i7++rp/Zok78xsDvhXp+kmmd6vmapfneSgud0PnMp2VH7gMuUAALBmLCyQV9Wdquou25aTHJnkU0nOTLLtTinHJXn3tHxmkmOnu60ckeSGaWrLOUmOrKq9py9zHpnknGnbN6rqiOnuKsfOHQsAANaERU5Z2T/JO6c7EW5I8pbu/h9VdUGSt1XV8Um+kOTJU/2zkzw+yeYkNyZ5VpJ099aq+qMkF0z1Xt7dW6fl5yV5Y5I7Jnnv9AIAgDVjYYG8uz+f5BeXKf96kkcvU95Jnr+dY52a5NRlyi9Mcr+d7iwAAAziSZ0AADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQQA4AAANtGN0BYO3YuOmshbex5aSjF94GANyWGCEHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBFh7Iq2qPqvpkVb1nWr93VX2sqjZX1V9W1e2n8jtM65un7RvnjvHSqfyzVfXYufKjprLNVbVp0ecCAAC72mqMkL8wyeVz669K8uruvk+S65IcP5Ufn+S6qfzVU71U1aFJnprkvkmOSvJnU8jfI8lrkzwuyaFJnjbVBQCANWOhgbyqDkxydJLXT+uV5FFJ3j5VOS3JE6blY6b1TNsfPdU/JskZ3f2d7v7nJJuTHD69Nnf357v7u0nOmOoCAMCasegR8v+e5CVJfjCt75vk+u6+aVq/KskB0/IBSa5Mkmn7DVP9H5Yv2Wd75QAAsGYsLJBX1a8kuaa7L1pUG7egLydU1YVVdeG11147ujsAAPBDixwhf1iSX62qLZlNJ3lUktck2auqNkx1Dkxy9bR8dZKDkmTafrckX58vX7LP9sp/Qnef0t2Hdfdh++23386fGQAA7CILC+Td/dLuPrC7N2b2pcz3d/czknwgyZOmasclefe0fOa0nmn7+7u7p/KnTndhuXeSQ5J8PMkFSQ6Z7tpy+6mNMxd1PgAAsAgbbr7KLvefkpxRVa9I8skkb5jK35DkTVW1OcnWzAJ2uvuyqnpbkk8nuSnJ87v7+0lSVS9Ick6SPZKc2t2XreqZAADATlqVQN7dH0zywWn585ndIWVpnW8n+fXt7P/KJK9cpvzsJGfvwq4CAMCq8qROAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYaEWBvKpeWFV3rZk3VNUnqurIRXcOAAB2dysdIX92d38jyZFJ9k7yzCQnLaxXAACwTqw0kNf0/vgkb+ruy+bKAACAW2mlgfyiqnpfZoH8nKq6S5IfLK5bAACwPmxYYb3jk9w/yee7+8aq2jfJsxbXLQAAWB9WOkLeSQ5N8lvT+p2S7LmQHgEAwDqy0kD+Z0kekuRp0/o3k7x2IT0CAIB1ZKVTVh7c3Q+sqk8mSXdfV1W3X2C/AABgXVjpCPn3qmqPzKaupKr2iy91AgDATlvpCPnJSd6Z5B5V9cokT0rynxfWK1gFGzedtSrtbDnp6FVpBwBYm1YUyLv7zVV1UZJHZ3b/8Sd09+UL7RkAAKwDKwrkVbVPkmuSvHWu7Hbd/b1FdQwAANaDlc4h/0SSa5P8U5IrpuUtVfWJqnrQojoHAAC7u5UG8nOTPL67797d+yZ5XJL3JHleZrdEBAAAboWVBvIjuvucbSvd/b4kD+nu85PcYSE9AwCAdWCld1n5clX9pyRnTOtPSfLV6VaIbn8IAAC30kpHyJ+e5MAk75peB09leyR58mK6BgAAu7+V3vbwa0l+czubN++67gAAwPqy0tse7pfkJUnum2TPbeXd/agF9QsAANaFlU5ZeXOSzyS5d5I/TLIlyQUL6hMAAKwbKw3k+3b3G5J8r7v/rrufncToOAAA7KSV3mVl2xM5v1xVRyf5UpJ9FtMlAABYP1YayF9RVXdL8uIkf5Lkrkn+t4X1CgAA1omV3mXlPdPiDUkeubjuAADA+rLSu6zcO7PbHm6c36e7f3Ux3QIAgPVhpVNW3pXkDUn+Jp7MCQAAu8xKA/m3u/vkhfYEAADWoZXe9vA1VXViVT2kqh647bWjHapqz6r6eFX9Y1VdVlV/OJXfu6o+VlWbq+ovq+r2U/kdpvXN0/aNc8d66VT+2ap67Fz5UVPZ5qradIvPHgAABlvpCPnPJ3lmZvce3zZlpbPje5F/J8mjuvtbVXW7JB+uqvcm+e0kr+7uM6rqz5Mcn+R10/t13X2fqnpqklcleUpVHZrkqZk9JfReSf62qv7t1MZrkzwmyVVJLqiqM7v70ys8JwAAGG6lgfzXk/xMd393pQfu7k7yrWn1dtNrW4h/+lR+WpI/yCyQHzMtJ8nbk/xpVdVUfkZ3fyfJP1fV5iSHT/U2d/fnk6SqzpjqCuQAAKwZK52y8qkke93Sg1fVHlV1cZJrkpyb5HNJru/um6YqVyU5YFo+IMmVSTJtvyHJvvPlS/bZXjkAAKwZKx0h3yvJZ6rqgsymoiS5+dsedvf3k9y/qvZK8s4kP3trO7ozquqEJCckycEHHzyiCwAAsKyVBvITd6aR7r6+qj6Q5CFJ9qqqDdMo+IFJrp6qXZ3koCRXVdWGJHdL8vW58m3m99le+dL2T0lySpIcdthhvTPnAgAAu9JKn9T5d7f0wFW1X5LvTWH8jpl9+fJVST6Q5ElJzkhyXJJ3T7ucOa1/dNr+/u7uqjozyVuq6o8z+1LnIUk+nqSSHDI9tOjqzL74uW1uOgAArAk7DORV9c3Mvoj5E5sy+97mXXew+z2TnFZVe2Q2V/1t3f2eqvp0kjOq6hVJPpnZA4cyvb9p+tLm1swCdrr7sqp6W2Zf1rwpyfOnqTCpqhckOSfJHklO7e7LVnLSAABwW7HDQN7dd7m1B+7uS5I8YJnyz+dHd0mZL/92ZndzWe5Yr0zyymXKz05y9q3tIwAAjLbSu6wAAAALIJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMtGF0BwDmbdx01sLb2HLS0QtvAwBWygg5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAG0Z3AHZ3GzedtfA2tpx09MLbAAAWY2Ej5FV1UFV9oKo+XVWXVdULp/J9qurcqrpiet97Kq+qOrmqNlfVJVX1wLljHTfVv6Kqjpsrf1BVXTrtc3JV1aLOBwAAFmGRU1ZuSvLi7j40yRFJnl9VhybZlOS87j4kyXnTepI8Lskh0+uEJK9LZgE+yYlJHpzk8CQnbgvxU53nzu131ALPBwAAdrmFBfLu/nJ3f2Ja/maSy5MckOSYJKdN1U5L8oRp+Zgkp/fM+Un2qqp7JnlsknO7e2t3X5fk3CRHTdvu2t3nd3cnOX3uWAAAsCasypc6q2pjkgck+ViS/bv7y9OmryTZf1o+IMmVc7tdNZXtqPyqZcqXa/+Eqrqwqi689tprd+pcAABgV1p4IK+qOyf56yQv6u5vzG+bRrZ70X3o7lO6+7DuPmy//fZbdHMAALBiCw3kVXW7zML4m7v7HVPxV6fpJpner5nKr05y0NzuB05lOyo/cJlyAABYMxZ5l5VK8oYkl3f3H89tOjPJtjulHJfk3XPlx053WzkiyQ3T1JZzkhxZVXtPX+Y8Msk507ZvVNURU1vHzh0LAADWhEXeh/xhSZ6Z5NKqungqe1mSk5K8raqOT/KFJE+etp2d5PFJNie5McmzkqS7t1bVHyW5YKr38u7eOi0/L8kbk9wxyXunFwAArBkLC+Td/eEk27sv+KOXqd9Jnr+dY52a5NRlyi9Mcr+d6CYAAAy1KndZAQAAlieQAwDAQAI5AAAMtMgvdQKrZOOms0Z3AQC4lYyQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADAMBAAjkAAAy0YXQHAGC1bNx01sLb2HLS0QtvA9i9GCEHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgTaM7gAAcMtt3HTWwtvYctLRC28DMEIOAABDCeQAADCQQA4AAAMJ5AAAMJBADgAAAwnkAAAwkEAOAAADCeQAADCQBwMB685qPFAlWZ2Hqng4DMDaZ4QcAAAGEsgBAGAgU1YAYBdarSlRwO5DIAdYEMEMgJUwZQUAAAYSyAEAYCCBHAAABhLIAQBgIIEcAAAGEsgBAGAgtz3cjbjFGgDA2mOEHAAABhLIAQBgIIEcAAAGEsgBAGAggRwAAAZaWCCvqlOr6pqq+tRc2T5VdW5VXTG97z2VV1WdXFWbq+qSqnrg3D7HTfWvqKrj5sofVFWXTvucXFW1qHMBAIBFWeQI+RuTHLWkbFOS87r7kCTnTetJ8rgkh0yvE5K8LpkF+CQnJnlwksOTnLgtxE91nju339K2AADgNm9hgby7P5Rk65LiY5KcNi2fluQJc+Wn98z5SfaqqnsmeWySc7t7a3dfl+TcJEdN2+7a3ed3dyc5fe5YAACwZqz2HPL9u/vL0/JXkuw/LR+Q5Mq5eldNZTsqv2qZcgAAWFOGPamzu7uqejXaqqoTMpsKk4MPPng1mgQAbkNW42nWW046euFtsHta7RHyr07TTTK9XzOVX53koLl6B05lOyo/cJnyZXX3Kd19WHcftt9+++30SQAAwK6y2oH8zCTb7pRyXJJ3z5UfO91t5YgkN0xTW85JcmRV7T19mfPIJOdM275RVUdMd1c5du5YAACwZixsykpVvTXJI5LcvaquyuxuKScleVtVHZ/kC0mePFU/O8njk2xOcmOSZyVJd2+tqj9KcsFU7+Xdve2Los/L7E4ud0zy3ukFAABrysICeXc/bTubHr1M3U7y/O0c59Qkpy5TfmGS++1MHwEAYDRP6gQAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEWdh9yAAB2vY2bzlp4G1tOOnrhbfAjRsgBAGAggRwAAAYSyAEAYCCBHAAABhLIAQBgIIEcAAAGEsgBAGAggRwAAAYSyAEAYCBP6gQAhlqNJ0/CbZkRcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABjIXVYA2KHVugPGlpOOXpV2AG5rBHIAAH7Mavwh7o/wHzFlBQAABhLIAQBgIFNWAAB2AU8c5dYSyAEAWHW+MP4jpqwAAMBAAjkAAAxkygoAtwnm3wLrlRFyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYaMPoDgAAt00bN501uguwLhghBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICBBHIAABhIIAcAgIEEcgAAGGjNB/KqOqqqPltVm6tq0+j+AADALbGmA3lV7ZHktUkel+TQJE+rqkPH9goAAFZuTQfyJIcn2dzdn+/u7yY5I8kxg/sEAAArttYD+QFJrpxbv2oqAwCANWHD6A6shqo6IckJ0+q3quqzA7px9yRfG9Au47n265drvz657uuXa38bVK9alWaWu/Y/vdKd13ogvzrJQXPrB05lP6a7T0lyymp1ajlVdWF3HzayD4zh2q9frv365LqvX679+rWz136tT1m5IMkhVXXvqrp9kqcmOXNwnwAAYMXW9Ah5d99UVS9Ick6SPZKc2t2XDe4WAACs2JoO5EnS3WcnOXt0P1Zg6JQZhnLt1y/Xfn1y3dcv13792qlrX929qzoCAADcQmt9DjkAAKxpAvkqqKqjquqzVbW5qjaN7g+7VlWdWlXXVNWn5sr2qapzq+qK6X3vqbyq6uTp38IlVfXAcT1nZ1TVQVX1gar6dFVdVlUvnMpd+91cVe1ZVR+vqn+crv0fTuX3rqqPTdf4L6ebDaSq7jCtb562bxzZf3ZOVe1RVZ+sqvdM6677OlBVW6rq0qq6uKounMp22e97gXzBqmqPJK9N8rgkhyZ5WlUdOrZX7GJvTHLUkrJNSc7r7kOSnDetJ7N/B4dMrxOSvG6V+siud1OSF3f3oUmOSPL86b9t1373950kj+ruX0xy/yRHVdURSV6V5NXdfZ8k1yU5fqp/fJLrpvJXT/VYu16Y5PK5ddd9/Xhkd99/7vaGu+z3vUC+eIcn2dzdn+/u7yY5I8kxg/vELtTdH0qydUnxMUlOm5ZPS/KEufLTe+b8JHtV1T1Xp6fsSt395e7+xLT8zcz+B31AXPvd3nQNvzWt3m56dZJHJXnWA83gAAAFkklEQVT7VL702m/7N/H2JI+uqlql7rILVdWBSY5O8vppveK6r2e77Pe9QL54ByS5cm79qqmM3dv+3f3lafkrSfaflv172A1NH0U/IMnH4tqvC9O0hYuTXJPk3CSfS3J9d980VZm/vj+89tP2G5Lsu7o9Zhf570lekuQH0/q+cd3Xi07yvqq6aHoCfLILf9+v+dsewm1dd3dVuZ3Rbqqq7pzkr5O8qLu/MT8A5trvvrr7+0nuX1V7JXlnkp8d3CUWrKp+Jck13X1RVT1idH9YdQ/v7qur6h5Jzq2qz8xv3Nnf90bIF+/qJAfNrR84lbF7++q2j6em92umcv8ediNVdbvMwvibu/sdU7Frv4509/VJPpDkIZl9LL1toGv++v7w2k/b75bk66vcVXbew5L8alVtyWz66aOSvCau+7rQ3VdP79dk9kf44dmFv+8F8sW7IMkh07ewb5/kqUnOHNwnFu/MJMdNy8clefdc+bHTN7CPSHLD3MddrCHTXNA3JLm8u/94bpNrv5urqv2mkfFU1R2TPCaz7xB8IMmTpmpLr/22fxNPSvL+9hCQNae7X9rdB3b3xsz+X/7+7n5GXPfdXlXdqarusm05yZFJPpVd+Pveg4FWQVU9PrN5Z3skObW7Xzm4S+xCVfXWJI9IcvckX01yYpJ3JXlbkoOTfCHJk7t76xTi/jSzu7LcmORZ3X3hiH6zc6rq4Un+Psml+dF80pdlNo/ctd+NVdUvZPYFrj0yG9h6W3e/vKp+JrOR032SfDLJv+/u71TVnknelNn3DLYmeWp3f35M79kVpikrv9Pdv+K67/6ma/zOaXVDkrd09yurat/sot/3AjkAAAxkygoAAAwkkAMAwEACOQAADCSQAwDAQAI5AAAMJJADrFFV9eqqetHc+jlV9fq59f+rqn67qu5VVW/fzjE+WFWHTcsvmyvfWFWfWmE/XlRVx97Kc3hBVT371uwLsLsQyAHWrn9I8tAkqaqfyuxe+Ped2/7QJB/p7i9195OW2X+pl918lR83PYHw2Uneckv3nZya5Ddv5b4AuwWBHGDt+khmj2xPZkH8U0m+WVV7V9Udkvxckk/Mj3ZX1R2r6oyquryq3pnkjlP5SUnuWFUXV9Wbp2PuUVX/T1VdVlXvm55KudSjknyiu2+ajvPBqnrNdJxPVdXhU/lrqup/n5YfW1Ufqqqf6u4bk2zZVg9gPRLIAdao7v5Skpuq6uDMRsM/mtmTQh+S5LAkl3b3d5fs9h+T3NjdP5fZU2UfNB1rU5J/6e77T48DT5JDkry2u++b5PokT1ymGw9LctGSsn/V3fdP8rzMRsCT5KVJnlJVj0xycmZPrtv2hNMLk/zSLf4BAOwmBHKAte0jmYXxbYH8o3Pr/7BM/V9O8hdJ0t2XJLlkB8f+5+6+eFq+KMnGZercM8m1S8reOh3/Q0nuWlV7TSPhz01ybpI/7e7PzdW/Jsm9dtAPgN2aQA6wtm2bR/7zmU1ZOT+zEfKHZhbWd8Z35pa/n2TDMnX+JcmeS8p6O+s/n+Tr+cnwved0HIB1SSAHWNs+kuRXkmzt7u9399Yke2UWypcL5B9K8vQkqar7JfmFuW3fq6rb3cL2L09ynyVlT5mO//AkN3T3DVX100lenOQBSR5XVQ+eq/9vM/tjAmBdEsgB1rZLM7u7yvlLym7o7q8tU/91Se5cVZcneXl+fP73KUkumftS50q8N7NpMPO+XVWfTPLnSY6vqkryhiS/M817Pz7J66tq28j6wzKbygKwLlX30k8WAWDlpru1vKS7r6iqD2YWvC9c4b4PSPLb3f3MRfYR4LbMCDkAO2tTZl/uvDXunuT3d2FfANYcI+QAADCQEXIAABhIIAcAgIEEcgAAGEggBwCAgQRyAAAYSCAHAICB/n8av9LMcfvPhwAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from matplotlib import pyplot as plt \n", - "import numpy as np \n", - "bins = list(range(0,500,20))\n", - "plt.figure(figsize=(12,8))\n", - "plt.hist(sizes, bins=bins)\n", - "plt.title(\"Face Image Sizes\") \n", - "plt.ylabel(\"Images\")\n", - "plt.xlabel(\"Width (px)\")\n", - "plt.yticks(range(0, 60000, 10000))\n", - "plt.title('IMDB-Wiki: Face Pixel Size')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```\n", - "dob: date of birth (Matlab serial date number)\n", - "photo_taken: year when the photo was taken\n", - "full_path: path to file\n", - "gender: 0 for female and 1 for male, NaN if unknown\n", - "name: name of the celebrity\n", - "face_location: location of the face. To crop the face in Matlab run\n", - "\n", - "img(face_location(2):face_location(4),face_location(1):face_location(3),:))\n", - "\n", - "face_score: detector score (the higher the better). Inf implies that no face was found in the image and the face_location then just returns the entire image\n", - "second_face_score: detector score of the face with the second highest score. This is useful to ignore images with more than one face. second_face_score is NaN if no second face was detected.\n", - "celeb_names (IMDB only): list of all celebrity names\n", - "celeb_id (IMDB only): index of celebrity name\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:megapixels]", - "language": "python", - "name": "conda-env-megapixels-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} -- cgit v1.2.3-70-g09d2