summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/identity/umd_faces_identity.ipynb
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-02-12 15:18:46 +0100
committeradamhrv <adam@ahprojects.com>2019-02-12 15:18:46 +0100
commita5bdab8e798fcdc7885cfdabb0e5dd8076fa1d40 (patch)
tree1e7a45a8d2c746994584cc5f8e4ccdabad82f8d8 /megapixels/notebooks/datasets/identity/umd_faces_identity.ipynb
parente95455a8a4013dafdeb7e41cfa8fb1f3ccc28dbb (diff)
reorder nbs
Diffstat (limited to 'megapixels/notebooks/datasets/identity/umd_faces_identity.ipynb')
-rw-r--r--megapixels/notebooks/datasets/identity/umd_faces_identity.ipynb675
1 files changed, 675 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/identity/umd_faces_identity.ipynb b/megapixels/notebooks/datasets/identity/umd_faces_identity.ipynb
new file mode 100644
index 00000000..a3da9d58
--- /dev/null
+++ b/megapixels/notebooks/datasets/identity/umd_faces_identity.ipynb
@@ -0,0 +1,675 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# UMD Faces Knowledge Graph Identities\n",
+ "\n",
+ "- convert filename-names to names\n",
+ "- fetch Google Knowledge Graph entity IDs for each name\n",
+ "- save KG IDs to CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "import os.path as osp\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "from pathlib import Path\n",
+ "import random\n",
+ "import math\n",
+ "from datetime import datetime\n",
+ "import requests\n",
+ "import json\n",
+ "import time\n",
+ "from pprint import pprint\n",
+ "from multiprocessing.pool import ThreadPool\n",
+ "import threading\n",
+ "import urllib.request\n",
+ "\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "import pandas as pd\n",
+ "from scipy.io import loadmat\n",
+ "import numpy as np\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load IMDB Metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_filenames = '/data_store_hdd/datasets/people/umd_faces/downloads/filenames.txt'\n",
+ "with open(fp_filenames, 'r') as fp:\n",
+ " filenames = fp.readlines()\n",
+ "_ = filenames.pop(0)\n",
+ "filenames = [x.replace('_', ' ').strip() for x in filenames]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "aaron rodgers\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(filenames[0])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Google Knowledge Graph API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "b'{\\n \"YourFuckingIPAddress\": \"78.55.72.54\",\\n \"YourFuckingLocation\": \"Berlin, BE, Germany\",\\n \"YourFuckingHostname\": \"x4e374836.dyn.telefonica.de\",\\n \"YourFuckingISP\": \"O2 Deutschland\",\\n \"YourFuckingTorExit\": \"false\",\\n \"YourFuckingCountryCode\": \"DE\"\\n}\\n'"
+ ]
+ },
+ "execution_count": 62,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "urllib.request.urlopen('https://wtfismyip.com/json').read()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# read API key\n",
+ "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n",
+ "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def _get_kg_meta(result_obj, params):\n",
+ " global api_key, url_kg_api\n",
+ " \n",
+ " params['indent'] = True\n",
+ " params['key'] = api_key\n",
+ " params['limit'] = 1\n",
+ " \n",
+ " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n",
+ " try:\n",
+ " json_response = urllib.request.urlopen(url).read()\n",
+ " except Exception as e:\n",
+ " result['error'] = str(e)\n",
+ " else:\n",
+ " try:\n",
+ " response = json.loads(json_response)\n",
+ " items = response.get('itemListElement', [])\n",
+ " result_obj['accessed'] = True\n",
+ " if items:\n",
+ " item = items[0]\n",
+ " item_result = item.get('result', [])\n",
+ " result_obj['description'] = item_result.get('description', '')\n",
+ " det_desc = item_result.get('detailedDescription', '')\n",
+ " if not result_obj['kg_id']:\n",
+ " result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n",
+ " if det_desc:\n",
+ " result_obj['description_extended'] = det_desc.get('articleBody','')\n",
+ " result_obj['description_license'] = det_desc.get('license','')\n",
+ " result_obj['description_url'] = det_desc.get('url','')\n",
+ " else:\n",
+ " result_obj['description_extended'] = ''\n",
+ " result_obj['description_license'] = ''\n",
+ " result_obj['description_url'] = ''\n",
+ " result_img = item_result.get('image', '')\n",
+ " if result_img:\n",
+ " result_obj['image_url'] = result_img.get('contentUrl', '')\n",
+ " result_obj['name'] = item_result.get('name', '')\n",
+ " result_obj['score'] = item.get('resultScore', 0.0)\n",
+ " result_obj['url'] = item_result.get('url', '')\n",
+ " except Exception as e:\n",
+ " result_obj['error'] = str(e)\n",
+ " return result_obj\n",
+ " \n",
+ "def get_kg_from_name(obj):\n",
+ " if obj['accessed']:\n",
+ " return obj\n",
+ " params = {'query': obj['query']}\n",
+ " return _get_kg_meta(obj, params)\n",
+ " \n",
+ "def get_kg_from_kg_id(obj):\n",
+ " if obj['accessed']:\n",
+ " return obj\n",
+ " params = {'ids': obj['kg_id']}\n",
+ " return _get_kg_meta(obj, params)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'accessed': False,\n",
+ " 'description': '',\n",
+ " 'error': '<urlopen error [Errno -2] Name or service not known>',\n",
+ " 'kg_id': '',\n",
+ " 'query': 'Taylor Swift',\n",
+ " 'score': 0.0,\n",
+ " 'url': ''}\n"
+ ]
+ }
+ ],
+ "source": [
+ "pprint(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'accessed': True,\n",
+ " 'description': 'American singer',\n",
+ " 'description_extended': 'Taylor Alison Swift is an American '\n",
+ " \"singer-songwriter. As one of the world's leading \"\n",
+ " 'contemporary recording artists, she is known for '\n",
+ " 'narrative songs about her personal life, which has '\n",
+ " 'received widespread media coverage.\\n',\n",
+ " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n",
+ " 'description_url': 'https://en.wikipedia.org/wiki/Taylor_Swift',\n",
+ " 'image_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcST848UJ0u31E6aoQfb2nnKZFyad7rwNF0ZLOCACGpu4jnboEzV',\n",
+ " 'kg_id': '/m/0dl567',\n",
+ " 'name': 'Taylor Swift',\n",
+ " 'query': 'Taylor Swift',\n",
+ " 'score': 1241.476318,\n",
+ " 'url': 'http://taylorswift.com/'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# test get from name\n",
+ "obj = {'query': 'Taylor Swift', 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n",
+ "result = get_kg_from_name(obj)\n",
+ "pprint(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# define thread mapping function\n",
+ "def pool_map_persons(obj):\n",
+ " global pbar\n",
+ " pbar.update(1)\n",
+ " kg_obj = get_kg_from_name(obj)\n",
+ " return kg_obj"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# build mapped_person objects\n",
+ "mapped_persons = []\n",
+ "for fn in filenames:\n",
+ " obj = {'query': fn, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n",
+ " mapped_persons.append(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3107\n",
+ "['aaron rodgers', 'aaron ruell', 'aaron staton', 'abel ferrara', 'abigail klein', 'abraham benrubi', 'abyshamble', 'adabel guerrero', 'adam ant', 'adam buxton']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(mapped_persons))\n",
+ "print(filenames[0:10])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "667\n"
+ ]
+ }
+ ],
+ "source": [
+ "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+ "print(num_non_accessed)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d38371156f594787ba242f451a3da650",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=3107), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3/3107 remaining\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d7c35975a7ad48fba2b9a02eb8ea2277",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=3107), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "num_threads = 20\n",
+ "pbar = tqdm(total=len(mapped_persons))\n",
+ "\n",
+ "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+ "\n",
+ "# convert to thread pool\n",
+ "while num_non_accessed > 0:\n",
+ " print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n",
+ " pool = ThreadPool(num_threads)\n",
+ "\n",
+ " # start threading\n",
+ " with tqdm(total=len(mapped_persons)) as pbar:\n",
+ " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n",
+ "\n",
+ " # close tqdm\n",
+ " pbar.close()\n",
+ "\n",
+ " num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+ " if num_non_accessed > 0:\n",
+ " print(f'{num_non_accessed} remaining. Sleeping...')\n",
+ " time.sleep(60*10) # wait X minutes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'query': 'aaron rodgers', 'kg_id': '/m/04q06_', 'score': 919.404602, 'description': 'Football quarterback', 'url': '', 'accessed': True, 'description_extended': 'Aaron Charles Rodgers is an American football quarterback for the Green Bay Packers of the National Football League. Rodgers played college football for the California Golden Bears, where he set several career passing records, including lowest single-season and career interception rates. ', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Aaron_Rodgers', 'image_url': 'http://t3.gstatic.com/images?q=tbn:ANd9GcTH_uiKmj_Y71Lc1kNCJK5HDiZsUSh3AxEBI9Jz_lp5q_89QZ9d', 'name': 'Aaron Rodgers'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# test output for a person\n",
+ "print(mapped_persons[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n",
+ "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n",
+ "cc_short = 'CC BY-SA 3.0'\n",
+ "nchanged = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " license = mapped_person.get('description_license', None)\n",
+ " if license == cc_long:\n",
+ " nchanged += 1\n",
+ " mapped_person['description_license'] = cc_short\n",
+ "print(nchanged)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# find number not accessed\n",
+ "n_empty = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " if not mapped_person.get('accessed', False):\n",
+ " n_empty += 1\n",
+ " print(mapped_person['kg_id'])\n",
+ "print(n_empty)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create dataframe for mapped persons\n",
+ "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n",
+ "df_mapped_persons.index.name = 'index'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>accessed</th>\n",
+ " <th>description</th>\n",
+ " <th>description_extended</th>\n",
+ " <th>description_license</th>\n",
+ " <th>description_url</th>\n",
+ " <th>image_url</th>\n",
+ " <th>kg_id</th>\n",
+ " <th>name</th>\n",
+ " <th>query</th>\n",
+ " <th>score</th>\n",
+ " <th>url</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>index</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>True</td>\n",
+ " <td>Football quarterback</td>\n",
+ " <td>Aaron Charles Rodgers is an American football ...</td>\n",
+ " <td>CC BY-SA 3.0</td>\n",
+ " <td>https://en.wikipedia.org/wiki/Aaron_Rodgers</td>\n",
+ " <td>http://t3.gstatic.com/images?q=tbn:ANd9GcTH_ui...</td>\n",
+ " <td>/m/04q06_</td>\n",
+ " <td>Aaron Rodgers</td>\n",
+ " <td>aaron rodgers</td>\n",
+ " <td>919.404602</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>True</td>\n",
+ " <td>American director</td>\n",
+ " <td>Derek Aaron Ruell, is an American director and...</td>\n",
+ " <td>CC BY-SA 3.0</td>\n",
+ " <td>https://en.wikipedia.org/wiki/Aaron_Ruell</td>\n",
+ " <td>http://t3.gstatic.com/images?q=tbn:ANd9GcSzGg8...</td>\n",
+ " <td>/m/05yf80</td>\n",
+ " <td>Aaron Ruell</td>\n",
+ " <td>aaron ruell</td>\n",
+ " <td>439.912476</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>True</td>\n",
+ " <td>American actor</td>\n",
+ " <td>Aaron Staton is an American actor. He is best ...</td>\n",
+ " <td>CC BY-SA 3.0</td>\n",
+ " <td>https://en.wikipedia.org/wiki/Aaron_Staton</td>\n",
+ " <td>http://t3.gstatic.com/images?q=tbn:ANd9GcTTmBV...</td>\n",
+ " <td>/m/06_vpyq</td>\n",
+ " <td>Aaron Staton</td>\n",
+ " <td>aaron staton</td>\n",
+ " <td>500.833344</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>True</td>\n",
+ " <td>American filmmaker</td>\n",
+ " <td>Abel Ferrara is an American filmmaker, known f...</td>\n",
+ " <td>CC BY-SA 3.0</td>\n",
+ " <td>https://en.wikipedia.org/wiki/Abel_Ferrara</td>\n",
+ " <td>http://t2.gstatic.com/images?q=tbn:ANd9GcRAhy-...</td>\n",
+ " <td>/m/056ryy</td>\n",
+ " <td>Abel Ferrara</td>\n",
+ " <td>abel ferrara</td>\n",
+ " <td>522.177734</td>\n",
+ " <td>http://www.abelferrara.com/</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>True</td>\n",
+ " <td>Actress</td>\n",
+ " <td></td>\n",
+ " <td></td>\n",
+ " <td></td>\n",
+ " <td>NaN</td>\n",
+ " <td>/m/0pbm3jf</td>\n",
+ " <td>Abigail Klein</td>\n",
+ " <td>abigail klein</td>\n",
+ " <td>341.831482</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " accessed description \\\n",
+ "index \n",
+ "0 True Football quarterback \n",
+ "1 True American director \n",
+ "2 True American actor \n",
+ "3 True American filmmaker \n",
+ "4 True Actress \n",
+ "\n",
+ " description_extended description_license \\\n",
+ "index \n",
+ "0 Aaron Charles Rodgers is an American football ... CC BY-SA 3.0 \n",
+ "1 Derek Aaron Ruell, is an American director and... CC BY-SA 3.0 \n",
+ "2 Aaron Staton is an American actor. He is best ... CC BY-SA 3.0 \n",
+ "3 Abel Ferrara is an American filmmaker, known f... CC BY-SA 3.0 \n",
+ "4 \n",
+ "\n",
+ " description_url \\\n",
+ "index \n",
+ "0 https://en.wikipedia.org/wiki/Aaron_Rodgers \n",
+ "1 https://en.wikipedia.org/wiki/Aaron_Ruell \n",
+ "2 https://en.wikipedia.org/wiki/Aaron_Staton \n",
+ "3 https://en.wikipedia.org/wiki/Abel_Ferrara \n",
+ "4 \n",
+ "\n",
+ " image_url kg_id \\\n",
+ "index \n",
+ "0 http://t3.gstatic.com/images?q=tbn:ANd9GcTH_ui... /m/04q06_ \n",
+ "1 http://t3.gstatic.com/images?q=tbn:ANd9GcSzGg8... /m/05yf80 \n",
+ "2 http://t3.gstatic.com/images?q=tbn:ANd9GcTTmBV... /m/06_vpyq \n",
+ "3 http://t2.gstatic.com/images?q=tbn:ANd9GcRAhy-... /m/056ryy \n",
+ "4 NaN /m/0pbm3jf \n",
+ "\n",
+ " name query score url \n",
+ "index \n",
+ "0 Aaron Rodgers aaron rodgers 919.404602 \n",
+ "1 Aaron Ruell aaron ruell 439.912476 \n",
+ "2 Aaron Staton aaron staton 500.833344 \n",
+ "3 Abel Ferrara abel ferrara 522.177734 http://www.abelferrara.com/ \n",
+ "4 Abigail Klein abigail klein 341.831482 "
+ ]
+ },
+ "execution_count": 75,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# check output\n",
+ "df_mapped_persons.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# save\n",
+ "fp_out = '/data_store_hdd/datasets/people/umd_faces/metadata/identity_kg.csv'\n",
+ "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create small version\n",
+ "limit = 1000\n",
+ "fpp_out = Path(fp_out)\n",
+ "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n",
+ "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n",
+ "df_mapped_persons_sm.index.name = 'index'\n",
+ "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:megapixels]",
+ "language": "python",
+ "name": "conda-env-megapixels-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}