diff options
| author | adamhrv <adam@ahprojects.com> | 2019-02-12 15:18:46 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-02-12 15:18:46 +0100 |
| commit | a5bdab8e798fcdc7885cfdabb0e5dd8076fa1d40 (patch) | |
| tree | 1e7a45a8d2c746994584cc5f8e4ccdabad82f8d8 /megapixels/notebooks/datasets/knowledge_graph/identity.ipynb | |
| parent | e95455a8a4013dafdeb7e41cfa8fb1f3ccc28dbb (diff) | |
reorder nbs
Diffstat (limited to 'megapixels/notebooks/datasets/knowledge_graph/identity.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/knowledge_graph/identity.ipynb | 792 |
1 files changed, 0 insertions, 792 deletions
diff --git a/megapixels/notebooks/datasets/knowledge_graph/identity.ipynb b/megapixels/notebooks/datasets/knowledge_graph/identity.ipynb deleted file mode 100644 index 81a74faf..00000000 --- a/megapixels/notebooks/datasets/knowledge_graph/identity.ipynb +++ /dev/null @@ -1,792 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Knowledge Graph Identities\n", - "\n", - "- convert filename-names to names\n", - "- fetch Google Knowledge Graph entity IDs for each name\n", - "- save KG IDs to CSV" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [], - "source": [ - "%reload_ext autoreload\n", - "%autoreload 2\n", - "\n", - "import os\n", - "import os.path as osp\n", - "from os.path import join\n", - "from glob import glob\n", - "from pathlib import Path\n", - "import random\n", - "import math\n", - "from datetime import datetime\n", - "import requests\n", - "import json\n", - "import time\n", - "from pprint import pprint\n", - "from multiprocessing.pool import ThreadPool\n", - "import threading\n", - "import urllib.request\n", - "import difflib\n", - "import unidecode\n", - "import slugify\n", - "\n", - "from tqdm import tqdm_notebook as tqdm\n", - "import pandas as pd\n", - "from scipy.io import loadmat\n", - "import numpy as np\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "\n", - "import sys\n", - "sys.path.append('/work/megapixels_dev/megapixels')\n", - "from app.utils import api_utils\n", - "from app.settings import types" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get List of Names" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def get_names(enum_dataset):\n", - " if enum_dataset == types.Dataset.LFW:\n", - " dir_lfw = '/data_store_hdd/datasets/people/lfw/media/original/'\n", - " names = [x.replace('_', ' ') for x in os.listdir(dir_lfw)]\n", - " elif enum_dataset == types.Dataset.YOUTUBE_FACES:\n", - " names = [x for x in names if 'labeled faces.txt' not in x]\n", - " return names" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['Kim Clijsters', 'William Rosenberg', 'John Brady', 'Juan Ignacio Chela', 'Floyd Keith', 'Sam Gerald', 'Imad Khadduri', 'Anna Kournikova', 'Jacques Rogge', 'Wilbert Elki Meza Majino']\n" - ] - } - ], - "source": [ - "names = get_names(types.Dataset.LFW)\n", - "print(names[0:10])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Google Knowledge Graph API" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# read API key\n", - "api_key = open('/work/megapixels_dev/env/google_knowledge_graph_api.env').read()\n", - "kg_api = api_utils.GoogleKnowledgeGraph(api_key)\n", - "wp_api = api_utils.WikipediaAPI()" - ] - }, - { - "cell_type": "code", - "execution_count": 241, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "wp\n", - "{'wp_accessed': True, 'wp_description': '', 'wp_name': '', 'wp_page_id': ''}\n", - "kg\n", - "{'kg_accessed': True,\n", - " 'kg_bio': '',\n", - " 'kg_bio_url': '',\n", - " 'kg_description': '',\n", - " 'kg_id': '',\n", - " 'kg_image_url': '',\n", - " 'kg_name': '',\n", - " 'kg_score': 0,\n", - " 'kg_url': '',\n", - " 'query': 'Jeff Dederian'}\n" - ] - } - ], - "source": [ - "#wp_api.test_access()\n", - "print('wp')\n", - "pprint(wp_api.get_meta({'query': 'Florecita Cobian'}))\n", - "print('kg')\n", - "pprint(kg_api.get_kg_from_name({'query':'Jeff Dederian'}))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test Name Similarity Matching" - ] - }, - { - "cell_type": "code", - "execution_count": 242, - "metadata": {}, - "outputs": [], - "source": [ - "def same_person(query, name, sim_min=.9, word_match_min=0.75, verbose=False):\n", - " if name == '':\n", - " return False\n", - " # check and remove if WP added parenthesis\n", - " if '(' in name and ')' in name:\n", - " name = name.split('(')[0]\n", - " \n", - " # then strip spaces and split into list\n", - " query_strings = [unidecode.unidecode(x.strip().lower()) for x in query.strip().split(' ')] # query\n", - " result_strings = [unidecode.unidecode(x.strip().lower()) for x in name.strip().split(' ')] # result\n", - " min_str_len = min(len(result_strings), len(query_strings))\n", - " # match each word in the query\n", - " matched_strings = []\n", - " \n", - " for i in range(len(query_strings)):\n", - " # for each word in the shorter text string\n", - " result_strings_tmp = result_strings.copy()\n", - " for j in range(len(result_strings_tmp)):\n", - " a = query_strings[i]\n", - " b = result_strings_tmp[j]\n", - " # make a the shorter string\n", - " lengths = [len(a), len(b)]\n", - " min_ratio = (min(lengths) / max(lengths) * .75)\n", - " ratio = difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()\n", - " result = (ratio >= min_ratio)\n", - " if verbose:\n", - " print(f'comapre \"{a}\" to \"{b}\" ratio was: {ratio:.2f} min: {min_ratio:.2}, passed: {result}')\n", - " if result:\n", - " # remove this item from result strings\n", - " matched_string = result_strings.pop(j)\n", - " matched_strings.append(matched_string)\n", - " break # exit loop and use shortened result string haystack\n", - "\n", - " matched = len(matched_strings) >= min_str_len\n", - " if verbose:\n", - " print(f'{matched} because {len(matched_strings)} >= {min_str_len}')\n", - " return matched" - ] - }, - { - "cell_type": "code", - "execution_count": 245, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(Adoor Gopalakrishnan == Adoors Gopalakarishnan ok) = True\n", - "\n", - "comapre \"dave\" to \"david\" ratio was: 0.67 min: 0.6, passed: True\n", - "comapre \"letterman\" to \"letterman\" ratio was: 1.00 min: 0.75, passed: True\n", - "True because 2 >= 2\n", - "(David Letterman == Dave Letterman) = True\n", - "\n", - "comapre \"charles\" to \"charles\" ratio was: 1.00 min: 0.75, passed: True\n", - "comapre \"dickens\" to \"booker\" ratio was: 0.31 min: 0.64, passed: False\n", - "False because 1 >= 2\n", - "(Charles Booker == Charles Dickens) = False\n", - "\n", - "comapre \"donald\" to \"don\" ratio was: 0.67 min: 0.38, passed: True\n", - "comapre \"trump\" to \"j.\" ratio was: 0.00 min: 0.3, passed: False\n", - "comapre \"trump\" to \"trump\" ratio was: 1.00 min: 0.75, passed: True\n", - "True because 2 >= 2\n", - "(Don J. Trump == Donald Trump) = True\n", - "\n", - "comapre \"wang\" to \"wang\" ratio was: 1.00 min: 0.75, passed: True\n", - "comapre \"fei\" to \"fei\" ratio was: 1.00 min: 0.75, passed: True\n", - "True because 2 >= 2\n", - "(Wang Fei (female footballer) == Wang Fei) = True\n" - ] - } - ], - "source": [ - "test_sim_match = True\n", - "if test_sim_match:\n", - " # Test name similarity search\n", - " query = 'Adoors Gopalakarishnan ok'\n", - " wp_name = 'Adoor Gopalakrishnan'\n", - " matched = same_person(query, wp_name)\n", - " print(f'({wp_name} == {query}) = {matched}')\n", - " print('')\n", - "\n", - " query = 'Dave Letterman'\n", - " wp_name = 'David Letterman'\n", - " matched = same_person(query, wp_name, verbose=True)\n", - " print(f'({wp_name} == {query}) = {matched}')\n", - " print('')\n", - "\n", - " query = 'Charles Dickens'\n", - " wp_name = 'Charles Booker'\n", - " matched = same_person(query, wp_name, verbose=True)\n", - " print(f'({wp_name} == {query}) = {matched}')\n", - " print('')\n", - "\n", - " query = 'Donald Trump'\n", - " wp_name = 'Don J. Trump'\n", - " matched = same_person(query, wp_name, verbose=True)\n", - " print(f'({wp_name} == {query}) = {matched}')\n", - " print('')\n", - " \n", - " query = 'Wang Fei'\n", - " kg_name = 'Faye Wong'\n", - " wp_name = 'Wang Fei (female footballer)'\n", - " matched = same_person(query, wp_name, verbose=True)\n", - " print(f'({wp_name} == {query}) = {matched}')" - ] - }, - { - "cell_type": "code", - "execution_count": 246, - "metadata": {}, - "outputs": [], - "source": [ - "# define thread mapping function\n", - "def pool_map_persons(obj):\n", - " global pbar\n", - " pbar.update(1)\n", - " kg_obj = kg_api.get_kg_from_name(obj)\n", - " wp_obj = wp_api.get_meta(obj)\n", - " person_obj = {**kg_obj, **wp_obj}\n", - " return person_obj\n", - "\n", - "def num_non_accessed(mps):\n", - " return sum(0 if (x.get('kg_accessed', False) and x.get('wp_accessed', False)) else 1 for x in mps)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load existing CSV" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "# load existing CSV\n", - "fp_csv = '/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n", - "df = pd.read_csv(fp_csv, encoding = 'utf-16').set_index('index')\n", - "# fill nulls\n", - "df.fillna('', inplace = True)\n", - "mapped_persons = df.to_dict('records')\n", - "# add columns\n", - "for mp in mapped_persons:\n", - " mp['wp_error'] = ''\n", - " mp['kg_error'] = ''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get Knowledge Graph Data" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5507f5c19de746df94aa5445e3c7cf46", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "832/5749 remaining\n", - "832/5749 remaining. Using 5 threads\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "411d08f873174d13a1de1f8b21f9f993", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done. 0 remaining.\n" - ] - } - ], - "source": [ - "num_threads_max = 5\n", - "sleep_min = 1\n", - "pbar = tqdm(total=len(mapped_persons))\n", - "\n", - "nna = num_non_accessed(mapped_persons)\n", - "print(f'{nna}/{len(mapped_persons)} remaining')\n", - "\n", - "# convert to thread pool\n", - "while nna > 0:\n", - " num_threads = max(1, min(num_threads_max, nna))\n", - " print(f'{nna}/{len(mapped_persons)} remaining. Using {num_threads} threads')\n", - " pool = ThreadPool(num_threads)\n", - "\n", - " # start threading\n", - " with tqdm(total=len(mapped_persons)) as pbar:\n", - " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", - "\n", - " # close tqdm\n", - " pbar.close()\n", - "\n", - " nna = num_non_accessed(mapped_persons)\n", - " if nna > 0:\n", - " print(f'{nna} remaining. Sleeping for {sleep_min} minutes...')\n", - " time.sleep(60 * sleep_min)\n", - "\n", - "print(f'Done. {nna} remaining.')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get Wikipedia API data" - ] - }, - { - "cell_type": "code", - "execution_count": 220, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "for i, mp in enumerate(mapped_persons):\n", - " kg_name = mp.get('kg_name')\n", - " wp_name = mp.get('wp_name')\n", - " query = mp.get('query')\n", - " name_orig = mp.get('source_name')\n", - " kg_score = int(mp.get('kg_score',0))\n", - "\n", - " kg_matches = same_person(name_orig, kg_name)\n", - " wp_matches = same_person(name_orig, wp_name)\n", - "\n", - " if kg_matches and wp_matches and kg_score > 100:\n", - " # very likely a match, confirm it\n", - " match_status = 2 # supermatch\n", - " # default to using wp because descriptions are more appropriate/udpated\n", - " source = 'wp'\n", - " elif kg_matches and wp_matches:\n", - " match_status = 1\n", - " # default to using wp because descriptions are more appropriate/udpated\n", - " source = 'wp'\n", - " elif kg_matches and not wp_matches:\n", - " # if the KG score is medium-high, but wp failed, needs review\n", - " source = 'kg'\n", - " match_status = 0\n", - " elif wp_matches and not kg_matches:\n", - " # if wikipedia text matched the query, then confirm\n", - " source = 'wp'\n", - " match_status = 0\n", - " else:\n", - " # no information available\n", - " match_status = -1\n", - " source = None\n", - " \n", - " slug = slugify.slugify(name_orig, separator='_')\n", - " mp_bio = mp.get('kg_bio', '')\n", - " wp_desc = mp.get('wp_description', '')\n", - " source_url = f\"http://vis-www.cs.umass.edu/lfw/person/{name_orig.replace(' ', '_')}.html\"\n", - " \n", - " if source == 'kg':\n", - " # google knowledge graph\n", - " mp_name = mp['kg_name']\n", - " mp_description = mp.get('kg_description', '')\n", - " elif source == 'wp':\n", - " # wikipedia\n", - " mp_name = mp['wp_name']\n", - " mp_description = mp.get('wp_description', '')\n", - " \n", - " if 'disambiguation' in wp_desc.lower():\n", - " #print(f\"disambiguate: {name_orig}\")\n", - " match_status = 0 # needs review if \"disambiguation appears\"\n", - " mp_name = ''\n", - " mp_description = ''\n", - " mp_bio = ''\n", - " \n", - " mp['source_url'] = source_url\n", - " mp['mp_slug'] = slug\n", - " mp['matched'] = match_status\n", - " mp['mp_bio'] = mp_bio\n", - " mp['mp_name'] = mp_name\n", - " mp['mp_description'] = mp_description" - ] - }, - { - "cell_type": "code", - "execution_count": 221, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "match: 4359\n", - "review: 718\n", - "fail: 672\n", - "no kg accessed: 0\n", - "no wp accessed: 0\n" - ] - } - ], - "source": [ - "print(f\"match: {sum(1 if (x.get('matched') > 0) else 0 for x in mapped_persons)}\")\n", - "print(f\"review: {sum(1 if (x.get('matched') == 0) else 0 for x in mapped_persons)}\")\n", - "print(f\"fail: {sum(1 if (x.get('matched') == -1) else 0 for x in mapped_persons)}\")\n", - "\n", - "print(f\"no kg accessed: {sum(0 if (x.get('kg_accessed', False)) else 1 for x in mapped_persons)}\")\n", - "print(f\"no wp accessed: {sum(0 if (x.get('wp_accessed', False)) else 1 for x in mapped_persons)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save data to CSV" - ] - }, - { - "cell_type": "code", - "execution_count": 235, - "metadata": {}, - "outputs": [], - "source": [ - "# create dataframe for mapped persons\n", - "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", - "df_mapped_persons.index.name = 'index'" - ] - }, - { - "cell_type": "code", - "execution_count": 236, - "metadata": {}, - "outputs": [], - "source": [ - "# save\n", - "fp_out = f'/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n", - "df_mapped_persons.drop(['kg_accessed', 'wp_accessed', 'kg_error', 'wp_error'], axis=1, inplace=True)\n", - "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')\n", - "# create small version\n", - "limit = 1000\n", - "fpp_out = Path(fp_out)\n", - "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n", - "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", - "df_mapped_persons_sm.index.name = 'index'\n", - "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')" - ] - }, - { - "cell_type": "code", - "execution_count": 237, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>kg_bio</th>\n", - " <th>kg_bio_url</th>\n", - " <th>kg_description</th>\n", - " <th>kg_id</th>\n", - " <th>kg_image_url</th>\n", - " <th>kg_name</th>\n", - " <th>kg_score</th>\n", - " <th>kg_url</th>\n", - " <th>matched</th>\n", - " <th>mp_bio</th>\n", - " <th>mp_description</th>\n", - " <th>mp_name</th>\n", - " <th>mp_slug</th>\n", - " <th>query</th>\n", - " <th>source</th>\n", - " <th>source_name</th>\n", - " <th>source_url</th>\n", - " <th>wp_description</th>\n", - " <th>wp_name</th>\n", - " <th>wp_page_id</th>\n", - " </tr>\n", - " <tr>\n", - " <th>index</th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>Kim Antonie Lode Clijsters is a Belgian former...</td>\n", - " <td>https://en.wikipedia.org/wiki/Kim_Clijsters</td>\n", - " <td>Belgian tennis player</td>\n", - " <td>/m/01m_gh</td>\n", - " <td>http://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK...</td>\n", - " <td>Kim Clijsters</td>\n", - " <td>618.272705</td>\n", - " <td></td>\n", - " <td>2</td>\n", - " <td>Kim Antonie Lode Clijsters is a Belgian former...</td>\n", - " <td>Belgian tennis player</td>\n", - " <td>Kim Clijsters</td>\n", - " <td>kim_clijsters</td>\n", - " <td>Kim Clijsters</td>\n", - " <td>lfw</td>\n", - " <td>Kim_Clijsters</td>\n", - " <td>http://vis-www.cs.umass.edu/lfw/person/Kim_Cli...</td>\n", - " <td>Belgian tennis player</td>\n", - " <td>Kim Clijsters</td>\n", - " <td>262793</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>William Rosenberg was an American entrepreneur...</td>\n", - " <td>https://en.wikipedia.org/wiki/William_Rosenberg</td>\n", - " <td>American entrepreneur</td>\n", - " <td>/m/07dy4z</td>\n", - " <td></td>\n", - " <td>William Rosenberg</td>\n", - " <td>367.879730</td>\n", - " <td></td>\n", - " <td>2</td>\n", - " <td>William Rosenberg was an American entrepreneur...</td>\n", - " <td>American businessman</td>\n", - " <td>William Rosenberg</td>\n", - " <td>william_rosenberg</td>\n", - " <td>William Rosenberg</td>\n", - " <td>lfw</td>\n", - " <td>William_Rosenberg</td>\n", - " <td>http://vis-www.cs.umass.edu/lfw/person/William...</td>\n", - " <td>American businessman</td>\n", - " <td>William Rosenberg</td>\n", - " <td>2.44981e+06</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " kg_bio \\\n", - "index \n", - "0 Kim Antonie Lode Clijsters is a Belgian former... \n", - "1 William Rosenberg was an American entrepreneur... \n", - "\n", - " kg_bio_url kg_description \\\n", - "index \n", - "0 https://en.wikipedia.org/wiki/Kim_Clijsters Belgian tennis player \n", - "1 https://en.wikipedia.org/wiki/William_Rosenberg American entrepreneur \n", - "\n", - " kg_id kg_image_url \\\n", - "index \n", - "0 /m/01m_gh http://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK... \n", - "1 /m/07dy4z \n", - "\n", - " kg_name kg_score kg_url matched \\\n", - "index \n", - "0 Kim Clijsters 618.272705 2 \n", - "1 William Rosenberg 367.879730 2 \n", - "\n", - " mp_bio \\\n", - "index \n", - "0 Kim Antonie Lode Clijsters is a Belgian former... \n", - "1 William Rosenberg was an American entrepreneur... \n", - "\n", - " mp_description mp_name mp_slug \\\n", - "index \n", - "0 Belgian tennis player Kim Clijsters kim_clijsters \n", - "1 American businessman William Rosenberg william_rosenberg \n", - "\n", - " query source source_name \\\n", - "index \n", - "0 Kim Clijsters lfw Kim_Clijsters \n", - "1 William Rosenberg lfw William_Rosenberg \n", - "\n", - " source_url \\\n", - "index \n", - "0 http://vis-www.cs.umass.edu/lfw/person/Kim_Cli... \n", - "1 http://vis-www.cs.umass.edu/lfw/person/William... \n", - "\n", - " wp_description wp_name wp_page_id \n", - "index \n", - "0 Belgian tennis player Kim Clijsters 262793 \n", - "1 American businessman William Rosenberg 2.44981e+06 " - ] - }, - "execution_count": 237, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_mapped_persons.head(2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Clean data" - ] - }, - { - "cell_type": "code", - "execution_count": 225, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "for mp in mapped_persons:\n", - " mp['source_name'] = mp['source_name'].replace(' ', '_')\n", - "# mp['kg_description'] = mp['kg_description'].strip()\n", - "# mp['kg_name'] = mp['kg_name'].strip()\n", - "# mp['kg_bio_url'] = mp['kg_bio_url'].strip()\n", - "# mp['kg_bio'] = mp['kg_bio'].strip()\n", - "# mp['kg_url'] = mp['kg_url'].strip()\n", - " \n", - "# mp['wp_description'] = mp['wp_description'].strip()\n", - "# mp['wp_name'] = mp['wp_name'].strip()\n", - " \n", - "# mp['mp_name'] = ''\n", - "# mp['mp_bio'] = ''\n", - "# mp['mp_description'] = ''\n", - "# mp['mp_slug'] = ''\n", - " \n", - " #mp.setdefault('kg_description','')\n", - "# if mp.get('kg_score', 0) == 0:\n", - "# mp['kg_image_url'] = ''\n", - "# mp['kg_bio_url'] = ''\n", - "# mp['kg_id'] = ''\n", - "# mp['kg_url'] = ''\n", - "# mp['kg_description'] = ''\n", - "# mp['kg_bio_url'] = ''\n", - "# mp['kg_name'] = ''\n", - "# if mp['kg_url'] == [] or mp['kg_url'] == '[]':\n", - "# mp['kg_url'] = ''\n", - "\n", - " try:\n", - " _ = mp.pop('wp_bio')\n", - " except:\n", - " pass" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:megapixels]", - "language": "python", - "name": "conda-env-megapixels-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} |
