reorder nbs

author: adamhrv <adam@ahprojects.com> 2019-02-12 15:18:46 +0100
committer: adamhrv <adam@ahprojects.com> 2019-02-12 15:18:46 +0100
commit: a5bdab8e798fcdc7885cfdabb0e5dd8076fa1d40 (patch)
tree: 1e7a45a8d2c746994584cc5f8e4ccdabad82f8d8 /megapixels/notebooks/datasets/knowledge_graph/identity.ipynb
parent: e95455a8a4013dafdeb7e41cfa8fb1f3ccc28dbb (diff)
1 files changed, 0 insertions, 792 deletions
diff --git a/megapixels/notebooks/datasets/knowledge_graph/identity.ipynb b/megapixels/notebooks/datasets/knowledge_graph/identity.ipynb
deleted file mode 100644
index 81a74faf..00000000
--- a/megapixels/notebooks/datasets/knowledge_graph/identity.ipynb
+++ /dev/null
@@ -1,792 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Knowledge Graph Identities\n",
-    "\n",
-    "- convert filename-names to names\n",
-    "- fetch Google Knowledge Graph entity IDs for each name\n",
-    "- save KG IDs to CSV"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%reload_ext autoreload\n",
-    "%autoreload 2\n",
-    "\n",
-    "import os\n",
-    "import os.path as osp\n",
-    "from os.path import join\n",
-    "from glob import glob\n",
-    "from pathlib import Path\n",
-    "import random\n",
-    "import math\n",
-    "from datetime import datetime\n",
-    "import requests\n",
-    "import json\n",
-    "import time\n",
-    "from pprint import pprint\n",
-    "from multiprocessing.pool import ThreadPool\n",
-    "import threading\n",
-    "import urllib.request\n",
-    "import difflib\n",
-    "import unidecode\n",
-    "import slugify\n",
-    "\n",
-    "from tqdm import tqdm_notebook as tqdm\n",
-    "import pandas as pd\n",
-    "from scipy.io import loadmat\n",
-    "import numpy as np\n",
-    "%matplotlib inline\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "import sys\n",
-    "sys.path.append('/work/megapixels_dev/megapixels')\n",
-    "from app.utils import api_utils\n",
-    "from app.settings import types"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Get List of Names"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_names(enum_dataset):\n",
-    "  if enum_dataset == types.Dataset.LFW:\n",
-    "    dir_lfw = '/data_store_hdd/datasets/people/lfw/media/original/'\n",
-    "    names = [x.replace('_', ' ') for x in os.listdir(dir_lfw)]\n",
-    "  elif enum_dataset == types.Dataset.YOUTUBE_FACES:\n",
-    "    names = [x for x in names if 'labeled faces.txt' not in x]\n",
-    "  return names"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['Kim Clijsters', 'William Rosenberg', 'John Brady', 'Juan Ignacio Chela', 'Floyd Keith', 'Sam Gerald', 'Imad Khadduri', 'Anna Kournikova', 'Jacques Rogge', 'Wilbert Elki Meza Majino']\n"
-     ]
-    }
-   ],
-   "source": [
-    "names = get_names(types.Dataset.LFW)\n",
-    "print(names[0:10])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Google Knowledge Graph API"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# read API key\n",
-    "api_key = open('/work/megapixels_dev/env/google_knowledge_graph_api.env').read()\n",
-    "kg_api = api_utils.GoogleKnowledgeGraph(api_key)\n",
-    "wp_api = api_utils.WikipediaAPI()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 241,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "wp\n",
-      "{'wp_accessed': True, 'wp_description': '', 'wp_name': '', 'wp_page_id': ''}\n",
-      "kg\n",
-      "{'kg_accessed': True,\n",
-      " 'kg_bio': '',\n",
-      " 'kg_bio_url': '',\n",
-      " 'kg_description': '',\n",
-      " 'kg_id': '',\n",
-      " 'kg_image_url': '',\n",
-      " 'kg_name': '',\n",
-      " 'kg_score': 0,\n",
-      " 'kg_url': '',\n",
-      " 'query': 'Jeff Dederian'}\n"
-     ]
-    }
-   ],
-   "source": [
-    "#wp_api.test_access()\n",
-    "print('wp')\n",
-    "pprint(wp_api.get_meta({'query': 'Florecita Cobian'}))\n",
-    "print('kg')\n",
-    "pprint(kg_api.get_kg_from_name({'query':'Jeff Dederian'}))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Test Name Similarity Matching"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 242,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def same_person(query, name, sim_min=.9, word_match_min=0.75, verbose=False):\n",
-    "  if name == '':\n",
-    "    return False\n",
-    "  # check and remove if WP added parenthesis\n",
-    "  if '(' in name and ')' in name:\n",
-    "    name = name.split('(')[0]\n",
-    "  \n",
-    "  # then strip spaces and split into list\n",
-    "  query_strings = [unidecode.unidecode(x.strip().lower()) for x in query.strip().split(' ')]  # query\n",
-    "  result_strings = [unidecode.unidecode(x.strip().lower()) for x in name.strip().split(' ')]  # result\n",
-    "  min_str_len = min(len(result_strings), len(query_strings))\n",
-    "  # match each word in the query\n",
-    "  matched_strings = []\n",
-    "  \n",
-    "  for i in range(len(query_strings)):\n",
-    "    # for each word in the shorter text string\n",
-    "    result_strings_tmp = result_strings.copy()\n",
-    "    for j in range(len(result_strings_tmp)):\n",
-    "      a = query_strings[i]\n",
-    "      b = result_strings_tmp[j]\n",
-    "      # make a the shorter string\n",
-    "      lengths = [len(a), len(b)]\n",
-    "      min_ratio = (min(lengths) / max(lengths) * .75)\n",
-    "      ratio = difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()\n",
-    "      result = (ratio >= min_ratio)\n",
-    "      if verbose:\n",
-    "        print(f'comapre \"{a}\" to \"{b}\" ratio was: {ratio:.2f} min: {min_ratio:.2}, passed: {result}')\n",
-    "      if result:\n",
-    "        # remove this item from result strings\n",
-    "        matched_string = result_strings.pop(j)\n",
-    "        matched_strings.append(matched_string)\n",
-    "        break  # exit loop and use shortened result string haystack\n",
-    "\n",
-    "  matched = len(matched_strings) >= min_str_len\n",
-    "  if verbose:\n",
-    "      print(f'{matched} because {len(matched_strings)} >= {min_str_len}')\n",
-    "  return matched"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 245,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(Adoor Gopalakrishnan == Adoors Gopalakarishnan ok) = True\n",
-      "\n",
-      "comapre \"dave\" to \"david\" ratio was: 0.67 min: 0.6, passed: True\n",
-      "comapre \"letterman\" to \"letterman\" ratio was: 1.00 min: 0.75, passed: True\n",
-      "True because 2 >= 2\n",
-      "(David Letterman == Dave Letterman) = True\n",
-      "\n",
-      "comapre \"charles\" to \"charles\" ratio was: 1.00 min: 0.75, passed: True\n",
-      "comapre \"dickens\" to \"booker\" ratio was: 0.31 min: 0.64, passed: False\n",
-      "False because 1 >= 2\n",
-      "(Charles Booker == Charles Dickens) = False\n",
-      "\n",
-      "comapre \"donald\" to \"don\" ratio was: 0.67 min: 0.38, passed: True\n",
-      "comapre \"trump\" to \"j.\" ratio was: 0.00 min: 0.3, passed: False\n",
-      "comapre \"trump\" to \"trump\" ratio was: 1.00 min: 0.75, passed: True\n",
-      "True because 2 >= 2\n",
-      "(Don J. Trump == Donald Trump) = True\n",
-      "\n",
-      "comapre \"wang\" to \"wang\" ratio was: 1.00 min: 0.75, passed: True\n",
-      "comapre \"fei\" to \"fei\" ratio was: 1.00 min: 0.75, passed: True\n",
-      "True because 2 >= 2\n",
-      "(Wang Fei (female footballer) == Wang Fei) = True\n"
-     ]
-    }
-   ],
-   "source": [
-    "test_sim_match = True\n",
-    "if test_sim_match:\n",
-    "  # Test name similarity search\n",
-    "  query = 'Adoors Gopalakarishnan ok'\n",
-    "  wp_name = 'Adoor Gopalakrishnan'\n",
-    "  matched = same_person(query, wp_name)\n",
-    "  print(f'({wp_name} == {query}) = {matched}')\n",
-    "  print('')\n",
-    "\n",
-    "  query = 'Dave Letterman'\n",
-    "  wp_name = 'David Letterman'\n",
-    "  matched = same_person(query, wp_name, verbose=True)\n",
-    "  print(f'({wp_name} == {query}) = {matched}')\n",
-    "  print('')\n",
-    "\n",
-    "  query = 'Charles Dickens'\n",
-    "  wp_name = 'Charles Booker'\n",
-    "  matched = same_person(query, wp_name, verbose=True)\n",
-    "  print(f'({wp_name} == {query}) = {matched}')\n",
-    "  print('')\n",
-    "\n",
-    "  query = 'Donald Trump'\n",
-    "  wp_name = 'Don J. Trump'\n",
-    "  matched = same_person(query, wp_name, verbose=True)\n",
-    "  print(f'({wp_name} == {query}) = {matched}')\n",
-    "  print('')\n",
-    "  \n",
-    "  query = 'Wang Fei'\n",
-    "  kg_name = 'Faye Wong'\n",
-    "  wp_name = 'Wang Fei (female footballer)'\n",
-    "  matched = same_person(query, wp_name, verbose=True)\n",
-    "  print(f'({wp_name} == {query}) = {matched}')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 246,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# define thread mapping function\n",
-    "def pool_map_persons(obj):\n",
-    "  global pbar\n",
-    "  pbar.update(1)\n",
-    "  kg_obj = kg_api.get_kg_from_name(obj)\n",
-    "  wp_obj = wp_api.get_meta(obj)\n",
-    "  person_obj = {**kg_obj, **wp_obj}\n",
-    "  return person_obj\n",
-    "\n",
-    "def num_non_accessed(mps):\n",
-    "  return sum(0 if (x.get('kg_accessed', False) and x.get('wp_accessed', False)) else 1 for x in mps)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Load existing CSV"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load existing CSV\n",
-    "fp_csv = '/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n",
-    "df = pd.read_csv(fp_csv, encoding = 'utf-16').set_index('index')\n",
-    "# fill nulls\n",
-    "df.fillna('', inplace = True)\n",
-    "mapped_persons = df.to_dict('records')\n",
-    "# add columns\n",
-    "for mp in mapped_persons:\n",
-    "  mp['wp_error'] = ''\n",
-    "  mp['kg_error'] = ''"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Get Knowledge Graph Data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5507f5c19de746df94aa5445e3c7cf46",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "832/5749 remaining\n",
-      "832/5749 remaining. Using 5 threads\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "411d08f873174d13a1de1f8b21f9f993",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Done. 0 remaining.\n"
-     ]
-    }
-   ],
-   "source": [
-    "num_threads_max = 5\n",
-    "sleep_min = 1\n",
-    "pbar = tqdm(total=len(mapped_persons))\n",
-    "\n",
-    "nna = num_non_accessed(mapped_persons)\n",
-    "print(f'{nna}/{len(mapped_persons)} remaining')\n",
-    "\n",
-    "# convert to thread pool\n",
-    "while nna > 0:\n",
-    "  num_threads = max(1, min(num_threads_max, nna))\n",
-    "  print(f'{nna}/{len(mapped_persons)} remaining. Using {num_threads} threads')\n",
-    "  pool = ThreadPool(num_threads)\n",
-    "\n",
-    "  # start threading\n",
-    "  with tqdm(total=len(mapped_persons)) as pbar:\n",
-    "    mapped_persons = pool.map(pool_map_persons, mapped_persons)\n",
-    "\n",
-    "  # close tqdm\n",
-    "  pbar.close()\n",
-    "\n",
-    "  nna = num_non_accessed(mapped_persons)\n",
-    "  if nna > 0:\n",
-    "    print(f'{nna} remaining. Sleeping for {sleep_min} minutes...')\n",
-    "    time.sleep(60 * sleep_min)\n",
-    "\n",
-    "print(f'Done. {nna} remaining.')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Get Wikipedia API data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 220,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [],
-   "source": [
-    "for i, mp in enumerate(mapped_persons):\n",
-    "  kg_name = mp.get('kg_name')\n",
-    "  wp_name = mp.get('wp_name')\n",
-    "  query = mp.get('query')\n",
-    "  name_orig = mp.get('source_name')\n",
-    "  kg_score = int(mp.get('kg_score',0))\n",
-    "\n",
-    "  kg_matches = same_person(name_orig, kg_name)\n",
-    "  wp_matches = same_person(name_orig, wp_name)\n",
-    "\n",
-    "  if kg_matches and wp_matches and kg_score > 100:\n",
-    "    # very likely a match, confirm it\n",
-    "    match_status = 2  # supermatch\n",
-    "    # default to using wp because descriptions are more appropriate/udpated\n",
-    "    source = 'wp'\n",
-    "  elif kg_matches and wp_matches:\n",
-    "    match_status = 1\n",
-    "    # default to using wp because descriptions are more appropriate/udpated\n",
-    "    source = 'wp'\n",
-    "  elif kg_matches and not wp_matches:\n",
-    "    # if the KG score is medium-high, but wp failed, needs review\n",
-    "    source = 'kg'\n",
-    "    match_status = 0\n",
-    "  elif wp_matches and not kg_matches:\n",
-    "    # if wikipedia text matched the query, then confirm\n",
-    "    source = 'wp'\n",
-    "    match_status = 0\n",
-    "  else:\n",
-    "    # no information available\n",
-    "    match_status = -1\n",
-    "    source = None\n",
-    "      \n",
-    "  slug = slugify.slugify(name_orig, separator='_')\n",
-    "  mp_bio = mp.get('kg_bio', '')\n",
-    "  wp_desc = mp.get('wp_description', '')\n",
-    "  source_url = f\"http://vis-www.cs.umass.edu/lfw/person/{name_orig.replace(' ', '_')}.html\"\n",
-    "  \n",
-    "  if source == 'kg':\n",
-    "    # google knowledge graph\n",
-    "    mp_name = mp['kg_name']\n",
-    "    mp_description = mp.get('kg_description', '')\n",
-    "  elif source == 'wp':\n",
-    "    # wikipedia\n",
-    "    mp_name = mp['wp_name']\n",
-    "    mp_description = mp.get('wp_description', '')\n",
-    "  \n",
-    "  if 'disambiguation' in wp_desc.lower():\n",
-    "    #print(f\"disambiguate: {name_orig}\")\n",
-    "    match_status = 0  # needs review if \"disambiguation appears\"\n",
-    "    mp_name = ''\n",
-    "    mp_description = ''\n",
-    "    mp_bio = ''\n",
-    "  \n",
-    "  mp['source_url'] = source_url\n",
-    "  mp['mp_slug'] = slug\n",
-    "  mp['matched'] = match_status\n",
-    "  mp['mp_bio'] = mp_bio\n",
-    "  mp['mp_name'] = mp_name\n",
-    "  mp['mp_description'] = mp_description"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 221,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "match: 4359\n",
-      "review: 718\n",
-      "fail: 672\n",
-      "no kg accessed: 0\n",
-      "no wp accessed: 0\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(f\"match: {sum(1 if (x.get('matched') > 0) else 0 for x in mapped_persons)}\")\n",
-    "print(f\"review: {sum(1 if (x.get('matched') == 0) else 0 for x in mapped_persons)}\")\n",
-    "print(f\"fail: {sum(1 if (x.get('matched') == -1) else 0 for x in mapped_persons)}\")\n",
-    "\n",
-    "print(f\"no kg accessed: {sum(0 if (x.get('kg_accessed', False)) else 1 for x in mapped_persons)}\")\n",
-    "print(f\"no wp accessed: {sum(0 if (x.get('wp_accessed', False)) else 1 for x in mapped_persons)}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Save data to CSV"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 235,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# create dataframe for mapped persons\n",
-    "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n",
-    "df_mapped_persons.index.name = 'index'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 236,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# save\n",
-    "fp_out = f'/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n",
-    "df_mapped_persons.drop(['kg_accessed', 'wp_accessed', 'kg_error', 'wp_error'], axis=1, inplace=True)\n",
-    "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')\n",
-    "# create small version\n",
-    "limit = 1000\n",
-    "fpp_out = Path(fp_out)\n",
-    "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n",
-    "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n",
-    "df_mapped_persons_sm.index.name = 'index'\n",
-    "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 237,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>kg_bio</th>\n",
-       "      <th>kg_bio_url</th>\n",
-       "      <th>kg_description</th>\n",
-       "      <th>kg_id</th>\n",
-       "      <th>kg_image_url</th>\n",
-       "      <th>kg_name</th>\n",
-       "      <th>kg_score</th>\n",
-       "      <th>kg_url</th>\n",
-       "      <th>matched</th>\n",
-       "      <th>mp_bio</th>\n",
-       "      <th>mp_description</th>\n",
-       "      <th>mp_name</th>\n",
-       "      <th>mp_slug</th>\n",
-       "      <th>query</th>\n",
-       "      <th>source</th>\n",
-       "      <th>source_name</th>\n",
-       "      <th>source_url</th>\n",
-       "      <th>wp_description</th>\n",
-       "      <th>wp_name</th>\n",
-       "      <th>wp_page_id</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>index</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Kim Antonie Lode Clijsters is a Belgian former...</td>\n",
-       "      <td>https://en.wikipedia.org/wiki/Kim_Clijsters</td>\n",
-       "      <td>Belgian tennis player</td>\n",
-       "      <td>/m/01m_gh</td>\n",
-       "      <td>http://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK...</td>\n",
-       "      <td>Kim Clijsters</td>\n",
-       "      <td>618.272705</td>\n",
-       "      <td></td>\n",
-       "      <td>2</td>\n",
-       "      <td>Kim Antonie Lode Clijsters is a Belgian former...</td>\n",
-       "      <td>Belgian tennis player</td>\n",
-       "      <td>Kim Clijsters</td>\n",
-       "      <td>kim_clijsters</td>\n",
-       "      <td>Kim Clijsters</td>\n",
-       "      <td>lfw</td>\n",
-       "      <td>Kim_Clijsters</td>\n",
-       "      <td>http://vis-www.cs.umass.edu/lfw/person/Kim_Cli...</td>\n",
-       "      <td>Belgian tennis player</td>\n",
-       "      <td>Kim Clijsters</td>\n",
-       "      <td>262793</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>William Rosenberg was an American entrepreneur...</td>\n",
-       "      <td>https://en.wikipedia.org/wiki/William_Rosenberg</td>\n",
-       "      <td>American entrepreneur</td>\n",
-       "      <td>/m/07dy4z</td>\n",
-       "      <td></td>\n",
-       "      <td>William Rosenberg</td>\n",
-       "      <td>367.879730</td>\n",
-       "      <td></td>\n",
-       "      <td>2</td>\n",
-       "      <td>William Rosenberg was an American entrepreneur...</td>\n",
-       "      <td>American businessman</td>\n",
-       "      <td>William Rosenberg</td>\n",
-       "      <td>william_rosenberg</td>\n",
-       "      <td>William Rosenberg</td>\n",
-       "      <td>lfw</td>\n",
-       "      <td>William_Rosenberg</td>\n",
-       "      <td>http://vis-www.cs.umass.edu/lfw/person/William...</td>\n",
-       "      <td>American businessman</td>\n",
-       "      <td>William Rosenberg</td>\n",
-       "      <td>2.44981e+06</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                  kg_bio  \\\n",
-       "index                                                      \n",
-       "0      Kim Antonie Lode Clijsters is a Belgian former...   \n",
-       "1      William Rosenberg was an American entrepreneur...   \n",
-       "\n",
-       "                                            kg_bio_url         kg_description  \\\n",
-       "index                                                                           \n",
-       "0          https://en.wikipedia.org/wiki/Kim_Clijsters  Belgian tennis player   \n",
-       "1      https://en.wikipedia.org/wiki/William_Rosenberg  American entrepreneur   \n",
-       "\n",
-       "           kg_id                                       kg_image_url  \\\n",
-       "index                                                                 \n",
-       "0      /m/01m_gh  http://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK...   \n",
-       "1      /m/07dy4z                                                      \n",
-       "\n",
-       "                 kg_name    kg_score kg_url  matched  \\\n",
-       "index                                                  \n",
-       "0          Kim Clijsters  618.272705               2   \n",
-       "1      William Rosenberg  367.879730               2   \n",
-       "\n",
-       "                                                  mp_bio  \\\n",
-       "index                                                      \n",
-       "0      Kim Antonie Lode Clijsters is a Belgian former...   \n",
-       "1      William Rosenberg was an American entrepreneur...   \n",
-       "\n",
-       "              mp_description            mp_name            mp_slug  \\\n",
-       "index                                                                \n",
-       "0      Belgian tennis player      Kim Clijsters      kim_clijsters   \n",
-       "1       American businessman  William Rosenberg  william_rosenberg   \n",
-       "\n",
-       "                   query source        source_name  \\\n",
-       "index                                                \n",
-       "0          Kim Clijsters    lfw      Kim_Clijsters   \n",
-       "1      William Rosenberg    lfw  William_Rosenberg   \n",
-       "\n",
-       "                                              source_url  \\\n",
-       "index                                                      \n",
-       "0      http://vis-www.cs.umass.edu/lfw/person/Kim_Cli...   \n",
-       "1      http://vis-www.cs.umass.edu/lfw/person/William...   \n",
-       "\n",
-       "              wp_description            wp_name   wp_page_id  \n",
-       "index                                                         \n",
-       "0      Belgian tennis player      Kim Clijsters       262793  \n",
-       "1       American businessman  William Rosenberg  2.44981e+06  "
-      ]
-     },
-     "execution_count": 237,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_mapped_persons.head(2)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Clean data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 225,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [],
-   "source": [
-    "for mp in mapped_persons:\n",
-    "  mp['source_name'] = mp['source_name'].replace(' ', '_')\n",
-    "#   mp['kg_description'] = mp['kg_description'].strip()\n",
-    "#   mp['kg_name'] = mp['kg_name'].strip()\n",
-    "#   mp['kg_bio_url'] = mp['kg_bio_url'].strip()\n",
-    "#   mp['kg_bio'] = mp['kg_bio'].strip()\n",
-    "#   mp['kg_url'] = mp['kg_url'].strip()\n",
-    "  \n",
-    "#   mp['wp_description'] = mp['wp_description'].strip()\n",
-    "#   mp['wp_name'] = mp['wp_name'].strip()\n",
-    "  \n",
-    "#   mp['mp_name'] = ''\n",
-    "#   mp['mp_bio'] = ''\n",
-    "#   mp['mp_description'] = ''\n",
-    "#   mp['mp_slug'] = ''\n",
-    "  \n",
-    "  #mp.setdefault('kg_description','')\n",
-    "#   if mp.get('kg_score', 0) == 0:\n",
-    "#     mp['kg_image_url'] = ''\n",
-    "#     mp['kg_bio_url'] = ''\n",
-    "#     mp['kg_id'] = ''\n",
-    "#     mp['kg_url'] = ''\n",
-    "#     mp['kg_description'] = ''\n",
-    "#     mp['kg_bio_url'] = ''\n",
-    "#     mp['kg_name'] = ''\n",
-    "#   if mp['kg_url'] == [] or mp['kg_url'] == '[]':\n",
-    "#     mp['kg_url'] = ''\n",
-    "\n",
-    "  try:\n",
-    "    _ = mp.pop('wp_bio')\n",
-    "  except:\n",
-    "    pass"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python [conda env:megapixels]",
-   "language": "python",
-   "name": "conda-env-megapixels-py"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
author	adamhrv <adam@ahprojects.com>	2019-02-12 15:18:46 +0100
committer	adamhrv <adam@ahprojects.com>	2019-02-12 15:18:46 +0100
commit	a5bdab8e798fcdc7885cfdabb0e5dd8076fa1d40 (patch)
tree	1e7a45a8d2c746994584cc5f8e4ccdabad82f8d8 /megapixels/notebooks/datasets/knowledge_graph/identity.ipynb
parent	e95455a8a4013dafdeb7e41cfa8fb1f3ccc28dbb (diff)