# Knowledge Graph Identities

- convert filename-names to names
- fetch Google Knowledge Graph entity IDs for each name
- save KG IDs to CSV

In [186]:
%reload_ext autoreload
%autoreload 2

import os
import os.path as osp
from os.path import join
from glob import glob
from pathlib import Path
import random
import math
from datetime import datetime
import requests
import json
import time
from pprint import pprint
from multiprocessing.pool import ThreadPool
import threading
import urllib.request
import difflib
import unidecode
import slugify

from tqdm import tqdm_notebook as tqdm
import pandas as pd
from scipy.io import loadmat
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

import sys
sys.path.append('/work/megapixels_dev/megapixels')
from app.utils import api_utils, identity_utils
from app.settings import app_cfg
from app.settings import types

## Get List of Names

In [188]:
names = identity_utils.get_names(types.Dataset.
                                )
print(names['names_query'][0:10])
print(names['names_orig'][0:10])

['Kim Clijsters', 'William Rosenberg', 'John Brady', 'Juan Ignacio Chela', 'Floyd Keith', 'Sam Gerald', 'Imad Khadduri', 'Anna Kournikova', 'Jacques Rogge', 'Wilbert Elki Meza Majino']
['Kim_Clijsters', 'William_Rosenberg', 'John_Brady', 'Juan_Ignacio_Chela', 'Floyd_Keith', 'Sam_Gerald', 'Imad_Khadduri', 'Anna_Kournikova', 'Jacques_Rogge', 'Wilbert_Elki_Meza_Majino']


## Google Knowledge Graph API

- about 100.000 requests per 24 hours

In [164]:
# read API key
kg_api = api_utils.GoogleKnowledgeGraph()
wp_api = api_utils.WikipediaAPI()

## Test API Access

In [165]:
print('wp----')
pprint(wp_api.get_meta({'query': 'Vicente Fox'}, verbose=True))
print('kg----')
pprint(kg_api.get_kg_from_name({'query':'Vicente Fox'}))

wp----
https://en.wikipedia.org/w/api.php?redirects=&ppprop=displaytitle&prop=pageprops%7Cpageimages%7Cdescription&generator=prefixsearch&action=query&format=json&piprop=thumbnail&pilimit=1&gpssearch=Vicente+Fox&gpsnamespace=0&gpslimit=1
{'wp_accessed': True,
 'wp_description': 'President of Mexico',
 'wp_name': 'Vicente Fox',
 'wp_page_id': '32836'}
kg----
{'kg_accessed': True,
 'kg_bio': '',
 'kg_bio_url': '',
 'kg_description': 'Former President of Mexico',
 'kg_error': '',
 'kg_id': '/m/081f4',
 'kg_image_url': 'http://t2.gstatic.com/images?q=tbn:ANd9GcQqs1Z0NhSLve9OyfdC0AHFWKWlTpHO4tCnU7dedSSz2kzCRk60',
 'kg_name': 'Vicente Fox',
 'kg_score': 610.987427,
 'kg_url': '',
 'query': 'Vicente Fox'}


### Test Name Similarity Matching

In [168]:
#print(identity_utils.names_match('AndrÃ©ss Iniestas', 'Andres Iniestalossas Jr.', as_float=True))
#print(identity_utils.names_match('Adoor Gopalakrishnan', 'Adoors Gopalakarishnan', as_float=True))
#print(identity_utils.names_match('Dave Letterman', 'David Letterman', as_float=True))
print(identity_utils.names_match('Charles Dickens', 'Charles Boooker', as_float=True, compound_score=True))
print(identity_utils.names_match('Charles Dickens', 'Charles Boooker', as_float=True, compound_score=False))
#print(identity_utils.names_match('Donald Trump', 'Donald J. Trump', as_float=True))
#print(identity_utils.names_match('Wang Fei', 'Fei Wang  III', as_float=True))

0.7714285714285716
0.7142857142857143


In [126]:
# define thread mapping function
def pool_map_persons(obj):
  global pbar
  pbar.update(1)
  kg_obj = kg_api.get_kg_from_name(obj)
  wp_obj = wp_api.get_meta(obj)
  person_obj = {**kg_obj, **wp_obj}
  return person_obj

def num_non_accessed(mps):
  return sum(0 if (x.get('kg_accessed', False) and x.get('wp_accessed', False)) else 1 for x in mps)

## Load existing CSV

In [36]:
# load existing CSV
fp_csv = '/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'
df = pd.read_csv(fp_csv, encoding = 'utf-16').set_index('index')
# fill nulls
df.fillna('', inplace = True)
mapped_persons = df.to_dict('records')
# add columns
for mp in mapped_persons:
  mp['wp_error'] = ''
  mp['kg_error'] = ''

## Get Knowledge Graph Data

In [40]:
num_threads_max = 5
sleep_min = 1
pbar = tqdm(total=len(mapped_persons))

nna = num_non_accessed(mapped_persons)
print(f'{nna}/{len(mapped_persons)} remaining')

# convert to thread pool
while nna > 0:
  num_threads = max(1, min(num_threads_max, nna))
  print(f'{nna}/{len(mapped_persons)} remaining. Using {num_threads} threads')
  pool = ThreadPool(num_threads)

  # start threading
  with tqdm(total=len(mapped_persons)) as pbar:
    mapped_persons = pool.map(pool_map_persons, mapped_persons)

  # close tqdm
  pbar.close()

  nna = num_non_accessed(mapped_persons)
  if nna > 0:
    print(f'{nna} remaining. Sleeping for {sleep_min} minutes...')
    time.sleep(60 * sleep_min)

print(f'Done. {nna} remaining.')

HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))

832/5749 remaining
832/5749 remaining. Using 5 threads


HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))

Done. 0 remaining.


### Get Wikipedia API data

In [220]:
for i, mp in enumerate(mapped_persons):
  kg_name = mp.get('kg_name')
  wp_name = mp.get('wp_name')
  query = mp.get('query')
  name_orig = mp.get('source_name')
  kg_score = int(mp.get('kg_score',0))

  kg_matches = same_person(name_orig, kg_name)
  wp_matches = same_person(name_orig, wp_name)

  if kg_matches and wp_matches and kg_score > 100:
    # very likely a match, confirm it
    match_status = 2  # supermatch
    # default to using wp because descriptions are more appropriate/udpated
    source = 'wp'
  elif kg_matches and wp_matches:
    match_status = 1
    # default to using wp because descriptions are more appropriate/udpated
    source = 'wp'
  elif kg_matches and not wp_matches:
    # if the KG score is medium-high, but wp failed, needs review
    source = 'kg'
    match_status = 0
  elif wp_matches and not kg_matches:
    # if wikipedia text matched the query, then confirm
    source = 'wp'
    match_status = 0
  else:
    # no information available
    match_status = -1
    source = None
      
  slug = slugify.slugify(name_orig, separator='_')
  mp_bio = mp.get('kg_bio', '')
  wp_desc = mp.get('wp_description', '')
  source_url = f"http://vis-www.cs.umass.edu/lfw/person/{name_orig.replace(' ', '_')}.html"
  
  if source == 'kg':
    # google knowledge graph
    mp_name = mp['kg_name']
    mp_description = mp.get('kg_description', '')
  elif source == 'wp':
    # wikipedia
    mp_name = mp['wp_name']
    mp_description = mp.get('wp_description', '')
  
  if 'disambiguation' in wp_desc.lower():
    #print(f"disambiguate: {name_orig}")
    match_status = 0  # needs review if "disambiguation appears"
    mp_name = ''
    mp_description = ''
    mp_bio = ''
  
  mp['source_url'] = source_url
  mp['mp_slug'] = slug
  mp['matched'] = match_status
  mp['mp_bio'] = mp_bio
  mp['mp_name'] = mp_name
  mp['mp_description'] = mp_description

In [221]:
print(f"match: {sum(1 if (x.get('matched') > 0) else 0 for x in mapped_persons)}")
print(f"review: {sum(1 if (x.get('matched') == 0) else 0 for x in mapped_persons)}")
print(f"fail: {sum(1 if (x.get('matched') == -1) else 0 for x in mapped_persons)}")

print(f"no kg accessed: {sum(0 if (x.get('kg_accessed', False)) else 1 for x in mapped_persons)}")
print(f"no wp accessed: {sum(0 if (x.get('wp_accessed', False)) else 1 for x in mapped_persons)}")

match: 4359
review: 718
fail: 672
no kg accessed: 0
no wp accessed: 0


### Save data to CSV

In [235]:
# create dataframe for mapped persons
df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)
df_mapped_persons.index.name = 'index'

In [236]:
# save
fp_out = f'/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'
df_mapped_persons.drop(['kg_accessed', 'wp_accessed', 'kg_error', 'wp_error'], axis=1, inplace=True)
df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')
# create small version
limit = 1000
fpp_out = Path(fp_out)
fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')
df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])
df_mapped_persons_sm.index.name = 'index'
df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')

In [237]:
df_mapped_persons.head(2)

Unnamed: 0_level_0,kg_bio,kg_bio_url,kg_description,kg_id,kg_image_url,kg_name,kg_score,kg_url,matched,mp_bio,mp_description,mp_name,mp_slug,query,source,source_name,source_url,wp_description,wp_name,wp_page_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,Kim Antonie Lode Clijsters is a Belgian former...,https://en.wikipedia.org/wiki/Kim_Clijsters,Belgian tennis player,/m/01m_gh,http://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK...,Kim Clijsters,618.272705,,2,Kim Antonie Lode Clijsters is a Belgian former...,Belgian tennis player,Kim Clijsters,kim_clijsters,Kim Clijsters,lfw,Kim_Clijsters,http://vis-www.cs.umass.edu/lfw/person/Kim_Cli...,Belgian tennis player,Kim Clijsters,262793.0
1,William Rosenberg was an American entrepreneur...,https://en.wikipedia.org/wiki/William_Rosenberg,American entrepreneur,/m/07dy4z,,William Rosenberg,367.87973,,2,William Rosenberg was an American entrepreneur...,American businessman,William Rosenberg,william_rosenberg,William Rosenberg,lfw,William_Rosenberg,http://vis-www.cs.umass.edu/lfw/person/William...,American businessman,William Rosenberg,2449810.0
