# Knowledge Graph Identities

- convert filename-names to names
- fetch Google Knowledge Graph entity IDs for each name
- save KG IDs to CSV

In [48]:
%reload_ext autoreload
%autoreload 2

import os
import os.path as osp
from os.path import join
from glob import glob
from pathlib import Path
import random
import math
from datetime import datetime
import requests
import json
import time
from pprint import pprint
from multiprocessing.pool import ThreadPool
import threading
import urllib.request
import difflib
import unidecode
import slugify

from tqdm import tqdm_notebook as tqdm
import pandas as pd
from scipy.io import loadmat
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

import sys
sys.path.append('/work/megapixels_dev/megapixels')
from app.utils import api_utils
from app.settings import types

## Get List of Names

In [2]:
def get_names(enum_dataset):
  if enum_dataset == types.Dataset.LFW:
    dir_lfw = '/data_store_hdd/datasets/people/lfw/media/original/'
    names = [x.replace('_', ' ') for x in os.listdir(dir_lfw)]
  elif enum_dataset == types.Dataset.YOUTUBE_FACES:
    names = [x for x in names if 'labeled faces.txt' not in x]
  return names

In [3]:
names = get_names(types.Dataset.LFW)
print(names[0:10])

['Kim Clijsters', 'William Rosenberg', 'John Brady', 'Juan Ignacio Chela', 'Floyd Keith', 'Sam Gerald', 'Imad Khadduri', 'Anna Kournikova', 'Jacques Rogge', 'Wilbert Elki Meza Majino']


## Google Knowledge Graph API

In [4]:
# read API key
api_key = open('/work/megapixels_dev/env/google_knowledge_graph_api.env').read()
kg_api = api_utils.GoogleKnowledgeGraph(api_key)
wp_api = api_utils.WikipediaAPI()

In [241]:
#wp_api.test_access()
print('wp')
pprint(wp_api.get_meta({'query': 'Florecita Cobian'}))
print('kg')
pprint(kg_api.get_kg_from_name({'query':'Jeff Dederian'}))

wp
{'wp_accessed': True, 'wp_description': '', 'wp_name': '', 'wp_page_id': ''}
kg
{'kg_accessed': True,
 'kg_bio': '',
 'kg_bio_url': '',
 'kg_description': '',
 'kg_id': '',
 'kg_image_url': '',
 'kg_name': '',
 'kg_score': 0,
 'kg_url': '',
 'query': 'Jeff Dederian'}


### Test Name Similarity Matching

In [242]:
def same_person(query, name, sim_min=.9, word_match_min=0.75, verbose=False):
  if name == '':
    return False
  # check and remove if WP added parenthesis
  if '(' in name and ')' in name:
    name = name.split('(')[0]
  
  # then strip spaces and split into list
  query_strings = [unidecode.unidecode(x.strip().lower()) for x in query.strip().split(' ')]  # query
  result_strings = [unidecode.unidecode(x.strip().lower()) for x in name.strip().split(' ')]  # result
  min_str_len = min(len(result_strings), len(query_strings))
  # match each word in the query
  matched_strings = []
  
  for i in range(len(query_strings)):
    # for each word in the shorter text string
    result_strings_tmp = result_strings.copy()
    for j in range(len(result_strings_tmp)):
      a = query_strings[i]
      b = result_strings_tmp[j]
      # make a the shorter string
      lengths = [len(a), len(b)]
      min_ratio = (min(lengths) / max(lengths) * .75)
      ratio = difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()
      result = (ratio >= min_ratio)
      if verbose:
        print(f'comapre "{a}" to "{b}" ratio was: {ratio:.2f} min: {min_ratio:.2}, passed: {result}')
      if result:
        # remove this item from result strings
        matched_string = result_strings.pop(j)
        matched_strings.append(matched_string)
        break  # exit loop and use shortened result string haystack

  matched = len(matched_strings) >= min_str_len
  if verbose:
      print(f'{matched} because {len(matched_strings)} >= {min_str_len}')
  return matched

In [245]:
test_sim_match = True
if test_sim_match:
  # Test name similarity search
  query = 'Adoors Gopalakarishnan ok'
  wp_name = 'Adoor Gopalakrishnan'
  matched = same_person(query, wp_name)
  print(f'({wp_name} == {query}) = {matched}')
  print('')

  query = 'Dave Letterman'
  wp_name = 'David Letterman'
  matched = same_person(query, wp_name, verbose=True)
  print(f'({wp_name} == {query}) = {matched}')
  print('')

  query = 'Charles Dickens'
  wp_name = 'Charles Booker'
  matched = same_person(query, wp_name, verbose=True)
  print(f'({wp_name} == {query}) = {matched}')
  print('')

  query = 'Donald Trump'
  wp_name = 'Don J. Trump'
  matched = same_person(query, wp_name, verbose=True)
  print(f'({wp_name} == {query}) = {matched}')
  print('')
  
  query = 'Wang Fei'
  kg_name = 'Faye Wong'
  wp_name = 'Wang Fei (female footballer)'
  matched = same_person(query, wp_name, verbose=True)
  print(f'({wp_name} == {query}) = {matched}')

(Adoor Gopalakrishnan == Adoors Gopalakarishnan ok) = True

comapre "dave" to "david" ratio was: 0.67 min: 0.6, passed: True
comapre "letterman" to "letterman" ratio was: 1.00 min: 0.75, passed: True
True because 2 >= 2
(David Letterman == Dave Letterman) = True

comapre "charles" to "charles" ratio was: 1.00 min: 0.75, passed: True
comapre "dickens" to "booker" ratio was: 0.31 min: 0.64, passed: False
False because 1 >= 2
(Charles Booker == Charles Dickens) = False

comapre "donald" to "don" ratio was: 0.67 min: 0.38, passed: True
comapre "trump" to "j." ratio was: 0.00 min: 0.3, passed: False
comapre "trump" to "trump" ratio was: 1.00 min: 0.75, passed: True
True because 2 >= 2
(Don J. Trump == Donald Trump) = True

comapre "wang" to "wang" ratio was: 1.00 min: 0.75, passed: True
comapre "fei" to "fei" ratio was: 1.00 min: 0.75, passed: True
True because 2 >= 2
(Wang Fei (female footballer) == Wang Fei) = True


In [246]:
# define thread mapping function
def pool_map_persons(obj):
  global pbar
  pbar.update(1)
  kg_obj = kg_api.get_kg_from_name(obj)
  wp_obj = wp_api.get_meta(obj)
  person_obj = {**kg_obj, **wp_obj}
  return person_obj

def num_non_accessed(mps):
  return sum(0 if (x.get('kg_accessed', False) and x.get('wp_accessed', False)) else 1 for x in mps)

## Load existing CSV

In [36]:
# load existing CSV
fp_csv = '/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'
df = pd.read_csv(fp_csv, encoding = 'utf-16').set_index('index')
# fill nulls
df.fillna('', inplace = True)
mapped_persons = df.to_dict('records')
# add columns
for mp in mapped_persons:
  mp['wp_error'] = ''
  mp['kg_error'] = ''

## Get Knowledge Graph Data

In [40]:
num_threads_max = 5
sleep_min = 1
pbar = tqdm(total=len(mapped_persons))

nna = num_non_accessed(mapped_persons)
print(f'{nna}/{len(mapped_persons)} remaining')

# convert to thread pool
while nna > 0:
  num_threads = max(1, min(num_threads_max, nna))
  print(f'{nna}/{len(mapped_persons)} remaining. Using {num_threads} threads')
  pool = ThreadPool(num_threads)

  # start threading
  with tqdm(total=len(mapped_persons)) as pbar:
    mapped_persons = pool.map(pool_map_persons, mapped_persons)

  # close tqdm
  pbar.close()

  nna = num_non_accessed(mapped_persons)
  if nna > 0:
    print(f'{nna} remaining. Sleeping for {sleep_min} minutes...')
    time.sleep(60 * sleep_min)

print(f'Done. {nna} remaining.')

HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))

832/5749 remaining
832/5749 remaining. Using 5 threads


HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))

Done. 0 remaining.


### Get Wikipedia API data

In [220]:
for i, mp in enumerate(mapped_persons):
  kg_name = mp.get('kg_name')
  wp_name = mp.get('wp_name')
  query = mp.get('query')
  name_orig = mp.get('source_name')
  kg_score = int(mp.get('kg_score',0))

  kg_matches = same_person(name_orig, kg_name)
  wp_matches = same_person(name_orig, wp_name)

  if kg_matches and wp_matches and kg_score > 100:
    # very likely a match, confirm it
    match_status = 2  # supermatch
    # default to using wp because descriptions are more appropriate/udpated
    source = 'wp'
  elif kg_matches and wp_matches:
    match_status = 1
    # default to using wp because descriptions are more appropriate/udpated
    source = 'wp'
  elif kg_matches and not wp_matches:
    # if the KG score is medium-high, but wp failed, needs review
    source = 'kg'
    match_status = 0
  elif wp_matches and not kg_matches:
    # if wikipedia text matched the query, then confirm
    source = 'wp'
    match_status = 0
  else:
    # no information available
    match_status = -1
    source = None
      
  slug = slugify.slugify(name_orig, separator='_')
  mp_bio = mp.get('kg_bio', '')
  wp_desc = mp.get('wp_description', '')
  source_url = f"http://vis-www.cs.umass.edu/lfw/person/{name_orig.replace(' ', '_')}.html"
  
  if source == 'kg':
    # google knowledge graph
    mp_name = mp['kg_name']
    mp_description = mp.get('kg_description', '')
  elif source == 'wp':
    # wikipedia
    mp_name = mp['wp_name']
    mp_description = mp.get('wp_description', '')
  
  if 'disambiguation' in wp_desc.lower():
    #print(f"disambiguate: {name_orig}")
    match_status = 0  # needs review if "disambiguation appears"
    mp_name = ''
    mp_description = ''
    mp_bio = ''
  
  mp['source_url'] = source_url
  mp['mp_slug'] = slug
  mp['matched'] = match_status
  mp['mp_bio'] = mp_bio
  mp['mp_name'] = mp_name
  mp['mp_description'] = mp_description

In [221]:
print(f"match: {sum(1 if (x.get('matched') > 0) else 0 for x in mapped_persons)}")
print(f"review: {sum(1 if (x.get('matched') == 0) else 0 for x in mapped_persons)}")
print(f"fail: {sum(1 if (x.get('matched') == -1) else 0 for x in mapped_persons)}")

print(f"no kg accessed: {sum(0 if (x.get('kg_accessed', False)) else 1 for x in mapped_persons)}")
print(f"no wp accessed: {sum(0 if (x.get('wp_accessed', False)) else 1 for x in mapped_persons)}")

match: 4359
review: 718
fail: 672
no kg accessed: 0
no wp accessed: 0


### Save data to CSV

In [235]:
# create dataframe for mapped persons
df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)
df_mapped_persons.index.name = 'index'

In [236]:
# save
fp_out = f'/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'
df_mapped_persons.drop(['kg_accessed', 'wp_accessed', 'kg_error', 'wp_error'], axis=1, inplace=True)
df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')
# create small version
limit = 1000
fpp_out = Path(fp_out)
fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')
df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])
df_mapped_persons_sm.index.name = 'index'
df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')

In [237]:
df_mapped_persons.head(2)

Unnamed: 0_level_0,kg_bio,kg_bio_url,kg_description,kg_id,kg_image_url,kg_name,kg_score,kg_url,matched,mp_bio,mp_description,mp_name,mp_slug,query,source,source_name,source_url,wp_description,wp_name,wp_page_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,Kim Antonie Lode Clijsters is a Belgian former...,https://en.wikipedia.org/wiki/Kim_Clijsters,Belgian tennis player,/m/01m_gh,http://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK...,Kim Clijsters,618.272705,,2,Kim Antonie Lode Clijsters is a Belgian former...,Belgian tennis player,Kim Clijsters,kim_clijsters,Kim Clijsters,lfw,Kim_Clijsters,http://vis-www.cs.umass.edu/lfw/person/Kim_Cli...,Belgian tennis player,Kim Clijsters,262793.0
1,William Rosenberg was an American entrepreneur...,https://en.wikipedia.org/wiki/William_Rosenberg,American entrepreneur,/m/07dy4z,,William Rosenberg,367.87973,,2,William Rosenberg was an American entrepreneur...,American businessman,William Rosenberg,william_rosenberg,William Rosenberg,lfw,William_Rosenberg,http://vis-www.cs.umass.edu/lfw/person/William...,American businessman,William Rosenberg,2449810.0


## Clean data

In [225]:
for mp in mapped_persons:
  mp['source_name'] = mp['source_name'].replace(' ', '_')
#   mp['kg_description'] = mp['kg_description'].strip()
#   mp['kg_name'] = mp['kg_name'].strip()
#   mp['kg_bio_url'] = mp['kg_bio_url'].strip()
#   mp['kg_bio'] = mp['kg_bio'].strip()
#   mp['kg_url'] = mp['kg_url'].strip()
  
#   mp['wp_description'] = mp['wp_description'].strip()
#   mp['wp_name'] = mp['wp_name'].strip()
  
#   mp['mp_name'] = ''
#   mp['mp_bio'] = ''
#   mp['mp_description'] = ''
#   mp['mp_slug'] = ''
  
  #mp.setdefault('kg_description','')
#   if mp.get('kg_score', 0) == 0:
#     mp['kg_image_url'] = ''
#     mp['kg_bio_url'] = ''
#     mp['kg_id'] = ''
#     mp['kg_url'] = ''
#     mp['kg_description'] = ''
#     mp['kg_bio_url'] = ''
#     mp['kg_name'] = ''
#   if mp['kg_url'] == [] or mp['kg_url'] == '[]':
#     mp['kg_url'] = ''

  try:
    _ = mp.pop('wp_bio')
  except:
    pass