# UMD Faces Knowledge Graph Identities

- convert filename-names to names
- fetch Google Knowledge Graph entity IDs for each name
- save KG IDs to CSV

In [78]:
%reload_ext autoreload
%autoreload 2

import os
import os.path as osp
from os.path import join
from glob import glob
from pathlib import Path
import random
import math
from datetime import datetime
import requests
import json
import time
from pprint import pprint
from multiprocessing.pool import ThreadPool
import threading
import urllib.request

from tqdm import tqdm_notebook as tqdm
import pandas as pd
from scipy.io import loadmat
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

## Load IMDB Metadata

In [38]:
fp_filenames = '/data_store_hdd/datasets/people/umd_faces/downloads/filenames.txt'
with open(fp_filenames, 'r') as fp:
  filenames = fp.readlines()
_ = filenames.pop(0)
filenames = [x.replace('_', ' ').strip() for x in filenames]

In [39]:
print(filenames[0])

aaron rodgers


## Google Knowledge Graph API

In [62]:
urllib.request.urlopen('https://wtfismyip.com/json').read()

b'{\n   "YourFuckingIPAddress": "78.55.72.54",\n   "YourFuckingLocation": "Berlin, BE, Germany",\n   "YourFuckingHostname": "x4e374836.dyn.telefonica.de",\n   "YourFuckingISP": "O2 Deutschland",\n   "YourFuckingTorExit": "false",\n   "YourFuckingCountryCode": "DE"\n}\n'

In [25]:
# read API key
api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()
url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'

In [59]:
def _get_kg_meta(result_obj, params):
  global api_key, url_kg_api
  
  params['indent'] = True
  params['key'] = api_key
  params['limit'] = 1
  
  url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'
  try:
    json_response = urllib.request.urlopen(url).read()
  except Exception as e:
    result['error'] = str(e)
  else:
    try:
      response = json.loads(json_response)
      items = response.get('itemListElement', [])
      result_obj['accessed'] = True
      if items:
        item = items[0]
        item_result = item.get('result', [])
        result_obj['description'] = item_result.get('description', '')
        det_desc = item_result.get('detailedDescription', '')
        if not result_obj['kg_id']:
          result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')
        if det_desc:
          result_obj['description_extended'] = det_desc.get('articleBody','')
          result_obj['description_license'] = det_desc.get('license','')
          result_obj['description_url'] = det_desc.get('url','')
        else:
          result_obj['description_extended'] = ''
          result_obj['description_license'] = ''
          result_obj['description_url'] = ''
        result_img = item_result.get('image', '')
        if result_img:
          result_obj['image_url'] = result_img.get('contentUrl', '')
        result_obj['name'] = item_result.get('name', '')
        result_obj['score'] = item.get('resultScore', 0.0)
        result_obj['url'] = item_result.get('url', '')
    except Exception as e:
      result_obj['error'] = str(e)
  return result_obj
  
def get_kg_from_name(obj):
  if obj['accessed']:
    return obj
  params = {'query': obj['query']}
  return _get_kg_meta(obj, params)
  
def get_kg_from_kg_id(obj):
  if obj['accessed']:
    return obj
  params = {'ids': obj['kg_id']}
  return _get_kg_meta(obj, params)

In [60]:
pprint(obj)

{'accessed': False,
 'description': '',
 'error': '<urlopen error [Errno -2] Name or service not known>',
 'kg_id': '',
 'query': 'Taylor Swift',
 'score': 0.0,
 'url': ''}


In [63]:
# test get from name
obj = {'query': 'Taylor Swift', 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}  # default
result = get_kg_from_name(obj)
pprint(obj)

{'accessed': True,
 'description': 'American singer',
 'description_extended': 'Taylor Alison Swift is an American '
                         "singer-songwriter. As one of the world's leading "
                         'contemporary recording artists, she is known for '
                         'narrative songs about her personal life, which has '
                         'received widespread media coverage.\n',
 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',
 'description_url': 'https://en.wikipedia.org/wiki/Taylor_Swift',
 'image_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcST848UJ0u31E6aoQfb2nnKZFyad7rwNF0ZLOCACGpu4jnboEzV',
 'kg_id': '/m/0dl567',
 'name': 'Taylor Swift',
 'query': 'Taylor Swift',
 'score': 1241.476318,
 'url': 'http://taylorswift.com/'}


In [40]:
# define thread mapping function
def pool_map_persons(obj):
  global pbar
  pbar.update(1)
  kg_obj = get_kg_from_name(obj)
  return kg_obj

In [41]:
# build mapped_person objects
mapped_persons = []
for fn in filenames:
  obj = {'query': fn, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}
  mapped_persons.append(obj)

In [42]:
print(len(mapped_persons))
print(filenames[0:10])

3107
['aaron rodgers', 'aaron ruell', 'aaron staton', 'abel ferrara', 'abigail klein', 'abraham benrubi', 'abyshamble', 'adabel guerrero', 'adam ant', 'adam buxton']


In [50]:
num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)
print(num_non_accessed)

667


In [69]:
num_threads = 20
pbar = tqdm(total=len(mapped_persons))

num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)

# convert to thread pool
while num_non_accessed > 0:
  print(f'{num_non_accessed}/{len(mapped_persons)} remaining')
  pool = ThreadPool(num_threads)

  # start threading
  with tqdm(total=len(mapped_persons)) as pbar:
    mapped_persons = pool.map(pool_map_persons, mapped_persons)

  # close tqdm
  pbar.close()

  num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)
  if num_non_accessed > 0:
    print(f'{num_non_accessed} remaining. Sleeping...')
    time.sleep(60*10)  # wait X minutes

HBox(children=(IntProgress(value=0, max=3107), HTML(value='')))

3/3107 remaining


HBox(children=(IntProgress(value=0, max=3107), HTML(value='')))

In [70]:
# test output for a person
print(mapped_persons[0])

{'query': 'aaron rodgers', 'kg_id': '/m/04q06_', 'score': 919.404602, 'description': 'Football quarterback', 'url': '', 'accessed': True, 'description_extended': 'Aaron Charles Rodgers is an American football quarterback for the Green Bay Packers of the National Football League. Rodgers played college football for the California Golden Bears, where he set several career passing records, including lowest single-season and career interception rates. ', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Aaron_Rodgers', 'image_url': 'http://t3.gstatic.com/images?q=tbn:ANd9GcTH_uiKmj_Y71Lc1kNCJK5HDiZsUSh3AxEBI9Jz_lp5q_89QZ9d', 'name': 'Aaron Rodgers'}


In [72]:
# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose
cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'
cc_short = 'CC BY-SA 3.0'
nchanged = 0
for mapped_person in mapped_persons:
  license = mapped_person.get('description_license', None)
  if license == cc_long:
    nchanged += 1
    mapped_person['description_license'] = cc_short
print(nchanged)

0


In [73]:
# find number not accessed
n_empty = 0
for mapped_person in mapped_persons:
  if not mapped_person.get('accessed', False):
    n_empty += 1
    print(mapped_person['kg_id'])
print(n_empty)

0


In [74]:
# create dataframe for mapped persons
df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)
df_mapped_persons.index.name = 'index'

In [75]:
# check output
df_mapped_persons.head()

Unnamed: 0_level_0,accessed,description,description_extended,description_license,description_url,image_url,kg_id,name,query,score,url
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,True,Football quarterback,Aaron Charles Rodgers is an American football ...,CC BY-SA 3.0,https://en.wikipedia.org/wiki/Aaron_Rodgers,http://t3.gstatic.com/images?q=tbn:ANd9GcTH_ui...,/m/04q06_,Aaron Rodgers,aaron rodgers,919.404602,
1,True,American director,"Derek Aaron Ruell, is an American director and...",CC BY-SA 3.0,https://en.wikipedia.org/wiki/Aaron_Ruell,http://t3.gstatic.com/images?q=tbn:ANd9GcSzGg8...,/m/05yf80,Aaron Ruell,aaron ruell,439.912476,
2,True,American actor,Aaron Staton is an American actor. He is best ...,CC BY-SA 3.0,https://en.wikipedia.org/wiki/Aaron_Staton,http://t3.gstatic.com/images?q=tbn:ANd9GcTTmBV...,/m/06_vpyq,Aaron Staton,aaron staton,500.833344,
3,True,American filmmaker,"Abel Ferrara is an American filmmaker, known f...",CC BY-SA 3.0,https://en.wikipedia.org/wiki/Abel_Ferrara,http://t2.gstatic.com/images?q=tbn:ANd9GcRAhy-...,/m/056ryy,Abel Ferrara,abel ferrara,522.177734,http://www.abelferrara.com/
4,True,Actress,,,,,/m/0pbm3jf,Abigail Klein,abigail klein,341.831482,


In [76]:
# save
fp_out = '/data_store_hdd/datasets/people/umd_faces/metadata/identity_kg.csv'
df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')

In [79]:
# create small version
limit = 1000
fpp_out = Path(fp_out)
fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')
df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])
df_mapped_persons_sm.index.name = 'index'
df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')