# PubFig83 Knowledge Graph Identities

- convert filename-names to names
- fetch Google Knowledge Graph entity IDs for each name
- save KG IDs to CSV

In [1]:
%reload_ext autoreload
%autoreload 2

import os
import os.path as osp
from os.path import join
from glob import glob
import random
import math
from pathlib import Path
from datetime import datetime
import requests
import json
import time
from pprint import pprint
from multiprocessing.pool import ThreadPool
import threading
import urllib.request

from tqdm import tqdm_notebook as tqdm
import pandas as pd
from scipy.io import loadmat
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

## Get List of Names

In [7]:
dir_lfw = '/data_store_hdd/datasets/people/pubfig83/media/original/'
names_orig = [x for x in os.listdir(dir_lfw)]
names_query = [x.replace('_', ' ') for x in names_orig]
print(len(names))

83


In [8]:
print(names_orig[0])

julia_stiles


## Google Knowledge Graph API

In [9]:
# read API key
api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()
url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'

In [10]:
def _get_kg_meta(result_obj, params):
  global api_key, url_kg_api
  
  params['indent'] = True
  params['key'] = api_key
  params['limit'] = 1
  
  url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'
  try:
    json_response = urllib.request.urlopen(url).read()
  except Exception as e:
    result['error'] = str(e)
  else:
    try:
      response = json.loads(json_response)
      items = response.get('itemListElement', [])
      result_obj['accessed'] = True
      if items:
        item = items[0]
        item_result = item.get('result', [])
        result_obj['description'] = item_result.get('description', '')
        det_desc = item_result.get('detailedDescription', '')
        if not result_obj['kg_id']:
          result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')
        if det_desc:
          result_obj['description_extended'] = det_desc.get('articleBody','')
          result_obj['description_license'] = det_desc.get('license','')
          result_obj['description_url'] = det_desc.get('url','')
        else:
          result_obj['description_extended'] = ''
          result_obj['description_license'] = ''
          result_obj['description_url'] = ''
        result_img = item_result.get('image', '')
        if result_img:
          result_obj['image_url'] = result_img.get('contentUrl', '')
        result_obj['name'] = item_result.get('name', '')
        result_obj['score'] = item.get('resultScore', 0.0)
        result_obj['url'] = item_result.get('url', '')
    except Exception as e:
      result_obj['error'] = str(e)
  return result_obj
  
def get_kg_from_name(obj):
  if obj['accessed']:
    return obj
  params = {'query': obj['query']}
  return _get_kg_meta(obj, params)
  
def get_kg_from_kg_id(obj):
  if obj['accessed']:
    return obj
  params = {'ids': obj['kg_id']}
  return _get_kg_meta(obj, params)

In [11]:
# test get from name
q = 'Adoor Gopalakrishnan'
obj = {'query': q, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}  # default
result = get_kg_from_name(obj)
pprint(obj)

{'accessed': True,
 'description': 'Indian film director',
 'description_extended': 'Adoor Gopalakrishnan is an Indian film director, '
                         'script writer, and producer. Adoor Gopalakrishnan '
                         'had a major role in revolutioning Malayalam cinema '
                         'during the 1970s and is regarded as one of the most '
                         'notable filmmakers of India. ',
 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',
 'description_url': 'https://en.wikipedia.org/wiki/Adoor_Gopalakrishnan',
 'image_url': 'http://t2.gstatic.com/images?q=tbn:ANd9GcQA-_aEYy_goHLhGJjmn558S1VEwcALB98m83I9HwUTV_gUsded',
 'kg_id': '/m/07s7wk',
 'name': 'Adoor Gopalakrishnan',
 'query': 'Adoor Gopalakrishnan',
 'score': 501.590881,
 'url': 'http://www.adoorgopalakrishnan.com'}


In [9]:
# define thread mapping function
def pool_map_persons(obj):
  global pbar
  pbar.update(1)
  kg_obj = get_kg_from_name(obj)
  return kg_obj

In [10]:
# build mapped_person objects
mapped_persons = []
for fn in names:
  obj = {'query': fn, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}
  mapped_persons.append(obj)

In [11]:
print(len(mapped_persons))
print(names[0:10])

83
['julia stiles', 'orlando bloom', 'adam sandler', 'victoria beckham', 'martha stewart', 'george clooney', 'steve carell', 'jennifer lopez', 'harrison ford', 'jessica alba']


In [13]:
num_threads = 5
pbar = tqdm(total=len(mapped_persons))

num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)

# convert to thread pool
while num_non_accessed > 0:
  print(f'{num_non_accessed}/{len(mapped_persons)} remaining')
  pool = ThreadPool(num_threads)

  # start threading
  with tqdm(total=len(mapped_persons)) as pbar:
    mapped_persons = pool.map(pool_map_persons, mapped_persons)

  # close tqdm
  pbar.close()

  num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)
  if num_non_accessed > 0:
    print(f'{num_non_accessed} remaining. Sleeping...')
    time.sleep(60)  # wait X minutes

HBox(children=(IntProgress(value=0, max=83), HTML(value='')))

12/83 remaining


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))

9 remaining. Sleeping...
9/83 remaining


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))


6 remaining. Sleeping...
6/83 remaining


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))


5 remaining. Sleeping...
5/83 remaining


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))


2 remaining. Sleeping...
2/83 remaining


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))




### Clean data

In [16]:
# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose
cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'
cc_short = 'CC BY-SA 3.0'
nchanged = 0
for mapped_person in mapped_persons:
  license = mapped_person.get('description_license', None)
  if license == cc_long:
    nchanged += 1
    mapped_person['description_license'] = cc_short
print(f'updated CC license: {nchanged}')

# find number not accessed
n_empty = 0
for mapped_person in mapped_persons:
  if not mapped_person.get('accessed', False):
    n_empty += 1
    print(mapped_person['kg_id'])
print(f'items w/o KG meta: {n_empty}')

updated CC license: 0
items w/o KG meta: 0


### Create dataframe

In [17]:
# create dataframe for mapped persons
df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)
df_mapped_persons.index.name = 'index'

In [19]:
# check output
df_mapped_persons.head(2)

Unnamed: 0_level_0,accessed,description,description_extended,description_license,description_url,image_url,kg_id,name,query,score,url
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,True,American actress,Julia O'Hara Stiles is an American actress. Bo...,CC BY-SA 3.0,https://en.wikipedia.org/wiki/Julia_Stiles,http://t1.gstatic.com/images?q=tbn:ANd9GcToFqB...,/m/02jtjz,Julia Stiles,julia stiles,637.113647,http://www.juliastilesblog.com
1,True,Actor,Orlando Jonathan Blanchard Bloom is an English...,CC BY-SA 3.0,https://en.wikipedia.org/wiki/Orlando_Bloom,http://t0.gstatic.com/images?q=tbn:ANd9GcQ2eYc...,/m/09wj5,Orlando Bloom,orlando bloom,689.364319,


In [20]:
# save
fp_out = '/data_store_hdd/datasets/people/pubfig83/metadata/identity_kg.csv'
df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')

In [23]:
# create small version
limit = 1000
fpp_out = Path(fp_out)
fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')
df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])
df_mapped_persons_sm.index.name = 'index'
df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')