# IMDB-WIKI Knowledge Graph

- convert names to Knowledge Graph entity IDs
- The `imdb.mat` file contains only full names, need KG ids `/m/12345`

In [87]:
%reload_ext autoreload
%autoreload 2

import os
import os.path as osp
from os.path import join
from glob import glob
from pathlib import Path
import random
import math
from datetime import datetime
import requests
import json
import time
from pprint import pprint
from multiprocessing.pool import ThreadPool
import threading
import urllib.request

from tqdm import tqdm_notebook as tqdm
import pandas as pd
from scipy.io import loadmat
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

## Load IMDB Metadata

In [13]:
fp_meta_imdb = '/data_store_hdd/datasets/people/imdb_wiki/metadata/imdb_mat.csv'
df_meta_imdb = pd.read_csv(fp_meta_imdb).set_index('index')
df_meta_imdb.head(2)

Unnamed: 0_level_0,celeb_id,dob,filepath,gender,name,x1,x2,y1,y2,year_photo
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,6488,1900-5-11,01/nm0000001_rm124825600_1899-5-10_1968.jpg,m,Fred Astaire,1072.926,1214.784,161.838,303.696,1968
1,6488,1900-5-11,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,m,Fred Astaire,477.184,622.592,100.352,245.76,1970


## Google Knowledge Graph API

In [14]:
# read API key
api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()
url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'

In [45]:
def _get_kg_meta(result_obj, params):
  global api_key, url_kg_api
  
  params['indent'] = True
  params['key'] = api_key
  params['limit'] = 1
  
  url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'
  try:
    json_response = urllib.request.urlopen(url).read()
  except Exception as e:
    result['error'] = str(e)
  else:
    try:
      response = json.loads(json_response)
      items = response.get('itemListElement', [])
      result_obj['accessed'] = True
      if items:
        item = items[0]
        item_result = item.get('result', [])
        result_obj['description'] = item_result.get('description', '')
        det_desc = item_result.get('detailedDescription', '')
        if not result_obj['kg_id']:
          result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')
        if det_desc:
          result_obj['description_extended'] = det_desc.get('articleBody','')
          result_obj['description_license'] = det_desc.get('license','')
          result_obj['description_url'] = det_desc.get('url','')
        else:
          result_obj['description_extended'] = ''
          result_obj['description_license'] = ''
          result_obj['description_url'] = ''
        result_img = item_result.get('image', '')
        if result_img:
          result_obj['image_url'] = result_img.get('contentUrl', '')
        result_obj['name'] = item_result.get('name', '')
        result_obj['score'] = item.get('resultScore', 0.0)
        result_obj['url'] = item_result.get('url', '')
    except Exception as e:
      result_obj['error'] = str(e)
  return result_obj
  
def get_kg_from_name(obj):
  if obj['accessed']:
    return obj
  params = {'query': obj['query']}
  return _get_kg_meta(obj, params)
  
def get_kg_from_kg_id(obj):
  if obj['accessed']:
    return obj
  params = {'ids': obj['kg_id']}
  return _get_kg_meta(obj, params)

In [46]:
# make a test query to check if API works
obj = {'query': 'Taylor Swift', 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}  # default
result = get_kg_from_name(obj)
pprint(obj)

{'accessed': True,
 'description': 'American singer',
 'description_extended': 'Taylor Alison Swift is an American '
                         "singer-songwriter. As one of the world's leading "
                         'contemporary recording artists, she is known for '
                         'narrative songs about her personal life, which has '
                         'received widespread media coverage.\n',
 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',
 'description_url': 'https://en.wikipedia.org/wiki/Taylor_Swift',
 'image_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcST848UJ0u31E6aoQfb2nnKZFyad7rwNF0ZLOCACGpu4jnboEzV',
 'kg_id': '/m/0dl567',
 'name': 'Taylor Swift',
 'query': 'Taylor Swift',
 'score': 1241.476318,
 'url': 'http://taylorswift.com/'}


In [None]:
obj = {'query': 'Taylor Swift', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}  # default
result = get_kg_from_id(obj)

In [49]:
# build mapped_person objects
mapped_persons = []
count = 0
df_person_groups = df_meta_imdb.groupby('name')
for group_name, df_name_group in df_person_groups:
  obj = {'query': group_name, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}
  mapped_persons.append(obj)

In [55]:
# define thread mapping function
def pool_map_persons(obj):
  global pbar
  pbar.update(1)
  kg_obj = get_kg_from_name(obj)
  return kg_obj

In [79]:
num_threads = 2
pbar = tqdm(total=len(mapped_persons))

num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)
print(f'{num_non_accessed}/{len(mapped_persons)} remaining')

# convert to thread pool
while num_non_accessed > 0:
  print(f'{num_non_accessed}/{len(mapped_persons)} remaining')
  pool = ThreadPool(num_threads)

  # start threading
  with tqdm(total=len(mapped_persons)) as pbar:
    mapped_persons = pool.map(pool_map_persons, mapped_persons)

  # close tqdm
  pbar.close()

  num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)
  if num_non_accessed > 0:
    print(f'{num_non_accessed}/{len(mapped_persons)} remaining. Sleeping...')
    time.sleep(60*20)  # wait X minutes

HBox(children=(IntProgress(value=0, max=20284), HTML(value='')))

0/20284 remaining


In [80]:
# test output for a person
print(mapped_persons[0])

{'query': "'Lee' George Quinones", 'kg_id': '/m/08hvx1', 'score': 280.322754, 'description': 'Artist', 'url': 'http://www.leequinones.com/', 'accessed': True, 'description_extended': 'George Lee Quiñones is a Puerto Rican artist and actor. He is one of several artists to gain fame from the New York City Subway graffiti movement.\n', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Lee_Qui%C3%B1ones', 'name': 'Lee Quiñones'}


In [82]:
# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose
cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'
cc_short = 'CC BY-SA 3.0'
nchanged = 0
for mapped_person in mapped_persons:
  license = mapped_person.get('description_license', None)
  if license == cc_long:
    nchanged += 1
    mapped_person['description_license'] = cc_short
print(nchanged)

0


In [83]:
# find number not accessed
n_empty = 0
for mapped_person in mapped_persons:
  if not mapped_person.get('accessed', False):
    n_empty += 1
    print(mapped_person['kg_id'])
print(n_empty)

0


In [84]:
# create dataframe for mapped persons
df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)
df_mapped_persons.index.name = 'index'

In [None]:
# check output
df_mapped_persons.head()

In [85]:
# save
fp_out = '/data_store_hdd/datasets/people/imdb_wiki/metadata/identity_kg.csv'
df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')

In [88]:
# create small version
limit = 1000
fpp_out = Path(fp_out)
fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')
df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])
df_mapped_persons_sm.index.name = 'index'
df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')