# Knowledge Graph MS Celeb

In [1]:
import os
import os.path as osp
from os.path import join
from glob import glob
import random
import math
import time
from datetime import datetime

import requests

import json
import urllib
from multiprocessing.pool import ThreadPool
import threading
from urllib.request import urlopen
import urllib.request

import pandas as pd
from scipy.io import loadmat
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm
%reload_ext autoreload
%autoreload 2
import sys
sys.path.append('/work/megapixels_dev/megapixels/')
from app.utils import file_utils, im_utils

## Load Metadata

In [2]:
dir_msceleb = '/data_store_hdd/datasets/people/msceleb/media/original/'
kg_ids_msceleb = [x.replace('m.', '/m/') for x in os.listdir(dir_msceleb)]

In [3]:
api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()
url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'

In [50]:
def get_kg_meta(obj, url):
  
def get_kg_from_name(obj):
  
def get_kg_from_kg_id(obj):
  # TODO detect 503 service unavailable
  if obj['accessed']:
    return obj
  global api_key, url_kg_api
  kg_id = obj['kg_id']
  params = {
      'query': q,
      'limit': 5,
      'indent': True,
      'key': api_key,
      }
  
  params = {
      'ids': kg_id,
      'limit': 1,
      'indent': True,
      'key': api_key,
      }
  url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'
  result = {'kg_id': kg_id, 'score': 0.0, 'description': '', 'url':'', 'accessed': False}  # default
  try:
    json_response = urllib.request.urlopen(url).read()
  except Exception as e:
    result['error'] = str(e)
  else:
    try:
      response = json.loads(json_response)
      items = response.get('itemListElement', [])
      result['accessed'] = True
      if items:
        item = items[0]
        item_result = item.get('result', [])
        result['description'] = item_result.get('description', '')
        det_desc = item_result.get('detailedDescription', '')
        if det_desc:
          result['description_extended'] = det_desc.get('articleBody','')
          result['description_license'] = det_desc.get('license','')
          result['description_url'] = det_desc.get('url','')
        else:
          result['description_extended'] = ''
          result['description_license'] = ''
          result['description_url'] = ''
        result_img = item_result.get('image', '')
        if result_img:
          result['image_url'] = result_img.get('contentUrl', '')
        result['name'] = item_result.get('name', '')
        result['score'] = item.get('resultScore', 0.0)
        result['url'] = item_result.get('url', '')
    except Exception as e:
      result['error'] = str(e)
  return result

In [5]:
unmapped_persons = [{'kg_id': x} for x in kg_ids_msceleb]
opt_threads = 10
pbar = tqdm(total=len(unmapped_persons))

In [None]:
# define thread mapping function
def pool_map_persons(obj):
  global pbar
  pbar.update(1)
  kg_obj = get_kg_from_kg_obj(obj)
  return kg_obj

In [47]:
#mapped_persons_bkup = mapped_persons.copy()

In [85]:
# convert to thread pool
#mapped_persons = []
pool = ThreadPool(opt_threads)

# start threading
with tqdm(total=len(unmapped_persons)) as pbar:
  mapped_persons = pool.map(pool_map_persons, mapped_persons)

# close tqdm
pbar.close()

HBox(children=(IntProgress(value=0, max=93418), HTML(value='')))

In [71]:
len(mapped_persons)

93418

In [77]:
mapped_persons[93415:]

[{'kg_id': '/m/0dlnwb0', 'score': 14.806737, 'description': 'American internet celebrity', 'url': '', 'accessed': True, 'description_extended': 'Keenan Cahill is an American Internet celebrity from Chicago, Illinois who lip-syncs to popular songs on YouTube.\nCahill launched his first famous lipsynced YouTube video on August 28, 2010 on the Katy Perry song Teenage Dream. ', 'description_license': 'CC BY-SA 3.0', 'description_url': 'https://en.wikipedia.org/wiki/Keenan_Cahill', 'name': 'Keenan Cahill'}, {'kg_id': '/m/047rtd1', 'score': 12.298853, 'description': 'Canadian film actor', 'url': '', 'accessed': True, 'description_extended': '', 'description_license': '', 'description_url': '', 'name': 'Nicholas Elia'}, {'kg_id': '/m/04j9rz9', 'score': 11.539564, 'description': 'Investor', 'url': '', 'accessed': True, 'description_extended': 'Nick Leslau is an English commercial property investor, with an estimated fortune in the Sunday Times Rich List of £350 million. Leslau is Chairman and 

In [86]:
# reduce CC attribution string
cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'
cc_short = 'CC BY-SA 3.0'
nchanged = 0
for mapped_person in mapped_persons:
  license = mapped_person.get('description_license',None)
  if license == cc_long:
    nchanged += 1
    mapped_person['description_license'] = cc_short
print(nchanged)

5


In [87]:
# find number not accessed
n_empty = 0
for mapped_person in mapped_persons:
  if not mapped_person.get('accessed', False):
    n_empty += 1
    print(mapped_person['kg_id'])
print(n_empty)

0


In [88]:
# create dataframe
df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)
df_mapped_persons.index.name = 'index'
fp_mapped_persons = '/data_store_hdd/datasets/people/msceleb/metadata/identity_kg.csv'
df_mapped_persons.to_csv(fp_mapped_persons, encoding = 'utf-16')

In [40]:
df_mapped_persons.head()

In [89]:
# create small version
limit = 1000
fp_mapped_persons_sm = f'/data_store_hdd/datasets/people/msceleb/metadata/identity_kg_0_{limit}.csv'
df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])
df_mapped_persons_sm.index.name = 'index'
df_mapped_persons_sm.to_csv(fp_mapped_persons_sm, encoding = 'utf-16')

In [84]:
#a = get_kg_from_kg_obj({'kg_id': '/m/03c2nqz', 'accessed': False})
#print(a)

{'kg_id': '/m/03c2nqz', 'score': 14.279573, 'description': 'Brazilian soccer player', 'url': '', 'accessed': True, 'description_extended': 'Cleiton Ribeiro Xavier is a Brazilian professional footballer who plays as an attacking midfielder for Vitória. He is known by his powerful and accurate free kicks, dribbling skills and passes.', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Cleiton_Xavier', 'image_url': 'http://t3.gstatic.com/images?q=tbn:ANd9GcSPzkNDBjtWX3f_oov7vOTlTxBNFrfIqEaIwJR26AsLfsBbP8H9', 'name': 'Cleiton Xavier'}
