{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Knowledge Graph Identities\n", "\n", "- convert filename-names to names\n", "- fetch Google Knowledge Graph entity IDs for each name\n", "- save KG IDs to CSV" ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "import os.path as osp\n", "from os.path import join\n", "from glob import glob\n", "from pathlib import Path\n", "import random\n", "import math\n", "from datetime import datetime\n", "import requests\n", "import json\n", "import time\n", "from pprint import pprint\n", "from multiprocessing.pool import ThreadPool\n", "import threading\n", "import urllib.request\n", "import difflib\n", "import unidecode\n", "import slugify\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd\n", "from scipy.io import loadmat\n", "import numpy as np\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels')\n", "from app.utils import api_utils, identity_utils\n", "from app.settings import app_cfg\n", "from app.settings import types" ] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/media/adam/ah8tb/work/megapixels_dev/env/google_knowledge_graph_api.env\n" ] } ], "source": [ "print(app_cfg.FP_KNOWLEDGE_GRAPH_ENV)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get List of Names" ] }, { "cell_type": "code", "execution_count": 160, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Kim Clijsters', 'William Rosenberg', 'John Brady', 'Juan Ignacio Chela', 'Floyd Keith', 'Sam Gerald', 'Imad Khadduri', 'Anna Kournikova', 'Jacques Rogge', 'Wilbert Elki Meza Majino']\n", "['Kim_Clijsters', 'William_Rosenberg', 'John_Brady', 'Juan_Ignacio_Chela', 'Floyd_Keith', 'Sam_Gerald', 'Imad_Khadduri', 'Anna_Kournikova', 'Jacques_Rogge', 'Wilbert_Elki_Meza_Majino']\n" ] } ], "source": [ "names = identity_utils.get_names(types.Dataset.LFW)\n", "print(names['names_query'][0:10])\n", "print(names['names_orig'][0:10])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Google Knowledge Graph API\n", "\n", "- about 100.000 requests per 24 hours" ] }, { "cell_type": "code", "execution_count": 161, "metadata": {}, "outputs": [], "source": [ "# read API key\n", "\n", "api_key = open(app_cfg.FP_KNOWLEDGE_GRAPH_ENV).read()\n", "kg_api = api_utils.GoogleKnowledgeGraph(api_key)\n", "wp_api = api_utils.WikipediaAPI()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test API Access" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "wp\n", "https://en.wikipedia.org/w/api.php?redirects=&ppprop=displaytitle&prop=pageprops%7Cpageimages%7Cdescription&generator=prefixsearch&action=query&format=json&piprop=thumbnail&pilimit=1&gpssearch=Vicente+Fox&gpsnamespace=0&gpslimit=1\n", "{'wp_accessed': True,\n", " 'wp_description': 'President of Mexico',\n", " 'wp_name': 'Vicente Fox',\n", " 'wp_page_id': '32836'}\n", "kg\n", "{'kg_accessed': True,\n", " 'kg_bio': 'Vicente Fox Quesada, RSerafO is a Mexican businessman and '\n", " 'politician who served as the 55th President of Mexico from 1 '\n", " 'December 2000 to 30 November 2006.\\n',\n", " 'kg_bio_url': 'https://en.wikipedia.org/wiki/Vicente_Fox',\n", " 'kg_description': 'Former President of Mexico',\n", " 'kg_error': '',\n", " 'kg_id': '/m/081f4',\n", " 'kg_image_url': 'http://t2.gstatic.com/images?q=tbn:ANd9GcQqs1Z0NhSLve9OyfdC0AHFWKWlTpHO4tCnU7dedSSz2kzCRk60',\n", " 'kg_name': 'Vicente Fox',\n", " 'kg_score': 610.987427,\n", " 'kg_url': '',\n", " 'query': 'Vicente Fox'}\n" ] } ], "source": [ "print('wp----')\n", "pprint(wp_api.get_meta({'query': 'Vicente Fox'}, verbose=True))\n", "print('kg----')\n", "pprint(kg_api.get_kg_from_name({'query':'Vicente Fox'}))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Test Name Similarity Matching" ] }, { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.7714285714285716\n" ] } ], "source": [ "#print(identity_utils.names_match('Andréss Iniestas', 'Andres Iniestalossas Jr.', as_float=True))\n", "#print(identity_utils.names_match('Adoor Gopalakrishnan', 'Adoors Gopalakarishnan', as_float=True))\n", "#print(identity_utils.names_match('Dave Letterman', 'David Letterman', as_float=True))\n", "print(identity_utils.names_match('Charles Dickens', 'Charles Boooker', as_float=True))\n", "#print(identity_utils.names_match('Donald Trump', 'Donald J. Trump', as_float=True))\n", "#print(identity_utils.names_match('Wang Fei', 'Fei Wang III', as_float=True))" ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [], "source": [ "# define thread mapping function\n", "def pool_map_persons(obj):\n", " global pbar\n", " pbar.update(1)\n", " kg_obj = kg_api.get_kg_from_name(obj)\n", " wp_obj = wp_api.get_meta(obj)\n", " person_obj = {**kg_obj, **wp_obj}\n", " return person_obj\n", "\n", "def num_non_accessed(mps):\n", " return sum(0 if (x.get('kg_accessed', False) and x.get('wp_accessed', False)) else 1 for x in mps)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load existing CSV" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "# load existing CSV\n", "fp_csv = '/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n", "df = pd.read_csv(fp_csv, encoding = 'utf-16').set_index('index')\n", "# fill nulls\n", "df.fillna('', inplace = True)\n", "mapped_persons = df.to_dict('records')\n", "# add columns\n", "for mp in mapped_persons:\n", " mp['wp_error'] = ''\n", " mp['kg_error'] = ''" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get Knowledge Graph Data" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5507f5c19de746df94aa5445e3c7cf46", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "832/5749 remaining\n", "832/5749 remaining. Using 5 threads\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "411d08f873174d13a1de1f8b21f9f993", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Done. 0 remaining.\n" ] } ], "source": [ "num_threads_max = 5\n", "sleep_min = 1\n", "pbar = tqdm(total=len(mapped_persons))\n", "\n", "nna = num_non_accessed(mapped_persons)\n", "print(f'{nna}/{len(mapped_persons)} remaining')\n", "\n", "# convert to thread pool\n", "while nna > 0:\n", " num_threads = max(1, min(num_threads_max, nna))\n", " print(f'{nna}/{len(mapped_persons)} remaining. Using {num_threads} threads')\n", " pool = ThreadPool(num_threads)\n", "\n", " # start threading\n", " with tqdm(total=len(mapped_persons)) as pbar:\n", " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", "\n", " # close tqdm\n", " pbar.close()\n", "\n", " nna = num_non_accessed(mapped_persons)\n", " if nna > 0:\n", " print(f'{nna} remaining. Sleeping for {sleep_min} minutes...')\n", " time.sleep(60 * sleep_min)\n", "\n", "print(f'Done. {nna} remaining.')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get Wikipedia API data" ] }, { "cell_type": "code", "execution_count": 220, "metadata": { "scrolled": false }, "outputs": [], "source": [ "for i, mp in enumerate(mapped_persons):\n", " kg_name = mp.get('kg_name')\n", " wp_name = mp.get('wp_name')\n", " query = mp.get('query')\n", " name_orig = mp.get('source_name')\n", " kg_score = int(mp.get('kg_score',0))\n", "\n", " kg_matches = same_person(name_orig, kg_name)\n", " wp_matches = same_person(name_orig, wp_name)\n", "\n", " if kg_matches and wp_matches and kg_score > 100:\n", " # very likely a match, confirm it\n", " match_status = 2 # supermatch\n", " # default to using wp because descriptions are more appropriate/udpated\n", " source = 'wp'\n", " elif kg_matches and wp_matches:\n", " match_status = 1\n", " # default to using wp because descriptions are more appropriate/udpated\n", " source = 'wp'\n", " elif kg_matches and not wp_matches:\n", " # if the KG score is medium-high, but wp failed, needs review\n", " source = 'kg'\n", " match_status = 0\n", " elif wp_matches and not kg_matches:\n", " # if wikipedia text matched the query, then confirm\n", " source = 'wp'\n", " match_status = 0\n", " else:\n", " # no information available\n", " match_status = -1\n", " source = None\n", " \n", " slug = slugify.slugify(name_orig, separator='_')\n", " mp_bio = mp.get('kg_bio', '')\n", " wp_desc = mp.get('wp_description', '')\n", " source_url = f\"http://vis-www.cs.umass.edu/lfw/person/{name_orig.replace(' ', '_')}.html\"\n", " \n", " if source == 'kg':\n", " # google knowledge graph\n", " mp_name = mp['kg_name']\n", " mp_description = mp.get('kg_description', '')\n", " elif source == 'wp':\n", " # wikipedia\n", " mp_name = mp['wp_name']\n", " mp_description = mp.get('wp_description', '')\n", " \n", " if 'disambiguation' in wp_desc.lower():\n", " #print(f\"disambiguate: {name_orig}\")\n", " match_status = 0 # needs review if \"disambiguation appears\"\n", " mp_name = ''\n", " mp_description = ''\n", " mp_bio = ''\n", " \n", " mp['source_url'] = source_url\n", " mp['mp_slug'] = slug\n", " mp['matched'] = match_status\n", " mp['mp_bio'] = mp_bio\n", " mp['mp_name'] = mp_name\n", " mp['mp_description'] = mp_description" ] }, { "cell_type": "code", "execution_count": 221, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "match: 4359\n", "review: 718\n", "fail: 672\n", "no kg accessed: 0\n", "no wp accessed: 0\n" ] } ], "source": [ "print(f\"match: {sum(1 if (x.get('matched') > 0) else 0 for x in mapped_persons)}\")\n", "print(f\"review: {sum(1 if (x.get('matched') == 0) else 0 for x in mapped_persons)}\")\n", "print(f\"fail: {sum(1 if (x.get('matched') == -1) else 0 for x in mapped_persons)}\")\n", "\n", "print(f\"no kg accessed: {sum(0 if (x.get('kg_accessed', False)) else 1 for x in mapped_persons)}\")\n", "print(f\"no wp accessed: {sum(0 if (x.get('wp_accessed', False)) else 1 for x in mapped_persons)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Save data to CSV" ] }, { "cell_type": "code", "execution_count": 235, "metadata": {}, "outputs": [], "source": [ "# create dataframe for mapped persons\n", "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", "df_mapped_persons.index.name = 'index'" ] }, { "cell_type": "code", "execution_count": 236, "metadata": {}, "outputs": [], "source": [ "# save\n", "fp_out = f'/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n", "df_mapped_persons.drop(['kg_accessed', 'wp_accessed', 'kg_error', 'wp_error'], axis=1, inplace=True)\n", "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')\n", "# create small version\n", "limit = 1000\n", "fpp_out = Path(fp_out)\n", "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n", "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", "df_mapped_persons_sm.index.name = 'index'\n", "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')" ] }, { "cell_type": "code", "execution_count": 237, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | kg_bio | \n", "kg_bio_url | \n", "kg_description | \n", "kg_id | \n", "kg_image_url | \n", "kg_name | \n", "kg_score | \n", "kg_url | \n", "matched | \n", "mp_bio | \n", "mp_description | \n", "mp_name | \n", "mp_slug | \n", "query | \n", "source | \n", "source_name | \n", "source_url | \n", "wp_description | \n", "wp_name | \n", "wp_page_id | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| index | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
| 0 | \n", "Kim Antonie Lode Clijsters is a Belgian former... | \n", "https://en.wikipedia.org/wiki/Kim_Clijsters | \n", "Belgian tennis player | \n", "/m/01m_gh | \n", "http://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK... | \n", "Kim Clijsters | \n", "618.272705 | \n", "\n", " | 2 | \n", "Kim Antonie Lode Clijsters is a Belgian former... | \n", "Belgian tennis player | \n", "Kim Clijsters | \n", "kim_clijsters | \n", "Kim Clijsters | \n", "lfw | \n", "Kim_Clijsters | \n", "http://vis-www.cs.umass.edu/lfw/person/Kim_Cli... | \n", "Belgian tennis player | \n", "Kim Clijsters | \n", "262793 | \n", "
| 1 | \n", "William Rosenberg was an American entrepreneur... | \n", "https://en.wikipedia.org/wiki/William_Rosenberg | \n", "American entrepreneur | \n", "/m/07dy4z | \n", "\n", " | William Rosenberg | \n", "367.879730 | \n", "\n", " | 2 | \n", "William Rosenberg was an American entrepreneur... | \n", "American businessman | \n", "William Rosenberg | \n", "william_rosenberg | \n", "William Rosenberg | \n", "lfw | \n", "William_Rosenberg | \n", "http://vis-www.cs.umass.edu/lfw/person/William... | \n", "American businessman | \n", "William Rosenberg | \n", "2.44981e+06 | \n", "