{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Knowledge Graph Identities\n", "\n", "- convert filename-names to names\n", "- fetch Google Knowledge Graph entity IDs for each name\n", "- save KG IDs to CSV" ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "import os.path as osp\n", "from os.path import join\n", "from glob import glob\n", "from pathlib import Path\n", "import random\n", "import math\n", "from datetime import datetime\n", "import requests\n", "import json\n", "import time\n", "from pprint import pprint\n", "from multiprocessing.pool import ThreadPool\n", "import threading\n", "import urllib.request\n", "import difflib\n", "import unidecode\n", "import slugify\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd\n", "from scipy.io import loadmat\n", "import numpy as np\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels')\n", "from app.utils import api_utils, identity_utils\n", "from app.settings import app_cfg\n", "from app.settings import types" ] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/media/adam/ah8tb/work/megapixels_dev/env/google_knowledge_graph_api.env\n" ] } ], "source": [ "print(app_cfg.FP_KNOWLEDGE_GRAPH_ENV)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get List of Names" ] }, { "cell_type": "code", "execution_count": 160, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Kim Clijsters', 'William Rosenberg', 'John Brady', 'Juan Ignacio Chela', 'Floyd Keith', 'Sam Gerald', 'Imad Khadduri', 'Anna Kournikova', 'Jacques Rogge', 'Wilbert Elki Meza Majino']\n", "['Kim_Clijsters', 'William_Rosenberg', 'John_Brady', 'Juan_Ignacio_Chela', 'Floyd_Keith', 'Sam_Gerald', 'Imad_Khadduri', 'Anna_Kournikova', 'Jacques_Rogge', 'Wilbert_Elki_Meza_Majino']\n" ] } ], "source": [ "names = identity_utils.get_names(types.Dataset.LFW)\n", "print(names['names_query'][0:10])\n", "print(names['names_orig'][0:10])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Google Knowledge Graph API\n", "\n", "- about 100.000 requests per 24 hours" ] }, { "cell_type": "code", "execution_count": 161, "metadata": {}, "outputs": [], "source": [ "# read API key\n", "\n", "api_key = open(app_cfg.FP_KNOWLEDGE_GRAPH_ENV).read()\n", "kg_api = api_utils.GoogleKnowledgeGraph(api_key)\n", "wp_api = api_utils.WikipediaAPI()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test API Access" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "wp\n", "https://en.wikipedia.org/w/api.php?redirects=&ppprop=displaytitle&prop=pageprops%7Cpageimages%7Cdescription&generator=prefixsearch&action=query&format=json&piprop=thumbnail&pilimit=1&gpssearch=Vicente+Fox&gpsnamespace=0&gpslimit=1\n", "{'wp_accessed': True,\n", " 'wp_description': 'President of Mexico',\n", " 'wp_name': 'Vicente Fox',\n", " 'wp_page_id': '32836'}\n", "kg\n", "{'kg_accessed': True,\n", " 'kg_bio': 'Vicente Fox Quesada, RSerafO is a Mexican businessman and '\n", " 'politician who served as the 55th President of Mexico from 1 '\n", " 'December 2000 to 30 November 2006.\\n',\n", " 'kg_bio_url': 'https://en.wikipedia.org/wiki/Vicente_Fox',\n", " 'kg_description': 'Former President of Mexico',\n", " 'kg_error': '',\n", " 'kg_id': '/m/081f4',\n", " 'kg_image_url': 'http://t2.gstatic.com/images?q=tbn:ANd9GcQqs1Z0NhSLve9OyfdC0AHFWKWlTpHO4tCnU7dedSSz2kzCRk60',\n", " 'kg_name': 'Vicente Fox',\n", " 'kg_score': 610.987427,\n", " 'kg_url': '',\n", " 'query': 'Vicente Fox'}\n" ] } ], "source": [ "print('wp----')\n", "pprint(wp_api.get_meta({'query': 'Vicente Fox'}, verbose=True))\n", "print('kg----')\n", "pprint(kg_api.get_kg_from_name({'query':'Vicente Fox'}))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Test Name Similarity Matching" ] }, { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.7714285714285716\n" ] } ], "source": [ "#print(identity_utils.names_match('Andréss Iniestas', 'Andres Iniestalossas Jr.', as_float=True))\n", "#print(identity_utils.names_match('Adoor Gopalakrishnan', 'Adoors Gopalakarishnan', as_float=True))\n", "#print(identity_utils.names_match('Dave Letterman', 'David Letterman', as_float=True))\n", "print(identity_utils.names_match('Charles Dickens', 'Charles Boooker', as_float=True))\n", "#print(identity_utils.names_match('Donald Trump', 'Donald J. Trump', as_float=True))\n", "#print(identity_utils.names_match('Wang Fei', 'Fei Wang III', as_float=True))" ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [], "source": [ "# define thread mapping function\n", "def pool_map_persons(obj):\n", " global pbar\n", " pbar.update(1)\n", " kg_obj = kg_api.get_kg_from_name(obj)\n", " wp_obj = wp_api.get_meta(obj)\n", " person_obj = {**kg_obj, **wp_obj}\n", " return person_obj\n", "\n", "def num_non_accessed(mps):\n", " return sum(0 if (x.get('kg_accessed', False) and x.get('wp_accessed', False)) else 1 for x in mps)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load existing CSV" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "# load existing CSV\n", "fp_csv = '/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n", "df = pd.read_csv(fp_csv, encoding = 'utf-16').set_index('index')\n", "# fill nulls\n", "df.fillna('', inplace = True)\n", "mapped_persons = df.to_dict('records')\n", "# add columns\n", "for mp in mapped_persons:\n", " mp['wp_error'] = ''\n", " mp['kg_error'] = ''" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get Knowledge Graph Data" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5507f5c19de746df94aa5445e3c7cf46", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "832/5749 remaining\n", "832/5749 remaining. Using 5 threads\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "411d08f873174d13a1de1f8b21f9f993", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Done. 0 remaining.\n" ] } ], "source": [ "num_threads_max = 5\n", "sleep_min = 1\n", "pbar = tqdm(total=len(mapped_persons))\n", "\n", "nna = num_non_accessed(mapped_persons)\n", "print(f'{nna}/{len(mapped_persons)} remaining')\n", "\n", "# convert to thread pool\n", "while nna > 0:\n", " num_threads = max(1, min(num_threads_max, nna))\n", " print(f'{nna}/{len(mapped_persons)} remaining. Using {num_threads} threads')\n", " pool = ThreadPool(num_threads)\n", "\n", " # start threading\n", " with tqdm(total=len(mapped_persons)) as pbar:\n", " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", "\n", " # close tqdm\n", " pbar.close()\n", "\n", " nna = num_non_accessed(mapped_persons)\n", " if nna > 0:\n", " print(f'{nna} remaining. Sleeping for {sleep_min} minutes...')\n", " time.sleep(60 * sleep_min)\n", "\n", "print(f'Done. {nna} remaining.')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get Wikipedia API data" ] }, { "cell_type": "code", "execution_count": 220, "metadata": { "scrolled": false }, "outputs": [], "source": [ "for i, mp in enumerate(mapped_persons):\n", " kg_name = mp.get('kg_name')\n", " wp_name = mp.get('wp_name')\n", " query = mp.get('query')\n", " name_orig = mp.get('source_name')\n", " kg_score = int(mp.get('kg_score',0))\n", "\n", " kg_matches = same_person(name_orig, kg_name)\n", " wp_matches = same_person(name_orig, wp_name)\n", "\n", " if kg_matches and wp_matches and kg_score > 100:\n", " # very likely a match, confirm it\n", " match_status = 2 # supermatch\n", " # default to using wp because descriptions are more appropriate/udpated\n", " source = 'wp'\n", " elif kg_matches and wp_matches:\n", " match_status = 1\n", " # default to using wp because descriptions are more appropriate/udpated\n", " source = 'wp'\n", " elif kg_matches and not wp_matches:\n", " # if the KG score is medium-high, but wp failed, needs review\n", " source = 'kg'\n", " match_status = 0\n", " elif wp_matches and not kg_matches:\n", " # if wikipedia text matched the query, then confirm\n", " source = 'wp'\n", " match_status = 0\n", " else:\n", " # no information available\n", " match_status = -1\n", " source = None\n", " \n", " slug = slugify.slugify(name_orig, separator='_')\n", " mp_bio = mp.get('kg_bio', '')\n", " wp_desc = mp.get('wp_description', '')\n", " source_url = f\"http://vis-www.cs.umass.edu/lfw/person/{name_orig.replace(' ', '_')}.html\"\n", " \n", " if source == 'kg':\n", " # google knowledge graph\n", " mp_name = mp['kg_name']\n", " mp_description = mp.get('kg_description', '')\n", " elif source == 'wp':\n", " # wikipedia\n", " mp_name = mp['wp_name']\n", " mp_description = mp.get('wp_description', '')\n", " \n", " if 'disambiguation' in wp_desc.lower():\n", " #print(f\"disambiguate: {name_orig}\")\n", " match_status = 0 # needs review if \"disambiguation appears\"\n", " mp_name = ''\n", " mp_description = ''\n", " mp_bio = ''\n", " \n", " mp['source_url'] = source_url\n", " mp['mp_slug'] = slug\n", " mp['matched'] = match_status\n", " mp['mp_bio'] = mp_bio\n", " mp['mp_name'] = mp_name\n", " mp['mp_description'] = mp_description" ] }, { "cell_type": "code", "execution_count": 221, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "match: 4359\n", "review: 718\n", "fail: 672\n", "no kg accessed: 0\n", "no wp accessed: 0\n" ] } ], "source": [ "print(f\"match: {sum(1 if (x.get('matched') > 0) else 0 for x in mapped_persons)}\")\n", "print(f\"review: {sum(1 if (x.get('matched') == 0) else 0 for x in mapped_persons)}\")\n", "print(f\"fail: {sum(1 if (x.get('matched') == -1) else 0 for x in mapped_persons)}\")\n", "\n", "print(f\"no kg accessed: {sum(0 if (x.get('kg_accessed', False)) else 1 for x in mapped_persons)}\")\n", "print(f\"no wp accessed: {sum(0 if (x.get('wp_accessed', False)) else 1 for x in mapped_persons)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Save data to CSV" ] }, { "cell_type": "code", "execution_count": 235, "metadata": {}, "outputs": [], "source": [ "# create dataframe for mapped persons\n", "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", "df_mapped_persons.index.name = 'index'" ] }, { "cell_type": "code", "execution_count": 236, "metadata": {}, "outputs": [], "source": [ "# save\n", "fp_out = f'/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n", "df_mapped_persons.drop(['kg_accessed', 'wp_accessed', 'kg_error', 'wp_error'], axis=1, inplace=True)\n", "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')\n", "# create small version\n", "limit = 1000\n", "fpp_out = Path(fp_out)\n", "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n", "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", "df_mapped_persons_sm.index.name = 'index'\n", "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')" ] }, { "cell_type": "code", "execution_count": 237, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
kg_biokg_bio_urlkg_descriptionkg_idkg_image_urlkg_namekg_scorekg_urlmatchedmp_biomp_descriptionmp_namemp_slugquerysourcesource_namesource_urlwp_descriptionwp_namewp_page_id
index
0Kim Antonie Lode Clijsters is a Belgian former...https://en.wikipedia.org/wiki/Kim_ClijstersBelgian tennis player/m/01m_ghhttp://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK...Kim Clijsters618.2727052Kim Antonie Lode Clijsters is a Belgian former...Belgian tennis playerKim Clijsterskim_clijstersKim ClijsterslfwKim_Clijstershttp://vis-www.cs.umass.edu/lfw/person/Kim_Cli...Belgian tennis playerKim Clijsters262793
1William Rosenberg was an American entrepreneur...https://en.wikipedia.org/wiki/William_RosenbergAmerican entrepreneur/m/07dy4zWilliam Rosenberg367.8797302William Rosenberg was an American entrepreneur...American businessmanWilliam Rosenbergwilliam_rosenbergWilliam RosenberglfwWilliam_Rosenberghttp://vis-www.cs.umass.edu/lfw/person/William...American businessmanWilliam Rosenberg2.44981e+06
\n", "
" ], "text/plain": [ " kg_bio \\\n", "index \n", "0 Kim Antonie Lode Clijsters is a Belgian former... \n", "1 William Rosenberg was an American entrepreneur... \n", "\n", " kg_bio_url kg_description \\\n", "index \n", "0 https://en.wikipedia.org/wiki/Kim_Clijsters Belgian tennis player \n", "1 https://en.wikipedia.org/wiki/William_Rosenberg American entrepreneur \n", "\n", " kg_id kg_image_url \\\n", "index \n", "0 /m/01m_gh http://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK... \n", "1 /m/07dy4z \n", "\n", " kg_name kg_score kg_url matched \\\n", "index \n", "0 Kim Clijsters 618.272705 2 \n", "1 William Rosenberg 367.879730 2 \n", "\n", " mp_bio \\\n", "index \n", "0 Kim Antonie Lode Clijsters is a Belgian former... \n", "1 William Rosenberg was an American entrepreneur... \n", "\n", " mp_description mp_name mp_slug \\\n", "index \n", "0 Belgian tennis player Kim Clijsters kim_clijsters \n", "1 American businessman William Rosenberg william_rosenberg \n", "\n", " query source source_name \\\n", "index \n", "0 Kim Clijsters lfw Kim_Clijsters \n", "1 William Rosenberg lfw William_Rosenberg \n", "\n", " source_url \\\n", "index \n", "0 http://vis-www.cs.umass.edu/lfw/person/Kim_Cli... \n", "1 http://vis-www.cs.umass.edu/lfw/person/William... \n", "\n", " wp_description wp_name wp_page_id \n", "index \n", "0 Belgian tennis player Kim Clijsters 262793 \n", "1 American businessman William Rosenberg 2.44981e+06 " ] }, "execution_count": 237, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_mapped_persons.head(2)" ] } ], "metadata": { "kernelspec": { "display_name": "megapixels", "language": "python", "name": "megapixels" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }