{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Knowledge Graph Identities\n", "\n", "- convert filename-names to names\n", "- fetch Google Knowledge Graph entity IDs for each name\n", "- save KG IDs to CSV" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "import os.path as osp\n", "from os.path import join\n", "from glob import glob\n", "from pathlib import Path\n", "import random\n", "import math\n", "from datetime import datetime\n", "import requests\n", "import json\n", "import time\n", "from pprint import pprint\n", "from multiprocessing.pool import ThreadPool\n", "import threading\n", "import urllib.request\n", "import difflib\n", "import unidecode\n", "import slugify\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd\n", "from scipy.io import loadmat\n", "import numpy as np\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels')\n", "from app.utils import api_utils\n", "from app.settings import types" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get List of Names" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def get_names(enum_dataset):\n", " if enum_dataset == types.Dataset.LFW:\n", " dir_lfw = '/data_store_hdd/datasets/people/lfw/media/original/'\n", " names = [x.replace('_', ' ') for x in os.listdir(dir_lfw)]\n", " elif enum_dataset == types.Dataset.YOUTUBE_FACES:\n", " names = [x for x in names if 'labeled faces.txt' not in x]\n", " return names" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Kim Clijsters', 'William Rosenberg', 'John Brady', 'Juan Ignacio Chela', 'Floyd Keith', 'Sam Gerald', 'Imad Khadduri', 'Anna Kournikova', 'Jacques Rogge', 'Wilbert Elki Meza Majino']\n" ] } ], "source": [ "names = get_names(types.Dataset.LFW)\n", "print(names[0:10])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Google Knowledge Graph API" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# read API key\n", "api_key = open('/work/megapixels_dev/env/google_knowledge_graph_api.env').read()\n", "kg_api = api_utils.GoogleKnowledgeGraph(api_key)\n", "wp_api = api_utils.WikipediaAPI()" ] }, { "cell_type": "code", "execution_count": 241, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "wp\n", "{'wp_accessed': True, 'wp_description': '', 'wp_name': '', 'wp_page_id': ''}\n", "kg\n", "{'kg_accessed': True,\n", " 'kg_bio': '',\n", " 'kg_bio_url': '',\n", " 'kg_description': '',\n", " 'kg_id': '',\n", " 'kg_image_url': '',\n", " 'kg_name': '',\n", " 'kg_score': 0,\n", " 'kg_url': '',\n", " 'query': 'Jeff Dederian'}\n" ] } ], "source": [ "#wp_api.test_access()\n", "print('wp')\n", "pprint(wp_api.get_meta({'query': 'Florecita Cobian'}))\n", "print('kg')\n", "pprint(kg_api.get_kg_from_name({'query':'Jeff Dederian'}))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Test Name Similarity Matching" ] }, { "cell_type": "code", "execution_count": 242, "metadata": {}, "outputs": [], "source": [ "def same_person(query, name, sim_min=.9, word_match_min=0.75, verbose=False):\n", " if name == '':\n", " return False\n", " # check and remove if WP added parenthesis\n", " if '(' in name and ')' in name:\n", " name = name.split('(')[0]\n", " \n", " # then strip spaces and split into list\n", " query_strings = [unidecode.unidecode(x.strip().lower()) for x in query.strip().split(' ')] # query\n", " result_strings = [unidecode.unidecode(x.strip().lower()) for x in name.strip().split(' ')] # result\n", " min_str_len = min(len(result_strings), len(query_strings))\n", " # match each word in the query\n", " matched_strings = []\n", " \n", " for i in range(len(query_strings)):\n", " # for each word in the shorter text string\n", " result_strings_tmp = result_strings.copy()\n", " for j in range(len(result_strings_tmp)):\n", " a = query_strings[i]\n", " b = result_strings_tmp[j]\n", " # make a the shorter string\n", " lengths = [len(a), len(b)]\n", " min_ratio = (min(lengths) / max(lengths) * .75)\n", " ratio = difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()\n", " result = (ratio >= min_ratio)\n", " if verbose:\n", " print(f'comapre \"{a}\" to \"{b}\" ratio was: {ratio:.2f} min: {min_ratio:.2}, passed: {result}')\n", " if result:\n", " # remove this item from result strings\n", " matched_string = result_strings.pop(j)\n", " matched_strings.append(matched_string)\n", " break # exit loop and use shortened result string haystack\n", "\n", " matched = len(matched_strings) >= min_str_len\n", " if verbose:\n", " print(f'{matched} because {len(matched_strings)} >= {min_str_len}')\n", " return matched" ] }, { "cell_type": "code", "execution_count": 245, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(Adoor Gopalakrishnan == Adoors Gopalakarishnan ok) = True\n", "\n", "comapre \"dave\" to \"david\" ratio was: 0.67 min: 0.6, passed: True\n", "comapre \"letterman\" to \"letterman\" ratio was: 1.00 min: 0.75, passed: True\n", "True because 2 >= 2\n", "(David Letterman == Dave Letterman) = True\n", "\n", "comapre \"charles\" to \"charles\" ratio was: 1.00 min: 0.75, passed: True\n", "comapre \"dickens\" to \"booker\" ratio was: 0.31 min: 0.64, passed: False\n", "False because 1 >= 2\n", "(Charles Booker == Charles Dickens) = False\n", "\n", "comapre \"donald\" to \"don\" ratio was: 0.67 min: 0.38, passed: True\n", "comapre \"trump\" to \"j.\" ratio was: 0.00 min: 0.3, passed: False\n", "comapre \"trump\" to \"trump\" ratio was: 1.00 min: 0.75, passed: True\n", "True because 2 >= 2\n", "(Don J. Trump == Donald Trump) = True\n", "\n", "comapre \"wang\" to \"wang\" ratio was: 1.00 min: 0.75, passed: True\n", "comapre \"fei\" to \"fei\" ratio was: 1.00 min: 0.75, passed: True\n", "True because 2 >= 2\n", "(Wang Fei (female footballer) == Wang Fei) = True\n" ] } ], "source": [ "test_sim_match = True\n", "if test_sim_match:\n", " # Test name similarity search\n", " query = 'Adoors Gopalakarishnan ok'\n", " wp_name = 'Adoor Gopalakrishnan'\n", " matched = same_person(query, wp_name)\n", " print(f'({wp_name} == {query}) = {matched}')\n", " print('')\n", "\n", " query = 'Dave Letterman'\n", " wp_name = 'David Letterman'\n", " matched = same_person(query, wp_name, verbose=True)\n", " print(f'({wp_name} == {query}) = {matched}')\n", " print('')\n", "\n", " query = 'Charles Dickens'\n", " wp_name = 'Charles Booker'\n", " matched = same_person(query, wp_name, verbose=True)\n", " print(f'({wp_name} == {query}) = {matched}')\n", " print('')\n", "\n", " query = 'Donald Trump'\n", " wp_name = 'Don J. Trump'\n", " matched = same_person(query, wp_name, verbose=True)\n", " print(f'({wp_name} == {query}) = {matched}')\n", " print('')\n", " \n", " query = 'Wang Fei'\n", " kg_name = 'Faye Wong'\n", " wp_name = 'Wang Fei (female footballer)'\n", " matched = same_person(query, wp_name, verbose=True)\n", " print(f'({wp_name} == {query}) = {matched}')" ] }, { "cell_type": "code", "execution_count": 246, "metadata": {}, "outputs": [], "source": [ "# define thread mapping function\n", "def pool_map_persons(obj):\n", " global pbar\n", " pbar.update(1)\n", " kg_obj = kg_api.get_kg_from_name(obj)\n", " wp_obj = wp_api.get_meta(obj)\n", " person_obj = {**kg_obj, **wp_obj}\n", " return person_obj\n", "\n", "def num_non_accessed(mps):\n", " return sum(0 if (x.get('kg_accessed', False) and x.get('wp_accessed', False)) else 1 for x in mps)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load existing CSV" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "# load existing CSV\n", "fp_csv = '/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n", "df = pd.read_csv(fp_csv, encoding = 'utf-16').set_index('index')\n", "# fill nulls\n", "df.fillna('', inplace = True)\n", "mapped_persons = df.to_dict('records')\n", "# add columns\n", "for mp in mapped_persons:\n", " mp['wp_error'] = ''\n", " mp['kg_error'] = ''" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get Knowledge Graph Data" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5507f5c19de746df94aa5445e3c7cf46", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "832/5749 remaining\n", "832/5749 remaining. Using 5 threads\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "411d08f873174d13a1de1f8b21f9f993", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Done. 0 remaining.\n" ] } ], "source": [ "num_threads_max = 5\n", "sleep_min = 1\n", "pbar = tqdm(total=len(mapped_persons))\n", "\n", "nna = num_non_accessed(mapped_persons)\n", "print(f'{nna}/{len(mapped_persons)} remaining')\n", "\n", "# convert to thread pool\n", "while nna > 0:\n", " num_threads = max(1, min(num_threads_max, nna))\n", " print(f'{nna}/{len(mapped_persons)} remaining. Using {num_threads} threads')\n", " pool = ThreadPool(num_threads)\n", "\n", " # start threading\n", " with tqdm(total=len(mapped_persons)) as pbar:\n", " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", "\n", " # close tqdm\n", " pbar.close()\n", "\n", " nna = num_non_accessed(mapped_persons)\n", " if nna > 0:\n", " print(f'{nna} remaining. Sleeping for {sleep_min} minutes...')\n", " time.sleep(60 * sleep_min)\n", "\n", "print(f'Done. {nna} remaining.')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get Wikipedia API data" ] }, { "cell_type": "code", "execution_count": 220, "metadata": { "scrolled": false }, "outputs": [], "source": [ "for i, mp in enumerate(mapped_persons):\n", " kg_name = mp.get('kg_name')\n", " wp_name = mp.get('wp_name')\n", " query = mp.get('query')\n", " name_orig = mp.get('source_name')\n", " kg_score = int(mp.get('kg_score',0))\n", "\n", " kg_matches = same_person(name_orig, kg_name)\n", " wp_matches = same_person(name_orig, wp_name)\n", "\n", " if kg_matches and wp_matches and kg_score > 100:\n", " # very likely a match, confirm it\n", " match_status = 2 # supermatch\n", " # default to using wp because descriptions are more appropriate/udpated\n", " source = 'wp'\n", " elif kg_matches and wp_matches:\n", " match_status = 1\n", " # default to using wp because descriptions are more appropriate/udpated\n", " source = 'wp'\n", " elif kg_matches and not wp_matches:\n", " # if the KG score is medium-high, but wp failed, needs review\n", " source = 'kg'\n", " match_status = 0\n", " elif wp_matches and not kg_matches:\n", " # if wikipedia text matched the query, then confirm\n", " source = 'wp'\n", " match_status = 0\n", " else:\n", " # no information available\n", " match_status = -1\n", " source = None\n", " \n", " slug = slugify.slugify(name_orig, separator='_')\n", " mp_bio = mp.get('kg_bio', '')\n", " wp_desc = mp.get('wp_description', '')\n", " source_url = f\"http://vis-www.cs.umass.edu/lfw/person/{name_orig.replace(' ', '_')}.html\"\n", " \n", " if source == 'kg':\n", " # google knowledge graph\n", " mp_name = mp['kg_name']\n", " mp_description = mp.get('kg_description', '')\n", " elif source == 'wp':\n", " # wikipedia\n", " mp_name = mp['wp_name']\n", " mp_description = mp.get('wp_description', '')\n", " \n", " if 'disambiguation' in wp_desc.lower():\n", " #print(f\"disambiguate: {name_orig}\")\n", " match_status = 0 # needs review if \"disambiguation appears\"\n", " mp_name = ''\n", " mp_description = ''\n", " mp_bio = ''\n", " \n", " mp['source_url'] = source_url\n", " mp['mp_slug'] = slug\n", " mp['matched'] = match_status\n", " mp['mp_bio'] = mp_bio\n", " mp['mp_name'] = mp_name\n", " mp['mp_description'] = mp_description" ] }, { "cell_type": "code", "execution_count": 221, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "match: 4359\n", "review: 718\n", "fail: 672\n", "no kg accessed: 0\n", "no wp accessed: 0\n" ] } ], "source": [ "print(f\"match: {sum(1 if (x.get('matched') > 0) else 0 for x in mapped_persons)}\")\n", "print(f\"review: {sum(1 if (x.get('matched') == 0) else 0 for x in mapped_persons)}\")\n", "print(f\"fail: {sum(1 if (x.get('matched') == -1) else 0 for x in mapped_persons)}\")\n", "\n", "print(f\"no kg accessed: {sum(0 if (x.get('kg_accessed', False)) else 1 for x in mapped_persons)}\")\n", "print(f\"no wp accessed: {sum(0 if (x.get('wp_accessed', False)) else 1 for x in mapped_persons)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Save data to CSV" ] }, { "cell_type": "code", "execution_count": 235, "metadata": {}, "outputs": [], "source": [ "# create dataframe for mapped persons\n", "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", "df_mapped_persons.index.name = 'index'" ] }, { "cell_type": "code", "execution_count": 236, "metadata": {}, "outputs": [], "source": [ "# save\n", "fp_out = f'/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n", "df_mapped_persons.drop(['kg_accessed', 'wp_accessed', 'kg_error', 'wp_error'], axis=1, inplace=True)\n", "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')\n", "# create small version\n", "limit = 1000\n", "fpp_out = Path(fp_out)\n", "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n", "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", "df_mapped_persons_sm.index.name = 'index'\n", "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')" ] }, { "cell_type": "code", "execution_count": 237, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | kg_bio | \n", "kg_bio_url | \n", "kg_description | \n", "kg_id | \n", "kg_image_url | \n", "kg_name | \n", "kg_score | \n", "kg_url | \n", "matched | \n", "mp_bio | \n", "mp_description | \n", "mp_name | \n", "mp_slug | \n", "query | \n", "source | \n", "source_name | \n", "source_url | \n", "wp_description | \n", "wp_name | \n", "wp_page_id | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| index | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
| 0 | \n", "Kim Antonie Lode Clijsters is a Belgian former... | \n", "https://en.wikipedia.org/wiki/Kim_Clijsters | \n", "Belgian tennis player | \n", "/m/01m_gh | \n", "http://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK... | \n", "Kim Clijsters | \n", "618.272705 | \n", "\n", " | 2 | \n", "Kim Antonie Lode Clijsters is a Belgian former... | \n", "Belgian tennis player | \n", "Kim Clijsters | \n", "kim_clijsters | \n", "Kim Clijsters | \n", "lfw | \n", "Kim_Clijsters | \n", "http://vis-www.cs.umass.edu/lfw/person/Kim_Cli... | \n", "Belgian tennis player | \n", "Kim Clijsters | \n", "262793 | \n", "
| 1 | \n", "William Rosenberg was an American entrepreneur... | \n", "https://en.wikipedia.org/wiki/William_Rosenberg | \n", "American entrepreneur | \n", "/m/07dy4z | \n", "\n", " | William Rosenberg | \n", "367.879730 | \n", "\n", " | 2 | \n", "William Rosenberg was an American entrepreneur... | \n", "American businessman | \n", "William Rosenberg | \n", "william_rosenberg | \n", "William Rosenberg | \n", "lfw | \n", "William_Rosenberg | \n", "http://vis-www.cs.umass.edu/lfw/person/William... | \n", "American businessman | \n", "William Rosenberg | \n", "2.44981e+06 | \n", "