{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Knowledge Graph Identities\n", "\n", "- convert filename-names to names\n", "- fetch Google Knowledge Graph entity IDs for each name\n", "- save KG IDs to CSV" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "import os.path as osp\n", "from os.path import join\n", "from glob import glob\n", "from pathlib import Path\n", "import random\n", "import math\n", "from datetime import datetime\n", "import requests\n", "import json\n", "import time\n", "from pprint import pprint\n", "from multiprocessing.pool import ThreadPool\n", "import threading\n", "import urllib.request\n", "import difflib\n", "import unidecode\n", "import slugify\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd\n", "from scipy.io import loadmat\n", "import numpy as np\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels')\n", "from app.utils import api_utils\n", "from app.settings import types" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get List of Names" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def get_names(enum_dataset):\n", " if enum_dataset == types.Dataset.LFW:\n", " dir_lfw = '/data_store_hdd/datasets/people/lfw/media/original/'\n", " names = [x.replace('_', ' ') for x in os.listdir(dir_lfw)]\n", " elif enum_dataset == types.Dataset.YOUTUBE_FACES:\n", " names = [x for x in names if 'labeled faces.txt' not in x]\n", " return names" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Kim Clijsters', 'William Rosenberg', 'John Brady', 'Juan Ignacio Chela', 'Floyd Keith', 'Sam Gerald', 'Imad Khadduri', 'Anna Kournikova', 'Jacques Rogge', 'Wilbert Elki Meza Majino']\n" ] } ], "source": [ "names = get_names(types.Dataset.LFW)\n", "print(names[0:10])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Google Knowledge Graph API" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# read API key\n", "api_key = open('/work/megapixels_dev/env/google_knowledge_graph_api.env').read()\n", "kg_api = api_utils.GoogleKnowledgeGraph(api_key)\n", "wp_api = api_utils.WikipediaAPI()" ] }, { "cell_type": "code", "execution_count": 241, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "wp\n", "{'wp_accessed': True, 'wp_description': '', 'wp_name': '', 'wp_page_id': ''}\n", "kg\n", "{'kg_accessed': True,\n", " 'kg_bio': '',\n", " 'kg_bio_url': '',\n", " 'kg_description': '',\n", " 'kg_id': '',\n", " 'kg_image_url': '',\n", " 'kg_name': '',\n", " 'kg_score': 0,\n", " 'kg_url': '',\n", " 'query': 'Jeff Dederian'}\n" ] } ], "source": [ "#wp_api.test_access()\n", "print('wp')\n", "pprint(wp_api.get_meta({'query': 'Florecita Cobian'}))\n", "print('kg')\n", "pprint(kg_api.get_kg_from_name({'query':'Jeff Dederian'}))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Test Name Similarity Matching" ] }, { "cell_type": "code", "execution_count": 242, "metadata": {}, "outputs": [], "source": [ "def same_person(query, name, sim_min=.9, word_match_min=0.75, verbose=False):\n", " if name == '':\n", " return False\n", " # check and remove if WP added parenthesis\n", " if '(' in name and ')' in name:\n", " name = name.split('(')[0]\n", " \n", " # then strip spaces and split into list\n", " query_strings = [unidecode.unidecode(x.strip().lower()) for x in query.strip().split(' ')] # query\n", " result_strings = [unidecode.unidecode(x.strip().lower()) for x in name.strip().split(' ')] # result\n", " min_str_len = min(len(result_strings), len(query_strings))\n", " # match each word in the query\n", " matched_strings = []\n", " \n", " for i in range(len(query_strings)):\n", " # for each word in the shorter text string\n", " result_strings_tmp = result_strings.copy()\n", " for j in range(len(result_strings_tmp)):\n", " a = query_strings[i]\n", " b = result_strings_tmp[j]\n", " # make a the shorter string\n", " lengths = [len(a), len(b)]\n", " min_ratio = (min(lengths) / max(lengths) * .75)\n", " ratio = difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()\n", " result = (ratio >= min_ratio)\n", " if verbose:\n", " print(f'comapre \"{a}\" to \"{b}\" ratio was: {ratio:.2f} min: {min_ratio:.2}, passed: {result}')\n", " if result:\n", " # remove this item from result strings\n", " matched_string = result_strings.pop(j)\n", " matched_strings.append(matched_string)\n", " break # exit loop and use shortened result string haystack\n", "\n", " matched = len(matched_strings) >= min_str_len\n", " if verbose:\n", " print(f'{matched} because {len(matched_strings)} >= {min_str_len}')\n", " return matched" ] }, { "cell_type": "code", "execution_count": 245, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(Adoor Gopalakrishnan == Adoors Gopalakarishnan ok) = True\n", "\n", "comapre \"dave\" to \"david\" ratio was: 0.67 min: 0.6, passed: True\n", "comapre \"letterman\" to \"letterman\" ratio was: 1.00 min: 0.75, passed: True\n", "True because 2 >= 2\n", "(David Letterman == Dave Letterman) = True\n", "\n", "comapre \"charles\" to \"charles\" ratio was: 1.00 min: 0.75, passed: True\n", "comapre \"dickens\" to \"booker\" ratio was: 0.31 min: 0.64, passed: False\n", "False because 1 >= 2\n", "(Charles Booker == Charles Dickens) = False\n", "\n", "comapre \"donald\" to \"don\" ratio was: 0.67 min: 0.38, passed: True\n", "comapre \"trump\" to \"j.\" ratio was: 0.00 min: 0.3, passed: False\n", "comapre \"trump\" to \"trump\" ratio was: 1.00 min: 0.75, passed: True\n", "True because 2 >= 2\n", "(Don J. Trump == Donald Trump) = True\n", "\n", "comapre \"wang\" to \"wang\" ratio was: 1.00 min: 0.75, passed: True\n", "comapre \"fei\" to \"fei\" ratio was: 1.00 min: 0.75, passed: True\n", "True because 2 >= 2\n", "(Wang Fei (female footballer) == Wang Fei) = True\n" ] } ], "source": [ "test_sim_match = True\n", "if test_sim_match:\n", " # Test name similarity search\n", " query = 'Adoors Gopalakarishnan ok'\n", " wp_name = 'Adoor Gopalakrishnan'\n", " matched = same_person(query, wp_name)\n", " print(f'({wp_name} == {query}) = {matched}')\n", " print('')\n", "\n", " query = 'Dave Letterman'\n", " wp_name = 'David Letterman'\n", " matched = same_person(query, wp_name, verbose=True)\n", " print(f'({wp_name} == {query}) = {matched}')\n", " print('')\n", "\n", " query = 'Charles Dickens'\n", " wp_name = 'Charles Booker'\n", " matched = same_person(query, wp_name, verbose=True)\n", " print(f'({wp_name} == {query}) = {matched}')\n", " print('')\n", "\n", " query = 'Donald Trump'\n", " wp_name = 'Don J. Trump'\n", " matched = same_person(query, wp_name, verbose=True)\n", " print(f'({wp_name} == {query}) = {matched}')\n", " print('')\n", " \n", " query = 'Wang Fei'\n", " kg_name = 'Faye Wong'\n", " wp_name = 'Wang Fei (female footballer)'\n", " matched = same_person(query, wp_name, verbose=True)\n", " print(f'({wp_name} == {query}) = {matched}')" ] }, { "cell_type": "code", "execution_count": 246, "metadata": {}, "outputs": [], "source": [ "# define thread mapping function\n", "def pool_map_persons(obj):\n", " global pbar\n", " pbar.update(1)\n", " kg_obj = kg_api.get_kg_from_name(obj)\n", " wp_obj = wp_api.get_meta(obj)\n", " person_obj = {**kg_obj, **wp_obj}\n", " return person_obj\n", "\n", "def num_non_accessed(mps):\n", " return sum(0 if (x.get('kg_accessed', False) and x.get('wp_accessed', False)) else 1 for x in mps)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load existing CSV" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "# load existing CSV\n", "fp_csv = '/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n", "df = pd.read_csv(fp_csv, encoding = 'utf-16').set_index('index')\n", "# fill nulls\n", "df.fillna('', inplace = True)\n", "mapped_persons = df.to_dict('records')\n", "# add columns\n", "for mp in mapped_persons:\n", " mp['wp_error'] = ''\n", " mp['kg_error'] = ''" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get Knowledge Graph Data" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5507f5c19de746df94aa5445e3c7cf46", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "832/5749 remaining\n", "832/5749 remaining. Using 5 threads\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "411d08f873174d13a1de1f8b21f9f993", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Done. 0 remaining.\n" ] } ], "source": [ "num_threads_max = 5\n", "sleep_min = 1\n", "pbar = tqdm(total=len(mapped_persons))\n", "\n", "nna = num_non_accessed(mapped_persons)\n", "print(f'{nna}/{len(mapped_persons)} remaining')\n", "\n", "# convert to thread pool\n", "while nna > 0:\n", " num_threads = max(1, min(num_threads_max, nna))\n", " print(f'{nna}/{len(mapped_persons)} remaining. Using {num_threads} threads')\n", " pool = ThreadPool(num_threads)\n", "\n", " # start threading\n", " with tqdm(total=len(mapped_persons)) as pbar:\n", " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", "\n", " # close tqdm\n", " pbar.close()\n", "\n", " nna = num_non_accessed(mapped_persons)\n", " if nna > 0:\n", " print(f'{nna} remaining. Sleeping for {sleep_min} minutes...')\n", " time.sleep(60 * sleep_min)\n", "\n", "print(f'Done. {nna} remaining.')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get Wikipedia API data" ] }, { "cell_type": "code", "execution_count": 220, "metadata": { "scrolled": false }, "outputs": [], "source": [ "for i, mp in enumerate(mapped_persons):\n", " kg_name = mp.get('kg_name')\n", " wp_name = mp.get('wp_name')\n", " query = mp.get('query')\n", " name_orig = mp.get('source_name')\n", " kg_score = int(mp.get('kg_score',0))\n", "\n", " kg_matches = same_person(name_orig, kg_name)\n", " wp_matches = same_person(name_orig, wp_name)\n", "\n", " if kg_matches and wp_matches and kg_score > 100:\n", " # very likely a match, confirm it\n", " match_status = 2 # supermatch\n", " # default to using wp because descriptions are more appropriate/udpated\n", " source = 'wp'\n", " elif kg_matches and wp_matches:\n", " match_status = 1\n", " # default to using wp because descriptions are more appropriate/udpated\n", " source = 'wp'\n", " elif kg_matches and not wp_matches:\n", " # if the KG score is medium-high, but wp failed, needs review\n", " source = 'kg'\n", " match_status = 0\n", " elif wp_matches and not kg_matches:\n", " # if wikipedia text matched the query, then confirm\n", " source = 'wp'\n", " match_status = 0\n", " else:\n", " # no information available\n", " match_status = -1\n", " source = None\n", " \n", " slug = slugify.slugify(name_orig, separator='_')\n", " mp_bio = mp.get('kg_bio', '')\n", " wp_desc = mp.get('wp_description', '')\n", " source_url = f\"http://vis-www.cs.umass.edu/lfw/person/{name_orig.replace(' ', '_')}.html\"\n", " \n", " if source == 'kg':\n", " # google knowledge graph\n", " mp_name = mp['kg_name']\n", " mp_description = mp.get('kg_description', '')\n", " elif source == 'wp':\n", " # wikipedia\n", " mp_name = mp['wp_name']\n", " mp_description = mp.get('wp_description', '')\n", " \n", " if 'disambiguation' in wp_desc.lower():\n", " #print(f\"disambiguate: {name_orig}\")\n", " match_status = 0 # needs review if \"disambiguation appears\"\n", " mp_name = ''\n", " mp_description = ''\n", " mp_bio = ''\n", " \n", " mp['source_url'] = source_url\n", " mp['mp_slug'] = slug\n", " mp['matched'] = match_status\n", " mp['mp_bio'] = mp_bio\n", " mp['mp_name'] = mp_name\n", " mp['mp_description'] = mp_description" ] }, { "cell_type": "code", "execution_count": 221, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "match: 4359\n", "review: 718\n", "fail: 672\n", "no kg accessed: 0\n", "no wp accessed: 0\n" ] } ], "source": [ "print(f\"match: {sum(1 if (x.get('matched') > 0) else 0 for x in mapped_persons)}\")\n", "print(f\"review: {sum(1 if (x.get('matched') == 0) else 0 for x in mapped_persons)}\")\n", "print(f\"fail: {sum(1 if (x.get('matched') == -1) else 0 for x in mapped_persons)}\")\n", "\n", "print(f\"no kg accessed: {sum(0 if (x.get('kg_accessed', False)) else 1 for x in mapped_persons)}\")\n", "print(f\"no wp accessed: {sum(0 if (x.get('wp_accessed', False)) else 1 for x in mapped_persons)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Save data to CSV" ] }, { "cell_type": "code", "execution_count": 235, "metadata": {}, "outputs": [], "source": [ "# create dataframe for mapped persons\n", "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", "df_mapped_persons.index.name = 'index'" ] }, { "cell_type": "code", "execution_count": 236, "metadata": {}, "outputs": [], "source": [ "# save\n", "fp_out = f'/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n", "df_mapped_persons.drop(['kg_accessed', 'wp_accessed', 'kg_error', 'wp_error'], axis=1, inplace=True)\n", "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')\n", "# create small version\n", "limit = 1000\n", "fpp_out = Path(fp_out)\n", "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n", "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", "df_mapped_persons_sm.index.name = 'index'\n", "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')" ] }, { "cell_type": "code", "execution_count": 237, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
kg_biokg_bio_urlkg_descriptionkg_idkg_image_urlkg_namekg_scorekg_urlmatchedmp_biomp_descriptionmp_namemp_slugquerysourcesource_namesource_urlwp_descriptionwp_namewp_page_id
index
0Kim Antonie Lode Clijsters is a Belgian former...https://en.wikipedia.org/wiki/Kim_ClijstersBelgian tennis player/m/01m_ghhttp://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK...Kim Clijsters618.2727052Kim Antonie Lode Clijsters is a Belgian former...Belgian tennis playerKim Clijsterskim_clijstersKim ClijsterslfwKim_Clijstershttp://vis-www.cs.umass.edu/lfw/person/Kim_Cli...Belgian tennis playerKim Clijsters262793
1William Rosenberg was an American entrepreneur...https://en.wikipedia.org/wiki/William_RosenbergAmerican entrepreneur/m/07dy4zWilliam Rosenberg367.8797302William Rosenberg was an American entrepreneur...American businessmanWilliam Rosenbergwilliam_rosenbergWilliam RosenberglfwWilliam_Rosenberghttp://vis-www.cs.umass.edu/lfw/person/William...American businessmanWilliam Rosenberg2.44981e+06
\n", "
" ], "text/plain": [ " kg_bio \\\n", "index \n", "0 Kim Antonie Lode Clijsters is a Belgian former... \n", "1 William Rosenberg was an American entrepreneur... \n", "\n", " kg_bio_url kg_description \\\n", "index \n", "0 https://en.wikipedia.org/wiki/Kim_Clijsters Belgian tennis player \n", "1 https://en.wikipedia.org/wiki/William_Rosenberg American entrepreneur \n", "\n", " kg_id kg_image_url \\\n", "index \n", "0 /m/01m_gh http://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK... \n", "1 /m/07dy4z \n", "\n", " kg_name kg_score kg_url matched \\\n", "index \n", "0 Kim Clijsters 618.272705 2 \n", "1 William Rosenberg 367.879730 2 \n", "\n", " mp_bio \\\n", "index \n", "0 Kim Antonie Lode Clijsters is a Belgian former... \n", "1 William Rosenberg was an American entrepreneur... \n", "\n", " mp_description mp_name mp_slug \\\n", "index \n", "0 Belgian tennis player Kim Clijsters kim_clijsters \n", "1 American businessman William Rosenberg william_rosenberg \n", "\n", " query source source_name \\\n", "index \n", "0 Kim Clijsters lfw Kim_Clijsters \n", "1 William Rosenberg lfw William_Rosenberg \n", "\n", " source_url \\\n", "index \n", "0 http://vis-www.cs.umass.edu/lfw/person/Kim_Cli... \n", "1 http://vis-www.cs.umass.edu/lfw/person/William... \n", "\n", " wp_description wp_name wp_page_id \n", "index \n", "0 Belgian tennis player Kim Clijsters 262793 \n", "1 American businessman William Rosenberg 2.44981e+06 " ] }, "execution_count": 237, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_mapped_persons.head(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Clean data" ] }, { "cell_type": "code", "execution_count": 225, "metadata": { "scrolled": false }, "outputs": [], "source": [ "for mp in mapped_persons:\n", " mp['source_name'] = mp['source_name'].replace(' ', '_')\n", "# mp['kg_description'] = mp['kg_description'].strip()\n", "# mp['kg_name'] = mp['kg_name'].strip()\n", "# mp['kg_bio_url'] = mp['kg_bio_url'].strip()\n", "# mp['kg_bio'] = mp['kg_bio'].strip()\n", "# mp['kg_url'] = mp['kg_url'].strip()\n", " \n", "# mp['wp_description'] = mp['wp_description'].strip()\n", "# mp['wp_name'] = mp['wp_name'].strip()\n", " \n", "# mp['mp_name'] = ''\n", "# mp['mp_bio'] = ''\n", "# mp['mp_description'] = ''\n", "# mp['mp_slug'] = ''\n", " \n", " #mp.setdefault('kg_description','')\n", "# if mp.get('kg_score', 0) == 0:\n", "# mp['kg_image_url'] = ''\n", "# mp['kg_bio_url'] = ''\n", "# mp['kg_id'] = ''\n", "# mp['kg_url'] = ''\n", "# mp['kg_description'] = ''\n", "# mp['kg_bio_url'] = ''\n", "# mp['kg_name'] = ''\n", "# if mp['kg_url'] == [] or mp['kg_url'] == '[]':\n", "# mp['kg_url'] = ''\n", "\n", " try:\n", " _ = mp.pop('wp_bio')\n", " except:\n", " pass" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:megapixels]", "language": "python", "name": "conda-env-megapixels-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }