summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/identity
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/notebooks/datasets/identity')
-rw-r--r--megapixels/notebooks/datasets/identity/identity_master.ipynb633
-rw-r--r--megapixels/notebooks/datasets/identity/identity_testing.ipynb50
2 files changed, 649 insertions, 34 deletions
diff --git a/megapixels/notebooks/datasets/identity/identity_master.ipynb b/megapixels/notebooks/datasets/identity/identity_master.ipynb
new file mode 100644
index 00000000..a48a7ba1
--- /dev/null
+++ b/megapixels/notebooks/datasets/identity/identity_master.ipynb
@@ -0,0 +1,633 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Identity Master List\n",
+ "\n",
+ "- start with MS Celeb Top1M\n",
+ "- then progressively add smaller datasets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "from pathlib import Path\n",
+ "import requests\n",
+ "import json\n",
+ "from pprint import pprint\n",
+ "from multiprocessing.pool import ThreadPool\n",
+ "import threading\n",
+ "import urllib.request\n",
+ "import difflib\n",
+ "import unidecode\n",
+ "\n",
+ "import slugify\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "import pandas as pd\n",
+ "from scipy.io import loadmat\n",
+ "import numpy as np\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "import sys\n",
+ "sys.path.append('/work/megapixels_dev/megapixels')\n",
+ "from app.utils import api_utils, identity_utils\n",
+ "from app.settings import app_cfg\n",
+ "from app.settings import types"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_master_identities = '/data_store_hdd/apps/megapixels/metadata/identities_master.csv'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## MS Celeb Top 1M\n",
+ "\n",
+ "- add column for each spelling of name\n",
+ "- convert kg id to standard google format"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_msceleb_top1m = '/data_store_hdd/datasets/people/msceleb/downloads/Top1M_MidList.Name.tsv'\n",
+ "df_msceleb_top1m = pd.read_csv(fp_msceleb_top1m, delimiter='\\t', header=None, encoding='utf-8', names=['id_kg', 'name'])\n",
+ "df_msceleb_top1m_groups = df_msceleb_top1m.groupby('id_kg')\n",
+ "n_groups = df_msceleb_top1m_groups.ngroups\n",
+ "print(f'{n_groups} groups')\n",
+ "df_msceleb_top1m.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mseleb_top1m_records = df_msceleb_top1m.to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 106,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#df_msceleb_top1m.head(100)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "abbrev_mappings = {\n",
+ " 'en-US': 'en',\n",
+ " 'en-GB': 'en',\n",
+ " 'es-419': 'es-419',\n",
+ " 'es'\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "msceleb_identities = {}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 120,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def split_name_lang(name_lang):\n",
+ " '''Split name into name and language'''\n",
+ " if '@' in name_lang:\n",
+ " indexes = [i for i,x in enumerate(name_lang) if x == '@']\n",
+ " idx_max = (max(indexes))\n",
+ " lang = name_lang[(idx_max + 1):]\n",
+ " name = name_lang[:(idx_max)]\n",
+ " else:\n",
+ " name = name_lang\n",
+ " lang = ''\n",
+ " return {'name': name, 'lang': lang}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 122,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'name': 'r@destiny', 'lang': 'en-417'}"
+ ]
+ },
+ "execution_count": 122,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "split_name_lang('r@destiny@en')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 141,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0120e006a7564f5c82729a7050ef0386",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "msceleb_identities = {}\n",
+ "for mseleb_top1m_record in tqdm(mseleb_top1m_records):\n",
+ " id_kg = mseleb_top1m_record['id_kg']\n",
+ " if not id_kg in msceleb_identities.keys():\n",
+ " msceleb_identities[id_kg] = {}\n",
+ " name_lang = split_name_lang(mseleb_top1m_record['name'])\n",
+ " name = name_lang['name']\n",
+ " lang = name_lang['lang']\n",
+ " msceleb_identities[id_kg][lang] = name"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 142,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import itertools\n",
+ "msceleb_identities_sm = dict(itertools.islice(msceleb_identities.items(), 0, 10))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 145,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Patrick Cummins en\n",
+ "Patrick Cummins pt\n",
+ "Mohamed Guessous en\n",
+ "Mohamed Guessous fr\n",
+ "محمد جسوس ar\n",
+ "Tsvetta Kaleynska en\n",
+ "Tsvetta Kaleynska es\n",
+ "Tsvetta Kaleynska fr\n",
+ "Цвета Калейнска bg\n",
+ "Цвета Калейнска ru\n",
+ "Caio Henrique Siqueira Sanchez en\n",
+ "Кајо Санчез sr\n",
+ "Julio Ríos Gallego ca\n",
+ "Julio Ríos Gallego en\n",
+ "Julio Ríos Gallego es\n",
+ "Nilson Ricardo da Silva Júnior en\n",
+ "ニルソン・リカルド・ダ・シルバ・ジュニオール ja\n",
+ "니우송 히카르두 다 시우바 주니오르 ko\n",
+ "Aleksej Aleksandrovič Starobinski sl\n",
+ "Alexei Alexandrowitsch Starobinski de\n",
+ "Alexei Starobinski pt\n",
+ "Alexei Starobinsky en\n",
+ "Alexeï Starobinski fr\n",
+ "Алексей Александрович Старобинский ru\n",
+ "Старобінський Олексій Олександрович uk\n",
+ "アレクセイ・スタロビンスキー ja\n",
+ "Hilda Rix Nicholas en\n",
+ "هیلدا ریکس نیکولاس fa\n",
+ "Behrouz Makvandi en\n",
+ "Бехруз Макванди ru\n",
+ "بهروز مکوندی fa\n",
+ "Borislav Terzić en\n",
+ "Борислав Терзић sr\n"
+ ]
+ }
+ ],
+ "source": [
+ "# de-duplicate names that use same spelling for multiple languages\n",
+ "for id_kg, name_langs in msceleb_identities_sm.items():\n",
+ " if 'en' in name_langs.keys():\n",
+ " name_en = name_langs['en']\n",
+ " for lang, name in name_langs.items():\n",
+ " print(name, lang)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "374a55f504084f14bd4d77fed0e2f4e4",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "n2 split is long: zh-Hant\n",
+ "n2 split is long: es-419\n",
+ "n2 split is long: fil\n",
+ "n2 split is long: en-GB\n",
+ "n2 split is long: en-US\n",
+ "n2 split is long: zh-HK\n",
+ "n2 split is long: fr-CA\n",
+ "n2 split is long: pt-PT\n",
+ "n2 split is long: ceb\n",
+ "n2 split is long: zorbla.de\n",
+ "n2 split is long: N\n",
+ "n2 split is long: hu\n",
+ "m.03zytg\tΑστέριος\"\n",
+ "n2 split is long: destiny\n",
+ "n2 split is long: Teng Boon Soon\n",
+ "n2 split is long: Yong Khoon Seng\n",
+ "n2 split is long: Tiki Anak Lafe\n",
+ "n2 split is long: Marcus Mojigoh\n",
+ "n2 split is long: Nyallau Anak Badak\n",
+ "n2 split is long: Bousou P\n",
+ "n2 split is long: evleaks\n"
+ ]
+ }
+ ],
+ "source": [
+ "messages = []\n",
+ "\n",
+ "for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):\n",
+ " id_kg = id_kg.replace('m.', '/m/')\n",
+ " for df_row in msceleb_group.itertuples():\n",
+ " if '@' in df_row.name:\n",
+ " splits = df_row.name.split('@')\n",
+ " if not len(splits) > 1:\n",
+ " msg = f'only one split: {df_row.name}'\n",
+ " if not msg in messages:\n",
+ " print(msg)\n",
+ " messages.append(msg)\n",
+ " elif len(splits) > 1:\n",
+ " if len(splits[1]) != 2:\n",
+ " msg = f'n2 split is long: {splits[1]}'\n",
+ " if not msg in messages:\n",
+ " print(msg)\n",
+ " messages.append(msg)\n",
+ " else:\n",
+ " print(df_row.name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "475871ac6d08484cbec44d5ccf099bd8",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# iterate groups and flatten language variations into named columns\n",
+ "identities = []\n",
+ "for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):\n",
+ " id_kg = id_kg.replace('m.', '/m/')\n",
+ " for df_row in msceleb_group.itertuples():\n",
+ " if '@' in df_row.name:\n",
+ " splits = df_row.name.split('@')\n",
+ " name = splits[0]\n",
+ " lang = splits[1] if len(splits) > 0 else 'en'\n",
+ " else:\n",
+ " # default to 'en'\n",
+ " lang = 'en'\n",
+ " name = df_row.name\n",
+ " col_name = f'ms_name_{lang}'\n",
+ " identities.append({'id_kg': id_kg, col_name: name})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[{'id_kg': 'm/01008l47', 'ms_name_en': 'Patrick Cummins'}, {'id_kg': 'm/01008l47', 'ms_name_pt': 'Patrick Cummins'}]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(identities[0:10])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# temp save DataFrame to CSV\n",
+ "def save_identity_master(identities, fp_out=fp_master_identities):\n",
+ " df_identities_master = pd.DataFrame.from_dict(identities)\n",
+ " df_identities_master.index.name = 'id'\n",
+ " df_identities_master.to_csv(fp_master_identities)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Add image count data for MS Celeb"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# load lines\n",
+ "fp_msceleb_clean = '/data_store_hdd/datasets/people/msceleb/downloads/MS-Celeb-1M_clean_list.txt'\n",
+ "with open(fp_msceleb_clean,'r') as fp:\n",
+ " msceleb_lines = fp.readlines()\n",
+ "msceleb_files = {}\n",
+ "\n",
+ "# iterate lines and append all files\n",
+ "for filepath in msceleb_lines:\n",
+ " id_kg, fname = filepath.split('/')\n",
+ " id_kg = id_kg.replace('m.', '/m/')\n",
+ " if not id_kg in msceleb_files.keys():\n",
+ " msceleb_files[id_kg] = []\n",
+ " msceleb_files[id_kg].append(fname)\n",
+ "\n",
+ " # add count\n",
+ "for identity in identities:\n",
+ " id_kg = identity['id_kg']\n",
+ " if id_kg in msceleb_files.keys():\n",
+ " identity['msceleb_count'] = len(msceleb_files[identity['id_kg']])\n",
+ " else:\n",
+ " identity['msceleb_count'] = 0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# save (takes 30 seconds)\n",
+ "save_identity_master(identities) # encoding='utf-16' ??"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['id_kg',\n",
+ " 'ms_name_ Marcus Mojigoh',\n",
+ " 'ms_name_ Nyallau Anak Badak',\n",
+ " 'ms_name_ Teng Boon Soon',\n",
+ " 'ms_name_ Tiki Anak Lafe',\n",
+ " 'ms_name_ Yong Khoon Seng',\n",
+ " 'ms_name_Bousou P',\n",
+ " 'ms_name_N',\n",
+ " 'ms_name_af',\n",
+ " 'ms_name_am',\n",
+ " 'ms_name_ar',\n",
+ " 'ms_name_az',\n",
+ " 'ms_name_be',\n",
+ " 'ms_name_bg',\n",
+ " 'ms_name_bm',\n",
+ " 'ms_name_bn',\n",
+ " 'ms_name_bo',\n",
+ " 'ms_name_br',\n",
+ " 'ms_name_bs',\n",
+ " 'ms_name_ca',\n",
+ " 'ms_name_ceb',\n",
+ " 'ms_name_ck',\n",
+ " 'ms_name_co',\n",
+ " 'ms_name_cr',\n",
+ " 'ms_name_cs',\n",
+ " 'ms_name_cy',\n",
+ " 'ms_name_da',\n",
+ " 'ms_name_de',\n",
+ " 'ms_name_destiny',\n",
+ " 'ms_name_dz',\n",
+ " 'ms_name_el',\n",
+ " 'ms_name_en',\n",
+ " 'ms_name_en-GB',\n",
+ " 'ms_name_en-US',\n",
+ " 'ms_name_eo',\n",
+ " 'ms_name_es',\n",
+ " 'ms_name_es-419',\n",
+ " 'ms_name_et',\n",
+ " 'ms_name_eu',\n",
+ " 'ms_name_evleaks',\n",
+ " 'ms_name_fa',\n",
+ " 'ms_name_fi',\n",
+ " 'ms_name_fil',\n",
+ " 'ms_name_fo',\n",
+ " 'ms_name_fr',\n",
+ " 'ms_name_fr-CA',\n",
+ " 'ms_name_fy',\n",
+ " 'ms_name_ga',\n",
+ " 'ms_name_gd',\n",
+ " 'ms_name_gl',\n",
+ " 'ms_name_gn',\n",
+ " 'ms_name_gu',\n",
+ " 'ms_name_ha',\n",
+ " 'ms_name_hi',\n",
+ " 'ms_name_hr',\n",
+ " 'ms_name_ht',\n",
+ " 'ms_name_hu',\n",
+ " 'ms_name_hu\\r\\nm.03zytg\\tΑστέριος\"',\n",
+ " 'ms_name_hy',\n",
+ " 'ms_name_id',\n",
+ " 'ms_name_ig',\n",
+ " 'ms_name_is',\n",
+ " 'ms_name_it',\n",
+ " 'ms_name_iw',\n",
+ " 'ms_name_ja',\n",
+ " 'ms_name_ka',\n",
+ " 'ms_name_kk',\n",
+ " 'ms_name_kl',\n",
+ " 'ms_name_km',\n",
+ " 'ms_name_kn',\n",
+ " 'ms_name_ko',\n",
+ " 'ms_name_ku',\n",
+ " 'ms_name_ky',\n",
+ " 'ms_name_la',\n",
+ " 'ms_name_lb',\n",
+ " 'ms_name_lo',\n",
+ " 'ms_name_lt',\n",
+ " 'ms_name_lv',\n",
+ " 'ms_name_mg',\n",
+ " 'ms_name_mi',\n",
+ " 'ms_name_mk',\n",
+ " 'ms_name_ml',\n",
+ " 'ms_name_mn',\n",
+ " 'ms_name_mr',\n",
+ " 'ms_name_ms',\n",
+ " 'ms_name_mt',\n",
+ " 'ms_name_my',\n",
+ " 'ms_name_ne',\n",
+ " 'ms_name_nl',\n",
+ " 'ms_name_nn',\n",
+ " 'ms_name_no',\n",
+ " 'ms_name_nv',\n",
+ " 'ms_name_ny',\n",
+ " 'ms_name_oc',\n",
+ " 'ms_name_or',\n",
+ " 'ms_name_pa',\n",
+ " 'ms_name_pl',\n",
+ " 'ms_name_ps',\n",
+ " 'ms_name_pt',\n",
+ " 'ms_name_pt-PT',\n",
+ " 'ms_name_ro',\n",
+ " 'ms_name_ru',\n",
+ " 'ms_name_rw',\n",
+ " 'ms_name_sa',\n",
+ " 'ms_name_sc',\n",
+ " 'ms_name_se',\n",
+ " 'ms_name_si',\n",
+ " 'ms_name_sk',\n",
+ " 'ms_name_sl',\n",
+ " 'ms_name_sn',\n",
+ " 'ms_name_so',\n",
+ " 'ms_name_sq',\n",
+ " 'ms_name_sr',\n",
+ " 'ms_name_st',\n",
+ " 'ms_name_su',\n",
+ " 'ms_name_sv',\n",
+ " 'ms_name_sw',\n",
+ " 'ms_name_ta',\n",
+ " 'ms_name_te',\n",
+ " 'ms_name_tg',\n",
+ " 'ms_name_th',\n",
+ " 'ms_name_tr',\n",
+ " 'ms_name_ug',\n",
+ " 'ms_name_uk',\n",
+ " 'ms_name_ur',\n",
+ " 'ms_name_uz',\n",
+ " 'ms_name_vi',\n",
+ " 'ms_name_xh',\n",
+ " 'ms_name_yi',\n",
+ " 'ms_name_yo',\n",
+ " 'ms_name_zh',\n",
+ " 'ms_name_zh-HK',\n",
+ " 'ms_name_zh-Hant',\n",
+ " 'ms_name_zorbla.de',\n",
+ " 'ms_name_zu']"
+ ]
+ },
+ "execution_count": 95,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "list(df_identities_master.keys())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "megapixels",
+ "language": "python",
+ "name": "megapixels"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/megapixels/notebooks/datasets/identity/identity_testing.ipynb b/megapixels/notebooks/datasets/identity/identity_testing.ipynb
index 384cca93..3975d0c6 100644
--- a/megapixels/notebooks/datasets/identity/identity_testing.ipynb
+++ b/megapixels/notebooks/datasets/identity/identity_testing.ipynb
@@ -13,7 +13,7 @@
},
{
"cell_type": "code",
- "execution_count": 155,
+ "execution_count": 186,
"metadata": {},
"outputs": [],
"source": [
@@ -54,23 +54,6 @@
]
},
{
- "cell_type": "code",
- "execution_count": 159,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "/media/adam/ah8tb/work/megapixels_dev/env/google_knowledge_graph_api.env\n"
- ]
- }
- ],
- "source": [
- "print(app_cfg.FP_KNOWLEDGE_GRAPH_ENV)"
- ]
- },
- {
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -79,7 +62,7 @@
},
{
"cell_type": "code",
- "execution_count": 160,
+ "execution_count": 188,
"metadata": {},
"outputs": [
{
@@ -92,7 +75,8 @@
}
],
"source": [
- "names = identity_utils.get_names(types.Dataset.LFW)\n",
+ "names = identity_utils.get_names(types.Dataset.\n",
+ " )\n",
"print(names['names_query'][0:10])\n",
"print(names['names_orig'][0:10])"
]
@@ -108,14 +92,12 @@
},
{
"cell_type": "code",
- "execution_count": 161,
+ "execution_count": 164,
"metadata": {},
"outputs": [],
"source": [
"# read API key\n",
- "\n",
- "api_key = open(app_cfg.FP_KNOWLEDGE_GRAPH_ENV).read()\n",
- "kg_api = api_utils.GoogleKnowledgeGraph(api_key)\n",
+ "kg_api = api_utils.GoogleKnowledgeGraph()\n",
"wp_api = api_utils.WikipediaAPI()"
]
},
@@ -128,25 +110,23 @@
},
{
"cell_type": "code",
- "execution_count": 128,
+ "execution_count": 165,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "wp\n",
+ "wp----\n",
"https://en.wikipedia.org/w/api.php?redirects=&ppprop=displaytitle&prop=pageprops%7Cpageimages%7Cdescription&generator=prefixsearch&action=query&format=json&piprop=thumbnail&pilimit=1&gpssearch=Vicente+Fox&gpsnamespace=0&gpslimit=1\n",
"{'wp_accessed': True,\n",
" 'wp_description': 'President of Mexico',\n",
" 'wp_name': 'Vicente Fox',\n",
" 'wp_page_id': '32836'}\n",
- "kg\n",
+ "kg----\n",
"{'kg_accessed': True,\n",
- " 'kg_bio': 'Vicente Fox Quesada, RSerafO is a Mexican businessman and '\n",
- " 'politician who served as the 55th President of Mexico from 1 '\n",
- " 'December 2000 to 30 November 2006.\\n',\n",
- " 'kg_bio_url': 'https://en.wikipedia.org/wiki/Vicente_Fox',\n",
+ " 'kg_bio': '',\n",
+ " 'kg_bio_url': '',\n",
" 'kg_description': 'Former President of Mexico',\n",
" 'kg_error': '',\n",
" 'kg_id': '/m/081f4',\n",
@@ -174,14 +154,15 @@
},
{
"cell_type": "code",
- "execution_count": 162,
+ "execution_count": 168,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "0.7714285714285716\n"
+ "0.7714285714285716\n",
+ "0.7142857142857143\n"
]
}
],
@@ -189,7 +170,8 @@
"#print(identity_utils.names_match('Andréss Iniestas', 'Andres Iniestalossas Jr.', as_float=True))\n",
"#print(identity_utils.names_match('Adoor Gopalakrishnan', 'Adoors Gopalakarishnan', as_float=True))\n",
"#print(identity_utils.names_match('Dave Letterman', 'David Letterman', as_float=True))\n",
- "print(identity_utils.names_match('Charles Dickens', 'Charles Boooker', as_float=True))\n",
+ "print(identity_utils.names_match('Charles Dickens', 'Charles Boooker', as_float=True, compound_score=True))\n",
+ "print(identity_utils.names_match('Charles Dickens', 'Charles Boooker', as_float=True, compound_score=False))\n",
"#print(identity_utils.names_match('Donald Trump', 'Donald J. Trump', as_float=True))\n",
"#print(identity_utils.names_match('Wang Fei', 'Fei Wang III', as_float=True))"
]