diff options
| author | adamhrv <adam@ahprojects.com> | 2019-02-14 14:45:18 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-02-14 14:45:18 +0100 |
| commit | 3a3a89f2c58eceee07b2cfcfb1700a61b34619e5 (patch) | |
| tree | 436347b8466422a1019209f9f04937ea1ce0e4eb /megapixels | |
| parent | 41247c08ea359d0a72a247992d2019ae2120536c (diff) | |
updates
Diffstat (limited to 'megapixels')
| -rw-r--r-- | megapixels/app/models/data_store.py | 46 | ||||
| -rw-r--r-- | megapixels/app/settings/app_cfg.py | 7 | ||||
| -rw-r--r-- | megapixels/app/settings/types.py | 5 | ||||
| -rw-r--r-- | megapixels/app/utils/api_utils.py | 25 | ||||
| -rw-r--r-- | megapixels/app/utils/identity_utils.py | 78 | ||||
| -rw-r--r-- | megapixels/commands/datasets/citations_to_csv.py | 53 | ||||
| -rw-r--r-- | megapixels/notebooks/datasets/identity/identity_master.ipynb | 633 | ||||
| -rw-r--r-- | megapixels/notebooks/datasets/identity/identity_testing.ipynb | 50 |
8 files changed, 813 insertions, 84 deletions
diff --git a/megapixels/app/models/data_store.py b/megapixels/app/models/data_store.py index a8d6916f..b4260b9c 100644 --- a/megapixels/app/models/data_store.py +++ b/megapixels/app/models/data_store.py @@ -2,6 +2,7 @@ import os from os.path import join import logging +from app.utils.logger_utils import Logger from app.settings import app_cfg as cfg from app.settings import types @@ -11,41 +12,62 @@ from app.settings import types # ------------------------------------------------------------------------- class DataStore: + # local data store + log = Logger.getLogger() + def __init__(self, opt_data_store, opt_dataset): self.data_store = join(f'/data_store_{opt_data_store.name.lower()}') - self.dir_dataset = join(self.data_store, 'datasets', cfg.DIR_PEOPLE, opt_dataset.name.lower()) - self.dir_media = join(self.dir_dataset, 'media') - self.dir_metadata = join(self.dir_dataset, 'metadata') + self._dir_dataset = join(self.data_store, 'datasets', cfg.DIR_PEOPLE, opt_dataset.name.lower()) + self._dir_media = join(self._dir_dataset, 'media') + self._dir_metadata = join(self._dir_dataset, 'metadata') def metadata(self, enum_type): - return join(self.dir_metadata, f'{enum_type.name.lower()}.csv') + return join(self._dir_metadata, f'{enum_type.name.lower()}.csv') + + @property + def dir_dataset(self): + return self._dir_dataset + + @property + def dir_media(self): + return self._dir_media + + @property + def dir_media_original(self): + return join(self._dir_media, 'original') + + @property + def dir_metadata(self): + return self._dir_metadata def metadata_dir(self): - return join(self.dir_metadata) + self.log.warn('deprecated. use dir_metadata') + return self._dir_metadata def media_dir(self): - return join(self.dir_media) + self.log.warn('deprecated. use dir_media') + return self._dir_media def media_images_original(self): - return join(self.dir_media, 'original') + return join(self._dir_media, 'original') def face(self, subdir, fn, ext): if subdir == '' or subdir is None: subdir = '.' - return join(self.dir_media, 'original', subdir, f'{fn}.{ext}') + return join(self._dir_media, 'original', subdir, f'{fn}.{ext}') def face_crop(self, subdir, fn, ext): - return join(self.dir_media, 'cropped', subdir, f'{fn}.{ext}') + return join(self._dir_media, 'cropped', subdir, f'{fn}.{ext}') def face_uuid(self, uuid, ext): - return join(self.dir_media, 'uuid',f'{uuid}.{ext}') + return join(self._dir_media, 'uuid',f'{uuid}.{ext}') def face_crop_uuid(self, uuid, ext): - return join(self.dir_media, 'uuid', f'{uuid}.{ext}') + return join(self._dir_media, 'uuid', f'{uuid}.{ext}') def uuid_dir(self): - return join(self.dir_media, 'uuid') + return join(self._dir_media, 'uuid') class DataStoreS3: diff --git a/megapixels/app/settings/app_cfg.py b/megapixels/app/settings/app_cfg.py index 2b10f9f0..0b1fb69d 100644 --- a/megapixels/app/settings/app_cfg.py +++ b/megapixels/app/settings/app_cfg.py @@ -6,6 +6,7 @@ from dotenv import load_dotenv from app.settings import types from app.utils import click_utils +from pathlib import Path import codecs codecs.register(lambda name: codecs.lookup('utf8') if name == 'utf8mb4' else None) @@ -26,6 +27,10 @@ FaceLandmark2D_5Var = click_utils.ParamVar(types.FaceLandmark2D_5) FaceLandmark2D_68Var = click_utils.ParamVar(types.FaceLandmark2D_68) FaceLandmark3D_68Var = click_utils.ParamVar(types.FaceLandmark3D_68) +# base path +DIR_SELF = os.path.dirname(os.path.realpath(__file__)) +DIR_ROOT = Path(DIR_SELF).parent.parent.parent + # # data_store DATA_STORE = '/data_store_hdd/' DATA_STORE_NAS = '/data_store_nas/' @@ -64,7 +69,7 @@ DIR_TEST_IMAGES = join(DIR_APP, 'test', 'images') # ----------------------------------------------------------------------------- # .env config for keys # ----------------------------------------------------------------------------- - +FP_KNOWLEDGE_GRAPH_ENV = join(DIR_ROOT, 'env/google_knowledge_graph_api.env') # DIR_DOTENV = join(DIR_APP, '.env') load_dotenv() # dotenv_path=DIR_DOTENV) diff --git a/megapixels/app/settings/types.py b/megapixels/app/settings/types.py index 933d1932..3d7e96c0 100644 --- a/megapixels/app/settings/types.py +++ b/megapixels/app/settings/types.py @@ -47,8 +47,9 @@ class Metadata(Enum): FACE_ATTRIBUTES, IMAGE_COUNT = range(10) class Dataset(Enum): - LFW, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \ - CASIA_WEBFACE, AFW, PUBFIG83, HELEN, PIPA, MEGAFACE, BRAINWASH, IMDB_WIKI = range(16) + LFW, VGG_FACE, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \ + CASIA_WEBFACE, AFW, PUBFIG83, HELEN, PIPA, MEGAFACE, BRAINWASH, IMDB_WIKI, \ + LARGE_AGE_GAP = range(18) # --------------------------------------------------------------------- diff --git a/megapixels/app/utils/api_utils.py b/megapixels/app/utils/api_utils.py index ec00113e..d9d67425 100644 --- a/megapixels/app/utils/api_utils.py +++ b/megapixels/app/utils/api_utils.py @@ -2,15 +2,21 @@ import json import urllib import urllib.request +from app.settings import app_cfg +from app.utils import file_utils, im_utils, logger_utils + class WikipediaAPI: url_base = 'https://en.wikipedia.org/w/api.php' - + log = logger_utils.Logger.getLogger() + # https://en.wikipedia.org/w/api.php?redirects=& + # ppprop=displaytitle&prop=pageprops|pageimages|description&generator=prefixsearch + # &action=query&format=json&piprop=thumbnail&pithumbsize=160&pilimit=6&gpssearch=Vicente+Fox&gpsnamespace=0&gpslimit=6 + def _url_builder(self, q): # https://www.mediawiki.org/wiki/API%3aProperties#Info%3a_Parameters - params = { 'redirects': '', 'ppprop': 'displaytitle', @@ -56,12 +62,16 @@ class WikipediaAPI: obj['wp_accessed'] = False return obj - def get_meta(self, query_obj): + def get_meta(self, query_obj, verbose=False): '''Searches Wikipedia API for query string''' + if query_obj.get('wp_accessed', False): return query_obj else: url = self._url_builder(query_obj['query']) + if verbose: + self.log.debug(f'querying: {url}') + print(url) return self._api_search(url) def search(self, q): @@ -73,9 +83,14 @@ class WikipediaAPI: class GoogleKnowledgeGraph: url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search' + log = logger_utils.Logger.getLogger() + fp_api_key = app_cfg.FP_KNOWLEDGE_GRAPH_ENV - def __init__(self, key): - self._api_key = key + def __init__(self, api_key=None): + if api_key is not None: + self._api_key = api_key + else: + self._api_key = open(self.fp_api_key).read() def _get_kg_meta(self, result_obj, params): diff --git a/megapixels/app/utils/identity_utils.py b/megapixels/app/utils/identity_utils.py index e090d16e..f9ed009e 100644 --- a/megapixels/app/utils/identity_utils.py +++ b/megapixels/app/utils/identity_utils.py @@ -5,22 +5,82 @@ import unidecode import difflib from app.settings import types +from app.models.data_store import DataStore from app.utils import logger_utils log = logger_utils.Logger.getLogger() +''' +class Dataset(Enum): + LFW, VGG_FACE, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \ + CASIA_WEBFACE, AFW, PUBFIG83, HELEN, PIPA, MEGAFACE, BRAINWASH, IMDB_WIKI = range(16) +''' # Get list of names based on Dataset type -def get_names(enum_dataset): - if enum_dataset == types.Dataset.LFW: - dir_lfw = '/data_store_hdd/datasets/people/lfw/media/original/' - names_orig = [x for x in os.listdir(dir_lfw)] +def get_names(opt_dataset, opt_data_store=types.DataStore.HDD): + data_store = DataStore(opt_data_store, opt_dataset) + dir_dataset = data_store.dir_dataset # path to dataset root + dir_media_orig = data_store.dir_media_original + if opt_dataset == types.Dataset.AFW: + # Annotated Faces in the Wild + pass + elif opt_dataset == types.Dataset.BRAINWASH: + # Brainwash IP Cam dataset + pass + elif opt_dataset == types.Dataset.CASIA_WEBFACE: + # + pass + elif opt_dataset == types.Dataset.HELEN: + # Helen + pass + elif opt_dataset == types.Dataset.IMDB_WIKI: + # University of Tennessee Knoxville + pass + elif opt_dataset == types.Dataset.LAG: + # Large Age Gap + pass + elif opt_dataset == types.Dataset.LFW: + # Labeled Faces in The Wild + names_orig = [x for x in os.listdir(dir_media_orig)] names_query = [x.replace('_', ' ') for x in names_orig] - result = {'names_orig': names_orig, 'names_query': names_query} - elif enum_dataset == types.Dataset.YOUTUBE_FACES: - names = [x for x in names if 'labeled faces.txt' not in x] + elif opt_dataset == types.Dataset.MEGAFACE: + # MegaFace + pass + elif opt_dataset == types.Dataset.MSCELEB: + # MS Celeb + pass + elif opt_dataset == types.Dataset.PIPA: + # People in Photo Albums + pass + elif opt_dataset == types.Dataset.PUBFIG83: + # PubFig83 + names_orig = [x for x in os.listdir(dir_media_orig) if Path(x).suffix is not '.txt'] + names_query = [x.replace('_', ' ') for x in names_orig] + elif opt_dataset == types.Dataset.SCUT_FBP: + # SCUT Facial Beauty Perception + pass + elif opt_dataset == types.Dataset.UCCS: + # Unconstrianed College Students + pass + elif opt_dataset == types.Dataset.UMD_FACES: + # University of Maryland Faces + pass + elif opt_dataset == types.Dataset.UTK: + # University of Tennessee Knoxville + pass + elif opt_dataset == types.Dataset.UCF_SELFIE: + # University of Central Florida Selfie + pass + elif opt_dataset == types.Dataset.VGG_FACE: + # Visual Geometry Group Face 1 + pass + elif opt_dataset == types.Dataset.VGG_FACE2: + # Visual Geometry Group Face 2 + pass else: - log.warn(f'{enum_dataset} not yet implemented') - result = {} + log.warn(f'{opt_dataset} not yet implemented') + names_orig = [] + names_query = [] + result = {'names_orig': names_orig, 'names_query': names_query} return result def similarity(a, b): diff --git a/megapixels/commands/datasets/citations_to_csv.py b/megapixels/commands/datasets/citations_to_csv.py index d96748e5..e54d0dac 100644 --- a/megapixels/commands/datasets/citations_to_csv.py +++ b/megapixels/commands/datasets/citations_to_csv.py @@ -8,11 +8,11 @@ log = Logger.getLogger() @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, - help='Input license data CSV') -@click.option('-o', '--output', 'opt_fp_out', + help='Input citation data file or folder') +@click.option('-o', '--output', 'opt_dir_out', help='Output directory') @click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out): +def cli(ctx, opt_fp_in, opt_dir_out): """Convert JSON to CSV""" import sys @@ -30,27 +30,38 @@ def cli(ctx, opt_fp_in, opt_fp_out): log.info('Convert JSON to CSV') # load - with open(opt_fp_in, 'r') as fp: - json_data = json.load(fp) + if Path(opt_fp_in).is_dir(): + fps_in = glob(join(opt_fp_in, '*.json')) + else: + fps_in = [opt_fp_in] - # parse - papers = [] - dataset_key = json_data['paper']['key'] - dataset_name = json_data['paper']['name'] - papers_main = get_orig_paper(json_data) - papers += papers_main - papers_citations = get_citations(dataset_key, dataset_name, json_data) - papers += papers_citations - papers = [p.to_dict() for p in papers] + log.info(f'{fps_in}') + + for fp_in in fps_in: + with open(fp_in, 'r') as fp: + json_data = json.load(fp) - # save - if not opt_fp_out: - fp_out = opt_fp_in.replace('.json','.csv') - log.info(fp_out) + # parse + papers = [] + dataset_key = json_data['paper']['key'] + dataset_name = json_data['paper']['name'] + papers_main = get_orig_paper(json_data) + papers += papers_main + papers_citations = get_citations(dataset_key, dataset_name, json_data) + papers += papers_citations + papers = [p.to_dict() for p in papers] + + # save + if not opt_dir_out: + # save to same directory replacing ext + fp_out = fp_in.replace('.json','.csv') + else: + fp_out = join(opt_dir_out, Path(fp_in).name) - df_papers = pd.DataFrame.from_dict(papers) - df_papers.index.name = 'index' - df_papers.to_csv(fp_out) + df_papers = pd.DataFrame.from_dict(papers) + df_papers.index.name = 'id' + df_papers.to_csv(fp_out) + log.info(f'Wrote {len(df_papers):,} lines to {fp_out}') diff --git a/megapixels/notebooks/datasets/identity/identity_master.ipynb b/megapixels/notebooks/datasets/identity/identity_master.ipynb new file mode 100644 index 00000000..a48a7ba1 --- /dev/null +++ b/megapixels/notebooks/datasets/identity/identity_master.ipynb @@ -0,0 +1,633 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Identity Master List\n", + "\n", + "- start with MS Celeb Top1M\n", + "- then progressively add smaller datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "from os.path import join\n", + "from glob import glob\n", + "from pathlib import Path\n", + "import requests\n", + "import json\n", + "from pprint import pprint\n", + "from multiprocessing.pool import ThreadPool\n", + "import threading\n", + "import urllib.request\n", + "import difflib\n", + "import unidecode\n", + "\n", + "import slugify\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd\n", + "from scipy.io import loadmat\n", + "import numpy as np\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import sys\n", + "sys.path.append('/work/megapixels_dev/megapixels')\n", + "from app.utils import api_utils, identity_utils\n", + "from app.settings import app_cfg\n", + "from app.settings import types" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "fp_master_identities = '/data_store_hdd/apps/megapixels/metadata/identities_master.csv'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## MS Celeb Top 1M\n", + "\n", + "- add column for each spelling of name\n", + "- convert kg id to standard google format" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "fp_msceleb_top1m = '/data_store_hdd/datasets/people/msceleb/downloads/Top1M_MidList.Name.tsv'\n", + "df_msceleb_top1m = pd.read_csv(fp_msceleb_top1m, delimiter='\\t', header=None, encoding='utf-8', names=['id_kg', 'name'])\n", + "df_msceleb_top1m_groups = df_msceleb_top1m.groupby('id_kg')\n", + "n_groups = df_msceleb_top1m_groups.ngroups\n", + "print(f'{n_groups} groups')\n", + "df_msceleb_top1m.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [], + "source": [ + "mseleb_top1m_records = df_msceleb_top1m.to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "#df_msceleb_top1m.head(100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "abbrev_mappings = {\n", + " 'en-US': 'en',\n", + " 'en-GB': 'en',\n", + " 'es-419': 'es-419',\n", + " 'es'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "msceleb_identities = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [], + "source": [ + "def split_name_lang(name_lang):\n", + " '''Split name into name and language'''\n", + " if '@' in name_lang:\n", + " indexes = [i for i,x in enumerate(name_lang) if x == '@']\n", + " idx_max = (max(indexes))\n", + " lang = name_lang[(idx_max + 1):]\n", + " name = name_lang[:(idx_max)]\n", + " else:\n", + " name = name_lang\n", + " lang = ''\n", + " return {'name': name, 'lang': lang}" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'name': 'r@destiny', 'lang': 'en-417'}" + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "split_name_lang('r@destiny@en')" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0120e006a7564f5c82729a7050ef0386", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "msceleb_identities = {}\n", + "for mseleb_top1m_record in tqdm(mseleb_top1m_records):\n", + " id_kg = mseleb_top1m_record['id_kg']\n", + " if not id_kg in msceleb_identities.keys():\n", + " msceleb_identities[id_kg] = {}\n", + " name_lang = split_name_lang(mseleb_top1m_record['name'])\n", + " name = name_lang['name']\n", + " lang = name_lang['lang']\n", + " msceleb_identities[id_kg][lang] = name" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [], + "source": [ + "import itertools\n", + "msceleb_identities_sm = dict(itertools.islice(msceleb_identities.items(), 0, 10))" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Patrick Cummins en\n", + "Patrick Cummins pt\n", + "Mohamed Guessous en\n", + "Mohamed Guessous fr\n", + "محمد جسوس ar\n", + "Tsvetta Kaleynska en\n", + "Tsvetta Kaleynska es\n", + "Tsvetta Kaleynska fr\n", + "Цвета Калейнска bg\n", + "Цвета Калейнска ru\n", + "Caio Henrique Siqueira Sanchez en\n", + "Кајо Санчез sr\n", + "Julio Ríos Gallego ca\n", + "Julio Ríos Gallego en\n", + "Julio Ríos Gallego es\n", + "Nilson Ricardo da Silva Júnior en\n", + "ニルソン・リカルド・ダ・シルバ・ジュニオール ja\n", + "니우송 히카르두 다 시우바 주니오르 ko\n", + "Aleksej Aleksandrovič Starobinski sl\n", + "Alexei Alexandrowitsch Starobinski de\n", + "Alexei Starobinski pt\n", + "Alexei Starobinsky en\n", + "Alexeï Starobinski fr\n", + "Алексей Александрович Старобинский ru\n", + "Старобінський Олексій Олександрович uk\n", + "アレクセイ・スタロビンスキー ja\n", + "Hilda Rix Nicholas en\n", + "هیلدا ریکس نیکولاس fa\n", + "Behrouz Makvandi en\n", + "Бехруз Макванди ru\n", + "بهروز مکوندی fa\n", + "Borislav Terzić en\n", + "Борислав Терзић sr\n" + ] + } + ], + "source": [ + "# de-duplicate names that use same spelling for multiple languages\n", + "for id_kg, name_langs in msceleb_identities_sm.items():\n", + " if 'en' in name_langs.keys():\n", + " name_en = name_langs['en']\n", + " for lang, name in name_langs.items():\n", + " print(name, lang)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "374a55f504084f14bd4d77fed0e2f4e4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n2 split is long: zh-Hant\n", + "n2 split is long: es-419\n", + "n2 split is long: fil\n", + "n2 split is long: en-GB\n", + "n2 split is long: en-US\n", + "n2 split is long: zh-HK\n", + "n2 split is long: fr-CA\n", + "n2 split is long: pt-PT\n", + "n2 split is long: ceb\n", + "n2 split is long: zorbla.de\n", + "n2 split is long: N\n", + "n2 split is long: hu\n", + "m.03zytg\tΑστέριος\"\n", + "n2 split is long: destiny\n", + "n2 split is long: Teng Boon Soon\n", + "n2 split is long: Yong Khoon Seng\n", + "n2 split is long: Tiki Anak Lafe\n", + "n2 split is long: Marcus Mojigoh\n", + "n2 split is long: Nyallau Anak Badak\n", + "n2 split is long: Bousou P\n", + "n2 split is long: evleaks\n" + ] + } + ], + "source": [ + "messages = []\n", + "\n", + "for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):\n", + " id_kg = id_kg.replace('m.', '/m/')\n", + " for df_row in msceleb_group.itertuples():\n", + " if '@' in df_row.name:\n", + " splits = df_row.name.split('@')\n", + " if not len(splits) > 1:\n", + " msg = f'only one split: {df_row.name}'\n", + " if not msg in messages:\n", + " print(msg)\n", + " messages.append(msg)\n", + " elif len(splits) > 1:\n", + " if len(splits[1]) != 2:\n", + " msg = f'n2 split is long: {splits[1]}'\n", + " if not msg in messages:\n", + " print(msg)\n", + " messages.append(msg)\n", + " else:\n", + " print(df_row.name)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "475871ac6d08484cbec44d5ccf099bd8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# iterate groups and flatten language variations into named columns\n", + "identities = []\n", + "for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):\n", + " id_kg = id_kg.replace('m.', '/m/')\n", + " for df_row in msceleb_group.itertuples():\n", + " if '@' in df_row.name:\n", + " splits = df_row.name.split('@')\n", + " name = splits[0]\n", + " lang = splits[1] if len(splits) > 0 else 'en'\n", + " else:\n", + " # default to 'en'\n", + " lang = 'en'\n", + " name = df_row.name\n", + " col_name = f'ms_name_{lang}'\n", + " identities.append({'id_kg': id_kg, col_name: name})" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'id_kg': 'm/01008l47', 'ms_name_en': 'Patrick Cummins'}, {'id_kg': 'm/01008l47', 'ms_name_pt': 'Patrick Cummins'}]\n" + ] + } + ], + "source": [ + "print(identities[0:10])" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "# temp save DataFrame to CSV\n", + "def save_identity_master(identities, fp_out=fp_master_identities):\n", + " df_identities_master = pd.DataFrame.from_dict(identities)\n", + " df_identities_master.index.name = 'id'\n", + " df_identities_master.to_csv(fp_master_identities)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add image count data for MS Celeb" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "# load lines\n", + "fp_msceleb_clean = '/data_store_hdd/datasets/people/msceleb/downloads/MS-Celeb-1M_clean_list.txt'\n", + "with open(fp_msceleb_clean,'r') as fp:\n", + " msceleb_lines = fp.readlines()\n", + "msceleb_files = {}\n", + "\n", + "# iterate lines and append all files\n", + "for filepath in msceleb_lines:\n", + " id_kg, fname = filepath.split('/')\n", + " id_kg = id_kg.replace('m.', '/m/')\n", + " if not id_kg in msceleb_files.keys():\n", + " msceleb_files[id_kg] = []\n", + " msceleb_files[id_kg].append(fname)\n", + "\n", + " # add count\n", + "for identity in identities:\n", + " id_kg = identity['id_kg']\n", + " if id_kg in msceleb_files.keys():\n", + " identity['msceleb_count'] = len(msceleb_files[identity['id_kg']])\n", + " else:\n", + " identity['msceleb_count'] = 0" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "# save (takes 30 seconds)\n", + "save_identity_master(identities) # encoding='utf-16' ??" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['id_kg',\n", + " 'ms_name_ Marcus Mojigoh',\n", + " 'ms_name_ Nyallau Anak Badak',\n", + " 'ms_name_ Teng Boon Soon',\n", + " 'ms_name_ Tiki Anak Lafe',\n", + " 'ms_name_ Yong Khoon Seng',\n", + " 'ms_name_Bousou P',\n", + " 'ms_name_N',\n", + " 'ms_name_af',\n", + " 'ms_name_am',\n", + " 'ms_name_ar',\n", + " 'ms_name_az',\n", + " 'ms_name_be',\n", + " 'ms_name_bg',\n", + " 'ms_name_bm',\n", + " 'ms_name_bn',\n", + " 'ms_name_bo',\n", + " 'ms_name_br',\n", + " 'ms_name_bs',\n", + " 'ms_name_ca',\n", + " 'ms_name_ceb',\n", + " 'ms_name_ck',\n", + " 'ms_name_co',\n", + " 'ms_name_cr',\n", + " 'ms_name_cs',\n", + " 'ms_name_cy',\n", + " 'ms_name_da',\n", + " 'ms_name_de',\n", + " 'ms_name_destiny',\n", + " 'ms_name_dz',\n", + " 'ms_name_el',\n", + " 'ms_name_en',\n", + " 'ms_name_en-GB',\n", + " 'ms_name_en-US',\n", + " 'ms_name_eo',\n", + " 'ms_name_es',\n", + " 'ms_name_es-419',\n", + " 'ms_name_et',\n", + " 'ms_name_eu',\n", + " 'ms_name_evleaks',\n", + " 'ms_name_fa',\n", + " 'ms_name_fi',\n", + " 'ms_name_fil',\n", + " 'ms_name_fo',\n", + " 'ms_name_fr',\n", + " 'ms_name_fr-CA',\n", + " 'ms_name_fy',\n", + " 'ms_name_ga',\n", + " 'ms_name_gd',\n", + " 'ms_name_gl',\n", + " 'ms_name_gn',\n", + " 'ms_name_gu',\n", + " 'ms_name_ha',\n", + " 'ms_name_hi',\n", + " 'ms_name_hr',\n", + " 'ms_name_ht',\n", + " 'ms_name_hu',\n", + " 'ms_name_hu\\r\\nm.03zytg\\tΑστέριος\"',\n", + " 'ms_name_hy',\n", + " 'ms_name_id',\n", + " 'ms_name_ig',\n", + " 'ms_name_is',\n", + " 'ms_name_it',\n", + " 'ms_name_iw',\n", + " 'ms_name_ja',\n", + " 'ms_name_ka',\n", + " 'ms_name_kk',\n", + " 'ms_name_kl',\n", + " 'ms_name_km',\n", + " 'ms_name_kn',\n", + " 'ms_name_ko',\n", + " 'ms_name_ku',\n", + " 'ms_name_ky',\n", + " 'ms_name_la',\n", + " 'ms_name_lb',\n", + " 'ms_name_lo',\n", + " 'ms_name_lt',\n", + " 'ms_name_lv',\n", + " 'ms_name_mg',\n", + " 'ms_name_mi',\n", + " 'ms_name_mk',\n", + " 'ms_name_ml',\n", + " 'ms_name_mn',\n", + " 'ms_name_mr',\n", + " 'ms_name_ms',\n", + " 'ms_name_mt',\n", + " 'ms_name_my',\n", + " 'ms_name_ne',\n", + " 'ms_name_nl',\n", + " 'ms_name_nn',\n", + " 'ms_name_no',\n", + " 'ms_name_nv',\n", + " 'ms_name_ny',\n", + " 'ms_name_oc',\n", + " 'ms_name_or',\n", + " 'ms_name_pa',\n", + " 'ms_name_pl',\n", + " 'ms_name_ps',\n", + " 'ms_name_pt',\n", + " 'ms_name_pt-PT',\n", + " 'ms_name_ro',\n", + " 'ms_name_ru',\n", + " 'ms_name_rw',\n", + " 'ms_name_sa',\n", + " 'ms_name_sc',\n", + " 'ms_name_se',\n", + " 'ms_name_si',\n", + " 'ms_name_sk',\n", + " 'ms_name_sl',\n", + " 'ms_name_sn',\n", + " 'ms_name_so',\n", + " 'ms_name_sq',\n", + " 'ms_name_sr',\n", + " 'ms_name_st',\n", + " 'ms_name_su',\n", + " 'ms_name_sv',\n", + " 'ms_name_sw',\n", + " 'ms_name_ta',\n", + " 'ms_name_te',\n", + " 'ms_name_tg',\n", + " 'ms_name_th',\n", + " 'ms_name_tr',\n", + " 'ms_name_ug',\n", + " 'ms_name_uk',\n", + " 'ms_name_ur',\n", + " 'ms_name_uz',\n", + " 'ms_name_vi',\n", + " 'ms_name_xh',\n", + " 'ms_name_yi',\n", + " 'ms_name_yo',\n", + " 'ms_name_zh',\n", + " 'ms_name_zh-HK',\n", + " 'ms_name_zh-Hant',\n", + " 'ms_name_zorbla.de',\n", + " 'ms_name_zu']" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(df_identities_master.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "megapixels", + "language": "python", + "name": "megapixels" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/megapixels/notebooks/datasets/identity/identity_testing.ipynb b/megapixels/notebooks/datasets/identity/identity_testing.ipynb index 384cca93..3975d0c6 100644 --- a/megapixels/notebooks/datasets/identity/identity_testing.ipynb +++ b/megapixels/notebooks/datasets/identity/identity_testing.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 155, + "execution_count": 186, "metadata": {}, "outputs": [], "source": [ @@ -54,23 +54,6 @@ ] }, { - "cell_type": "code", - "execution_count": 159, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/media/adam/ah8tb/work/megapixels_dev/env/google_knowledge_graph_api.env\n" - ] - } - ], - "source": [ - "print(app_cfg.FP_KNOWLEDGE_GRAPH_ENV)" - ] - }, - { "cell_type": "markdown", "metadata": {}, "source": [ @@ -79,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 160, + "execution_count": 188, "metadata": {}, "outputs": [ { @@ -92,7 +75,8 @@ } ], "source": [ - "names = identity_utils.get_names(types.Dataset.LFW)\n", + "names = identity_utils.get_names(types.Dataset.\n", + " )\n", "print(names['names_query'][0:10])\n", "print(names['names_orig'][0:10])" ] @@ -108,14 +92,12 @@ }, { "cell_type": "code", - "execution_count": 161, + "execution_count": 164, "metadata": {}, "outputs": [], "source": [ "# read API key\n", - "\n", - "api_key = open(app_cfg.FP_KNOWLEDGE_GRAPH_ENV).read()\n", - "kg_api = api_utils.GoogleKnowledgeGraph(api_key)\n", + "kg_api = api_utils.GoogleKnowledgeGraph()\n", "wp_api = api_utils.WikipediaAPI()" ] }, @@ -128,25 +110,23 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 165, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "wp\n", + "wp----\n", "https://en.wikipedia.org/w/api.php?redirects=&ppprop=displaytitle&prop=pageprops%7Cpageimages%7Cdescription&generator=prefixsearch&action=query&format=json&piprop=thumbnail&pilimit=1&gpssearch=Vicente+Fox&gpsnamespace=0&gpslimit=1\n", "{'wp_accessed': True,\n", " 'wp_description': 'President of Mexico',\n", " 'wp_name': 'Vicente Fox',\n", " 'wp_page_id': '32836'}\n", - "kg\n", + "kg----\n", "{'kg_accessed': True,\n", - " 'kg_bio': 'Vicente Fox Quesada, RSerafO is a Mexican businessman and '\n", - " 'politician who served as the 55th President of Mexico from 1 '\n", - " 'December 2000 to 30 November 2006.\\n',\n", - " 'kg_bio_url': 'https://en.wikipedia.org/wiki/Vicente_Fox',\n", + " 'kg_bio': '',\n", + " 'kg_bio_url': '',\n", " 'kg_description': 'Former President of Mexico',\n", " 'kg_error': '',\n", " 'kg_id': '/m/081f4',\n", @@ -174,14 +154,15 @@ }, { "cell_type": "code", - "execution_count": 162, + "execution_count": 168, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.7714285714285716\n" + "0.7714285714285716\n", + "0.7142857142857143\n" ] } ], @@ -189,7 +170,8 @@ "#print(identity_utils.names_match('Andréss Iniestas', 'Andres Iniestalossas Jr.', as_float=True))\n", "#print(identity_utils.names_match('Adoor Gopalakrishnan', 'Adoors Gopalakarishnan', as_float=True))\n", "#print(identity_utils.names_match('Dave Letterman', 'David Letterman', as_float=True))\n", - "print(identity_utils.names_match('Charles Dickens', 'Charles Boooker', as_float=True))\n", + "print(identity_utils.names_match('Charles Dickens', 'Charles Boooker', as_float=True, compound_score=True))\n", + "print(identity_utils.names_match('Charles Dickens', 'Charles Boooker', as_float=True, compound_score=False))\n", "#print(identity_utils.names_match('Donald Trump', 'Donald J. Trump', as_float=True))\n", "#print(identity_utils.names_match('Wang Fei', 'Fei Wang III', as_float=True))" ] |
