From fe0dee2f8c8a7127d1ac2f01c5989f5011a2ee8a Mon Sep 17 00:00:00 2001 From: Adam Harvey Date: Tue, 19 Feb 2019 22:50:01 +0100 Subject: ...identity --- megapixels/app/utils/api_utils.py | 2 +- megapixels/app/utils/identity_utils.py | 22 +- .../datasets/identity/identity_master.ipynb | 1561 +++++++++++++++----- 3 files changed, 1248 insertions(+), 337 deletions(-) diff --git a/megapixels/app/utils/api_utils.py b/megapixels/app/utils/api_utils.py index d9d67425..a4dad501 100644 --- a/megapixels/app/utils/api_utils.py +++ b/megapixels/app/utils/api_utils.py @@ -3,7 +3,7 @@ import urllib import urllib.request from app.settings import app_cfg -from app.utils import file_utils, im_utils, logger_utils +from app.utils import logger_utils class WikipediaAPI: diff --git a/megapixels/app/utils/identity_utils.py b/megapixels/app/utils/identity_utils.py index f9ed009e..775652dc 100644 --- a/megapixels/app/utils/identity_utils.py +++ b/megapixels/app/utils/identity_utils.py @@ -10,6 +10,25 @@ from app.utils import logger_utils log = logger_utils.Logger.getLogger() +az = 'abcdefghijklmlopqrstuvwzxyz' +AZ = az.upper() +z9 = list(map(str, list(range(0,10)))) +aZ9 = list(az) + list(AZ) + z9 + +def letter_strip(a, b=aZ9): + # strip every letter from a that is not in b + return ''.join([x for x in a if x in b]) + +def letter_match(a, b): + # check if every letter (a-zA-Z0-9) exists in both + return sum([x in b for x in a]) == len(a) + +def names_match_strict(a, b): + clean_a = letter_strip(a) + clean_b = letter_strip(b) + return len(clean_a) == len(clean_b) and letter_match(clean_a, clean_b) and letter_match(clean_b, clean_a) + + ''' class Dataset(Enum): LFW, VGG_FACE, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \ @@ -83,6 +102,7 @@ def get_names(opt_dataset, opt_data_store=types.DataStore.HDD): result = {'names_orig': names_orig, 'names_query': names_query} return result + def similarity(a, b): return difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio() @@ -111,7 +131,7 @@ def names_match(name_a, name_b, threshold=0.9, as_float=False, compound_score=Fa scores.append(subscores) # return result - ratio_similar = sum(max(x) for x in scores) / len_min + ratio_similar = sum(max(x) for x in scores) / len(scores) if compound_score: # combine with any missing letters/words diff --git a/megapixels/notebooks/datasets/identity/identity_master.ipynb b/megapixels/notebooks/datasets/identity/identity_master.ipynb index a48a7ba1..e932a947 100644 --- a/megapixels/notebooks/datasets/identity/identity_master.ipynb +++ b/megapixels/notebooks/datasets/identity/identity_master.ipynb @@ -6,13 +6,30 @@ "source": [ "# Identity Master List\n", "\n", - "- start with MS Celeb Top1M\n", - "- then progressively add smaller datasets" + "- [x] MS Celeb 1M\n", + "- UMD Faces\n", + "- FaceScrub\n", + "- LFW\n", + "- PubFig\n", + "- PubFig83\n", + "- VGG Face\n", + "- VGG Face2\n", + "- IJB-C\n", + "- CASIA Webface\n", + "- IMDB-Face\n", + "- IMDB-Wiki" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 156, "metadata": {}, "outputs": [], "source": [ @@ -47,15 +64,6 @@ "from app.settings import types" ] }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "fp_master_identities = '/data_store_hdd/apps/megapixels/metadata/identities_master.csv'" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -68,65 +76,185 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 157, + "metadata": {}, + "outputs": [], + "source": [ + "fp_master_identities = '/data_store_hdd/apps/megapixels/metadata/identities_master_02.csv'\n", + "dir_msceleb_dloads = '/data_store_hdd/datasets/people/msceleb/downloads/'\n", + "fp_msceleb_clean_txt = join(dir_msceleb_dloads,'MS-Celeb-1M_clean_list.txt')" + ] + }, + { + "cell_type": "code", + "execution_count": 158, "metadata": {}, "outputs": [], "source": [ "fp_msceleb_top1m = '/data_store_hdd/datasets/people/msceleb/downloads/Top1M_MidList.Name.tsv'\n", - "df_msceleb_top1m = pd.read_csv(fp_msceleb_top1m, delimiter='\\t', header=None, encoding='utf-8', names=['id_kg', 'name'])\n", + "df_msceleb_top1m = pd.read_csv(fp_msceleb_top1m, delimiter='\\t', header=None, encoding='utf-8', names=['id_kg', 'name_lang'])\n", "df_msceleb_top1m_groups = df_msceleb_top1m.groupby('id_kg')\n", - "n_groups = df_msceleb_top1m_groups.ngroups\n", - "print(f'{n_groups} groups')\n", - "df_msceleb_top1m.head(2)" + "n_groups = df_msceleb_top1m_groups.ngroups" ] }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 200, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fbc706a8b9f34d958e478cdf584bf853", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "mseleb_top1m_records = df_msceleb_top1m.to_dict('records')" + "# create alphabetically sorted dict\n", + "msceleb_top1m_az = {}\n", + "a2z = 'abcdefghijklmnopqrstuvwxyz'\n", + "for c in a2z:\n", + " msceleb_top1m_az[c] = []\n", + "for msceleb_row in tqdm(df_msceleb_top1m.itertuples(), total=len(df_msceleb_top1m)):\n", + " name = msceleb_row.name_lang\n", + " try:\n", + " msceleb_top1m_az[name[0].lower()].append({'name': name, 'id_kg': msceleb_row.id_kg})\n", + " except Exception as e:\n", + " pass" ] }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 159, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id_kgname_lang
0m.01008l47Patrick Cummins@en
1m.01008l47Patrick Cummins@pt
2m.01008l96Mohamed Guessous@en
3m.01008l96Mohamed Guessous@fr
4m.01008l96محمد جسوس@ar
\n", + "
" + ], + "text/plain": [ + " id_kg name_lang\n", + "0 m.01008l47 Patrick Cummins@en\n", + "1 m.01008l47 Patrick Cummins@pt\n", + "2 m.01008l96 Mohamed Guessous@en\n", + "3 m.01008l96 Mohamed Guessous@fr\n", + "4 m.01008l96 محمد جسوس@ar" + ] + }, + "execution_count": 159, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#df_msceleb_top1m.head(100)" + "df_msceleb_top1m.head()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 160, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 3,481,186 total name variations\n", + "There are 1,000,000 unique identities\n" + ] + } + ], + "source": [ + "print(f'There are {len(df_msceleb_top1m):,} total name variations')\n", + "print(f'There are {n_groups:,} unique identities')" + ] + }, + { + "cell_type": "code", + "execution_count": 161, "metadata": {}, "outputs": [], "source": [ - "abbrev_mappings = {\n", - " 'en-US': 'en',\n", - " 'en-GB': 'en',\n", - " 'es-419': 'es-419',\n", - " 'es'\n", - "}" + "# convert DataFrame to dict\n", + "mseleb_top1m_records = df_msceleb_top1m.to_dict('records')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 162, "metadata": {}, "outputs": [], "source": [ + "# store all identity info here, until creating dataframe\n", "msceleb_identities = {}" ] }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 163, "metadata": {}, "outputs": [], "source": [ + "# utility functions\n", "def split_name_lang(name_lang):\n", " '''Split name into name and language'''\n", " if '@' in name_lang:\n", @@ -137,141 +265,142 @@ " else:\n", " name = name_lang\n", " lang = ''\n", - " return {'name': name, 'lang': lang}" + " return {'name': name, 'lang': lang}\n", + "\n", + "# temp save DataFrame to CSV\n", + "def save_identity_master(identities, fp_out=fp_master_identities):\n", + " df_identities_master = pd.DataFrame.from_dict(identities)\n", + " df_identities_master.index.name = 'id'\n", + " df_identities_master.to_csv(fp_master_identities)" ] }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 164, "metadata": {}, "outputs": [ { "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "884edc099a404dfcb53e353d2abf6819", + "version_major": 2, + "version_minor": 0 + }, "text/plain": [ - "{'name': 'r@destiny', 'lang': 'en-417'}" + "HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))" ] }, - "execution_count": 122, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "split_name_lang('r@destiny@en')" + "# convert to \"name@lang\" to dict format\n", + "msceleb_identities = {}\n", + "for mseleb_top1m_record in tqdm(mseleb_top1m_records):\n", + " id_kg = mseleb_top1m_record['id_kg'].replace('m.','/m/')\n", + " if not id_kg in msceleb_identities.keys():\n", + " msceleb_identities[id_kg] = {'names': {}}\n", + " name_lang = split_name_lang(mseleb_top1m_record['name_lang'])\n", + " name = name_lang['name']\n", + " lang = name_lang['lang']\n", + " if lang == 'en':\n", + " msceleb_identities[id_kg]['names']['canonical'] = name\n", + " msceleb_identities[id_kg]['names'][lang] = name" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Patch @en names" ] }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 165, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0120e006a7564f5c82729a7050ef0386", + "model_id": "1cd2915f485b4cd299a929e1fb2d5926", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))" + "HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "no english name for /m/017vbn\n", + "no english name for /m/026q0k_\n", + "no english name for /m/02k2kw\n", + "no english name for /m/0bwhrg1\n" + ] } ], "source": [ - "msceleb_identities = {}\n", - "for mseleb_top1m_record in tqdm(mseleb_top1m_records):\n", - " id_kg = mseleb_top1m_record['id_kg']\n", - " if not id_kg in msceleb_identities.keys():\n", - " msceleb_identities[id_kg] = {}\n", - " name_lang = split_name_lang(mseleb_top1m_record['name'])\n", - " name = name_lang['name']\n", - " lang = name_lang['lang']\n", - " msceleb_identities[id_kg][lang] = name" + "# check for missing english names\n", + "for id_kg, attrs in tqdm(msceleb_identities.items()):\n", + " lang_attrs = attrs['names']\n", + " name_en = lang_attrs.get('en', None)\n", + " if not name_en:\n", + " print(f'no english name for {id_kg}')" ] }, { "cell_type": "code", - "execution_count": 142, - "metadata": {}, - "outputs": [], - "source": [ - "import itertools\n", - "msceleb_identities_sm = dict(itertools.islice(msceleb_identities.items(), 0, 10))" - ] - }, - { - "cell_type": "code", - "execution_count": 145, + "execution_count": 166, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Patrick Cummins en\n", - "Patrick Cummins pt\n", - "Mohamed Guessous en\n", - "Mohamed Guessous fr\n", - "محمد جسوس ar\n", - "Tsvetta Kaleynska en\n", - "Tsvetta Kaleynska es\n", - "Tsvetta Kaleynska fr\n", - "Цвета Калейнска bg\n", - "Цвета Калейнска ru\n", - "Caio Henrique Siqueira Sanchez en\n", - "Кајо Санчез sr\n", - "Julio Ríos Gallego ca\n", - "Julio Ríos Gallego en\n", - "Julio Ríos Gallego es\n", - "Nilson Ricardo da Silva Júnior en\n", - "ニルソン・リカルド・ダ・シルバ・ジュニオール ja\n", - "니우송 히카르두 다 시우바 주니오르 ko\n", - "Aleksej Aleksandrovič Starobinski sl\n", - "Alexei Alexandrowitsch Starobinski de\n", - "Alexei Starobinski pt\n", - "Alexei Starobinsky en\n", - "Alexeï Starobinski fr\n", - "Алексей Александрович Старобинский ru\n", - "Старобінський Олексій Олександрович uk\n", - "アレクセイ・スタロビンスキー ja\n", - "Hilda Rix Nicholas en\n", - "هیلدا ریکس نیکولاس fa\n", - "Behrouz Makvandi en\n", - "Бехруз Макванди ru\n", - "بهروز مکوندی fa\n", - "Borislav Terzić en\n", - "Борислав Терзић sr\n" + "patched /m/017vbn de to en\n", + "patched /m/026q0k_ nl to en\n", + "patched /m/02k2kw de to en\n", + "patched /m/0bwhrg1 it to en\n" ] } ], "source": [ - "# de-duplicate names that use same spelling for multiple languages\n", - "for id_kg, name_langs in msceleb_identities_sm.items():\n", - " if 'en' in name_langs.keys():\n", - " name_en = name_langs['en']\n", - " for lang, name in name_langs.items():\n", - " print(name, lang)" + "# patch en name exception: 4 names missing english\n", + "en_exceptions = {\n", + " '/m/017vbn': 'de',\n", + " '/m/026q0k_': 'nl',\n", + " '/m/02k2kw': 'de',\n", + " '/m/0bwhrg1': 'it'\n", + "}\n", + "for id_kg, lang in en_exceptions.items():\n", + " msceleb_identities[id_kg]['names']['en'] = msceleb_identities[id_kg]['names'][lang]\n", + " msceleb_identities[id_kg]['names']['canonical'] = msceleb_identities[id_kg]['names']['en']\n", + " print(f'patched {id_kg} {lang} to en')" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "### Remove duplicate names" + ] }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 167, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "374a55f504084f14bd4d77fed0e2f4e4", + "model_id": "33ffa229c16d4a9088087c21210d421e", "version_major": 2, "version_minor": 0 }, @@ -286,62 +415,85 @@ "name": "stdout", "output_type": "stream", "text": [ - "n2 split is long: zh-Hant\n", - "n2 split is long: es-419\n", - "n2 split is long: fil\n", - "n2 split is long: en-GB\n", - "n2 split is long: en-US\n", - "n2 split is long: zh-HK\n", - "n2 split is long: fr-CA\n", - "n2 split is long: pt-PT\n", - "n2 split is long: ceb\n", - "n2 split is long: zorbla.de\n", - "n2 split is long: N\n", - "n2 split is long: hu\n", - "m.03zytg\tΑστέριος\"\n", - "n2 split is long: destiny\n", - "n2 split is long: Teng Boon Soon\n", - "n2 split is long: Yong Khoon Seng\n", - "n2 split is long: Tiki Anak Lafe\n", - "n2 split is long: Marcus Mojigoh\n", - "n2 split is long: Nyallau Anak Badak\n", - "n2 split is long: Bousou P\n", - "n2 split is long: evleaks\n" + "removed 1,485,336 duplicate names\n" ] } ], "source": [ - "messages = []\n", + "# de-duplicate names that use same spelling for multiple languages\n", + "items_removed = []\n", + "msceleb_identities_copy = msceleb_identities.copy()\n", + "\n", + "for id_kg, attrs in tqdm(msceleb_identities_copy.items()):\n", + " lang_attrs = attrs['names']\n", + " name_main = lang_attrs.get('canonical', None)\n", + " if not name_en:\n", + " print('error. all names need \"en\"')\n", + " break\n", + " lang_attrs_copy = attrs['names'].copy()\n", + " for lang, name in lang_attrs_copy.items():\n", + " if name == name_main and lang != 'en' and lang != 'canonical':\n", + " # remove it\n", + " items_removed.append(msceleb_identities[id_kg]['names'].pop(lang))\n", + " del lang_attrs_copy\n", "\n", - "for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):\n", + "del msceleb_identities_copy\n", + "print(f'removed {len(items_removed):,} duplicate names')\n", + "del items_removed" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Count images per person for ms celeb" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6e3a3f659fa6414b80d678d5b991ed0a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=5049824), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# calculate total images per id\n", + "msceleb_files = {}\n", + "# load text file\n", + "with open(fp_msceleb_clean_txt,'r') as fp:\n", + " msceleb_lines = fp.readlines()\n", + " \n", + "# iterate lines and append all files\n", + "for filepath in tqdm(msceleb_lines):\n", + " id_kg, fname = filepath.split('/')\n", " id_kg = id_kg.replace('m.', '/m/')\n", - " for df_row in msceleb_group.itertuples():\n", - " if '@' in df_row.name:\n", - " splits = df_row.name.split('@')\n", - " if not len(splits) > 1:\n", - " msg = f'only one split: {df_row.name}'\n", - " if not msg in messages:\n", - " print(msg)\n", - " messages.append(msg)\n", - " elif len(splits) > 1:\n", - " if len(splits[1]) != 2:\n", - " msg = f'n2 split is long: {splits[1]}'\n", - " if not msg in messages:\n", - " print(msg)\n", - " messages.append(msg)\n", - " else:\n", - " print(df_row.name)" + " if not id_kg in msceleb_files.keys():\n", + " msceleb_files[id_kg] = []\n", + " msceleb_files[id_kg].append(fname)" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 171, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "475871ac6d08484cbec44d5ccf099bd8", + "model_id": "bd0530f0e4634a8dbae0308964cd6e2b", "version_major": 2, "version_minor": 0 }, @@ -354,251 +506,990 @@ } ], "source": [ - "# iterate groups and flatten language variations into named columns\n", - "identities = []\n", - "for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):\n", - " id_kg = id_kg.replace('m.', '/m/')\n", - " for df_row in msceleb_group.itertuples():\n", - " if '@' in df_row.name:\n", - " splits = df_row.name.split('@')\n", - " name = splits[0]\n", - " lang = splits[1] if len(splits) > 0 else 'en'\n", - " else:\n", - " # default to 'en'\n", - " lang = 'en'\n", - " name = df_row.name\n", - " col_name = f'ms_name_{lang}'\n", - " identities.append({'id_kg': id_kg, col_name: name})" + "# add count to \n", + "for id_kg, attrs in tqdm(msceleb_identities.items()):\n", + " if id_kg in msceleb_files.keys():\n", + " count = len(msceleb_files[id_kg])\n", + " else:\n", + " count = 0\n", + " msceleb_identities[id_kg]['count_msceleb'] = count" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [], + "source": [ + "im_counts_idxs = [attrs['count_msceleb'] for id_kg, attrs in msceleb_identities.items()]\n", + "im_counts_id_kg = [id_kg for id_kg, _ in msceleb_identities.items()]" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 173, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[{'id_kg': 'm/01008l47', 'ms_name_en': 'Patrick Cummins'}, {'id_kg': 'm/01008l47', 'ms_name_pt': 'Patrick Cummins'}]\n" + "Most images 130 for Leelee Sobieski\n", + "88,244 more than 10\n", + "78,027 more than 20\n", + "49,042 more than 50\n", + "5,025 more than 100\n" ] } ], "source": [ - "print(identities[0:10])" + "# print stats\n", + "idx_max = np.argmax(im_counts_idxs)\n", + "id_kg_max = im_counts_id_kg[idx_max]\n", + "count_max = im_counts_idxs[idx_max]\n", + "name_max = msceleb_identities[id_kg_max]['names']['canonical']\n", + "print(f'Most images {count_max:,} for {name_max}')\n", + "# distribution\n", + "im_counts_idxs = np.array(im_counts_idxs)\n", + "print(f'{len(im_counts_idxs[im_counts_idxs > 10]):,} more than 10')\n", + "print(f'{len(im_counts_idxs[im_counts_idxs > 20]):,} more than 20')\n", + "print(f'{len(im_counts_idxs[im_counts_idxs > 50]):,} more than 50')\n", + "print(f'{len(im_counts_idxs[im_counts_idxs > 100]):,} more than 100')" ] }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 174, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "420bc435f447454faa2dba73d7dff982", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# awkward conversion of msceleb_identities to a list of dicts\n", + "identities_flat = []\n", + "for id_kg, attrs in tqdm(msceleb_identities.items()):\n", + " obj = {'id_kg': id_kg}\n", + " for lang, name in attrs['names'].items():\n", + " if lang != 'canonical':\n", + " col_name = f'name_msceleb_{lang}'\n", + " elif lang == 'canonical':\n", + " col_name = 'name_msceleb'\n", + " obj[col_name] = name\n", + " obj['count_msceleb'] = attrs['count_msceleb']\n", + " identities_flat.append(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 175, "metadata": {}, "outputs": [], "source": [ - "# temp save DataFrame to CSV\n", - "def save_identity_master(identities, fp_out=fp_master_identities):\n", - " df_identities_master = pd.DataFrame.from_dict(identities)\n", - " df_identities_master.index.name = 'id'\n", - " df_identities_master.to_csv(fp_master_identities)" + "# convert to dataframe\n", + "df_identities = pd.DataFrame.from_dict(identities_flat)" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "metadata": {}, + "outputs": [], + "source": [ + "# save checkpoint CSV\n", + "save_identity_master(identities_flat) # encoding='utf-16' ??" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "metadata": {}, + "outputs": [], + "source": [ + "# copy to master and delete ref to msceleb\n", + "identities = msceleb_identities.copy()\n", + "del msceleb_identities" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Add image count data for MS Celeb" + "## LFW" ] }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 193, "metadata": {}, "outputs": [], "source": [ - "# load lines\n", - "fp_msceleb_clean = '/data_store_hdd/datasets/people/msceleb/downloads/MS-Celeb-1M_clean_list.txt'\n", - "with open(fp_msceleb_clean,'r') as fp:\n", - " msceleb_lines = fp.readlines()\n", - "msceleb_files = {}\n", + "# add LFW data\n", + "fp_lfw = '/data_store_hdd/datasets/people/lfw/downloads/lfw-names.txt'\n", + "with open(fp_lfw,'r') as fp:\n", + " lfw_lines = fp.readlines()\n", + "lfw_lines = [x.strip() for x in lfw_lines]\n", "\n", - "# iterate lines and append all files\n", - "for filepath in msceleb_lines:\n", - " id_kg, fname = filepath.split('/')\n", - " id_kg = id_kg.replace('m.', '/m/')\n", - " if not id_kg in msceleb_files.keys():\n", - " msceleb_files[id_kg] = []\n", - " msceleb_files[id_kg].append(fname)\n", - "\n", - " # add count\n", - "for identity in identities:\n", - " id_kg = identity['id_kg']\n", - " if id_kg in msceleb_files.keys():\n", - " identity['msceleb_count'] = len(msceleb_files[identity['id_kg']])\n", - " else:\n", - " identity['msceleb_count'] = 0" + "lfw_meta = []\n", + "for lfw_line in lfw_lines:\n", + " name_orig, count = lfw_line.split('\\t')\n", + " name_clean = name_orig.replace('_',' ')\n", + " obj = {'name_orig': name_orig, 'name': name_clean, 'count':count}\n", + " lfw_meta.append(obj)" ] }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 179, "metadata": {}, "outputs": [], "source": [ - "# save (takes 30 seconds)\n", - "save_identity_master(identities) # encoding='utf-16' ??" + "identities_tmp = identities.copy()" ] }, { "cell_type": "code", - "execution_count": 95, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# make exact name matches\n", + "lfw_name_matches_tmp = {}\n", + "for lfw_item in tqdm(lfw_meta):\n", + " lfw_name = lfw_item['name'] # name is transformed original name\n", + " lfwnl = lfw_name.lower()\n", + " splits = lfw_name.split(' ')\n", + " matches_tmp = {}\n", + " for word in splits:\n", + " # for each word in names, check if exact word is in master name list\n", + " c = word[0].lower()\n", + " matches_tmp = []\n", + " for name_id_kg in msceleb_top1m_az[c]:\n", + " name = name_id_kg['name']\n", + " id_kg = name_id_kg['id_kg']\n", + " if lfwnl in name.lower():\n", + " lfw_name_matches_tmp[lfw_name] = id_kg\n", + " break\n", + "print(f'found {len(lfw_name_matches_exact)} of {len(lfw_meta)} names using exact matches')" + ] + }, + { + "cell_type": "code", + "execution_count": 212, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8c0f2dbf032145fea3ad5759a97abc44", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlfw_name\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mlfwnl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlfw_name\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mname_id_kg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmsceleb_top1m_az\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname_id_kg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'name'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mid_kg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname_id_kg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'id_kg'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# make exact name matches\n", + "lfw_name_matches_exact = {}\n", + "for lfw_item in tqdm(lfw_meta):\n", + " lfw_name = lfw_item['name'] # name is transformed original name\n", + " # quickly check if it's in the alphabetized list\n", + " c = lfw_name[0].lower()\n", + " lfwnl = lfw_name.lower()\n", + " for name_id_kg in msceleb_top1m_az[c]:\n", + " name = name_id_kg['name']\n", + " id_kg = name_id_kg['id_kg']\n", + " if lfwnl in name.lower():\n", + " lfw_name_matches_exact[lfw_name] = id_kg\n", + " break\n", + "print(f'found {len(lfw_name_matches_exact)} of {len(lfw_meta)} names using exact matches')" + ] + }, + { + "cell_type": "code", + "execution_count": 217, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1949065d12b349ce8bbf28ebd09f1e29", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "matched AJ Cook to A. J. Cook in canonical. Add to matched ids\n", + "matched AJ Lamas to A.J. Lamas in canonical. Add to matched ids\n", + "could not find: Aaron Patterson\n", + "matched Aaron Pena to Aaron Peña in canonical. Add to matched ids\n", + "could not find: Abdel Aziz Al-Hakim\n", + "could not find: Abdel Madi Shabneh\n", + "could not find: Abdel Nasser Assidi\n", + "could not find: Abdul Majeed Shobokshi\n", + "matched Abdulaziz Kamilov to Abdulaziz Komilov in canonical. Add to matched ids\n", + "could not find: Abdullah Nasseef\n", + "could not find: Abdullah al-Attiyah\n", + "could not find: Abdullatif Sener\n", + "could not find: Abner Martinez\n", + "could not find: Aby Har-Even\n", + "could not find: Adam Kennedy\n", + "could not find: Adelina Avila\n", + "could not find: Adisai Bodharamik\n", + "could not find: Adolfo Aguilar Zinser\n", + "could not find: Adoor Gopalakarishnan\n", + "could not find: Adrian Annus\n", + "matched Adrian Fernandez to Adriana Fernández in canonical. Add to matched ids\n", + "could not find: Adrian Nastase\n", + "could not find: Adriana Perez Navarro\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mlang\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0midentity\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;31m# for each name's language variation, look for match\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m \u001b[0mstrict_match\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0midentity_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnames_match_strict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlfw_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 14\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mstrict_match\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mmatched_id_kg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mid_kg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/work/megapixels_dev/megapixels/app/utils/identity_utils.py\u001b[0m in \u001b[0;36mnames_match_strict\u001b[0;34m(a, b)\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mnames_match_strict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0mclean_a\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mletter_strip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m \u001b[0mclean_b\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mletter_strip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 29\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclean_a\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclean_b\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mletter_match\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclean_a\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclean_b\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mletter_match\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclean_b\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclean_a\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/work/megapixels_dev/megapixels/app/utils/identity_utils.py\u001b[0m in \u001b[0;36mletter_strip\u001b[0;34m(a, b)\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mletter_strip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maZ9\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;31m# strip every letter from a that is not in b\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mletter_match\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/work/megapixels_dev/megapixels/app/utils/identity_utils.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mletter_strip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maZ9\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;31m# strip every letter from a that is not in b\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mletter_match\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# make strict name-letter matches\n", + "lfw_name_matches_strict = {}\n", + "for lfw_item in tqdm(lfw_meta):\n", + " lfw_name = lfw_item['name'] # name is transformed original name\n", + " if lfw_name in lfw_name_matches_exact.keys():\n", + " continue\n", + " \n", + " matched_id_kg = None\n", + " for id_kg, identity in identities_tmp.items():\n", + " # for each msceleb identity, look for match\n", + " for lang, name in identity['names'].items():\n", + " # for each name's language variation, look for match\n", + " strict_match = identity_utils.names_match_strict(lfw_name, name)\n", + " if strict_match:\n", + " matched_id_kg = id_kg\n", + " matched_lang = lang\n", + " matched_name = name\n", + " break\n", + " if matched_id_kg:\n", + " print(f'matched {lfw_name} to {matched_name} in {matched_lang}. Add to matched ids')\n", + " lfw_name_matches_strict[lfw_name] = matched_id_kg\n", + " break\n", + " if not matched_id_kg:\n", + " print(f'could not find: {lfw_name}')\n", + "print(f'found {len(lfw_name_matches_strict)} of {len(lfw_meta)} names using exact matches')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# make fuzzy name matches\n" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d822a21cc63e4c5c9fe9bb637f5455dd", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, description='1st loop', max=5749, style=ProgressStyle(description_width='i…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found: Aaron Eckhart@ca\n", + "Found: Aaron Guiel@en\n", + "Found: Aaron Peirsol@ca\n", + "Found: Aaron Sorkin@ca\n", + "Found: Aaron Tippin@de\n", + "Found: Abba Eban@cs\n", + "Found: Abbas Kiarostami@ca\n", + "Found: Abdoulaye Wade@ca\n", + "Found: Abdul Rahman Lestaluhu@id\n", + "Found: Abdullah Cabir@tr\n", + "Found: Abdullah Ahmad Badawi@da\n", + "Found: Abdullah Gulam Rasoul@en\n", + "Found: Abel Aguilar@cs\n", + "Found: Abel Pacheco de la Espriella@es\n", + "Found: Abid Hamid Mahmud al-Tikriti@nl\n", + "Found: Abraham Foxman@cs\n", + "Found: Adam Ant@cs\n", + "Found: Adam Freier@en\n", + "Found: Adam Herbert@en\n", + "Found: Adam Mair@de\n", + "Found: Adam Richards@en\n", + "Found: Adam Sandler@ca\n", + "Found: George Adam Scott@en\n", + "Found: Adel Al-Jubeir@fr\n", + "Found: Adolfo Rodriguez Saa@id\n", + "Found: Adrian McPherson@en\n", + "Found: Adrian Murrell@en\n", + "Found: Adriana Lima@ca\n", + "Found: Adrien Brody@ca\n", + "Found: Afton Smith@cs\n", + "Found: Agbani Darego@de\n", + "Found: Agnelo Queiroz@en\n", + "Found: Agnes Bruckner@de\n", + "Found: Ahmed Ahmedou@de\n", + "Found: Ahmed Chalabi@en\n", + "Found: Mahmood Ahmed Ghazi@en\n", + "Found: Ahmet Necdet Sezer@ca\n", + "Found: Ai Sugiyama@da\n", + "Found: Aidan Quinn@ca\n", + "Found: Aileen Riggin Soule@fr\n", + "Found: Aishwarya Rai Bachchan@en\n", + "Found: Ajit Agarkar@en\n", + "Found: Akbar Al Baker@en\n", + "Found: Akbar Hashemi Rafsanjani@da\n", + "Found: Akhmed Zakayev@en\n", + "Found: Akiko Morigami@da\n", + "Found: Al Cardenas@en\n", + "Found: Vidal Davis@en\n", + "Found: Al Gore III@en\n", + "Found: Al Leiter@de\n", + "Found: Al Pacino@ca\n", + "Found: Al Sharpton@de\n", + "Found: Alain Cervantes@en\n", + "Found: Alain Ducasse@de\n", + "Found: Alan Ball jr.@nl\n", + "Found: Alan Dershowitz@da\n", + "Found: Alan Greenspan@de\n", + "Found: Alan Mulally@de\n", + "Found: Alan Trammell@de\n", + "Found: Alan Zemaitis@en\n", + "Found: Alanis Morissette@ca\n", + "Found: Alanna Ubach@de\n", + "Found: Alastair Campbell@de\n", + "Found: Alastair Johnston@en\n", + "Found: Albert Costa Balboa@es\n", + "Found: Albert Pujols@da\n", + "Found: Alberto Acosta@ca\n", + "Found: Alberto Fujimori@ca\n", + "Found: Alberto Sordi@ca\n", + "Found: Aldo Paredes@en\n", + "Found: Alec Baldwin@ca\n", + "Found: Alejandro Atchugarry@de\n", + "Found: Alejandro Fernandez Almendras@sl\n", + "Found: Alejandro Lembo@de\n", + "Found: Alejandro Lerner@en\n", + "Found: Alejandro Toledo@en\n", + "Found: Alek Wek@de\n", + "Found: Alessandro Nesta@ca\n", + "Found: Alex Barros@de\n", + "Found: Alex Cabrera@en\n", + "Found: Alex Ferguson@en\n", + "Found: Alex Holmes@en\n", + "Found: Alex Kingston@cs\n", + "Found: Alex Penelas@en\n", + "Found: Alex Popovici@es\n", + "Found: Alex Sink@en\n", + "Found: Alex Wallau@en\n", + "Found: Alex Zanardi@ca\n", + "Found: Alexa Vega@da\n", + "Found: Alexander Downer@de\n", + "Found: Alexander Losyukov@en\n", + "Found: Alexander Lukashenko@en\n", + "Found: Alexander Payne@cs\n", + "Found: Alexandra Pelosi@en\n", + "Found: Alexandra Stevenson@de\n", + "Found: Alexandre Daigle@cs\n", + "Found: Alexandre Despatie@ca\n", + "Found: Alexandre Herchcovitch@en\n", + "Found: Alexandre Vinokourov@fr\n", + "Found: Alexis Bledel@ca\n", + "Found: Alfonso Portillo@en\n", + "Found: Alfonso Soriano@en\n", + "Found: James Alfred Ford@en\n", + "Found: Alfred Santell@en\n", + "Found: Alfredo Moreno@en\n", + "Found: Ali Abbas Al-Hilfi@en\n", + "Found: Ali Abdullah Saleh@da\n", + "Found: Ali Ahmeti@de\n", + "Found: Prince Ali bin Hussein@en\n", + "Found: Ali Fallahian@de\n", + "Found: Ali Hammoud@en\n", + "Found: Ali Khamenei@ca\n", + "Found: Alicia Hollowell@en\n", + "Found: Alicia Keys@ca\n", + "Found: Alicia Molik@de\n", + "Found: Alicia Silverstone@ca\n", + "Found: Alicia Witt@ca\n", + "Found: Alimzhan Tokhtakhounov@pt\n", + "Found: Alina Kabaeva@en\n", + "Found: Alison Krauss@ca\n", + "Found: Alison Lohman@de\n", + "Found: Alistair Macdonald@en\n", + "Found: Allan Houston@ca\n", + "Found: Allan Kemakeza@de\n", + "Found: Allan Wagner Tizón@de\n", + "Found: Allen Iverson@ca\n", + "Found: Allison Janney@da\n", + "Found: Ally Sheedy@ca\n", + "Found: Allyson Felix@ca\n", + "Found: Alma Powell@de\n", + "Found: Alonzo Mourning@ca\n", + "Found: Aly Wagner@de\n", + "Found: Alyson Hannigan@ca\n", + "Found: Amanda Beard@de\n", + "Found: Amanda Bynes@ca\n", + "Found: Amanda Coetzer@de\n", + "Found: Amanda Marshall@de\n", + "Found: Amber Frey@en\n", + "Found: Amber Tamblyn@de\n", + "Found: Ambrose Lee@en\n", + "Found: Amelia Vega@en\n", + "Found: Amelie Mauresmo@ms\n", + "Found: Amr Moussa@ca\n", + "Found: Amram Mitzna@de\n", + "Found: Amy Brenneman@da\n", + "Found: Amy Cotton@en\n", + "Found: Amy Pascal@de\n", + "Found: Amy Redford@de\n", + "Found: Amy Smart@da\n", + "Found: Amy Yasbeck@de\n", + "Found: Ana Guevara@de\n", + "Found: Ananías Maidana Palacios@es\n", + "Found: Anastasia Kelesidou@de\n", + "Found: Anastasia Myskina@en\n", + "Found: Anatoliy Kinakh@en\n", + "Found: Anders Fogh Rasmussen@ca\n", + "Found: Andre Agassi@ca\n", + "Found: Andre Lange@et\n", + "Found: J. Andre Smith@en\n", + "Found: Andrea Bocelli@ca\n", + "Found: Andrea De Cruz@en\n", + "Found: Andrea Yates@en\n", + "Found: Andreas Vinciguerra@de\n", + "Found: Andrei Konchalovsky@en\n", + "Found: Andrei Mikhnevich@en\n", + "Found: Andrei Nikolishin@en\n", + "Found: Andrew Bernard@en\n", + "Found: Andrew Caldecott@en\n", + "Found: Andrew Cuomo@ca\n", + "Found: Andrew Fastow@de\n", + "Found: Andrew Firestone@en\n", + "Found: Andrew Gilligan@en\n", + "Found: Andrew Jarecki@de\n", + "Found: Andrew Luster@de\n", + "Found: Andrew Niccol@cs\n", + "Found: Andy Benes@en\n", + "Found: Andy Dickens@en\n", + "Found: DJ Andy Garcia@en\n", + "Found: Andy Griffith@ca\n", + "Found: Andy Griggs@en\n", + "Found: Andy Lau@cs\n", + "Found: Andy Northey@en\n", + "Found: Sandy Perez Aguila@en\n", + "Found: Andy Roddick@ca\n", + "Found: Andy Rooney@da\n", + "Found: Andy Warhol@ca\n", + "Found: Angela Bassett@ca\n", + "Found: Angela Lansbury@ca\n", + "Found: Angela Merkel@ca\n", + "Found: Angelina Jolie@ca\n", + "Found: Angie Martinez@en\n", + "Found: Anita DeFrantz@de\n", + "Found: Ann Landers@da\n", + "Found: Ann Morgan Guilbert@en\n", + "Found: Ann Veneman@de\n", + "Found: Anna Chicherova@en\n", + "Found: Anna Faris@ca\n", + "Found: Susanna Jones@en\n", + "Found: Anna Kournikova@da\n", + "Found: Anna Nicole Smith@ca\n", + "Found: Anne Donovan@de\n", + "Found: Anne Heche@ca\n", + "Found: Anne Krueger@fr\n", + "Found: Anne McLellan@en\n", + "Found: Annette Bening@ca\n", + "Found: Annette Lu@de\n", + "Found: Annie Machon@de\n", + "Found: Antanas Valionis@de\n", + "Found: Anthony Fauci@de\n", + "Found: Anthony Garotinho@en\n", + "Found: Anthony Hopkins@ca\n", + "Found: Anthony LaPaglia@da\n", + "Found: Anthony Principi@de\n", + "Found: Antje Buschschulte@de\n", + "Found: Anton Balasingham@en\n", + "Found: Antonio Banderas@ca\n", + "Found: Antonio Cassano@ca\n", + "Found: Antonio Catania@de\n", + "Found: Antonio Palocci@de\n", + "Found: Antonio Trillanes IV@fil\n", + "Found: Antony Leung@en\n", + "Found: Antwun Echols@en\n", + "Found: Anwar Ibrahim@da\n", + "Found: Aretha Franklin@ca\n", + "Found: Ari Bousbib@en\n", + "Found: Ari Fleischer@de\n", + "Found: Arianna Huffington@ca\n", + "Found: Arie Haan@de\n", + "Found: Ariel Sharon@ca\n", + "Found: Arif Mardin@de\n", + "Found: Arlen Specter@ca\n", + "Found: Armando Carrillo@en\n", + "Found: Arminio Fraga@en\n", + "Found: Arnold Palmer@da\n", + "Found: Arnold Schwarzenegger@ca\n", + "Found: Rolfe Arnold Scott-James@en\n", + "Found: Aron Ralston@cs\n", + "Found: Stuart Cooper@en\n", + "Found: Stuart Howe@en\n", + "Found: Arthur Johnson@it\n", + "Found: John Arthur Martinez@en\n", + "Found: Arturo Gatti@ca\n", + "Found: Asa Hutchinson@de\n", + "Found: Ashanti Douglas@nl\n", + "Found: Ashley Judd@ca\n", + "Found: Ashley Olsen@ca\n", + "Found: Ashley Postell@en\n", + "Found: Ashraf Ghani Ahmadzai@es\n", + "Found: Ashton Kutcher@ca\n", + "Found: Asif Ali Zardari@ca\n", + "Found: Askar Akayev@en\n", + "Found: Astou Ndiaye-Diatta@en\n", + "Found: Premiership of Atal Bihari Vajpayee@en\n", + "Found: Atom Egoyan@da\n", + "Found: Atsushi Satou@id\n", + "Found: Audrey Lacroix@en\n", + "Found: Audrey Sauret@en\n", + "Found: Augusto Pinochet Ugarte@ca\n", + "Found: Augusto Roa Bastos@de\n", + "Found: Aung San Suu Kyi@ca\n", + "Found: Austin Kearns@en\n", + "Found: Avril Lavigne@ca\n", + "Found: Azmi Bishara@ca\n", + "Found: Azra Akin@id\n", + "Found: Babe Ruth@ca\n", + "Found: Barbara Bach@cs\n", + "Found: Barbara Becker-Cantarino@en\n", + "Found: Barbara Bodine@en\n", + "Found: Barbara Boxer@cs\n", + "Found: Barbara Brezigar@cs\n", + "Found: Barbara Robertson@en\n", + "Found: Barbara Walters@de\n", + "Found: Barbra Streisand@ca\n", + "Found: Barry Alvarez@en\n", + "Found: Barry Bonds@da\n", + "Found: Barry Collier@en\n", + "Found: Barry Diller@de\n", + "Found: Barry Forde@ca\n", + "Found: Barry Hinson@en\n", + "Found: Barry Switzer@de\n", + "Found: Barry Williamson@en\n", + "Found: Barry Zito@de\n", + "Found: Bart Freundlich@de\n", + "Found: Bart Hendricks@en\n", + "Found: Bartosz Kizierowski@de\n", + "Found: Barzan Al-Tikriti@fr\n", + "Found: Basdeo Panday@de\n", + "Found: Baz Luhrmann@ca\n", + "Found: Beatriz Merino Lucero@pl\n", + "Found: Bela Karolyi@ms\n", + "Found: Ben Affleck@ca\n", + "Found: Torben Betts@en\n", + "Found: Ben Braun@en\n", + "Found: Ben Broussard@en\n", + "Found: Ben Cahoon@en\n", + "Found: Reuben Davis@en\n", + "Found: Ben Kingsley@ca\n", + "Found: Ben Lee Tyler@en\n", + "Found: Ben Steinbauer@en\n", + "Found: Benazir Bhutto@ca\n", + "Found: Benedita da Silva@en\n", + "Found: Benicio Del Toro@fi\n", + "Found: Benito Santiago@en\n", + "Found: Benjamin Bratt@cs\n", + "Found: Benjamin Franklin Bailey@en\n", + "Found: Benjamin McKenzie@ca\n", + "Found: Benjamin Netanyahu@da\n", + "Found: Bernadette Peters@ca\n", + "Found: Bernard Ebbers@de\n", + "Found: Bernard Giraudeau@de\n", + "Found: Bernard Kerik@en\n", + "Found: Bernard Landry@de\n", + "Found: Bernard Law@fr\n", + "Found: Bernard Lord@en\n", + "Found: Bernardo Segura@de\n", + "Found: Bertie Ahern@ca\n", + "Found: Bertrand Bonello@de\n", + "Found: A. Elizabeth Jones@en\n", + "Found: Bettina Rheims@cs\n", + "Found: Betty Williams@en\n", + "Found: Bianca Jagger@da\n", + "Found: Bijan Namdar Zangeneh@de\n", + "Found: Bill Belichick@da\n", + "Found: Bill Butler@ca\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found: Bill Callahan@en\n", + "Found: Bill Cartwright@en\n", + "Found: Bill Clancy@en\n", + "Found: Bill Clinton@ca\n", + "Found: Bill Curry@en\n", + "Found: Bill Doba@en\n", + "Found: Bill Elliott@pt\n", + "Found: Bill Fennelly@en\n", + "Found: Bill Frist@de\n", + "Found: Bill Gates@ca\n", + "Found: Bill Grahame@en\n", + "Found: Bill Guerin@de\n", + "Found: Bill Herrion@en\n", + "Found: Bill Hughes@en\n", + "Found: Bill Kollar@en\n", + "Found: Bill Kong@es\n", + "Found: Bill Mauldin@de\n", + "Found: Bill McBride@en\n", + "Found: Bill Nelson@da\n", + "Found: Bill Parcells@de\n", + "Found: Bill Parsons@en\n", + "Found: Bill Paxton@ca\n", + "Found: Bill Self@de\n", + "Found: Bill Sizemore@en\n", + "Found: Bill Stapleton@en\n", + "Found: Bill Steinke@en\n", + "Found: Bill Walton@de\n", + "Found: Billy Andrade@da\n", + "Found: Billy Beane@de\n", + "Found: Billy Bob Thornton@ca\n", + "Found: Billy Boyd@en\n", + "Found: Billy Crawford@de\n", + "Found: Billy Crystal@ca\n", + "Found: Billy Donovan@en\n", + "Found: Billy Gilman@en\n", + "Found: Billy Joel@ca\n", + "Found: Bing Crosby@ca\n", + "Found: Binyamin Ben-Eliezer@en\n", + "Found: Bison Dele@de\n", + "Found: Bixente Lizarazu@ca\n", + "Found: Blas Ople@de\n", + "Found: Blythe Danner@ca\n", + "Found: Blythe Hartley@de\n", + "Found: Bo Pelini@en\n", + "Found: Bo Ryan@en\n", + "Found: Bob Alper@en\n", + "Found: Bob Beauprez@de\n", + "Found: Bob Bowlsby@en\n", + "Found: Bob Dole@ca\n", + "Found: Bob Ferguson@da\n", + "Found: Bob Geldof@ca\n", + "Found: Bob Graham@en\n", + "Found: Bob Guccione@cs\n", + "Found: Bob Hayes@cs\n", + "Found: Bob Holden@de\n", + "Found: Bob Hope@ca\n", + "Found: Bob Huggins@en\n", + "Found: Bob Iger@en\n", + "Found: Bob Krueger@en\n", + "Found: Bob Menendez@da\n", + "Found: Bob Newhart@de\n", + "Found: Bob Stoops@en\n", + "Found: Bob Taft@de\n", + "Found: Bobby Bowden@de\n", + "Found: Bobby Kielty@en\n", + "Found: Bobby Robson@ca\n", + "Found: Bode Miller@ca\n", + "Found: Bonnie Fuller@en\n", + "Found: Bonnie Hunt@ca\n", + "Found: Nella Maria Bonora@de\n", + "Found: Boris Berezovsky@en\n", + "Found: Boris Henry@cs\n", + "Found: Boris Jordan@en\n", + "Found: Boris Trajkovski@ca\n", + "Found: Boris Yeltsin@en\n", + "Found: Brad Banks@en\n", + "Found: Brad Brownell@en\n", + "Found: Brad Garrett@da\n", + "Found: Brad Gushue@de\n", + "Found: Brad Miller@en\n", + "Found: Brad Pitt@ca\n", + "Found: Brad Wilk@cs\n", + "Found: Brajesh Mishra@en\n", + "Found: Brandon Boyd@da\n", + "Found: Brandon Hammond@en\n", + "Found: Brandon Inge@de\n", + "Found: Brandon Jones@en\n", + "Found: Brandon Knight@de\n", + "Found: Brandon Larson@en\n", + "Found: Brandon Lloyd@en\n", + "Found: Brandon Webb@pl\n", + "Found: Branko Crvenkovski@ca\n", + "Found: Brendan Fraser@ca\n", + "Found: Brendan Gaughan@en\n", + "Found: Brendan Hansen@en\n", + "Found: H. Brent Coles@en\n", + "Found: Brett Hawke@en\n", + "Found: Brett Hull@cs\n", + "Found: Brian Billick@de\n", + "Found: Brian Campbell Vickery@de\n", + "Found: Brian Cashman@en\n", + "Found: Brian Clemens@de\n", + "Found: Brian Cook@en\n", + "Found: Brian Cowen@ca\n", + "Found: Brian De Palma@ca\n", + "Found: Brian Gregory@en\n", + "Found: Brian Griese@en\n", + "Found: Brian Heidik@en\n", + "Found: Brian Henson@en\n", + "Found: Brian Kerr@de\n", + "Found: Brian Lara@de\n", + "Found: Brian Mulroney@ca\n", + "Found: Brian Olson@en\n", + "Found: Brian Scalabrine@ca\n", + "Found: Brian Schneider@en\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;31m# first, grep all rows of the original TSV file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mlfw_name_clean\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlfw_name\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mmsceleb_row\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf_msceleb_top1m\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitertuples\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlfw_name_clean\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmsceleb_row\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname_lang\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'Found: {msceleb_row.name_lang}'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# compare this this to master identity\n", + "for lfw_item in tqdm(lfw_meta, desc='1st loop'):\n", + " \n", + " # for each LFW name, look for match\n", + " lfw_name = lfw_item['name']\n", + " matched_id = None\n", + " \n", + " for id_kg, identity in identities_tmp.items():\n", + " # for each msceleb identity, look for match\n", + " for lang, name in identity['names'].items():\n", + " # for each name's language variation, look for match\n", + " if not len(name) > 0:\n", + " print('no name')\n", + " continue\n", + " strict_match = identity_utils.names_match_strict(lfw_name, name)\n", + " if strict_match:\n", + " #print(f'Strict matched \"{lfw_name}\" to \"{name}\"')\n", + " matched_id = id_kg\n", + " matched_lang = lang\n", + " matched_name = name\n", + " break\n", + " if matched_id:\n", + " matched_lang = lang\n", + " matched_name = name\n", + " print(f'OK. Found match: {lfw_name} == {matched_name} in lang: {matched_lang}')\n", + " pbar_ids.clear()\n", + " pbar_ids.close()\n", + " break\n", + " if not matched_id:\n", + " print(f'ERROR: could not find {lfw_name}')\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['id_kg',\n", - " 'ms_name_ Marcus Mojigoh',\n", - " 'ms_name_ Nyallau Anak Badak',\n", - " 'ms_name_ Teng Boon Soon',\n", - " 'ms_name_ Tiki Anak Lafe',\n", - " 'ms_name_ Yong Khoon Seng',\n", - " 'ms_name_Bousou P',\n", - " 'ms_name_N',\n", - " 'ms_name_af',\n", - " 'ms_name_am',\n", - " 'ms_name_ar',\n", - " 'ms_name_az',\n", - " 'ms_name_be',\n", - " 'ms_name_bg',\n", - " 'ms_name_bm',\n", - " 'ms_name_bn',\n", - " 'ms_name_bo',\n", - " 'ms_name_br',\n", - " 'ms_name_bs',\n", - " 'ms_name_ca',\n", - " 'ms_name_ceb',\n", - " 'ms_name_ck',\n", - " 'ms_name_co',\n", - " 'ms_name_cr',\n", - " 'ms_name_cs',\n", - " 'ms_name_cy',\n", - " 'ms_name_da',\n", - " 'ms_name_de',\n", - " 'ms_name_destiny',\n", - " 'ms_name_dz',\n", - " 'ms_name_el',\n", - " 'ms_name_en',\n", - " 'ms_name_en-GB',\n", - " 'ms_name_en-US',\n", - " 'ms_name_eo',\n", - " 'ms_name_es',\n", - " 'ms_name_es-419',\n", - " 'ms_name_et',\n", - " 'ms_name_eu',\n", - " 'ms_name_evleaks',\n", - " 'ms_name_fa',\n", - " 'ms_name_fi',\n", - " 'ms_name_fil',\n", - " 'ms_name_fo',\n", - " 'ms_name_fr',\n", - " 'ms_name_fr-CA',\n", - " 'ms_name_fy',\n", - " 'ms_name_ga',\n", - " 'ms_name_gd',\n", - " 'ms_name_gl',\n", - " 'ms_name_gn',\n", - " 'ms_name_gu',\n", - " 'ms_name_ha',\n", - " 'ms_name_hi',\n", - " 'ms_name_hr',\n", - " 'ms_name_ht',\n", - " 'ms_name_hu',\n", - " 'ms_name_hu\\r\\nm.03zytg\\tΑστέριος\"',\n", - " 'ms_name_hy',\n", - " 'ms_name_id',\n", - " 'ms_name_ig',\n", - " 'ms_name_is',\n", - " 'ms_name_it',\n", - " 'ms_name_iw',\n", - " 'ms_name_ja',\n", - " 'ms_name_ka',\n", - " 'ms_name_kk',\n", - " 'ms_name_kl',\n", - " 'ms_name_km',\n", - " 'ms_name_kn',\n", - " 'ms_name_ko',\n", - " 'ms_name_ku',\n", - " 'ms_name_ky',\n", - " 'ms_name_la',\n", - " 'ms_name_lb',\n", - " 'ms_name_lo',\n", - " 'ms_name_lt',\n", - " 'ms_name_lv',\n", - " 'ms_name_mg',\n", - " 'ms_name_mi',\n", - " 'ms_name_mk',\n", - " 'ms_name_ml',\n", - " 'ms_name_mn',\n", - " 'ms_name_mr',\n", - " 'ms_name_ms',\n", - " 'ms_name_mt',\n", - " 'ms_name_my',\n", - " 'ms_name_ne',\n", - " 'ms_name_nl',\n", - " 'ms_name_nn',\n", - " 'ms_name_no',\n", - " 'ms_name_nv',\n", - " 'ms_name_ny',\n", - " 'ms_name_oc',\n", - " 'ms_name_or',\n", - " 'ms_name_pa',\n", - " 'ms_name_pl',\n", - " 'ms_name_ps',\n", - " 'ms_name_pt',\n", - " 'ms_name_pt-PT',\n", - " 'ms_name_ro',\n", - " 'ms_name_ru',\n", - " 'ms_name_rw',\n", - " 'ms_name_sa',\n", - " 'ms_name_sc',\n", - " 'ms_name_se',\n", - " 'ms_name_si',\n", - " 'ms_name_sk',\n", - " 'ms_name_sl',\n", - " 'ms_name_sn',\n", - " 'ms_name_so',\n", - " 'ms_name_sq',\n", - " 'ms_name_sr',\n", - " 'ms_name_st',\n", - " 'ms_name_su',\n", - " 'ms_name_sv',\n", - " 'ms_name_sw',\n", - " 'ms_name_ta',\n", - " 'ms_name_te',\n", - " 'ms_name_tg',\n", - " 'ms_name_th',\n", - " 'ms_name_tr',\n", - " 'ms_name_ug',\n", - " 'ms_name_uk',\n", - " 'ms_name_ur',\n", - " 'ms_name_uz',\n", - " 'ms_name_vi',\n", - " 'ms_name_xh',\n", - " 'ms_name_yi',\n", - " 'ms_name_yo',\n", - " 'ms_name_zh',\n", - " 'ms_name_zh-HK',\n", - " 'ms_name_zh-Hant',\n", - " 'ms_name_zorbla.de',\n", - " 'ms_name_zu']" + "True" ] }, - "execution_count": 95, + "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "list(df_identities_master.keys())" + "identity_utils.names_match_strict('AJ Cook', 'A.J. Cook')" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "names_match('A.J. Cook', 'cook Aj', as_float=True, compound_score=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PubFig" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add pubfig data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Face Scrub" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add facescrub" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## UMD Faces" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add umd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CASIA Webface" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add CASIA Webface" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# IMDB Wiki" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add imdb-wiki" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## IMDB-Face" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add imdb face" ] }, { @@ -625,7 +1516,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.1" } }, "nbformat": 4, -- cgit v1.2.3-70-g09d2 From 1bbee45bcb178fa47e60910c76bd914c43e7fd90 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Wed, 20 Feb 2019 12:47:44 +0100 Subject: commenting --- scraper/util.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scraper/util.py b/scraper/util.py index 0c3e2169..fa9f6a22 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -400,6 +400,7 @@ def fetch_paper(s2, paper_id): return paper def fetch_spreadsheet(): + """Open the Google Spreadsheet, which contains the individual worksheets""" scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive'] path = os.path.dirname(os.path.abspath(__file__)) credentials = ServiceAccountCredentials.from_json_keyfile_name(os.path.join(path, '.creds/Megapixels-ef28f91112a9.json'), scope) @@ -409,16 +410,22 @@ def fetch_spreadsheet(): return spreadsheet def fetch_worksheet(name="institutions"): + """Get a reference to a particular "worksheet" from the Google Spreadsheet""" spreadsheet = fetch_spreadsheet() return spreadsheet.worksheet(name) def fetch_google_sheet(name="institutions"): + """Get all the values from a particular worksheet as a list of lists. + Returns: + :keys - the first row of the document + :lines - a list of lists with the rest of the rows""" rows = fetch_worksheet(name).get_all_values() keys = rows[0] lines = rows[1:] return keys, lines def fetch_google_sheet_objects(name): + """Get all the values from a worksheet as a list of dictionaries""" keys, rows = fetch_google_sheet(name) recs = [] for row in rows: @@ -429,6 +436,8 @@ def fetch_google_sheet_objects(name): return recs def fetch_google_lookup(name, item_key='key'): + """Get all the values from a worksheet as a dictionary of dictionaries. + Specify which field you want to use as the dictionary key.""" keys, rows = fetch_google_sheet(name) lookup = {} for row in rows: -- cgit v1.2.3-70-g09d2 From 7885a180e1b3ddc37ef2192c74a897b911e48a14 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Wed, 20 Feb 2019 16:05:25 +0100 Subject: adding countries to citation feed / geocode step --- megapixels/commands/datasets/citations_to_csv.py | 4 +- scraper/README.md | 34 +- scraper/countries.json | 79 ++--- scraper/pdf_dump_all.sh | 20 -- scraper/pdf_dump_first_page.sh | 17 - scraper/reports/doi_institutions_geocoded.csv | 391 +++++++++++++++++++++++ scraper/reports/doi_institutions_not_found.csv | 27 ++ scraper/s2-geocode-spreadsheet.py | 160 ++++++---- scraper/s2-geocode.py | 5 +- scraper/util.py | 18 +- 10 files changed, 602 insertions(+), 153 deletions(-) delete mode 100644 scraper/pdf_dump_all.sh delete mode 100644 scraper/pdf_dump_first_page.sh diff --git a/megapixels/commands/datasets/citations_to_csv.py b/megapixels/commands/datasets/citations_to_csv.py index 8959cdaf..cda879aa 100644 --- a/megapixels/commands/datasets/citations_to_csv.py +++ b/megapixels/commands/datasets/citations_to_csv.py @@ -83,7 +83,7 @@ def get_citations(dataset_key, dataset_name, json_data): for a in addresses: paper = Paper(dataset_key, dataset_name, p['id'], p['title'], d_type, year, p['pdf'], - a['address'], a['type'], a['lat'], a['lng']) + a['name'], a['type'], a['lat'], a['lng'], a['country']) papers.append(paper) else: paper = Paper(p['key'], p['name'], d['id'], p['title'], 'main', year, p['pdf']) @@ -103,7 +103,7 @@ def get_orig_paper(json_data): continue paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year, p['pdf'], - a['address'], a['type'], a['lat'], a['lng']) + a['name'], a['type'], a['lat'], a['lng'], a['country']) papers.append(paper) else: paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year, p['pdf']) diff --git a/scraper/README.md b/scraper/README.md index 993dbfa2..e19a6920 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -74,7 +74,7 @@ Included in the content-script folder is a Chrome extension which scrapes Google --- -## Scraping Institutions +## Mapping papers to locations Once you have the data from S2, you can scrape all the PDFs (and other URLs) you find, and then extract institutions from those and geocode them. @@ -98,22 +98,34 @@ Use pdfminer.six to extract the first page from the PDFs. Perform initial extraction of university-like terms, to be geocoded. +### s2-pdf-report.py + +Generates reports of things from the PDFs that were not found. + ### s2-doi-report.py -Extract named entities from the scraped DOI links (IEEE, ACM, etc). +Extract named entities from the scraped DOI links (IEEE, ACM, etc), as well as unknown entities. This is technically the cleanest data, since we know 99% of it is institutions, but it's also quite noisy. ### s2-geocode.py -Geocode lists of entities using Nominativ. +Geocode lists of unknown entities using Google. By default, it tries to geocode everything that was not recognized by the DOI report. + +### s2-geocode-spreadsheet.py + +To add new institutions, simply list them in the spreadsheet with the lat/lng fields empty. Then run this script and anything missing a lat/lng will get one. ### s2-citation-report.py For each paper in the citations CSV, find the corresponding paper in the database, and get all the citations. For each of the citations, try to find an address for each one. Embed the appropriate entries from institutions list and then render them on a leaflet map. +### s2-final-report.py + +Generate the final JSON files containing the final, raw Megapixels dataset. Includes data on the papers, with merged citations, as well as the corresponding data from the spreadsheets. Suitable for making custom builds for other people. + --- -## Cleaning the Data +## Notes on the geocoding process After scraping these universities, we got up to 47% match rate on papers from the dataset. However there is still more to solve: @@ -123,23 +135,25 @@ After scraping these universities, we got up to 47% match rate on papers from th - Empty addresses - some papers need to be gone through by hand? Maybe we can do digram/trigram analysis on the headings. Just finding common words would help. - Make a list of bogus papers - ones where PDFminer returned empty results, or which did not contain the word ABSTRACT, or were too long. +These scripts are files for getting an initial set of universities to dedupe cleanly. + ### expand-uni-lookup.py By now I had a list of institutions in `reports/all_institutions.csv` (done by merging the results of the geocoding, as I had done this on 4 computers and thus had 4 files of institutions). This file must be gone through manually. This technique geocoded around 47% of papers. At this point I moved `reports/all_institutions.csv` into the Google Sheets. All further results use the CSV on Google Sheets. -### s2-pdf-report.py +--- -Generates reports of things from the PDFs that were not found. +## Dumping all the PDF text -### s2-geocode-spreadsheet.py +### s2-extract-full-pdf-txt.py -To add new institutions, simply list them in the spreadsheet with the lat/lng fields empty. Then run this script and anything missing a lat/lng will get one. +Dumps all the PDF text and images to `datasets/s2/txt/*/*/paper.txt` using PDFMiner. -### s2-citation-report.py +### rm-txt-images.sh -Generate the main report with maps and citation lists. +The images dumped by PDFMiner include `*.img` files which seem to be some sort of raw image file. ImageMagick doesn't recognize them and they take a lot of space so I just delete them and leave the `jpg`/`bmp` files. --- diff --git a/scraper/countries.json b/scraper/countries.json index 4844aa14..d3dd213d 100644 --- a/scraper/countries.json +++ b/scraper/countries.json @@ -13,14 +13,14 @@ {"name": "Armenia", "code": "AM"}, {"name": "Aruba", "code": "AW"}, {"name": "Australia", "code": "AU"}, -{"name": "Austria", "code": "AT"}, +{"name": "Austria", "code": "AT", "alt": ["Österreich"]}, {"name": "Azerbaijan", "code": "AZ"}, {"name": "Bahamas", "code": "BS"}, {"name": "Bahrain", "code": "BH"}, -{"name": "Bangladesh", "code": "BD"}, +{"name": "Bangladesh", "code": "BD", "alt": ["বাংলাদেশ"]}, {"name": "Barbados", "code": "BB"}, {"name": "Belarus", "code": "BY"}, -{"name": "Belgium", "code": "BE"}, +{"name": "Belgium", "code": "BE", "alt": ["België / Belgique / Belgien"]}, {"name": "Belize", "code": "BZ"}, {"name": "Benin", "code": "BJ"}, {"name": "Bermuda", "code": "BM"}, @@ -29,21 +29,21 @@ {"name": "Bosnia and Herzegovina", "code": "BA"}, {"name": "Botswana", "code": "BW"}, {"name": "Bouvet Island", "code": "BV"}, -{"name": "Brazil", "code": "BR"}, +{"name": "Brazil", "code": "BR", "alt": ["Brasil"]}, {"name": "British Indian Ocean Territory", "code": "IO"}, {"name": "Brunei Darussalam", "code": "BN"}, {"name": "Bulgaria", "code": "BG"}, {"name": "Burkina Faso", "code": "BF"}, {"name": "Burundi", "code": "BI"}, {"name": "Cambodia", "code": "KH"}, -{"name": "Cameroon", "code": "CM"}, +{"name": "Cameroon", "code": "CM", "alt": ["Cameroun"]}, {"name": "Canada", "code": "CA"}, {"name": "Cape Verde", "code": "CV"}, {"name": "Cayman Islands", "code": "KY"}, {"name": "Central African Republic", "code": "CF"}, {"name": "Chad", "code": "TD"}, {"name": "Chile", "code": "CL"}, -{"name": "China", "code": "CN"}, +{"name": "China", "code": "CN", "alt": ["中国"]}, {"name": "Christmas Island", "code": "CX"}, {"name": "Cocos (Keeling) Islands", "code": "CC"}, {"name": "Colombia", "code": "CO"}, @@ -55,23 +55,23 @@ {"name": "Cote D'Ivoire", "code": "CI"}, {"name": "Croatia", "code": "HR"}, {"name": "Cuba", "code": "CU"}, -{"name": "Cyprus", "code": "CY"}, -{"name": "Czech Republic", "code": "CZ"}, -{"name": "Denmark", "code": "DK"}, +{"name": "Cyprus", "code": "CY", "alt": ["Κύπρος - Kıbrıs"]}, +{"name": "Czech Republic", "code": "CZ", "alt": ["Czechia", "Česko"]}, +{"name": "Denmark", "code": "DK", "alt": ["Danmark"]}, {"name": "Djibouti", "code": "DJ"}, {"name": "Dominica", "code": "DM"}, {"name": "Dominican Republic", "code": "DO"}, {"name": "Ecuador", "code": "EC"}, -{"name": "Egypt", "code": "EG"}, +{"name": "Egypt", "code": "EG", "alt": ["مصر"]}, {"name": "El Salvador", "code": "SV"}, {"name": "Equatorial Guinea", "code": "GQ"}, {"name": "Eritrea", "code": "ER"}, -{"name": "Estonia", "code": "EE"}, +{"name": "Estonia", "code": "EE", "alt": ["Eesti"]}, {"name": "Ethiopia", "code": "ET"}, {"name": "Falkland Islands (Malvinas)", "code": "FK"}, {"name": "Faroe Islands", "code": "FO"}, {"name": "Fiji", "code": "FJ"}, -{"name": "Finland", "code": "FI"}, +{"name": "Finland", "code": "FI", "alt": ["Suomi"]}, {"name": "France", "code": "FR"}, {"name": "French Guiana", "code": "GF"}, {"name": "French Polynesia", "code": "PF"}, @@ -79,10 +79,10 @@ {"name": "Gabon", "code": "GA"}, {"name": "Gambia", "code": "GM"}, {"name": "Georgia", "code": "GE"}, -{"name": "Germany", "code": "DE"}, +{"name": "Germany", "code": "DE", "alt": ["Deutschland"]}, {"name": "Ghana", "code": "GH"}, {"name": "Gibraltar", "code": "GI"}, -{"name": "Greece", "code": "GR"}, +{"name": "Greece", "code": "GR", "alt": ["Ελλάδα"]}, {"name": "Greenland", "code": "GL"}, {"name": "Grenada", "code": "GD"}, {"name": "Guadeloupe", "code": "GP"}, @@ -98,24 +98,24 @@ {"name": "Honduras", "code": "HN"}, {"name": "Hong Kong", "code": "HK"}, {"name": "Hungary", "code": "HU"}, -{"name": "Iceland", "code": "IS"}, +{"name": "Iceland", "code": "IS", "alt": ["Ísland"]}, {"name": "India", "code": "IN"}, {"name": "Indonesia", "code": "ID"}, -{"name": "Iran, Islamic Republic Of", "code": "IR"}, +{"name": "Iran", "code": "IR", "alt": ["ایران", "ایران"]}, {"name": "Iraq", "code": "IQ"}, {"name": "Ireland", "code": "IE"}, {"name": "Isle of Man", "code": "IM"}, -{"name": "Israel", "code": "IL"}, -{"name": "Italy", "code": "IT"}, +{"name": "Israel", "code": "IL", "alt": ["ישראל"]}, +{"name": "Italy", "code": "IT", "alt": ["Italia"]}, {"name": "Jamaica", "code": "JM"}, -{"name": "Japan", "code": "JP"}, +{"name": "Japan", "code": "JP", "alt": ["日本"]}, {"name": "Jersey", "code": "JE"}, {"name": "Jordan", "code": "JO"}, -{"name": "Kazakhstan", "code": "KZ"}, +{"name": "Kazakhstan", "code": "KZ", "alt": ["Казахстан"]}, {"name": "Kenya", "code": "KE"}, {"name": "Kiribati", "code": "KI"}, {"name": "North Korea", "code": "KP"}, -{"name": "Korea", "code": "KR", "alt": ["대한민국"]}, +{"name": "South Korea", "code": "KR", "alt": ["대한민국", "Korea"]}, {"name": "Kuwait", "code": "KW"}, {"name": "Kyrgyzstan", "code": "KG"}, {"name": "Laos", "code": "LA"}, @@ -144,7 +144,7 @@ {"name": "Micronesia", "code": "FM"}, {"name": "Moldova", "code": "MD"}, {"name": "Monaco", "code": "MC"}, -{"name": "Mongolia", "code": "MN"}, +{"name": "Mongolia", "code": "MN", "alt": ["Монгол улс"]}, {"name": "Montserrat", "code": "MS"}, {"name": "Morocco", "code": "MA"}, {"name": "Mozambique", "code": "MZ"}, @@ -152,19 +152,19 @@ {"name": "Namibia", "code": "NA"}, {"name": "Nauru", "code": "NR"}, {"name": "Nepal", "code": "NP"}, -{"name": "Netherlands", "code": "NL"}, +{"name": "Netherlands", "code": "NL", "alt": ["Nederland"]}, {"name": "Netherlands Antilles", "code": "AN"}, {"name": "New Caledonia", "code": "NC"}, -{"name": "New Zealand", "code": "NZ"}, +{"name": "New Zealand", "code": "NZ", "alt": ["New Zealand/Aotearoa"]}, {"name": "Nicaragua", "code": "NI"}, {"name": "Niger", "code": "NE"}, {"name": "Nigeria", "code": "NG"}, {"name": "Niue", "code": "NU"}, {"name": "Norfolk Island", "code": "NF"}, {"name": "Northern Mariana Islands", "code": "MP"}, -{"name": "Norway", "code": "NO"}, +{"name": "Norway", "code": "NO", "alt": ["Norge"]}, {"name": "Oman", "code": "OM"}, -{"name": "Pakistan", "code": "PK"}, +{"name": "Pakistan", "code": "PK", "alt": ["پاکستان"]}, {"name": "Palau", "code": "PW"}, {"name": "Palestine", "code": "PS"}, {"name": "Panama", "code": "PA"}, @@ -173,13 +173,13 @@ {"name": "Peru", "code": "PE"}, {"name": "Philippines", "code": "PH"}, {"name": "Pitcairn", "code": "PN"}, -{"name": "Poland", "code": "PL"}, +{"name": "Poland", "code": "PL", "alt": ["RP"]}, {"name": "Portugal", "code": "PT"}, {"name": "Puerto Rico", "code": "PR"}, {"name": "Qatar", "code": "QA"}, {"name": "Reunion", "code": "RE"}, -{"name": "Romania", "code": "RO"}, -{"name": "Russian Federation", "code": "RU"}, +{"name": "Romania", "code": "RO", "alt": ["România"]}, +{"name": "Russia", "code": "RU", "alt": ["Russian Federation", "РФ"]}, {"name": "Rwanda", "code": "RW"}, {"name": "Saint Helena", "code": "SH"}, {"name": "Saint Kitts and Nevis", "code": "KN"}, @@ -191,42 +191,43 @@ {"name": "Sao Tome and Principe", "code": "ST"}, {"name": "Saudi Arabia", "code": "SA"}, {"name": "Senegal", "code": "SN"}, -{"name": "Serbia and Montenegro", "code": "CS"}, +{"name": "Serbia", "code": "CS", "alt": ["Serbia"]}, +{"name": "Montenegro", "code": "ME"}, {"name": "Seychelles", "code": "SC"}, {"name": "Sierra Leone", "code": "SL"}, {"name": "Singapore", "code": "SG"}, {"name": "Slovakia", "code": "SK"}, -{"name": "Slovenia", "code": "SI"}, +{"name": "Slovenia", "code": "SI", "alt": ["Slovenija"]}, {"name": "Solomon Islands", "code": "SB"}, {"name": "Somalia", "code": "SO"}, {"name": "South Africa", "code": "ZA"}, {"name": "South Georgia and the South Sandwich Islands", "code": "GS"}, -{"name": "Spain", "code": "ES"}, +{"name": "Spain", "code": "ES", "alt": ["España"]}, {"name": "Sri Lanka", "code": "LK"}, {"name": "Sudan", "code": "SD"}, {"name": "Suriname", "code": "SR"}, {"name": "Svalbard and Jan Mayen", "code": "SJ"}, {"name": "Swaziland", "code": "SZ"}, -{"name": "Sweden", "code": "SE"}, -{"name": "Switzerland", "code": "CH"}, +{"name": "Sweden", "code": "SE", "alt": ["Sverige"]}, +{"name": "Switzerland", "code": "CH", "alt": ["Schweiz/Suisse/Svizzera/Svizra"]}, {"name": "Syrian Arab Republic", "code": "SY"}, -{"name": "Taiwan", "code": "TW"}, +{"name": "Taiwan", "code": "TW", "alt": ["臺灣"]}, {"name": "Tajikistan", "code": "TJ"}, {"name": "Tanzania", "code": "TZ"}, -{"name": "Thailand", "code": "TH"}, +{"name": "Thailand", "code": "TH", "alt": ["ประเทศไทย"]}, {"name": "Timor-Leste", "code": "TL"}, {"name": "Togo", "code": "TG"}, {"name": "Tokelau", "code": "TK"}, {"name": "Tonga", "code": "TO"}, {"name": "Trinidad and Tobago", "code": "TT"}, {"name": "Tunisia", "code": "TN"}, -{"name": "Turkey", "code": "TR"}, +{"name": "Turkey", "code": "TR", "alt": ["Türkiye"]}, {"name": "Turkmenistan", "code": "TM"}, {"name": "Turks and Caicos Islands", "code": "TC"}, {"name": "Tuvalu", "code": "TV"}, {"name": "Uganda", "code": "UG"}, {"name": "Ukraine", "code": "UA"}, -{"name": "United Arab Emirates", "code": "AE"}, +{"name": "United Arab Emirates", "code": "AE", "alt": ["Abu Dhabi - United Arab Emirates"]}, {"name": "United Kingdom", "code": "GB", "alt": ["UK"]}, {"name": "United States", "code": "US", "alt": ["USA"]}, {"name": "United States Minor Outlying Islands", "code": "UM"}, @@ -234,7 +235,7 @@ {"name": "Uzbekistan", "code": "UZ"}, {"name": "Vanuatu", "code": "VU"}, {"name": "Venezuela", "code": "VE"}, -{"name": "Vietnam", "code": "VN"}, +{"name": "Vietnam", "code": "VN", "alt": ["Việt Nam"]}, {"name": "British Virgin Islands", "code": "VG"}, {"name": "US Virgin Islands", "code": "VI"}, {"name": "Wallis and Futuna", "code": "WF"}, diff --git a/scraper/pdf_dump_all.sh b/scraper/pdf_dump_all.sh deleted file mode 100644 index a17c8d44..00000000 --- a/scraper/pdf_dump_all.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -for i in datasets/s2/pdf/*/*/*.pdf - do - OUTPUT="${i%.*}.txt" - OUTPUT="${OUTPUT/pdf/txt}" - IMDIR=`dirname ${OUTPUT}` - if [[ ! -e $OUTPUT ]] - then - pdf2txt.py -o "${OUTPUT}" -O "${IMDIR}" "${i}" - if [ -s $OUTPUT ] - then - echo "found $OUTPUT" - else - echo "rm empty $OUTPUT" - rm -f $OUTPUT - fi - fi - done - diff --git a/scraper/pdf_dump_first_page.sh b/scraper/pdf_dump_first_page.sh deleted file mode 100644 index 20e5182d..00000000 --- a/scraper/pdf_dump_first_page.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -for i in datasets/s2/pdf/*/*/*.pdf - do - OUTPUT="${i%.*}.txt" - if [[ ! -e $OUTPUT ]] - then - pdf2txt.py -p 1 $i > $OUTPUT - if [ -s $OUTPUT ] - then - echo "found $OUTPUT" - else - echo "rm empty $OUTPUT" - rm -f $OUTPUT - fi - fi - done diff --git a/scraper/reports/doi_institutions_geocoded.csv b/scraper/reports/doi_institutions_geocoded.csv index bb69984b..322af9ac 100644 --- a/scraper/reports/doi_institutions_geocoded.csv +++ b/scraper/reports/doi_institutions_geocoded.csv @@ -7888,3 +7888,394 @@ ACL Laboratory at Sharif University of Technology,35.7036366,51.351593,خیاب "Bogaziçi University, Bebek, Istanbul, Turkey hamdi.dibeklioglu@cmpe.boun.edu.tr",41.0847571,29.0510399,"Bebek Mh., 34342 Beşiktaş/İstanbul, Turkey" "BioCom - Bioinspired Computing Laboratory, University of Sao Paulo, Brazil",-23.5613991,-46.7307891,"São Paulo - State of São Paulo, Brazil" "Rose-Hulman Inst. of Technol., Terre Haute, IN, USA",39.4828386,-87.3240403,"5500 Wabash Ave, Terre Haute, IN 47803, USA" +"University of Buckingham, Buckingham, MK18 1EG, UK. harin.sellahewa@buckingham.ac.uk",51.9961738,-0.9918690000000001,"Yeomanry House, Hunter St, Buckingham MK18 1EG, UK" +"Institut TELECOM; TELECOM Lille 1, France",50.6103822,3.1349625,"Rue Guglielmo Marconi, 59650 Villeneuve-d'Ascq, France" +"RK University, Rajkot, India",22.2403032,70.90085460000002,"Bhavnagar Highway Kasturbadham, Rajkot, Gujarat 360020, India" +"Kyungpook National University, Buk-gu, Daegu, The Republic of Korea",35.888836,128.6102997,"80 Daehak-ro, Sangyeok 3(sam)-dong, Buk-gu, Daegu, South Korea" +"Union Visual Innovation Technology Co., Ltd., Shenzhen, China",22.543096,114.057865,"Shenzhen, Guangdong, China" +"E.T.S. Ingenieros Industriales, Universidad de Castilla-La Mancha Campus Universitario, Ciudad Real, Spain",38.99404390000001,-3.9204979,"Avda. Camilo José Cela, s/n, 13071 Ciudad Real, Cdad. Real, Spain" +"Centre de Développement des Technologies Avancées, Algeria",36.689487,2.981877,"CDTA Cité, 20 Août 1956, Baba Hassen، 16081, Algeria" +"INRIA/Ecole Normale Supérieure, Paris",48.8422058,2.3451689,"45 Rue d'Ulm, 75005 Paris, France" +"Algılayıcılar, Görüntü ve Sinyal İşleme Grubu, HAVELSAN A.Ş. Ankara, Türkiye",39.9333635,32.8597419,"Ankara, Turkey" +"Brigham and Women's Hospital and Harvard Medical School, Boston, MA, USA",42.3362325,-71.1065443,"75 Francis St, Boston, MA 02115, USA" +"Wrocław University of Technology, Wrocław, Poland",51.1073907,17.0619712,"wybrzeże Stanisława Wyspiańskiego 27, 50-370 Wrocław, Poland" +"State Key Laboratory of Smart Grid Protection and Control, Nanjing, China",32.060255,118.796877,"Nanjing, Jiangsu, China" +"Nanjing Forestry University and Shandong University, Jinan, China",32.060255,118.796877,"Nanjing, Jiangsu, China" +"Shanghai Jiao Tong University & Alibaba Group, Shanghai, China",31.0252201,121.4337784,"China, Shanghai, Minhang, 东川路 邮政编码: 200240" +"YCCE Nagpur, India",21.0944477,78.9783831,"Wanadongri ct, Wanadongri, Maharashtra 441110, India" +"Research Center for Smart Tourism Technology Application and Innovation, Sichuan Tourism University, Chengdu, China",30.578908,104.27712,"459 Hongling Rd, Longquanyi Qu, Chengdu Shi, Sichuan Sheng, China" +"University of Mysore, Mysore, India",12.307992,76.638921,"Krishnaraja Boulevard Road, K.G Koppal, Mysuru, Karnataka 570006, India" +"Poojappura Trivandrum, India",8.4916137,76.97870759999999,"Poojapura, Thiruvananthapuram, Kerala, India" +"M.S. Ramaiah Inst. of Tech., Bangalore-560054, India",13.0304619,77.5646862,"MSRIT Post, M S Ramaiah Nagar, MSR Nagar, Bengaluru, Karnataka 560054, India" +"Sichuan University Chengdu, China",30.663964,104.071022,"China, Sichuan, Chengdu, Qingyang, Shuncheng St, 252号顺吉大厦10层B1-B2 邮政编码: 610017" +"University of Brasilia, Brasília, Brazil",-15.7631573,-47.8706311,"Brasilia - Federal District, 70910-900, Brazil" +"Rose-Hulman Inst. of Technol., Terre Haute, IN, USA",39.4828386,-87.3240403,"5500 Wabash Ave, Terre Haute, IN 47803, USA" +"Tianjin University of Technology (TJUT), Tianjin, 300384, China",39.061004,117.142023,"Xiqing, China" +"Folkhalsan Research Center Helsinki, Finland",60.1871945,24.9138332,"Topeliuksenkatu 20, 00250 Helsinki, Finland" +"CSE, SUNY at Buffalo, USA and Southeast University, China",43.0027625,-78.7874198,"Davis Hall, #338, Buffalo, NY 14260, USA" +"Beijing Institute of Technology, Beijing 100081, CHINA. xushuang@bit.edu.cn",39.964431,116.310319,"Side Rd of N. 3rd Ring Rd W, Haidian Qu, Beijing Shi, China" +"Engineering Research Center of Wideband Wireless Communication Technique, Ministry of Education, Nanjing University of Posts and Telecommunications, Nanjing 210003, China",32.08025800000001,118.768159,"38 Guangdong Rd, Gulou Qu, Nanjing Shi, Jiangsu Sheng, China, 210028" +"Eastman Kodak Company, Rochester, NY, USA",43.1608057,-77.61967109999999,"343 State St, Rochester, NY 14650, USA" +"Cisco Systems Inc., Bengaluru, India",12.930671,77.6913747,"Salarpuria Hallmark Bldg A 133, Kadubeesanhalli, Panatoor Gram Panchayat, Outer Ring Road, Devarabisanahalli, Bellandur, Bengaluru, Karnataka 560103, India" +"Key Laboratory of Educational Informatization for Nationalities, Yunnan Normal University, Kunming, China",25.055125,102.696888,"298 Yi'eryi St, Wuhua Qu, Kunming Shi, Yunnan Sheng, China, 650031" +"Tata Consultancy Services Ltd., TCS Research & Innovation, Kolkata, India",22.5861287,88.4887248,"1A, Action Area II, Newtown, Kolkata, West Bengal 700156, India" +"Yunnan Minzu University, Kunming, People’s Republic of China",24.840109,102.849645,"Chenggong, Kunming, China" +"Indian Institute of Technology (B.H.U.), Varanasi-221005, India",25.2623247,82.98937350000001,"IIT-BHU, Banaras Hindu University Campus, Uttar Pradesh 221005, India" +"Library, Zhejiang Wanli University, Ningbo, China",29.819014,121.567337,"8 Qianhu S Rd, Yinzhou Qu, Ningbo Shi, Zhejiang Sheng, China, 315000" +"Umeå University, SE-901 87 Umeå, Sweden",63.82022240000001,20.3054461,"Postadress:, 901 87 Umeå, Sweden" +"Central China Normal University, Wuhan, China",30.518801,114.358401,"152 Luoyu Rd, Hongshan Qu, Wuhan Shi, Hubei Sheng, China, 430072" +"Manipal Institute of Technology, Manipal University, Karnataka-576104, India",13.3525321,74.79282239999999,"Udupi - Karkala Rd, Eshwar Nagar, Manipal, Karnataka 576104, India" +"École Polytechnique Fédérale de Lausanne (EPFL), Computer Vision Laboratory, Switzerland",46.5190557,6.5667576,"Route Cantonale, 1015 Lausanne, Switzerland" +"PRIS Lab of Beijing University of Posts and Telecommunications, Beijing 100876; Chongqing Three Gorges University, Chongqing 404000. E-mail: niexf@tom.com",39.962796,116.358103,"10 Xitucheng Rd, BeiTaiPingZhuang, Haidian Qu, Beijing Shi, China, 100876" +"Telecommunications and IT, University Politehnica Bucharest, Romania",44.4337844,26.0575959,"Complex studențesc Leu, corp A, B, Bulevardul Iuliu Maniu 1-3, București 061071, Romania" +"Institut TELECOM ; Telecom Lille 1, LIFL (UMR USTL/CNRS 8022), France",50.6103822,3.1349625,"Rue Guglielmo Marconi, 59650 Villeneuve-d'Ascq, France" +"Jiangsu Province Traditional Chinese Medicine Hospital, Nanjing, China",32.041842,118.774902,"155 Hanzhong Rd, Bai Xia Qu, Nanjing Shi, Jiangsu Sheng, China, 210004" +"Rajeev Gandhi Memorial College of Engineering & Technology, Nandyal, AP, India, 518001",15.5042936,78.37750249999999,"Nerawada 'X' Roads, Kurnool District, Nandyal, Andhra Pradesh 518501, India" +"University of Sharjah, Sharjah, United Arab Emirates",25.2867708,55.4783714,University City Rd - Sharjah - United Arab Emirates +"Gazi University Ankara, Turkey",39.94202190000001,32.823005,"Emniyet mah, Gazi Üniversitesi Rektörlüğü Teknik okulları, 06560 Yenimahalle/Ankara, Turkey" +"FEECS, Technical University of Ostrava, Ostrava-Poruba, Czech Republic",49.8318228,18.1609442,"Studentská 6184/15, 708 00 Ostrava-Poruba, Czechia" +"JNTU College of Engineering, Kakinada, India",16.9814788,82.2407284,"Jawaharlal Nehru Technological University, Kakinada, Andhra Pradesh 533003, India" +"Telecommunications and IT University “Politehnica” of Bucharest, Romania",44.4386064,26.0494925,"Splaiul Independenței 313, București 060042, Romania" +"The State Key Laboratory of High-end Server and Storage Technology, Jinan, China",36.6512,117.120095,"Jinan, Shandong, China" +"Universitat Politcnica de Catalunya, EU, Spain",41.388004,2.1132804,"Campus Nord, Carrer de Jordi Girona, 1, 3, 08034 Barcelona, Spain" +"University of Rhode Island, Kingston, RI, 02881",41.4860647,-71.5308537,"45 Upper College Rd, Kingston, RI 02881, USA" +"Hanshin University, Osan, Republic of Korea",37.1933281,127.0225987,"137 Hanshindae-gil, Sema-dong, Osan, Gyeonggi-do, South Korea" +"S.J.B. Institute of Technology, Bangalore, India",12.8998841,77.49578819999999,"No.67, Uttarahalli Main Road, Kengeri, BGS Health & Education City, Bengaluru, Karnataka 560060, India" +"Xi'an Institute of Optics and Precision Mechanics &Chinese Academy of Sciences, Xi'an, China",34.16673110000001,108.8582546,"17 Xinxi Ave, Changan Qu, Xian Shi, Shaanxi Sheng, China" +"Rayat & Bahra College of Engg. & Bio-Tech, Kharar, India",30.78094239999999,76.6192274,"V.P.O. Sahauran, Tehsil Kharar, Distt. Mohali, Kharar, Punjab 140104, India" +"Beijing Jiatong University, China",39.952371,116.347005,"China, Beijing, Haidian, 交大东路" +"Dynamixyz, Rennes, France",48.12746,-1.6260142,"3 Av. de Belle Fontaine, 35510 Cesson-Sévigné, France" +"University of Calcutta, Kolkata, India",22.5750862,88.3629188,"87, 1, College St, Calcutta University, College Square, Kolkata, West Bengal 700073, India" +"Informatics and Telematics Institute, Centre for Research and Technology Hellas, 1st Km Thermi-Panorama Rd, Thessaloniki 57001, Greece. filareti@iti.gr",40.5667611,22.9980147,"Thermi 570 01, Greece" +"College of Information and Control Engineering, China University of Petroleum(East China), Qingdao, China",36.067108,120.382609,"Qingdao, Shandong, China" +"LBS Institute of Technology for Women, Poojapura, Trivandrum, Kerala",8.4909689,76.9724786,"Vazhuthacaud - Poojappura Rd, Near Central Jail, Poojapura, Thiruvananthapuram, Kerala 695012, India" +"Centre for Quantum Computation & Intelligent Systems, and the Faculty of Engineering and Information Technology, University of Technology, Sydney, 235 Jones Street, Ultimo, NSW 2007, Australia",-33.8840299,151.199234,"Building 11 University of Technology Sydney 81, Broadway, Ultimo NSW 2007, Australia" +"Electronics and Information, Guangxi University",22.8376,108.289839,"Chongwen Rd, Xixiangtang Qu, Nanning Shi, Guangxi Zhuangzuzizhiqu, China, 530001" +"University of Alabama at Birmingham, USA",33.5021227,-86.8064447,"1720 2nd Ave S, Birmingham, AL 35294, USA" +"Indian Institute of Technology Kharagpur, Kharagpur, West Bengal, India",22.3149274,87.31053109999999,"Kharagpur, West Bengal 721302, India" +"Networking Biomedical Research Center on Bioengineering, Biomaterials and Nanomedicine (CIBER-BBN), Barcelona, Spain",41.3850639,2.1734035,"Barcelona, Spain" +"Infotech Oulu, Oulu Univ., Finland",65.0593177,25.4662935,"Pentti Kaiteran katu 1, 90014 Oulu, Finland" +"Faculty of Mathematics, Complutense University of Madrid, Madrid, Spain",40.4494588,-3.7258568,"Plaza Ciencias, 3, 28040 Madrid, Spain" +"PEC University of Technology, Chandigarh-160012, India",30.7672804,76.78698349999999,"Vidya Path, Sector 12, Chandigarh, 160012, India" +"INRIA Rhone Alpes, Grenoble 38334, France",45.217886,5.807369,"655 Avenue de l'Europe, 38330 Montbonnot-Saint-Martin, France" +"The University of Texas Southwestern Medical Center, Dallas, TX, USA",32.812093,-96.8401694,"5323 Harry Hines Blvd, Dallas, TX 75390, USA" +"University of Science & Technology, Bannu, Pakistan",33.0282503,70.70049139999999,"Bannu, Khyber Pakhtunkhwa, Pakistan" +DELPHI Automotive,42.6014462,-83.1619226,"5725 Delphi Dr, Troy, MI 48098, USA" +"University of California at Irvine, Irvine, CA, USA",33.6404952,-117.8442962,"Irvine, CA 92697, USA" +"INRIA Sophia Antipolis - Méditerranée. 2004 Route des Lucioles - BP93, Sohpia Antipolis, 06902, France",43.6158131,7.068379999999999,"2004 Route des Lucioles, 06902 Valbonne, France" +"DIEI, University of Cassino and of Southern Lazio, Via G. Di Biasio 43, (FR) - Italy",41.471875,13.828328,"Viale dell'Università, 03043 Cassino FR, Italy" +"The Johns Hopkins University, Baltimore, MD 21218, USA",39.3299013,-76.6205177,"Baltimore, MD 21218, USA" +"Netflix, Los Gatos, CA, USA",37.2571032,-121.9641779,"121 Albright Way, Los Gatos, CA 95032, USA" +"Joondalup WA 6027, Australia",-31.745,115.766111,"Joondalup WA 6027, Australia" +"Macquarie University, NSW, Australia",-33.77382370000001,151.1126498,"Balaclava Rd, Macquarie Park NSW 2109, Australia" +"Research Laboratories, Eastman Kodak Company, Rochester, New York 14650-1816",43.1608057,-77.61967109999999,"343 State St, Rochester, NY 14650, USA" +"College of Information Science and Technology, Nanjing Agricultural University, Nanjing, China",32.03341,118.842577,"Jinling Rd, Xuanwu Qu, Nanjing Shi, Jiangsu Sheng, China, 210014" +"Carlos III University of Madrid (UC3M), Getafe, Spain",40.31705720000001,-3.7274678,"Calle Madrid, 126, 28903 Getafe, Madrid, Spain" +"Dalian Key Laboratory of Digital Technology for National Culture, Dalian Minzu University, Dalian, China",38.91400300000001,121.614682,"Dalian, Liaoning, China" +"Sarvajanik College of Engineering and Technology, Surat, India",21.1821465,72.80875979999999,"Dr, R K Desai Marg, Opp. Mission Hospital, Athwalines, Athwa, Surat, Gujarat 395001, India" +"Tianjin University of Commerce, Tianjin, China",39.1821223,117.1303827,"China, Tianjin, Beichen, Chuiliu Rd, 商业大学" +"University of Ulsan, 680-749, Republic of Korea",35.5437411,129.2562843,"93 Daehak-ro, Mugeo-dong, Nam-gu, Ulsan, South Korea" +"University of Rajshahi, Rajshahi, Bangladesh",24.3683017,88.6376927,"Rajshahi University, Administration Building 1, Rajshahi, Bangladesh" +"University of Jaffna, Sri Lanka",9.684855599999999,80.0220413,"Sir. Pon Ramanathan Road, Thirunelvelly,, Jaffna 40000, Sri Lanka" +"Kuwait University, Safat, Kuwait",29.3406199,47.9212797,"Jamal Abdul Nasser St, Kuwait" +"Microsoft Live Labs Research, Redmond",47.64183920000001,-122.1407465,"14865 NE 36th St, Redmond, WA 98052, USA" +"Anyang Normal University, Anyang, China",36.058736,114.370738,"436 Xiange Ave, Wenfeng Qu, Anyang Shi, Henan Sheng, China" +"Curtin University GPO Box U1987, Perth, WA 6845",-32.0061951,115.8944182,"Kent St, Bentley WA 6102, Australia" +"Temple Univ., Philadelphia",39.9811935,-75.1553512,"1801 N Broad St, Philadelphia, PA 19122, USA" +"College of Information Engineering, Shanghai Maritime University, Shanghai, People’s Republic of China",30.874115,121.905446,"1550 Haigang Ave, Pudong Xinqu, Shanghai Shi, China" +"Minjiang University, Fuzhou, People’s Republic of China",26.064704,119.169116,"1 Wenxian Rd, Minhou Xian, Fuzhou Shi, Fujian Sheng, China" +"Robotic Surgery, Kaleida Health Western New York, Buffalo NY 14214 USA",42.9571394,-78.7159126,"2100 Wehrle Dr, Buffalo, NY 14221, USA" +TX 76201,33.2167226,-97.14132219999999,"Denton, TX 76201, USA" +"China University of Geosciences, Wuhan, China, 430074",30.516496,114.401994,"China, Hubei, Wuhan, Hongshan, Lumo Rd, 388号中国地质大学东区" +"Digital Image Processing Laboratory, Islamia College, Peshawar, Pakistan",34.0000461,71.4759682,"Grand Trunk Rd, Rahat Abad, Peshawar, Khyber Pakhtunkhwa, Pakistan" +"Army Research Laboratory, Adelphi, Maryland, United States of America",39.0298587,-76.9638027,"2800 Powder Mill Rd, Adelphi, MD 20783, USA" +"Statistical Machine Intelligence & LEarning, 611731, China",30.7598454,103.9705512,"Pixian, Chengdu, Sichuan, China, 611731" +"University at Buffalo, The State University of New York, Buffalo, NY, USA",43.0008093,-78.7889697,"Buffalo, NY 14260, USA" +"Dharmsinh Desai University, Nadiad, India",22.6811771,72.87949139999999,"College Rd, Akshar Twp, Nadiad, Gujarat 387001, India" +"University of New Orleans, LA 70148, U.S.A",30.0285959,-90.0660365,"2000 Lakeshore Dr, New Orleans, LA 70148, USA" +"University of Califonia, San Diego",32.8800604,-117.2340135,"9500 Gilman Dr, La Jolla, CA 92093, USA" +"Electronics and Instrumentation Engineering, Seemanta Engineering College, Mayurbhanj, Odisha, India, 757086",22.17084,86.60754910000001,"Mayurvihar, P.O. - Jharpokharia, Dist: Mayurbhanj, Kandalia, Odisha 757086, India" +"Perth, Western Australia 6012",-32.0135412,115.7634366,"Mosman Park WA 6012, Australia" +"Xinjiang University, China",43.765047,87.616153,"666 Shengli Rd, Tianshan Qu, Wulumuqi Shi, Xinjiang Weiwuerzizhiqu, China" +"Central China Normal University, Wuhan, China",30.518801,114.358401,"152 Luoyu Rd, Hongshan Qu, Wuhan Shi, Hubei Sheng, China, 430072" +"University of Electronic Science and Technology of China (UESTC), Chengdu, 610054, China P.R.C",30.672721,104.098806,"2 Jianshe North Rd 2nd Section, Jianshe Road, Chenghua Qu, Chengdu Shi, Sichuan Sheng, China, 610054" +"S.J.B. Inst. of Tech., Bangalore-560060, INDIA",12.8998841,77.49578819999999,"No.67, Uttarahalli Main Road, Kengeri, BGS Health & Education City, Bengaluru, Karnataka 560060, India" +"Aveiro, Portugal",40.6405055,-8.6537539,"Aveiro, Portugal" +"Beijing Institute of Technology, Beijing, 100081, P.R. China",39.964431,116.310319,"Side Rd of N. 3rd Ring Rd W, Haidian Qu, Beijing Shi, China" +"Trinity College, Dublin, Ireland",53.3437935,-6.254571599999999,"College Green, Dublin 2, Co. Dublin, Ireland" +"Beijing Innovisgroup Tec Co. LTD, Beijing, China",39.90419989999999,116.4073963,"Beijing, China" +"Southern Illinois University Carbondale, Carbondale, IL, USA",37.7100209,-89.2225941,"1263 Lincoln Dr, Carbondale, IL 62901, USA" +"Misrimal Navajee Munoth Jain Engineering College, Chennai, India",12.9460255,80.2450236,"Guru Marudhar Kesari Building, Subramanya Nagar St Rd, Thorapakkam, Jyothi Nagar, Chennai, Tamil Nadu 600097, India" +"IIT, Roorkee, India",29.8648599,77.89657869999999,"Roorkee - Haridwar Highway, Roorkee, Uttarakhand 247667, India" +"Bangalore University, India",12.9527314,77.5157387,"Gnana Bharathi Campus, Gnana Bharathi Main Rd, Teachers Colony, Nagarbhavi, Bengaluru, Karnataka 560056, India" +"International Islamic University, Islamabad, Pakistan",33.6593237,73.023753,"H-10, Islamabad, Islamabad Capital Territory 44000, Pakistan" +"Computational Biomedicine Lab, Univ. of Houston, Houston, TX",29.7199489,-95.3422334,"4800 Calhoun Rd, Houston, TX 77004, USA" +"University of Nîmes, France",46.227638,2.213749,France +"College of Engineering, Huazhong Agricultural University, Wuhan P.R. China 430070",30.475126,114.353035,"China, Hubei, Wuhan, Hongshan, 南湖路" +"Kodak Alaris, Inc., Rochester, NY, USA",43.1494914,-77.7355832,"336 Initiative Dr, Rochester, NY 14624, USA" +"Center for Information Technology, UPES, Dehradun, India",30.317269,78.0277816,"47, 2nd Floor Meedo Complex, Opposite Shivaji Dharamshala, Saharanpur Rd, Saharanpur Chowk, Dehradun, Uttarakhand 248001, India" +"Hebei University of Technology, Tianjin, China",39.179635,117.165882,"8 Dingzigu 1st Rd, Hongqiao Qu, China, 300131" +"Siirt University, Siirt, Turkey",37.9360072,41.9403985,"Yeni Mahallesi, Güreş Cad., 56100 Siirt Merkez/Siirt, Turkey" +"Laboratory of LESIA, University of Biskra, Algeria",34.8455802,5.7481915,"BP 145 RP، بسكرة 07000, Algeria" +Graduate University of Chinese Academy of Sciences,39.98177,116.330086,"China, Beijing, Haidian, Zhongguancun South 1st Alley, 中关村南一条" +"Harbin Institute of Technology Shenzhen Graduate School, Shenzhen, China",22.586752,113.96878,"China, Guangdong, Shenzhen, Nanshan, 平山一路" +"Guangdong Pharmaceutical University, Guangzhou, China",23.055449,113.411846,"280 Daxuecheng Outer Ring E Rd, Panyu Qu, Guangzhou Shi, Guangdong Sheng, China" +"Shenzhen Academy of Robotics, Shenzhen, China",22.543096,114.057865,"Shenzhen, Guangdong, China" +"Telecom-Bretagne, Brest - France",48.35820649999999,-4.5703774,"655 Avenue du Technopôle, 29280 Plouzané, France" +"Srinakharinwirot University, Ongkharak, Thailand",14.1074545,100.9820801,"63 หมู่ที่ 7 Rangsit - Nakhon Nayok Rd, Ongkharak, Ongkharak District, Nakhon Nayok 26120, Thailand" +"More-Than-One Robotics Laboratory, University of Brunei Darussalam, Brunei Darussalam",4.9754274,114.8960247,"Universiti Brunei Darussalam, Brunei" +"Faculty of Information Technology, VUT — Brno University of Technology, Brno, Czech Republic",49.226616,16.5966401,"Božetěchova 1/2, 612 00 Brno-Královo Pole, Czechia" +"Hubei Province Key Laboratory of Intelligent Robot, Wuhan Institute of Technology, 430073, China",30.5020376,114.3901822,"693 Xiongchu Ave, Hongshan Qu, Wuhan Shi, Hubei Sheng, China, 430073" +"Shanghai Institute of Technology, Shanghai, China",30.838077,121.507817,"100 Haiquan Rd, Fengxian Qu, Shanghai Shi, China" +"Changzhou University, Changzhou, China",31.684237,119.955141,"1 Gehu Middle Rd, Wujin Qu, Changzhou Shi, Jiangsu Sheng, China" +"Engineering and Technology College, Sichuan Open University, Chengdu, China",30.728862,103.966381,"一 Baiye Rd, Pixian, Chengdu Shi, Sichuan Sheng, China" +"Fujian Provincial Key Laboratory of Information Processing and Intelligent Control, Fuzhou, China",26.074478,119.296482,"Fuzhou, Fujian, China" +"College of Computer Science, Inner Mongolia University, Hohhot, China",40.81426099999999,111.689298,"Saihan, Hohhot, Inner Mongolia, China, 010000" +"BoHai University, JinZhou, China",41.086564,121.118854,"19 Keji Rd, Taihe Qu, Jinzhou Shi, Liaoning Sheng, China, 121000" +"Kalasalingam University, Krishnankoil, India",9.5747052,77.6798137,"MDR 194, Krishnan Kovil, Tamil Nadu 626126, India" +"Escuela Politécnica Superior, University of Alcala, Alcalá de Henares, Madrid, Spain",40.5130335,-3.3487276,"Campus Universitario, Ctra. Madrid-Barcelona km, 33, 600, 28805 Alcalá de Henares, Spain" +"Beijing University of Posts and Telecom., China",39.962796,116.358103,"10 Xitucheng Rd, BeiTaiPingZhuang, Haidian Qu, Beijing Shi, China, 100876" +3,37.0375471,-95.6372785,"1314 W 7th St, Coffeyville, KS 67337, USA" +"Bogaziçi University, Bebek, Istanbul, Turkey hamdi.dibeklioglu@cmpe.boun.edu.tr",41.0847571,29.0510399,"Bebek Mh., 34342 Beşiktaş/İstanbul, Turkey" +"University of Missouri Kansas City, Kansas City, USA",39.0335539,-94.57602589999999,"5100 Rockhill Rd, Kansas City, MO 64110, USA" +"Signal Processing of Ministry of Education Southeast University, Key Laboratory of Underwater Acoustic, China",35.86166,104.195397,China +"Fujian Provincial Key Laboratory of Information Processing and Intelligent Control, Minjiang University, Fuzhou, China",26.064704,119.169116,"1 Wenxian Rd, Minhou Xian, Fuzhou Shi, Fujian Sheng, China" +"Universitat Politècnica de Catalunya, Vilanova i la Geltrú, Spain",41.22311029999999,1.7356471,"Carrer Metaŀlúrgia, 11, 08800 Villanueva y Geltrú, Barcelona, Spain" +"Faculty of Sciences, University of Novi Sad, Novi Sad, Serbia",45.245411,19.852777,"Trg Dositeja Obradovića 3, Novi Sad 21000, Serbia" +"Institute of Informatics, UFRGS, Porto Alegre, Brazil",-30.0688963,-51.1206198,"Avenida Bento Gonçalves, 9500 - Agronomia, Porto Alegre - RS, 91509-900, Brazil" +"Faculty of the Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University, Xi’an, China",34.250803,108.983693,"28 Xianning W Rd, JiaoDa ShangYe JieQu, Beilin Qu, Xian Shi, Shaanxi Sheng, China" +"Guangdong Polytechnic Normal University, Guangzhou, China",23.131707,113.371643,"Tianhe Park, Tianhe, Guangzhou, China, 510640" +"S.J.B. Inst. of Tech., Bangalore - 560060, India",12.8998841,77.49578819999999,"No.67, Uttarahalli Main Road, Kengeri, BGS Health & Education City, Bengaluru, Karnataka 560060, India" +"Vellore Institute of Technology, Vellore, India",12.972067,79.1595619,"Gorbachev Rd, Vellore, Tamil Nadu 632014, India" +"National Engineering Research Center for Multimedia Software, China",35.86166,104.195397,China +"K.U. Leuven, Faculty of Engineering, Center of Processing Speech and Images at the Medical Imaging Research Center, University Hospitals Gasthuisberg, Herestraat 49 - bus 7003, B-3000 Leuven, Belgium",50.8786385,4.671852900000001,"Herestraat 49 - bus 7003, 3000 Leuven, Belgium" +"University of Nebraska-Lincoln, Lincoln 68503, NE, United States",40.8201966,-96.70047629999999,"1400 R St, Lincoln, NE 68588, USA" +"University of Mossouri, 65211, USA",38.9331391,-92.3738037,"E2509 Lafferre Hall, Columbia, MO 65211, USA" +"IBM Research Lab, China",37.211053,-121.8069487,"650 Harry Rd, San Jose, CA 95120, USA" +"Ames HCI Group, Moffett Field",30.2583281,-81.60224819999999,"6440 Southpoint Pkwy #300, Jacksonville, FL 32216, USA" +"Computer Center, Russian Academy of Sciences, Moscow, Russia",55.70873899999999,37.5863299,"Leninsky Ave, 37А, Moskva, Russia, 119334" +"Gazi University, Engineering and Architecture Faculty, 06570 Ankara, Turkey",39.9313785,32.845961,"Eti Mh., Maltepe Mahallesi, Yükseliş Sokak, No:5, Ankara, Turkey" +"University at Buffalo (SUNY), NY 14260, USA",42.953204,-78.81835149999999,"Buffalo, NY 14260, USA" +"Engineering, U.E.T Peshawar Mardan Campus, Charsaddah Road Mardan",34.1856024,72.0273113,"Bypass Rd, Mardan, Khyber Pakhtunkhwa 23200, Pakistan" +"Electronic and Computer Engineering, University of Pretoria, Pretoria, South Africa",-25.7545492,28.2314476,"Lynnwood Rd, Hatfield, Pretoria, 0002, South Africa" +"Faculty of Computer Science, Benemérita Universidad Autónoma de Puebla, Puebla, Mexico",19.0051844,-98.20441869999999,"Av San Claudio 14 Sur, Cd Universitaria, Cdad. Universitaria, 72592 Puebla, Pue., Mexico" +1,37.220166,-95.6990574,"612 E Walnut St, Independence, KS 67301, USA" +"Image Processing and Computer Vision Group, Universidad de Buenos Aires, Argentina",-34.5419202,-58.44227009999999,"Av. Int. Cantilo, Buenos Aires, Argentina" +"Innopolis University, Kazan, Russia",55.753449,48.74341099999999,"Universitetskaya St, 1, Innopolis, Respublika Tatarstan, Russia, 420500" +"Speech and Image Processing Research Lab, National Institute of Technology Silchar, 788010, Assam, India",24.7577144,92.79229289999999,"NIT Road, Fakiratilla, Silchar, Assam 788010, India" +University of São Paulo,-23.5613991,-46.7307891,"São Paulo - State of São Paulo, Brazil" +"Brunel University London, UB8 3PH, UK",51.5328677,-0.471229,"Kingston Ln, London, Uxbridge UB8 3PH, UK" +"Anhui University of Technology, Maanshan, China",31.692956,118.510711,"59 Hudong N Rd, Huashan Qu, Maanshan Shi, Anhui Sheng, China, 243000" +"Computer Science, King Mongkut's University of Technology Thonburi, Bangkok, Thailand",13.2311474,100.9584579,"Bang Phra, Si Racha District, Chon Buri 20110, Thailand" +"National Institute of Technology Silchar, India",24.7577144,92.79229289999999,"NIT Road, Fakiratilla, Silchar, Assam 788010, India" +"Guizhou Normal University, Guiyang, China",26.5869154,106.7227525,"116 Baoshan N Rd, Yunyan Qu, Guiyang Shi, Guizhou Sheng, China, 550000" +"Treelogic, Technological Scientific Park of Asturias, Llanera, Spain",43.421881,-5.820168,"Parque Tecnológico de Asturias, parcela 30, 33428, Asturias, Spain" +"Information Technology and Engineering, University of Ballarat, Australia",-37.626441,143.8910952,"University Dr, Mount Helen VIC 3350, Australia" +University of Sao Paulo,-23.5613991,-46.7307891,"São Paulo - State of São Paulo, Brazil" +"Electrical Engineering and Computer Science, Queens University Belfast, BT7 1NN, UK",54.581827,-5.9374652,"18 Malone Rd, Belfast BT9 6RT, UK" +"Indian Institute of Information Technology Chittoor, Sri City, A.P., India",13.5568171,80.0261283,"630 Gnan Marg, Sri City, Andhra Pradesh 517646, India" +"Southwest Petroleum University, Chengdu, China",30.821792,104.183563,"8 Xindu Ave, Xindu Qu, Chengdu Shi, Sichuan Sheng, China" +"Institute of Automation, Chinese Academy of Sciences & Boomhope Information and Technology Co., Ltd, Beijing, China",39.979456,116.3284567,"China, Beijing, Haidian, Zhongguancun South 3rd St, 中关村" +"Air Force Engineering University, Xi’an, China",34.273797,109.036171,"Baqiao, Xi'an, Shaanxi, China" +"Khalifa University of Science Technology & Research, Sharjah, United Arab Emirates",24.4473556,54.3949088,Abu Dhabi - United Arab Emirates +"University of Pannonia, H-8200 Veszprém Egyetem u. 10., czuni@almos.vein.hu",47.0879055,17.9081616,"Veszprém, Egyetem u. 10, 8200 Hungary" +"Institute of Image & Graphic, Chengdu, 610064, China",30.6360596,104.0909334,"Wuhou, Chengdu, Sichuan, China, 610064" +"Chongqing Vocational College of Transportation, Chongqing, 402247, China",29.407238,106.276214,"Jiangjin, Chongqing, China" +"Boğaziçi University, Istanbul, Turkey",41.0847571,29.0510399,"Bebek Mh., 34342 Beşiktaş/İstanbul, Turkey" +"Alibaba Group R&D, Hangzhou, P.R. China",30.274084,120.15507,"Hangzhou, Zhejiang, China" +"University of Electro-Communications, 1-5-1 Chofugaoka, Chofu-shi, Tokyo, 182-8585 Japan",35.6556498,139.5442039,"1 Chome-5-1 Chofugaoka, Chofu, Tokyo 182-8585, Japan" +"Xi¿an University of Techonology, China",34.251836,108.993218,"China, Shaanxi, Xi'an, Beilin, Jinhua S Rd, 金花南路5号 邮编 710048" +"Shanghai University of Engineering Science, Shanghai 201620, P. R. China",31.05723,121.208437,"333 Longteng Rd, Songjiang Qu, Shanghai Shi, China" +"North-China University of Technology, Beijing, China",39.927951,116.205525,"5 Jinyuanzhuang Rd, Shijingshan Qu, China, 100144" +"Universidad Tecnica Federico Santa Maria, Valparaiso, Chile",-33.0352386,-71.5967794,"1680 - Av. España, Valparaíso, Región de Valparaíso, Chile" +"Intelligent Systems Lab (LSI) from Universidad Carlos III de Madrid, Leganés, Spain",40.31705720000001,-3.7274678,"Calle Madrid, 126, 28903 Getafe, Madrid, Spain" +"College of Applied Science and Technology, Hainan University, China",20.058432,110.330623,"Qunxian N Rd, Meilan Qu, Haikou Shi, Hainan Sheng, China, 570208" +"Univ. of Minnesota, USA",44.97399,-93.2277285,"Minneapolis, MN 55455, USA" +"IIIT Chittoor, SriCity, Andhra Pradesh, India",13.5568171,80.0261283,"630 Gnan Marg, Sri City, Andhra Pradesh 517646, India" +"Xi'an Jiaotong-Liverpool University, Suzhou, 215123, China",31.274822,120.738094,"Wuzhong, Suzhou, China, 215123" +"Industry Centre, Shenzhen polytechnic, Shenzhen, China",22.580973,113.940407,"China, Guangdong, Shenzhen, Nanshan, 留仙大道" +"Maharashtra Institute of Technology, Pune, India",18.5183671,73.8147226,"Survey No. 124, MIT College Campus, Paud Rd, Kothrud, Pune, Maharashtra 411038, India" +"GREYC Laboratory, ENSICAEN - University of Caen - CNRS, 6 Boulevard Maréchal Juin, 14000, France",49.2147766,-0.3671692,"Boulevard Maréchal Juin, 14000 Caen, France" +m,37.09024,-95.712891,"Independence, KS 67301, USA" +"College of Information Science and Technology Beijing Normal University, Beijing Beijing, China",39.9614831,116.3675134,"Bei Jing Shi Fan Da Xue Xin Xi Ke Xue Yu Ji Shu Xue Yuan, BeiTaiPingZhuang, Haidian Qu, Beijing Shi, China" +"University of Rajshahi, Rajshahi, Bangladesh",24.3683017,88.6376927,"Rajshahi University, Administration Building 1, Rajshahi, Bangladesh" +"China National Computer Network Emergency Response Technical Team/Coordination Center of China, Beijing, China",39.9743696,116.3873427,"Guo Jia Ji Suan Ji Wang Luo Ying Ji Ji Shu Chu Li Xie Tiao Zhong Xin, Chaoyang Qu, Beijing Shi, China, 100029" +"Northumbria University Newcastle upon Tyne, NE1 8ST, UK",54.9767623,-1.6074983,"Sutherland Building, 2 Ellison Pl, Newcastle upon Tyne NE1 8ST, UK" +"Peking University & Alibaba Group, Beijing, China",39.986913,116.3058739,"5 Yiheyuan Rd, Haidian Qu, Beijing Shi, China, 100080" +"University of Kansas, Lawrence, KS",38.9543439,-95.2557961,"1450 Jayhawk Blvd, Lawrence, KS 66045, USA" +MIT World Peace University,18.5183671,73.8147226,"Survey No. 124, MIT College Campus, Paud Rd, Kothrud, Pune, Maharashtra 411038, India" +"Ideal Institute of Information and Technology, Northeast Normal University, NENU, Changchun, China",43.817071,125.323544,"Changchun, Jilin, China" +"TOBB University of Economics and Technology, Ankara, Turkey",39.92130969999999,32.7988233,"Söğütözü Mahallesi, Söğütözü Cd. No:43, 06510 Çankaya/Ankara, Turkey" +"Central South University, Changsha, People’s Republic of China",28.16437,112.93251,"932 Lushan S Rd, Yuelu Qu, Changsha Shi, Hunan Sheng, China" +"College of Computer and Communication Engineering, China University of Petroleum, Dongying, China",37.468675,118.541767,"Chenguang W Rd, Dongying Qu, Dongying Shi, Shandong Sheng, China, 257000" +"Royal Melbourne Institute of Technology University , Melbourne, Australia",-37.809286,144.9644092,"124 La Trobe St, Melbourne VIC 3000, Australia" +"Engineering Research Institute of Aragon (I3A), University of Zaragoza, Zaragoza, Spain",41.6850158,-0.8874757999999999,"C/ Mariano Esquillor s/n, 50018 Zaragoza, Spain" +"Computer Vision and Machine Learning Research Group, University of Central Lancashire, Preston, UK",53.7641378,-2.7092453,"Fylde Rd, Preston PR1 2HE, UK" +"NJ, USA",40.0583238,-74.4056612,"New Jersey, USA" +"Center for Automation Research, UMIACS University of Maryland, College Park, MD 20742",38.9907524,-76.9362708,"115 Paint Branch Dr, College Park, MD 20742, USA" +"Hebei College of Industry and Technology, Shijiazhuang 050091, China",37.980386,114.462042,"626 Hongqi St, Qiaoxi Qu, Shijiazhuang Shi, Hebei Sheng, China" +"Institute of Intelligent System and Decision, Zhejiang, China",29.1416432,119.7889248,"Zhejiang, China" +"ECE, Mepco Schlenk Engineering College, Sivakasi, India",9.524678699999999,77.8553089,"Mepco Nagar, Mepco Engineering College Post, Sivakasi, Virudhunagar, Tamil Nadu 626005, India" +"VIT University, Vellore, Tamilnadu, India",12.972067,79.1595619,"Gorbachev Rd, Vellore, Tamil Nadu 632014, India" +"National University of Ireland, Maynooth, Ireland",53.38447799999999,-6.6011155,"Mariavilla, Maynooth, Co. Kildare, Ireland" +"Shri Ram Murti Smarak College of Engineering & Technology, Bareilly, India",28.4760187,79.4346714,"Ghanghoua Ghanghori, Uttar Pradesh 243202, India" +"SVNIT, Surat, India",21.167353,72.7850947,"Ichchhanath Surat- Dumas Road, Keval Chowk, Surat, Gujarat 395007, India" +"Centre for Quantum Computation and Information Systems, University of Technology, Sydney, Sydney, Australia",-33.918015,151.2292294,"2, Newton Building, UNSW, Kensington NSW 2033, Australia" +"Shanghai University of Electric Power, Shanghai, China 200090",31.274773,121.547575,"1193 Hejian Rd, Yangpu Qu, Shanghai Shi, China, 200093" +"École centrale de Lyon, Écully, France",45.7838596,4.7691899,"36 Avenue Guy de Collongue, 69134 Écully, France" +"Beijing Laboratory of Intelligent Information Technology, Beijing Institute of Technology, Beijing, 100081, P.R. China",39.9924551,116.3499005,"Chengfu Rd, WuDaoKou, Haidian Qu, Beijing Shi, China, 100083" +"Wuhan University Wuhan, China",30.5360485,114.3643219,"Wuchang, Wuhan, Hubei, China, 430072" +"S.V. National Institute of Technology, Surat, 329507, India",21.167353,72.7850947,"Ichchhanath Surat- Dumas Road, Keval Chowk, Surat, Gujarat 395007, India" +"University of Bucharest, 14 Academiei Street, Bucharest, Romania",44.4354633,26.0996883,"Strada Academiei 14, București 010014, Romania" +"University of California at Riverside, Riverside, CA, USA",33.9737055,-117.3280644,"900 University Ave, Riverside, CA 92521, USA" +"Visualization and Perception Lab, Chennai, India",12.9890989,80.2288467,"CSB, Indian Institute Of Technology, Chennai, Tamil Nadu 600036, India" +"IBM, Yorktown Heights, NY, USA",41.2105554,-73.8034271,"1101 Kitchawan Rd, Yorktown Heights, NY 10598, USA" +"Centre de Developpement des Technologies Avancees, Algeria",36.689487,2.981877,"CDTA Cité, 20 Août 1956, Baba Hassen، 16081, Algeria" +"Central China Normal University, Wuhan, China",30.518801,114.358401,"152 Luoyu Rd, Hongshan Qu, Wuhan Shi, Hubei Sheng, China, 430072" +"Birla Institute of Technology and Science (BITS), Pilani, India",28.3588163,75.58802039999999,"Pilani, Rajasthan 333031, India" +"National University of Ireland, Galway",53.2770243,-9.0614864,"University Rd, Galway, Ireland" +"Industrial Technology Research Institute, China",31.4032991,120.8948911,"Boshi Rd, Kunshan Shi, Suzhou Shi, Jiangsu Sheng, China" +"WaveLab - The Multimedia Signal Processing and Security Lab, Universität Salzburg, Austria",47.79722959999999,13.0479826,"Kapitelgasse 4/6, 5020 Salzburg, Austria" +"Université de Biskra, Biskra, Algeria",34.8455802,5.7481915,"BP 145 RP، بسكرة 07000, Algeria" +Zürich University Hospital,47.3766843,8.5491457,"Rämistrasse 100, 8091 Zürich, Switzerland" +"Autonomous University of Madrid, Madrid, Spain",40.5466983,-3.6943619,"Ciudad Universitaria de Cantoblanco, 28049 Madrid, Spain" +"Indian Institute of Technology Kharagpur, Kharagpur, West Bengal, India",22.3149274,87.31053109999999,"Kharagpur, West Bengal 721302, India" +"Pontifícia Universidade Católica do Rio Grande do Sul, Brazil",-30.0346316,-51.2176986,"State of Rio Grande do Sul, Brazil" +"Sichuan Film and Television University, Chengdu, China",30.724051,104.026606,"China, Sichuan, Chengdu, Jinniu, 金鑫大道沙西线9 邮政编码: 610036" +"Center for Research in Intelligent Systems, University of California Riverside, Riverside, CA, 92521, USA",33.9737055,-117.3280644,"900 University Ave, Riverside, CA 92521, USA" +"ISIR laboratory, Pierre and Marie Curie university, Paris Cedex 05, France",48.8471036,2.357499,"4 Place Jussieu, 75005 Paris, France" +"Virtual Humans Simulation Lab, Pontifical Catholic University of Rio Grande do Sul, Porto Alegre, Brazil",-30.0593446,-51.1734912,"Av. Ipiranga, 6681 - Partenon, Porto Alegre - RS, 90619-900, Brazil" +"Faculty of Electrical Engineering and Information Technology, Slovak University of Technology, Bratislava",48.15185320000001,17.0733446,"Ilkovičova 2961/3, 841 04 Karlova Ves, Slovakia" +"University of North Dakota, Grand Forks, North Dakota 58202",47.922891,-97.0768014,"Grand Forks, ND 58202, USA" +"Universitat Jaume I, Castelló de la Plana, Spain",39.9945711,-0.0689003,"Avenida de Vicent Sos Baynat, s/n, 12071 Castelló de la Plana, Castelló, Spain" +"Audio-Visual Information Processing Laboratory (VIPLAB), Pontifical Catholic University of Minas Gerais (PUC Minas)",-18.512178,-44.5550308,"State of Minas Gerais, Brazil" +"Veermata Jijabai Technological Institute, Mumbai, India",19.0222181,72.85612119999999,"H R Mahajani Rd, Matunga, Mumbai, Maharashtra 400019, India" +"Face Recognition and Artificial Vision Group, Universidad Rey Juan Carlos, C/ Tulipán, s/n, Móstoles E-28933 Madrid (Spain). cristina.conde@urjc.es",40.3358661,-3.8769432,"Calle Tulipán, s/n, 28933 Móstoles, Madrid, Spain" +"Hennan Polytechnic University, Jiaozuo, China",35.241085,113.234217,"Heping St, Jiefang Qu, Jiaozuo Shi, Henan Sheng, China, 454000" +"Atlanta, GA 30302, USA",33.75,-84.39999999999999,"Atlanta, GA 30302, USA" +"Engineering in Wuhan University of Technology, Wuhan, China",30.542755,114.292403,"China, Hubei, Wuhan, Wuchang, 临江大道" +"Luohu Branch, Shenzhen Municipal Public Security Bureau, Guangdong, China",22.542965,114.108743,"Caiwuwei, Luohu, Shenzhen, China, 518016" +"Telecommunications and Informatics Gdansk University of Technology, Gdansk, Poland",54.3716751,18.6163277,"Gabriela Narutowicza 11/12, 80-233 Gdańsk, Poland" +"M.S. Ramaiah Inst. of Tech., Bangalore - 560054, India",13.0304619,77.5646862,"MSRIT Post, M S Ramaiah Nagar, MSR Nagar, Bengaluru, Karnataka 560054, India" +"Research and Development Center «ELVEES», Zelenograd, Russia",55.984218,37.2180741,"проезд 4922, дом 4 строение 2, Зеленоград, gorod Moskva, Russia, 124498" +"Solapur University, N.B.N.Sinhgad College of Engineering, Kegoan, Solpaur, India",17.727823,75.8503355,"Gat No. 38/1 B, Solapur - Pune Hwy, Kegaon, Maharashtra 413255, India" +"Visvesvaraya National Institute of Technology, Nagpur, India",21.1263,79.0515878,"S Ambazari Rd, Ambazari, Nagpur, Maharashtra 440010, India" +"College of Science, Diyala University, Diyala, 32001, Iraq",33.757893,44.6064003,"Baqubah 32001, Iraq" +"Guangxi Key Laboratory of Trusted Software, Guilin University of Electronic Technoloy, Guilin, China",25.28164,110.337304,"1 Jinji Rd, Qixing Qu, Guilin Shi, Guangxi Zhuangzuzizhiqu, China" +Toyohashi University of Technology,34.7017563,137.4086346,"Hibarigaoka-1−1 Tenpakucho, Toyohashi, Aichi Prefecture 441-8580, Japan" +"Beckman Institute Advanced Science and Technology, University of Illinois at Urbana–Champaign, Urbana, IL, USA",40.1157707,-88.2272043,"405 N Mathews Ave, Urbana, IL 61801, USA" +"SRO Satellite Centre (ISAC), Bengaluru, India",12.9601126,77.6549828,"HAL Old Airport Rd, HAL Airport Area, Kodihalli, Bengaluru, Karnataka 560017, India" +"The Guangzhou Key Laboratory of Digital Content Processing and Security Technologies, Guangzhou, China",23.12911,113.264385,"Guangzhou, Guangdong, China" +"CGPIT, Uka Tarsadia University, Bardoli, Surat, India",21.0676868,73.13155429999999,"Maliba Campus, Gopal Vidyanagar, Bardoli - Mahuva Road, Tarsadi, Surat, Gujarat 394350, India" +"France Telecom R&D division, 28, Chemin du Vieux Chêne 38243 Meylan, France. e-mail: sebastien.roux@orange-ftgroup.com",45.2095831,5.793992299999999,"28 Chemin du Vieux Chêne, 38240 Meylan, France" +"UFMG & Kunumi, Brazil",-19.8690878,-43.9663841,"Av. Pres. Antônio Carlos, 6627 - Pampulha, Belo Horizonte - MG, 31270-901, Brazil" +"Indian Institute of Technology Guwahati, Guwahati, Assam, India",26.187859,91.6915834,"Surjyamukhi Road, North, Amingaon, Guwahati, Assam 781039, India" +"Institut TELECOM, T&M SudParis, 9 Rue Charles Fourrier, Evry, FRANCE; mohamed.anouar_mellakh@it-sudparis.eu",48.6246883,2.4432511,"Courcouronnes, 9 Rue Charles Fourier, 91000 Évry, France" +"Shanghai, China",31.2303904,121.4737021,"Shanghai, China" +"College of Forestry, Nanjing Forestry University, Nanjing, China",32.080904,118.812736,"159 Longpan Rd, Xuanwu Qu, Nanjing Shi, Jiangsu Sheng, China, 210037" +"Faculty of Science and Technology University of Macau Macau SAR, China",22.1524575,113.5650208,"Av. Wai Long, Macau" +"Centre for Development of Advanced Computing (CDAC), Kolkata, India",22.5687555,88.4354717,"Plot - E-2/1, Block-GP, Sector-V, Salt Lake Electronics Complex, Bidhannagar, Kolkata, West Bengal 700091, India" +"Institute of Image Processing and Pattern Recognition, Xi’an Jiaotong University, Xi’an, Shaanxi, China",34.250803,108.983693,"28 Xianning W Rd, JiaoDa ShangYe JieQu, Beilin Qu, Xian Shi, Shaanxi Sheng, China" +"B. Tech Graduate, ECE, MSIT, C-4 Janakpuri, New Delhi, India",28.6210152,77.0926258,"C-4 MARKET, Janakpuri, New Delhi, Delhi 110058, India" +"Mireo d.d., Buzinski prilaz 32, HR-10000 Zagreb, Croatia",45.746113,15.996763,"Buzinski prilaz 32, 10000, Zagreb, Croatia" +"King Fahd University of Petroleum & Minerals, Dhahran, KSA",26.3071046,50.14594229999999,"University Blvd, King Fahd University of Petroleum and Minerals, Az Zahran Saudi Arabia" +"Gulbarga University, Kalaburgi, India",17.3103594,76.8729891,"Sedam Road, Jnana Ganga, Kalaburagi, Karnataka 585106, India" +"NorthWest University Xi'an, China",34.14552,108.875136,"Chang'an Western University Town Shangquan, Chang'an, Xi'an, China" +India,20.593684,78.96288,India +"Telecommunications and Information Technology, Polytechnic University of Bucharest, Romania",44.4337844,26.0575959,"Complex studențesc Leu, corp A, B, Bulevardul Iuliu Maniu 1-3, București 061071, Romania" +"The City College of New York, 10031, USA",40.8200471,-73.9492724,"160 Convent Ave, New York, NY 10031, USA" +"Aalborg University Copenhagen, Denmark",55.6503358,12.5432553,"A. C. Meyers Vænge 15, 2450 København, Denmark" +"The University of the West Indies, Mona Campus, Jamaica",18.002929,-76.7499366,"Mona, Kingston, Jamaica" +"Batman University, Batman, Turkey",37.7873005,41.06279139999999,"Yenişehir Mahallesi, Batman Üniversitesi Merkez Kampüsü, 72000 Merkez/Batman Merkez/Batman, Turkey" +"Research Institution of Intelligent Control and Testing, 518055, China",22.581808,113.9656548,"Nanshan, Shenzhen, Guangdong, China, 518055" +"IIT, Guwathi, India",26.187859,91.6915834,"Surjyamukhi Road, North, Amingaon, Guwahati, Assam 781039, India" +University of of Modena and Reggio Emilia,44.6451046,10.9279268,"Via Università, 4, 41121 Modena MO, Italy" +"Institute of Computer Science and Technology, Beijing, China",39.9906583,116.3149389,"China, Beijing, Haidian, Zhongguancun N St, 逸夫苑" +"Avinashilingam Institute for Home Science and Higher Education for Women, Coimbatore-46",11.0198317,76.95160059999999,"Bharathi Park Road Tatabad, Forest College Campus, Saibaba Colony, Coimbatore, Tamil Nadu 641043, India" +"Northern University. Dhaka, Bangladesh",23.7410196,90.3749701,"Rd No. 4A, Dhaka 1205, Bangladesh" +"Jinling Institute of Technology, Nanjing, China",32.025815,118.793999,"Qinhuai, Nanjing, China" +"Southwest China University, Chongqing, China",29.8195959,106.4266239,"Bayi Rd, Beibei Qu, Chongqing Shi, China, 400716" +"University Medical Center (CMU), Geneva",40.4428081,-79.94301279999999,"5000 Forbes Ave, Pittsburgh, PA 15213, USA" +"Indiana University-Purdue University Indianapolis, Indianapolis, IN 46202 USA",39.7738832,-86.1763393,"420 University Blvd, Indianapolis, IN 46202, USA" +"Southwest Petroleum University, Chengdu, China",30.821792,104.183563,"8 Xindu Ave, Xindu Qu, Chengdu Shi, Sichuan Sheng, China" +"CSE, Park College of Engineering and Technology, Kaniyur, Coimbatore, Tamilnadu, India",11.0921383,77.1633161,"NH 47, Avinashi - Coimbatore Road, Kaniyur, Tamil Nadu 641659, India" +"Xi’an Jiaotong University, Xi’an, Shaanxi, China",34.250803,108.983693,"28 Xianning W Rd, JiaoDa ShangYe JieQu, Beilin Qu, Xian Shi, Shaanxi Sheng, China" +"BV-Tech Ricerca, Rovereto (Italy)",45.8896497,11.0318828,"Via Zeni Fortunato, 8, 38068 Rovereto TN, Italy" +"University of Pernambuco - UPE, Pernambuco - Brazil",-8.0440603,-34.8861167,"Av. Gov. Agamenon Magalhães - Santo Amaro, Recife - PE, 50100-010, Brazil" +"Information Research Institute of Shandong Academy of Sciences, Jinan, China",36.7084734,117.0773869,"202 Gongye N Rd, Licheng Qu, Jinan Shi, Shandong Sheng, China, 250032" +"Circuits, and Architectures, Faculty of Electronics, Telecommunication and Information Technology, University “Politehnica” of Bucharest, Romania",44.4337844,26.0575959,"Complex studențesc Leu, corp A, B, Bulevardul Iuliu Maniu 1-3, București 061071, Romania" +"University of Melbourne, Melbourne, Australia",-37.7963689,144.9611738,"Parkville VIC 3010, Australia" +"Civil Aviation University of China, Tianjin, China",39.1118774,117.3497451,"2898 Jinbei Hwy, Dongli Qu, Tianjin Shi, China, 300300" +"Zhongyuan University of Technology, Zhengzhou, China",34.58603,113.688309,"China, Henan, Zhengzhou, Xinzheng, Bohai Rd, 淮河路1号" +"Cyprus International University, Nicosia, Northern Cyprus, Mersin 10, Turkey",35.220923,33.416627,"Uluslarararası Kıbrıs Üniversitesi Kampüsü, Haspolat" +"Motilal Nehru National Institute of Technology, Allahabad, India",25.4920102,81.8639163,"Barrister Mullah Colony, MNNIT Allahabad Campus, Teliarganj, Prayagraj, Uttar Pradesh 211004, India" +"Shandong University at Weihai, China",37.529734,122.060834,"180 Wenhua W Rd, Huancui Qu, Weihai Shi, Shandong Sheng, China, 264209" +"Low Speed Aerodynamics Institute of China Aerodynamics Research and Development Center, Mianyang, China",31.46746,104.679004,"Mianyang, Sichuan, China" +"Nanjing Forest University, Nanjing, China",32.080904,118.812736,"159 Longpan Rd, Xuanwu Qu, Nanjing Shi, Jiangsu Sheng, China, 210037" +"Division of System Integration and IC Design, Institute of Nano-tech and Nano-bionics, CAS, Suzhou, China",31.298974,120.585289,"Suzhou, Jiangsu, China" +"Xi'an Jiaotong-Liverpool University, Suzhou, 215123, China",31.274822,120.738094,"Wuzhong, Suzhou, China, 215123" +India,20.593684,78.96288,India +c,37.09024,-95.712891,"Independence, KS 67301, USA" +"Oxford, UK",51.7520209,-1.2577263,"Oxford, UK" +"Computer Science at the University of Maryland, College Park, MD 20742",38.9892032,-76.9361955,"College Park, MD 20740, USA" +"Vrije Universiteit Amsterdam, Amsterdam, The Netherlands",52.3337568,4.8657199,"De Boelelaan 1105, 1081 HV Amsterdam, Netherlands" +"Fujifilm Software, San Jose, USA",37.3676203,-121.9642857,"2250 Martin Ave, Santa Clara, CA 95050, USA" +"Nanjing Forestry University, Nanjing, China",32.080904,118.812736,"159 Longpan Rd, Xuanwu Qu, Nanjing Shi, Jiangsu Sheng, China, 210037" +"Anhui Keli Information Industry Co. Ltd., Hefei, China",31.820591,117.227219,"Hefei, Anhui, China" +"VIT University, Vellore, India",12.972067,79.1595619,"Gorbachev Rd, Vellore, Tamil Nadu 632014, India" +"Xiamen University of Technology, Xiamen 361024, China",24.43944,118.090169,"394 Siming S Rd, Siming Qu, Xiamen Shi, Fujian Sheng, China, 361005" +"Bangladesh University of Engineering and Technology, Dhaka-1205, Bangladesh",23.7276638,90.3928418,"BUET Central Road, Dhaka 1000, Bangladesh" +"Computational Vision Group, University of Reading, UK",51.4414205,-0.9418157,"Reading, UK" +"University of Brasilia, DF 70910-900 Brazil",-15.7631573,-47.8706311,"Brasilia - Federal District, 70910-900, Brazil" +"Image Processing and Analysis Laboratory, University “Politehnica” of Bucharest, Romania",44.4386064,26.0494925,"Splaiul Independenței 313, București 060042, Romania" +"University of Leicester, Leicester, Unithed Kingdom",52.6211393,-1.1246325,"University Rd, Leicester LE1 7RH, UK" +"Asian Institute of Technology (AIT), Pathum Thani 12120, Thailand",14.0785,100.6140362,"58 หมู่ที่ 9 Phahonyothin Rd, Khlong Nueng, Khlong Luang District, Pathum Thani 12120, Thailand" +"Intelligent Systems Laboratory (LSI), Universidad Carlos III de Madrid, Leganés, Spain",40.31705720000001,-3.7274678,"Calle Madrid, 126, 28903 Getafe, Madrid, Spain" +"Instituto Nacional de Astrofísica, Óptica y Electrónica, Luis Enrique Erro No.1, Tonantzintla, Puebla, México. CP 72840",19.0323107,-98.31537019999999,"Luis Enrique Erro 1, Sta María Tonanzintla, 72840 Puebla, Pue., Mexico" +"Hebei University of Technology, Tianjin, China",39.179635,117.165882,"8 Dingzigu 1st Rd, Hongqiao Qu, China, 300131" +"Nanjing University of Information Science & Technology, Nanjing, China",32.20541,118.726956,"Pukou, Nanjing, China, 210044" +"Northwestern Polytechnic University, Xi’an, China",34.2416608,108.9367387,"127 Youyi W Rd, Beilin Qu, Xian Shi, Shaanxi Sheng, China, 710065" +"Central University of Finance and Economics (CUFE), Beijing, China",40.169887,116.283102,"Changping, Beijing, China, 102202" +"National Research University of Electronic Technology (MIET), Zelenograd, Russia",55.9829727,37.20930550000001,"Ploshchad' Shokina, 1, Zelenograd, Moskva, Russia, 124498" +"Chengdu, Sichuan, China",30.572815,104.066801,"Chengdu, Sichuan, China" +"IIT Madras, India",12.9914929,80.2336907,"Indian Institute Of Technology, Chennai, Tamil Nadu 600036, India" +"Batman University, Batman, Turkey",37.7873005,41.06279139999999,"Yenişehir Mahallesi, Batman Üniversitesi Merkez Kampüsü, 72000 Merkez/Batman Merkez/Batman, Turkey" +"Vilnius Gediminas Technical University, Naugarduko g. 41-413, Vilnius LT-03227, Lithuania",54.6737777,25.2662225,"Naugarduko g. 41, Vilnius 03227, Lithuania" +"NPU-VUB Joint AVSP Research Lab, Northwestern Polytechnical, University (NPU) Shaanxi Key Lab on Speech and Image Information Processing 127 Youyi Xilu, Xi’an, 710072, China",34.2416608,108.9367387,"127 Youyi W Rd, Beilin Qu, Xian Shi, Shaanxi Sheng, China, 710065" +"HeNan Radio and Television University, Zhengzhou, People’s Republic of China",34.80371,113.802319,"Jinshui, Zhengzhou, Henan, China, 450046" +"Changchun University of Technology, Changchun China",43.834677,125.303135,"7989 Weixing Rd, Chaoyang Qu, Changchun Shi, Jilin Sheng, China, 130012" +"University of Science & Technology, Beijing, China",39.989458,116.356945,"30 Xueyuan Rd, Haidian Qu, Beijing Shi, China, 100083" +"University of Luxembourg, Luxembourg",49.504254,5.948386,"2, avenue de l'Université, 4365 Esch-sur-Alzette, Luxembourg" +"China University of Geosciences, Wuhan, 430074, China",30.51735439999999,114.4007303,"388 Lumo Rd, Hongshan Qu, Wuhan Shi, Hubei Sheng, China, 430073" +"Ecole Centrale de Lyon, Lyon, 69134, France",45.7838596,4.7691899,"36 Avenue Guy de Collongue, 69134 Écully, France" +"GM Advanced Technical Center Israel, Hamada 7, Herliya, Isarel",32.1665923,34.8127468,"46733, HaMada St 7, Herzliya, Israel" +"Wuhan University of Science and Technology, Wuhan, P. R. China, 430081",30.621883,114.370852,"China, Hubei, Wuhan, Qingshan, 和平大道947号" +"Professor, Visualization and Perception Lab, Chennai, India",13.0826802,80.2707184,"Chennai, Tamil Nadu, India" +"Beijing University of Civil Engineering and Architecture, Beijing, China",39.93532099999999,116.342721,"1 Zhanlanguan Rd, DongWuYuan, Xicheng Qu, China, 100044" +"IIT Guwahati, 781039, Assam, India",26.187859,91.6915834,"Surjyamukhi Road, North, Amingaon, Guwahati, Assam 781039, India" +"Elektrik-Elektronik Mühendisliği Bölümü, Işık Üniversitesi, İstanbul, TÜRKİYE",41.16889949999999,29.5639972,"Meşrutiyet Mahallesi, Işık Üniversitesi, 34980 Şile/İstanbul, Turkey" +"Université Paris-Saclay, Orsay, France",48.6976847,2.1764839,"15 Rue Georges Clemenceau, 91400 Orsay, France" +"Fuzhou University, Fuzhou, Fujian, China",26.060092,119.202892,"Fuzhou University, Minhou Xian, Fuzhou Shi, China" +"Centre for Vision, Speech and Signal Processing, University of Surrey Guildford, GU2 7XH Surrey, UK. N.Poh@surrey.ac.uk",51.2435451,-0.5885743999999999,"388 Stag Hill, Guildford GU2 7XH Stag Hill, Guildford GU2 7XH, UK" +"University of N. British Columbia, Prince George, Canada V2N 4Z9",53.8922034,-122.8133607,"3333 University Way, Prince George, BC V2N 4Z9, Canada" +"University of Bucharest, Bucharest, Romania",44.4355381,26.1011433,"Bulevardul Regina Elisabeta 4-12, București 030018, Romania" +"Beijing Institute of Technology, Beijing 100081, CHINA. jiayunde@bit.edu.cn",39.964431,116.310319,"Side Rd of N. 3rd Ring Rd W, Haidian Qu, Beijing Shi, China" +"MM University, Mullana, India",30.251223,77.047538,"Mullana University Road, Mullana, Ambala, Haryana 133207, India" +"Institute of Road and Transport Technology, Erode-638 316, India",11.4151249,77.66557019999999,"Near Vasavi College PO, Erode, Tamil Nadu 638316, India" +"Universidad Politécnica de Madrid, Madrid, Spain",40.4486372,-3.7192798,"Calle Ramiro de Maeztu, 7, 28040 Madrid, Spain" +"Anhui University of Technology, Maanshan, Anhui, China",31.692956,118.510711,"59 Hudong N Rd, Huashan Qu, Maanshan Shi, Anhui Sheng, China, 243000" +"P A College of Engineering, Mangalore, India",12.806886,74.932597,"Near Mangalore University, Kudla, Konaje, Karnataka 574153, India" +"Faculty of Engineering, Kasetsart University, Bangkok, Thailand",13.845747,100.5701901,"50 Phahonyothin Rd, Khwaeng Lat Yao, Khet Chatuchak, Krung Thep Maha Nakhon 10903, Thailand" +"JNTUACE, Ananthapuramu, India",14.6515242,77.6080818,"Sir Mokshagundam Visvesvaraya Road, Saradha Nagar, Anantapur, Andhra Pradesh 515002, India" +"Qihoo 360 AI Institute, Beijing, China",39.90419989999999,116.4073963,"Beijing, China" +"Weill Cornell Medicine, Center for Autism and the Developing Brain, White Plains, NY",41.0247309,-73.75380000000001,"21 Bloomingdale Rd, White Plains, NY 10605, USA" +"Guangxi University, Electronics and Information, Nanning, China",22.8376,108.289839,"Chongwen Rd, Xixiangtang Qu, Nanning Shi, Guangxi Zhuangzuzizhiqu, China, 530001" +"INRIA Grenoble-Rhone-Alpes, Cedex",45.217886,5.807369,"655 Avenue de l'Europe, 38330 Montbonnot-Saint-Martin, France" +"Hubei University of Science and Technology, P. R. China",37.9763587,114.5212977,"26 Yuxiang St, Yuhua Qu, Shijiazhuang Shi, Hebei Sheng, China" +"Minjiang University, Fuzhou, Fujian, China, 430073",26.064704,119.169116,"1 Wenxian Rd, Minhou Xian, Fuzhou Shi, Fujian Sheng, China" +"Coimbra, Portugal",40.2033145,-8.4102573,"Coimbra, Portugal" +"Instituto Politécnico Nacional, ESIME, Culhuacan, Mexico D.F.",19.3302206,-99.111921,"Avenida Santa Ana, San Francisco Culhuacan, 04260 Coyoacán, CDMX, Mexico" +"MBRDNA, Sunnyvale, US",37.3862972,-122.0361744,"309 N Pastoria Ave, Sunnyvale, CA 94085, USA" +"Informatics institute of Technology, Colombo, Sri Lanka",6.865139099999999,79.8598574,"57 Ramakrishna Rd, Colombo 00600, Sri Lanka" +"Curtin University of Technology, GPO Box U1987, Perth, WA 6845, Australia. senjian@cs.curtin.edu.au",-32.0061951,115.8944182,"Kent St, Bentley WA 6102, Australia" +"Yanshan University, Qinhuangdao, China",39.904508,119.539719,"438 Hebei Street West Section, Haigang Qu, Qinhuangdao Shi, Hebei Sheng, China" diff --git a/scraper/reports/doi_institutions_not_found.csv b/scraper/reports/doi_institutions_not_found.csv index 2bb500ff..ba885c0b 100644 --- a/scraper/reports/doi_institutions_not_found.csv +++ b/scraper/reports/doi_institutions_not_found.csv @@ -433,3 +433,30 @@ b099df0a0c5abeff5586a1389e9278613e6c0c64,Flame detection using deep learning,"Au 8ac2d704f27a2ddf19b40c8e4695da629aa52a54,Expressions Recognition of North-East Indian (NEI) Faces,"Department of Physics, Tripura University (A Central University), Suryamaninagar, India" 7e5fc58d742ac5fbc16c3e33284c08fe9d9556ec,Meerkat: A framework for developing presence monitoring software based on face recognition,IP-based Communication Laboratory dd0258367fadb632b612ccd84fbc1ef892e70aeb,Face Recognition Method Combining 3D Face Model with 2D Recognition,"Xi'an Shiyou University, China" +095251493ac774df0a737bb8023cffd036edccd0,Person Re-identification using prototype formation,"Department of CSE, National Institute of Technology, Rourkela, India, 769008" +4820e34baf57e7b3d8d70df915c1710e6e93d631,A real-time multi-class multi-object tracker using YOLOv2,Poongsan Defense R&D Institute +38f1d8d25c0332798e0929594af2c43092d2c5c8,Face recognition via fast dense correspondence,"Key Lab of Ministry of Education for Broad Band Communication and Sensor Network Technology, Nanjing University of Posts and Telecommunications, Nanjing, China" +d93fd354b469190ab73826f416d58840e8efa13c,Face recognition using Multi-modal Binary Patterns,"Gipsa-lab, Grenoble INP, France" +673f17802bb99766407c70804c17357c3c448b6f,CAS(ME)$^2$ : A Database for Spontaneous Macro-Expression and Micro-Expression Spotting and Recognition,"CAS Key Laboratory of Behavior Sciences, Institute of Psychology, Beijing, China" +e96540252f2f83e394012d653452411efb9f744f,Face expression recognition system based on ripplet transform type II and least square SVM,"Department of Computer Science and Engineering, National Institute of Technology, Rourkela, India" +9c54c038664ec0c167211e9855a2275a97101708,Tracking Human Pose Using Max-Margin Markov Models,"Centre for Quantum Computation & Intelligent Systems and the Faculty of Engineering and Information Technology, University of Technology, Sydney, 81 Broadway Street, Ultimo, NSW, Australia" +0185bbb0f45bea11210689543fec253e87abde82,Neighborhood repulsed metric learning for kinship verification,"Captial Normal University, Beijing" +43eee49e372e5299608a79f8491fcf40998028df,"Complex event processing for content-based text, image, and video retrieval","Army Research Laboratory, Computational and Information Sciences Directorate" +6681006eaa13e03d8cf87cf44797e87ffbe0826b,Assistive Intelligent Transportation Systems: The Need for User Localization and Anonymous Disability Identification,"Computer Engineering Department, Universidad de Alcala de Henares, Madrid, Madrid, Spain" +bb93c56a44a244601daad96a7bb94e41609afc4d,Towards Improved Design and Evaluation of Epileptic Seizure Predictors,"Mayo Systems Electrophysiology Laboratory, Departments of Neurology and Biomedical EngineeringMayo Clinic" +4555cee0e8d34cce2498f547c0631955318da952,A deep neural network approach to fusing vision and heteroscedastic motion estimates for low-SWaP robotic applications,"Micro and Nano Devices and Materials Branch in the Sensors and Electron Devices Directorate at the US Army Research Laboratory, Adelphi, MD 20783" +685fcf13c5e261bf4851ddd1273e048869124ac2,Joint label-interaction learning for human action recognition,"Zhejiang University of Technology 288 Liuhe Rd. Hangzhou, P. R. China, 310023" +19d4b3679294563247c126148912d44cbf03e40e,Value-Aware Resampling and Loss for Imbalanced Classification,"Information Science School, Guangdong University of Finance and Economics, China" +0f7a5e5cd62066d2207a3b51f2cf26dbec1f134f,Face recognition through mismatch driven representations of the face,"Dept. of Electr. & Comput. Eng., Carnegie Mellon Univ., Pittsburgh, PA, USA" +bb2f61a057bbf176e402d171d79df2635ccda9f6,Multi-modal joint embedding for fashion product retrieval,Wide Eyes Technologies +e860db656f39d738050b5f3e0bf72724e6a4ad5c,Analysis of landmarks in recognition of face expressions,"Department of Statistics, University of Nebraska-Lincoln, Lincoln, USA" +a432ee5977443b5c29001f4bd10c6303cc364d4d,Polysemious visual representation based on feature aggregation for large scale image applications,"Key Laboratory of Intelligent Information Processing of Chinese Academy of Sciences (CAS), Institute of Computing Technology(ICT), Beijing, China" +3d38022d7ba71e865ca406d28acd3fe547024319,Unsupervised Local Facial Attributes Transfer Using Dual Discriminative Adversarial Networks,"Academy of Broadcasting Science, China" +cb8382f43ce073322eba82809f02d3084dad7969,Facial Expression Recognition using 2D Stationary Wavelet Transform and Gray-Level Co-occurrence MatrixP@13-17,"Department of Computer Science and Engineering, National Institute of Technology, Rourkela, Odisha, India" +eb3436a52fac7dd498efeaad0861c39d5361f7f2,Convolutional neural network for motorbike detection in dense traffic,"Faculty of Computer Science and Engineering, Ho Chi Minh City University of Technology, Ho Chi Minh City, Vietnam" +5e53f530871b5167be0f224993be8a38e85796e8,Automatic facial expression recognition: A survey based on feature extraction and classification techniques,"Shree Vaishnav Institute of Technology and Science, Indore (M.P) India" +eefc06f6e6af1c2b9b33fb12bdaff73e19a31d6c,The IST-EURECOM Light Field Face Database,"Instituto de Telecomunicações, Instituto Superior Técnico - Universidade de Lisboa, Lisbon, Portugal" +fac5a9a18157962cff38df6d4ae69f8a7da1cfa8,Face recognition from a single training image under arbitrary unknown lighting using spherical harmonics,"Dept. of Comput. Sci., State Univ. of New York, Stony Brook, NY, USA" +bf8fcb7860bc89c332d87ebacd260bdc5a30b4ce,"Face recognition using Symlet, PCA and cosine angle distance measure","University school of Information and Communication technology, Guru Gobind Singh Indraprastha University, Dwarka, New-Delhi, India" +ed779cc4f026f6ac22f5ef0c34126138e1ebc8b2,Audio Visual Recognition of Spontaneous Emotions In-the-Wild,"NPU-VUB Joint AVSP Lab, Department ETRO, Vrije Universiteit Brussel (VUB), Brussels, Belgium" +04e2b2eab1966ffb0079685baf169d4d7ad4ec8c,Detecting Sex From Handwritten Examples,"Department of Computer Science, Independent University Bangladesh, Dhaka, Bangladesh" diff --git a/scraper/s2-geocode-spreadsheet.py b/scraper/s2-geocode-spreadsheet.py index 66607562..b21a8453 100644 --- a/scraper/s2-geocode-spreadsheet.py +++ b/scraper/s2-geocode-spreadsheet.py @@ -7,77 +7,115 @@ from dotenv import load_dotenv from util import * load_dotenv() -@click.command() -def s2_geocode_spreadsheet(): - geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY')) +geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY')) + +print('loading institutions...') +worksheet = fetch_worksheet("institutions") +keys, rows = fetch_google_sheet("institutions") - worksheet = fetch_worksheet() - keys, rows = fetch_google_sheet() - valid_count = 0 - invalid_count = 0 +print("got {} rows".format(len(rows))) - print("got {} rows".format(len(rows))) +countries = load_countries() +print('loaded countries...') - cname_lookup = {} +cname_lookup = {} + +@click.command() +def s2_geocode_spreadsheet(): + # row_tuples = [] for i, row in enumerate(rows): - if len(row) == 6: - cname, name, address, lat, lng, org_type = row - elif len(row) == 7: - cname, name, address, lat, lng, org_type, extra_address = row - else: - print("Weirdly formatted row {}".format(i)) - continue + # row_tuples.append((i, row,)) + cname, name, address, lat, lng, org_type, extra_address, country = row if cname == name or cname not in cname_lookup: cname_lookup[cname] = i - - # 0 cname 1 name 2 address 3 lat 4 lng 5 org_type + print("built lookup") + print("processing sheet...") for i, row in enumerate(rows): - if len(row) == 6: - cname, name, address, lat, lng, org_type = row - elif len(row) == 7: - cname, name, address, lat, lng, org_type, extra_address = row - else: - print("Weirdly formatted row {}: {} entries".format(i, len(row))) - continue - if lat and lng: - continue - c_row = rows[cname_lookup[cname]] - if c_row[3] and c_row[4]: - print("name {}, found cname: {}".format(name, cname)) - worksheet.update_cell(i+2, 3, c_row[2]) - worksheet.update_cell(i+2, 4, c_row[3]) - worksheet.update_cell(i+2, 5, c_row[4]) - continue - if address: - address_to_geocode = address - elif name: - address_to_geocode = name - elif cname: - address_to_geocode = cname + hit_api = s2_geocode_row(i, row) + if hit_api: + time.sleep(1) + # parallelize(s2_geocode_row, row_tuples) + +def s2_geocode_row(i, row): + # 0 cname 1 name 2 address 3 lat 4 lng 5 org_type 6 notes 7 country + cname, name, address, lat, lng, org_type, extra_address, country = row + if lat and lng: + if not country: + update_country_from_address(address, i, countries, worksheet) + return True + return False + + c_row = rows[cname_lookup[cname]] + if c_row[3] and c_row[4]: + print("name {}, found cname: {}".format(name, cname)) + worksheet.update_cell(i+2, 2+1, c_row[2]) + worksheet.update_cell(i+2, 3+1, c_row[3]) + worksheet.update_cell(i+2, 4+1, c_row[4]) + worksheet.update_cell(i+2, 7+1, c_row[7]) + return True + if address: + address_to_geocode = address + elif name: + address_to_geocode = name + elif cname: + address_to_geocode = cname + + if not address_to_geocode: + return False - if not address_to_geocode: - continue + print(address_to_geocode) + location = geolocator.geocode(address_to_geocode) + if location: + print("{} found: {}".format(i+1, name)) + print(location.raw) + worksheet.update_cell(i+2, 2+1, location.address) + worksheet.update_cell(i+2, 3+1, location.latitude) + worksheet.update_cell(i+2, 4+1, location.longitude) + if address and address != location.address: + worksheet.update_cell(i+2, 6+1, address) # store alt address in "notes" field + valid_count += 1 + country = update_country_from_address(location.address, i, countries, worksheet) + row[2] = location.address + row[3] = location.latitude + row[4] = location.longitude + row[7] = country + return True + else: + print("{} not found: {}".format(i+1, address_to_geocode)) + invalid_count += 1 + return False - print(address_to_geocode) - location = geolocator.geocode(address_to_geocode) - if location: - print("{} found: {}".format(i+1, name)) - print(location.raw) - worksheet.update_cell(i+2, 3, location.address) - worksheet.update_cell(i+2, 4, location.latitude) - worksheet.update_cell(i+2, 5, location.longitude) - if address and address != location.address: - worksheet.update_cell(i+2, 7, address) - valid_count += 1 - row[2] = location.address - row[3] = location.latitude - row[4] = location.longitude - else: - print("{} not found: {}".format(i+1, address_to_geocode)) - invalid_count += 1 - time.sleep(2) +def update_country_from_address(address, i, countries, worksheet): + address_partz = address.split(', ') + possible_country = address_partz[-1] + country = None + if possible_country in countries: + country = countries[possible_country] + elif "China" in address: + country = "China" + elif "Singapore" in address: + country = "Singapore" + elif "Taiwan" in address: + country = "Taiwan" + elif "Russia" in address: + country = "Russia" + elif "Japan" in address: + country = "Japan" + elif "Iran" in address: + country = "Iran" + elif "Egypt" in address: + country = "Egypt" + elif "پاکستان" in address: + country = "Pakistan" + elif "السعودية" in address: + country = "Saudi Arabia" + else: + print("unknown country: {}".format(possible_country)) + return "" - print("geocoded {} addresses, {} found, {} not found".format(len(rows), valid_count, invalid_count)) + worksheet.update_cell(i+2, 7+1, country) + # print(country) + return country if __name__ == '__main__': s2_geocode_spreadsheet() diff --git a/scraper/s2-geocode.py b/scraper/s2-geocode.py index 25eb6f8a..989c17bf 100644 --- a/scraper/s2-geocode.py +++ b/scraper/s2-geocode.py @@ -40,10 +40,10 @@ def s2_geocode(fn): print("found: {}".format(name)) cname = name for word in name.split(', '): - if "university" in word.lower(): + if "university" in word.lower() and 'california' not in word.lower(): cname = word worksheet.append_row([ - cname, name, location.address, location.latitude, location.longitude, 'edu' + cname, name, location.address, location.latitude, location.longitude, 'edu', '', ]) valid.append([ name, @@ -77,3 +77,4 @@ def remove_department_name(name): if __name__ == '__main__': s2_geocode() + diff --git a/scraper/util.py b/scraper/util.py index fa9f6a22..0401b342 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -331,10 +331,13 @@ class AddressBook (object): row = self.find(address) if row is not None: return { - 'address': row[0], + 'name': row[0], + 'source_name': row[1], + 'street_adddress': row[2], 'lat': row[3], 'lng': row[4], 'type': row[5], + 'country': row[7], } return None @@ -372,7 +375,7 @@ def file_path(key, paper_id, fn): return os.path.join(data_path(key, paper_id), fn) def parallelize(func, rows): - print("Fetching {} items".format(len(rows))) + print("Processing {} items".format(len(rows))) if hasattr(os, 'sched_getaffinity'): processCount = len(os.sched_getaffinity(0)) else: @@ -447,3 +450,14 @@ def fetch_google_lookup(name, item_key='key'): lookup[rec[item_key]] = rec return lookup +def load_countries(): + countries = read_json('countries.json') + lookup = {} + for country in countries: + name = country['name'] + lookup[name] = name + if 'alt' in country: + for alt_name in country['alt']: + lookup[alt_name] = name + return lookup + -- cgit v1.2.3-70-g09d2 From 881d559cb0491c532264b151ed922c401f30db96 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Wed, 20 Feb 2019 16:19:08 +0100 Subject: avoid adding very short cnames --- scraper/s2-geocode-spreadsheet.py | 2 ++ scraper/s2-geocode.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scraper/s2-geocode-spreadsheet.py b/scraper/s2-geocode-spreadsheet.py index b21a8453..98baf4b5 100644 --- a/scraper/s2-geocode-spreadsheet.py +++ b/scraper/s2-geocode-spreadsheet.py @@ -26,6 +26,8 @@ def s2_geocode_spreadsheet(): for i, row in enumerate(rows): # row_tuples.append((i, row,)) cname, name, address, lat, lng, org_type, extra_address, country = row + if len(cname) < 3: + print("very short cname: {}".format(cname)) if cname == name or cname not in cname_lookup: cname_lookup[cname] = i print("built lookup") diff --git a/scraper/s2-geocode.py b/scraper/s2-geocode.py index 989c17bf..705f3a17 100644 --- a/scraper/s2-geocode.py +++ b/scraper/s2-geocode.py @@ -30,7 +30,7 @@ def s2_geocode(fn): for i, row in enumerate(rows): name = row[2] name = remove_department_name(name) - if not name: + if not name or len(name) < 2: continue try: location = geolocator.geocode(name) -- cgit v1.2.3-70-g09d2 From 9b97ddf7e1bc1febc4066cd5e083cee688d77027 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Wed, 20 Feb 2019 16:20:20 +0100 Subject: also avoid adding countries --- scraper/countries.json | 2 +- scraper/s2-geocode.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/scraper/countries.json b/scraper/countries.json index d3dd213d..87b3c997 100644 --- a/scraper/countries.json +++ b/scraper/countries.json @@ -229,7 +229,7 @@ {"name": "Ukraine", "code": "UA"}, {"name": "United Arab Emirates", "code": "AE", "alt": ["Abu Dhabi - United Arab Emirates"]}, {"name": "United Kingdom", "code": "GB", "alt": ["UK"]}, -{"name": "United States", "code": "US", "alt": ["USA"]}, +{"name": "United States", "code": "US", "alt": ["USA", "United States of America"]}, {"name": "United States Minor Outlying Islands", "code": "UM"}, {"name": "Uruguay", "code": "UY"}, {"name": "Uzbekistan", "code": "UZ"}, diff --git a/scraper/s2-geocode.py b/scraper/s2-geocode.py index 705f3a17..1fcc690d 100644 --- a/scraper/s2-geocode.py +++ b/scraper/s2-geocode.py @@ -18,6 +18,7 @@ def s2_geocode(fn): # geolocator = geocoders.Nominatim(user_agent="cool geocoding service") geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY')) worksheet = fetch_worksheet('institutions') + countries = load_countries() # print(fn) @@ -31,6 +32,8 @@ def s2_geocode(fn): name = row[2] name = remove_department_name(name) if not name or len(name) < 2: + if cname in countries: + print("cname is a country: {}".format(cname)) continue try: location = geolocator.geocode(name) -- cgit v1.2.3-70-g09d2