summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/identity/msceleb_identity.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/notebooks/datasets/identity/msceleb_identity.ipynb')
-rw-r--r--megapixels/notebooks/datasets/identity/msceleb_identity.ipynb378
1 files changed, 378 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/identity/msceleb_identity.ipynb b/megapixels/notebooks/datasets/identity/msceleb_identity.ipynb
new file mode 100644
index 00000000..d330badb
--- /dev/null
+++ b/megapixels/notebooks/datasets/identity/msceleb_identity.ipynb
@@ -0,0 +1,378 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Knowledge Graph MS Celeb"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import os.path as osp\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "import random\n",
+ "import math\n",
+ "import time\n",
+ "from datetime import datetime\n",
+ "\n",
+ "import requests\n",
+ "\n",
+ "import json\n",
+ "import urllib\n",
+ "from multiprocessing.pool import ThreadPool\n",
+ "import threading\n",
+ "from urllib.request import urlopen\n",
+ "import urllib.request\n",
+ "\n",
+ "import pandas as pd\n",
+ "from scipy.io import loadmat\n",
+ "import numpy as np\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "import sys\n",
+ "sys.path.append('/work/megapixels_dev/megapixels/')\n",
+ "from app.utils import file_utils, im_utils"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load Metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dir_msceleb = '/data_store_hdd/datasets/people/msceleb/media/original/'\n",
+ "kg_ids_msceleb = [x.replace('m.', '/m/') for x in os.listdir(dir_msceleb)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n",
+ "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_kg_meta(obj, url):\n",
+ " \n",
+ "def get_kg_from_name(obj):\n",
+ " \n",
+ "def get_kg_from_kg_id(obj):\n",
+ " # TODO detect 503 service unavailable\n",
+ " if obj['accessed']:\n",
+ " return obj\n",
+ " global api_key, url_kg_api\n",
+ " kg_id = obj['kg_id']\n",
+ " params = {\n",
+ " 'query': q,\n",
+ " 'limit': 5,\n",
+ " 'indent': True,\n",
+ " 'key': api_key,\n",
+ " }\n",
+ " \n",
+ " params = {\n",
+ " 'ids': kg_id,\n",
+ " 'limit': 1,\n",
+ " 'indent': True,\n",
+ " 'key': api_key,\n",
+ " }\n",
+ " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n",
+ " result = {'kg_id': kg_id, 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n",
+ " try:\n",
+ " json_response = urllib.request.urlopen(url).read()\n",
+ " except Exception as e:\n",
+ " result['error'] = str(e)\n",
+ " else:\n",
+ " try:\n",
+ " response = json.loads(json_response)\n",
+ " items = response.get('itemListElement', [])\n",
+ " result['accessed'] = True\n",
+ " if items:\n",
+ " item = items[0]\n",
+ " item_result = item.get('result', [])\n",
+ " result['description'] = item_result.get('description', '')\n",
+ " det_desc = item_result.get('detailedDescription', '')\n",
+ " if det_desc:\n",
+ " result['description_extended'] = det_desc.get('articleBody','')\n",
+ " result['description_license'] = det_desc.get('license','')\n",
+ " result['description_url'] = det_desc.get('url','')\n",
+ " else:\n",
+ " result['description_extended'] = ''\n",
+ " result['description_license'] = ''\n",
+ " result['description_url'] = ''\n",
+ " result_img = item_result.get('image', '')\n",
+ " if result_img:\n",
+ " result['image_url'] = result_img.get('contentUrl', '')\n",
+ " result['name'] = item_result.get('name', '')\n",
+ " result['score'] = item.get('resultScore', 0.0)\n",
+ " result['url'] = item_result.get('url', '')\n",
+ " except Exception as e:\n",
+ " result['error'] = str(e)\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unmapped_persons = [{'kg_id': x} for x in kg_ids_msceleb]\n",
+ "opt_threads = 10\n",
+ "pbar = tqdm(total=len(unmapped_persons))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# define thread mapping function\n",
+ "def pool_map_persons(obj):\n",
+ " global pbar\n",
+ " pbar.update(1)\n",
+ " kg_obj = get_kg_from_kg_obj(obj)\n",
+ " return kg_obj"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#mapped_persons_bkup = mapped_persons.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "886ce68bd7484d2fa4ab2da0beec5359",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=93418), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# convert to thread pool\n",
+ "#mapped_persons = []\n",
+ "pool = ThreadPool(opt_threads)\n",
+ "\n",
+ "# start threading\n",
+ "with tqdm(total=len(unmapped_persons)) as pbar:\n",
+ " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n",
+ "\n",
+ "# close tqdm\n",
+ "pbar.close()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "93418"
+ ]
+ },
+ "execution_count": 71,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(mapped_persons)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'kg_id': '/m/0dlnwb0', 'score': 14.806737, 'description': 'American internet celebrity', 'url': '', 'accessed': True, 'description_extended': 'Keenan Cahill is an American Internet celebrity from Chicago, Illinois who lip-syncs to popular songs on YouTube.\\nCahill launched his first famous lipsynced YouTube video on August 28, 2010 on the Katy Perry song Teenage Dream. ', 'description_license': 'CC BY-SA 3.0', 'description_url': 'https://en.wikipedia.org/wiki/Keenan_Cahill', 'name': 'Keenan Cahill'}, {'kg_id': '/m/047rtd1', 'score': 12.298853, 'description': 'Canadian film actor', 'url': '', 'accessed': True, 'description_extended': '', 'description_license': '', 'description_url': '', 'name': 'Nicholas Elia'}, {'kg_id': '/m/04j9rz9', 'score': 11.539564, 'description': 'Investor', 'url': '', 'accessed': True, 'description_extended': 'Nick Leslau is an English commercial property investor, with an estimated fortune in the Sunday Times Rich List of £350 million. Leslau is Chairman and Chief Executive of Prestbury Investment Holdings Limited and Chairman of Prestbury Investments LLP. ', 'description_license': 'CC BY-SA 3.0', 'description_url': 'https://en.wikipedia.org/wiki/Nick_Leslau', 'name': 'Nick Leslau'}]"
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mapped_persons[93415:]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "5\n"
+ ]
+ }
+ ],
+ "source": [
+ "# reduce CC attribution string\n",
+ "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n",
+ "cc_short = 'CC BY-SA 3.0'\n",
+ "nchanged = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " license = mapped_person.get('description_license',None)\n",
+ " if license == cc_long:\n",
+ " nchanged += 1\n",
+ " mapped_person['description_license'] = cc_short\n",
+ "print(nchanged)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# find number not accessed\n",
+ "n_empty = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " if not mapped_person.get('accessed', False):\n",
+ " n_empty += 1\n",
+ " print(mapped_person['kg_id'])\n",
+ "print(n_empty)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create dataframe\n",
+ "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n",
+ "df_mapped_persons.index.name = 'index'\n",
+ "fp_mapped_persons = '/data_store_hdd/datasets/people/msceleb/metadata/identity_kg.csv'\n",
+ "df_mapped_persons.to_csv(fp_mapped_persons, encoding = 'utf-16')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_mapped_persons.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create small version\n",
+ "limit = 1000\n",
+ "fp_mapped_persons_sm = f'/data_store_hdd/datasets/people/msceleb/metadata/identity_kg_0_{limit}.csv'\n",
+ "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n",
+ "df_mapped_persons_sm.index.name = 'index'\n",
+ "df_mapped_persons_sm.to_csv(fp_mapped_persons_sm, encoding = 'utf-16')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'kg_id': '/m/03c2nqz', 'score': 14.279573, 'description': 'Brazilian soccer player', 'url': '', 'accessed': True, 'description_extended': 'Cleiton Ribeiro Xavier is a Brazilian professional footballer who plays as an attacking midfielder for Vitória. He is known by his powerful and accurate free kicks, dribbling skills and passes.', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Cleiton_Xavier', 'image_url': 'http://t3.gstatic.com/images?q=tbn:ANd9GcSPzkNDBjtWX3f_oov7vOTlTxBNFrfIqEaIwJR26AsLfsBbP8H9', 'name': 'Cleiton Xavier'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "#a = get_kg_from_kg_obj({'kg_id': '/m/03c2nqz', 'accessed': False})\n",
+ "#print(a)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:megapixels]",
+ "language": "python",
+ "name": "conda-env-megapixels-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}