diff options
Diffstat (limited to 'megapixels/notebooks/datasets/names_kg.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/names_kg.ipynb | 243 |
1 files changed, 243 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/names_kg.ipynb b/megapixels/notebooks/datasets/names_kg.ipynb new file mode 100644 index 00000000..ab4edc4b --- /dev/null +++ b/megapixels/notebooks/datasets/names_kg.ipynb @@ -0,0 +1,243 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Find Knowledge Graph Names" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import os.path as osp\n", + "from os.path import join\n", + "from glob import glob\n", + "import random\n", + "import math\n", + "import time\n", + "from datetime import datetime\n", + "\n", + "import requests\n", + "\n", + "import json\n", + "import urllib\n", + "from multiprocessing.pool import ThreadPool\n", + "import threading\n", + "from urllib.request import urlopen\n", + "import urllib.request\n", + "\n", + "\n", + "import cv2 as cv\n", + "import pandas as pd\n", + "from scipy.io import loadmat\n", + "import numpy as np\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "%reload_ext autoreload\n", + "%autoreload 2\n", + "import sys\n", + "sys.path.append('/work/megapixels_dev/megapixels/')\n", + "from app.utils import file_utils, im_utils" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "dir_msceleb = '/data_store_hdd/datasets/people/msceleb/media/original/'\n", + "kg_ids_msceleb = [x.replace('m.', '/m/') for x in os.listdir(dir_msceleb)]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", + "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def get_kg_from_kg_obj(obj):\n", + " # TODO detect 503 service unavailable\n", + " timeout_error_msg = b'HTTP Error 503: Service Unavailable'\n", + " url_error_msg = b'HTTP Error 400: Bad Request'\n", + " global api_key, url_kg_api\n", + " kg_id = obj['kg_id']\n", + " params = {\n", + " 'ids': kg_id,\n", + " 'limit': 1,\n", + " 'indent': True,\n", + " 'key': api_key,\n", + " }\n", + " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n", + " result = {'kg_id': kg_id, 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", + " try:\n", + " json_response = urllib.request.urlopen(url).read()\n", + " except Exception as e:\n", + " result['error'] = str(e)\n", + " else:\n", + " try:\n", + " response = json.loads(json_response)\n", + " items = response.get('itemListElement', [])\n", + " result['accessed'] = True\n", + " if items:\n", + " item = items[0]\n", + " item_result = item.get('result', [])\n", + " result['description'] = item_result.get('description', '')\n", + " det_desc = item_result.get('detailedDescription', '')\n", + " if det_desc:\n", + " result['description_extended'] = det_desc.get('articleBody','')\n", + " result['description_license'] = det_desc.get('license','')\n", + " result['description_url'] = det_desc.get('url','')\n", + " else:\n", + " result['description_extended'] = ''\n", + " result['description_license'] = ''\n", + " result['description_url'] = ''\n", + " result_img = item_result.get('image', '')\n", + " if result_img:\n", + " result['image_url'] = result_img.get('contentUrl', '')\n", + " result['name'] = item_result.get('name', '')\n", + " result['score'] = item.get('resultScore', 0.0)\n", + " result['url'] = item_result.get('url', '')\n", + " except Exception as e:\n", + " result['error'] = str(e)\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "unmapped_persons = [{'kg_id': x} for x in kg_ids_msceleb]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2d0733764379489aa82ed20f20edbb9b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=93418), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f38b47614c5b4894b7e026b6a46a5057", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=93418), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "opt_threads = 10\n", + "pbar = tqdm(total=len(unmapped_persons))\n", + "\n", + "# define thread mapping function\n", + "def pool_map_persons(obj):\n", + " global pbar\n", + " pbar.update(1)\n", + " kg_obj = get_kg_from_kg_obj(obj)\n", + " return kg_obj\n", + "\n", + "# convert to thread pool\n", + "mapped_persons = []\n", + "pool = ThreadPool(opt_threads)\n", + "\n", + "# start threading\n", + "with tqdm(total=len(unmapped_persons)) as pbar:\n", + " mapped_persons = pool.map(pool_map_persons, unmapped_persons)\n", + "\n", + "# close tqdm\n", + "pbar.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "{'@id': 'kg:/m/01008l96',\n", + " 'name': 'Mohamed Guessous',\n", + " '@type': ['Thing', 'Person'],\n", + " 'description': 'Moroccan sociologist',\n", + " 'image': {'contentUrl': 'http://t2.gstatic.com/images?q=tbn:ANd9GcTAHGBU-4ZzSqcMbDPnSHZA10u0L9Hnppdvt_AnFdQzOYnS0aHM',\n", + " 'url': 'https://en.wikipedia.org/wiki/Mohamed_Guessous'},\n", + " 'detailedDescription': {'articleBody': 'Mohamed Guessous was a Moroccan sociologist. He was also an active politician in the Socialist Union of Popular Forces.',\n", + " 'url': 'https://en.wikipedia.org/wiki/Mohamed_Guessous',\n", + " 'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'},\n", + " 'score': 11.046742}\n", + " ```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} |
