{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Find Knowledge Graph Names" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import os.path as osp\n", "from os.path import join\n", "from glob import glob\n", "import random\n", "import math\n", "import time\n", "from datetime import datetime\n", "\n", "import requests\n", "\n", "import json\n", "import urllib\n", "from multiprocessing.pool import ThreadPool\n", "import threading\n", "from urllib.request import urlopen\n", "import urllib.request\n", "\n", "\n", "import cv2 as cv\n", "import pandas as pd\n", "from scipy.io import loadmat\n", "import numpy as np\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "%reload_ext autoreload\n", "%autoreload 2\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels/')\n", "from app.utils import file_utils, im_utils" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Metadata" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "dir_msceleb = '/data_store_hdd/datasets/people/msceleb/media/original/'\n", "kg_ids_msceleb = [x.replace('m.', '/m/') for x in os.listdir(dir_msceleb)]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def get_kg_from_kg_obj(obj):\n", " # TODO detect 503 service unavailable\n", " timeout_error_msg = b'HTTP Error 503: Service Unavailable'\n", " url_error_msg = b'HTTP Error 400: Bad Request'\n", " global api_key, url_kg_api\n", " kg_id = obj['kg_id']\n", " params = {\n", " 'ids': kg_id,\n", " 'limit': 1,\n", " 'indent': True,\n", " 'key': api_key,\n", " }\n", " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n", " result = {'kg_id': kg_id, 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", " try:\n", " json_response = urllib.request.urlopen(url).read()\n", " except Exception as e:\n", " result['error'] = str(e)\n", " else:\n", " try:\n", " response = json.loads(json_response)\n", " items = response.get('itemListElement', [])\n", " result['accessed'] = True\n", " if items:\n", " item = items[0]\n", " item_result = item.get('result', [])\n", " result['description'] = item_result.get('description', '')\n", " det_desc = item_result.get('detailedDescription', '')\n", " if det_desc:\n", " result['description_extended'] = det_desc.get('articleBody','')\n", " result['description_license'] = det_desc.get('license','')\n", " result['description_url'] = det_desc.get('url','')\n", " else:\n", " result['description_extended'] = ''\n", " result['description_license'] = ''\n", " result['description_url'] = ''\n", " result_img = item_result.get('image', '')\n", " if result_img:\n", " result['image_url'] = result_img.get('contentUrl', '')\n", " result['name'] = item_result.get('name', '')\n", " result['score'] = item.get('resultScore', 0.0)\n", " result['url'] = item_result.get('url', '')\n", " except Exception as e:\n", " result['error'] = str(e)\n", " return result" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "unmapped_persons = [{'kg_id': x} for x in kg_ids_msceleb]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2d0733764379489aa82ed20f20edbb9b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=93418), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f38b47614c5b4894b7e026b6a46a5057", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=93418), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "opt_threads = 10\n", "pbar = tqdm(total=len(unmapped_persons))\n", "\n", "# define thread mapping function\n", "def pool_map_persons(obj):\n", " global pbar\n", " pbar.update(1)\n", " kg_obj = get_kg_from_kg_obj(obj)\n", " return kg_obj\n", "\n", "# convert to thread pool\n", "mapped_persons = []\n", "pool = ThreadPool(opt_threads)\n", "\n", "# start threading\n", "with tqdm(total=len(unmapped_persons)) as pbar:\n", " mapped_persons = pool.map(pool_map_persons, unmapped_persons)\n", "\n", "# close tqdm\n", "pbar.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "{'@id': 'kg:/m/01008l96',\n", " 'name': 'Mohamed Guessous',\n", " '@type': ['Thing', 'Person'],\n", " 'description': 'Moroccan sociologist',\n", " 'image': {'contentUrl': 'http://t2.gstatic.com/images?q=tbn:ANd9GcTAHGBU-4ZzSqcMbDPnSHZA10u0L9Hnppdvt_AnFdQzOYnS0aHM',\n", " 'url': 'https://en.wikipedia.org/wiki/Mohamed_Guessous'},\n", " 'detailedDescription': {'articleBody': 'Mohamed Guessous was a Moroccan sociologist. He was also an active politician in the Socialist Union of Popular Forces.',\n", " 'url': 'https://en.wikipedia.org/wiki/Mohamed_Guessous',\n", " 'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'},\n", " 'score': 11.046742}\n", " ```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:megapixels]", "language": "python", "name": "conda-env-megapixels-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }