{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Knowledge Graph MS Celeb" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import os.path as osp\n", "from os.path import join\n", "from glob import glob\n", "import random\n", "import math\n", "import time\n", "from datetime import datetime\n", "\n", "import requests\n", "\n", "import json\n", "import urllib\n", "from multiprocessing.pool import ThreadPool\n", "import threading\n", "from urllib.request import urlopen\n", "import urllib.request\n", "\n", "import pandas as pd\n", "from scipy.io import loadmat\n", "import numpy as np\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "%reload_ext autoreload\n", "%autoreload 2\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels/')\n", "from app.utils import file_utils, im_utils" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Metadata" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "dir_msceleb = '/data_store_hdd/datasets/people/msceleb/media/original/'\n", "kg_ids_msceleb = [x.replace('m.', '/m/') for x in os.listdir(dir_msceleb)]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "def get_kg_meta(obj, url):\n", " \n", "def get_kg_from_name(obj):\n", " \n", "def get_kg_from_kg_id(obj):\n", " # TODO detect 503 service unavailable\n", " if obj['accessed']:\n", " return obj\n", " global api_key, url_kg_api\n", " kg_id = obj['kg_id']\n", " params = {\n", " 'query': q,\n", " 'limit': 5,\n", " 'indent': True,\n", " 'key': api_key,\n", " }\n", " \n", " params = {\n", " 'ids': kg_id,\n", " 'limit': 1,\n", " 'indent': True,\n", " 'key': api_key,\n", " }\n", " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n", " result = {'kg_id': kg_id, 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", " try:\n", " json_response = urllib.request.urlopen(url).read()\n", " except Exception as e:\n", " result['error'] = str(e)\n", " else:\n", " try:\n", " response = json.loads(json_response)\n", " items = response.get('itemListElement', [])\n", " result['accessed'] = True\n", " if items:\n", " item = items[0]\n", " item_result = item.get('result', [])\n", " result['description'] = item_result.get('description', '')\n", " det_desc = item_result.get('detailedDescription', '')\n", " if det_desc:\n", " result['description_extended'] = det_desc.get('articleBody','')\n", " result['description_license'] = det_desc.get('license','')\n", " result['description_url'] = det_desc.get('url','')\n", " else:\n", " result['description_extended'] = ''\n", " result['description_license'] = ''\n", " result['description_url'] = ''\n", " result_img = item_result.get('image', '')\n", " if result_img:\n", " result['image_url'] = result_img.get('contentUrl', '')\n", " result['name'] = item_result.get('name', '')\n", " result['score'] = item.get('resultScore', 0.0)\n", " result['url'] = item_result.get('url', '')\n", " except Exception as e:\n", " result['error'] = str(e)\n", " return result" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "unmapped_persons = [{'kg_id': x} for x in kg_ids_msceleb]\n", "opt_threads = 10\n", "pbar = tqdm(total=len(unmapped_persons))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# define thread mapping function\n", "def pool_map_persons(obj):\n", " global pbar\n", " pbar.update(1)\n", " kg_obj = get_kg_from_kg_obj(obj)\n", " return kg_obj" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "#mapped_persons_bkup = mapped_persons.copy()" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "886ce68bd7484d2fa4ab2da0beec5359", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=93418), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# convert to thread pool\n", "#mapped_persons = []\n", "pool = ThreadPool(opt_threads)\n", "\n", "# start threading\n", "with tqdm(total=len(unmapped_persons)) as pbar:\n", " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", "\n", "# close tqdm\n", "pbar.close()" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "93418" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(mapped_persons)" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'kg_id': '/m/0dlnwb0', 'score': 14.806737, 'description': 'American internet celebrity', 'url': '', 'accessed': True, 'description_extended': 'Keenan Cahill is an American Internet celebrity from Chicago, Illinois who lip-syncs to popular songs on YouTube.\\nCahill launched his first famous lipsynced YouTube video on August 28, 2010 on the Katy Perry song Teenage Dream. ', 'description_license': 'CC BY-SA 3.0', 'description_url': 'https://en.wikipedia.org/wiki/Keenan_Cahill', 'name': 'Keenan Cahill'}, {'kg_id': '/m/047rtd1', 'score': 12.298853, 'description': 'Canadian film actor', 'url': '', 'accessed': True, 'description_extended': '', 'description_license': '', 'description_url': '', 'name': 'Nicholas Elia'}, {'kg_id': '/m/04j9rz9', 'score': 11.539564, 'description': 'Investor', 'url': '', 'accessed': True, 'description_extended': 'Nick Leslau is an English commercial property investor, with an estimated fortune in the Sunday Times Rich List of £350 million. Leslau is Chairman and Chief Executive of Prestbury Investment Holdings Limited and Chairman of Prestbury Investments LLP. ', 'description_license': 'CC BY-SA 3.0', 'description_url': 'https://en.wikipedia.org/wiki/Nick_Leslau', 'name': 'Nick Leslau'}]" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mapped_persons[93415:]" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5\n" ] } ], "source": [ "# reduce CC attribution string\n", "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n", "cc_short = 'CC BY-SA 3.0'\n", "nchanged = 0\n", "for mapped_person in mapped_persons:\n", " license = mapped_person.get('description_license',None)\n", " if license == cc_long:\n", " nchanged += 1\n", " mapped_person['description_license'] = cc_short\n", "print(nchanged)" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] } ], "source": [ "# find number not accessed\n", "n_empty = 0\n", "for mapped_person in mapped_persons:\n", " if not mapped_person.get('accessed', False):\n", " n_empty += 1\n", " print(mapped_person['kg_id'])\n", "print(n_empty)" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "# create dataframe\n", "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", "df_mapped_persons.index.name = 'index'\n", "fp_mapped_persons = '/data_store_hdd/datasets/people/msceleb/metadata/identity_kg.csv'\n", "df_mapped_persons.to_csv(fp_mapped_persons, encoding = 'utf-16')" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "df_mapped_persons.head()" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "# create small version\n", "limit = 1000\n", "fp_mapped_persons_sm = f'/data_store_hdd/datasets/people/msceleb/metadata/identity_kg_0_{limit}.csv'\n", "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", "df_mapped_persons_sm.index.name = 'index'\n", "df_mapped_persons_sm.to_csv(fp_mapped_persons_sm, encoding = 'utf-16')" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'kg_id': '/m/03c2nqz', 'score': 14.279573, 'description': 'Brazilian soccer player', 'url': '', 'accessed': True, 'description_extended': 'Cleiton Ribeiro Xavier is a Brazilian professional footballer who plays as an attacking midfielder for Vitória. He is known by his powerful and accurate free kicks, dribbling skills and passes.', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Cleiton_Xavier', 'image_url': 'http://t3.gstatic.com/images?q=tbn:ANd9GcSPzkNDBjtWX3f_oov7vOTlTxBNFrfIqEaIwJR26AsLfsBbP8H9', 'name': 'Cleiton Xavier'}\n" ] } ], "source": [ "#a = get_kg_from_kg_obj({'kg_id': '/m/03c2nqz', 'accessed': False})\n", "#print(a)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:megapixels]", "language": "python", "name": "conda-env-megapixels-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }