import json import urllib import urllib.request from app.settings import app_cfg from app.utils import logger_utils class WikipediaAPI: url_base = 'https://en.wikipedia.org/w/api.php' log = logger_utils.Logger.getLogger() # https://en.wikipedia.org/w/api.php?redirects=& # ppprop=displaytitle&prop=pageprops|pageimages|description&generator=prefixsearch # &action=query&format=json&piprop=thumbnail&pithumbsize=160&pilimit=6&gpssearch=Vicente+Fox&gpsnamespace=0&gpslimit=6 def _url_builder(self, q): # https://www.mediawiki.org/wiki/API%3aProperties#Info%3a_Parameters params = { 'redirects': '', 'ppprop': 'displaytitle', 'prop': 'pageprops|pageimages|description', 'generator': 'prefixsearch', 'action': 'query', 'format': 'json', 'piprop': 'thumbnail', #'pithumbsize': 160, 'pilimit': 1, 'gpssearch': q, 'gpsnamespace': 0, 'gpslimit': 1 } url = f'{self.url_base}?{urllib.parse.urlencode(params)}' return url def _api_search(self, url): # set empty object obj = { 'wp_description': '', 'wp_page_id': '', 'wp_name': '' } try: json_response = urllib.request.urlopen(url).read() response = json.loads(json_response) obj['wp_accessed'] = True query = response.get('query', None) if query: pages = query.get('pages',[]) if pages: page_id= list(pages.keys())[0] if int(page_id) != -1: page = pages[page_id] # populate with successful result obj['wp_name'] = page['title'] obj['wp_page_id'] = page_id obj['wp_description'] = page.get('description', '') # not always available # if fail, return None except Exception as e: obj['wp_error'] = e obj['wp_accessed'] = False return obj def get_meta(self, query_obj, verbose=False): '''Searches Wikipedia API for query string''' if query_obj.get('wp_accessed', False): return query_obj else: url = self._url_builder(query_obj['query']) if verbose: self.log.debug(f'querying: {url}') print(url) return self._api_search(url) def search(self, q): '''Searches Wikipedia API for query string''' url = self._url_builder(q) return self._api_search(url) class GoogleKnowledgeGraph: url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search' log = logger_utils.Logger.getLogger() fp_api_key = app_cfg.FP_KNOWLEDGE_GRAPH_ENV def __init__(self, api_key=None): if api_key is not None: self._api_key = api_key else: self._api_key = open(self.fp_api_key).read() def _get_kg_meta(self, result_obj, params): params['indent'] = True # JSON indent params['key'] = self._api_key params['limit'] = 1 ''' Restricts returned entities to those of the specified types. For example, you can specify `Person` (as defined in http://schema.org/Person) to restrict the results to entities representing people. If multiple types are specified, returned entities will contain one or more of these type''' params['types'] = 'Person' '''Enables prefix (initial substring) match against names and aliases of entities. For example, a prefix `Jung` will match entities and aliases such as `Jung`, `Jungle`, and `Jung-ho Kang`.''' params['prefix'] = False url = f'{self.url_kg_api}?{urllib.parse.urlencode(params)}' try: json_response = urllib.request.urlopen(url).read() except Exception as e: result_obj['kg_error'] = str(e) result_obj['kg_accessed'] = False else: response = json.loads(json_response) items = response.get('itemListElement', []) if items: item = items[0] item_result = item.get('result', []) result_obj['kg_url'] = item.get('url', '') result_obj['kg_description'] = item_result.get('description', '') result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','') result_obj['kg_name'] = item_result.get('name', '') result_obj['kg_score'] = item.get('resultScore', 0.0) det_desc = item_result.get('detailedDescription', '') if det_desc: result_obj['kg_bio'] = det_desc.get('articleBody','') result_obj['kg_bio_url'] = det_desc.get('url','') else: result_obj['kg_bio'] = '' result_obj['kg_bio_url'] = '' result_img = item_result.get('image', '') if result_img: result_obj['kg_image_url'] = result_img.get('contentUrl', '') else: result_obj['kg_image_url'] = '' result_obj['kg_error'] = '' else: # search was valid but no results result_obj['kg_url'] = '' result_obj['kg_description'] = '' result_obj['kg_id'] = '' result_obj['kg_name'] = '' result_obj['kg_score'] = 0 result_obj['kg_bio'] = '' result_obj['kg_bio_url'] = '' result_obj['kg_image_url'] = '' result_obj['kg_accessed'] = True return result_obj def get_kg_from_name(self, obj): if obj.get('kg_accessed', False): return obj params = {'query': obj['query']} return self._get_kg_meta(obj, params) def get_kg_from_kg_id(self, obj): if obj.get('kg_accessed', False): return obj params = {'ids': obj['kg_ig']} return self._get_kg_meta(obj, params)