diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2019-12-01 11:07:59 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2019-12-01 11:07:59 +0100 |
| commit | d9c3928e542faabaf8a9cb3d235029939cb65f09 (patch) | |
| tree | 7f5093e8be5c3473d056c5da4c40947a21ca2b7e /cli/app/utils/api_utils.py | |
utilz
Diffstat (limited to 'cli/app/utils/api_utils.py')
| -rw-r--r-- | cli/app/utils/api_utils.py | 170 |
1 files changed, 170 insertions, 0 deletions
diff --git a/cli/app/utils/api_utils.py b/cli/app/utils/api_utils.py new file mode 100644 index 0000000..a4dad50 --- /dev/null +++ b/cli/app/utils/api_utils.py @@ -0,0 +1,170 @@ +import json +import urllib +import urllib.request + +from app.settings import app_cfg +from app.utils import logger_utils + + +class WikipediaAPI: + + url_base = 'https://en.wikipedia.org/w/api.php' + log = logger_utils.Logger.getLogger() + # https://en.wikipedia.org/w/api.php?redirects=& + # ppprop=displaytitle&prop=pageprops|pageimages|description&generator=prefixsearch + # &action=query&format=json&piprop=thumbnail&pithumbsize=160&pilimit=6&gpssearch=Vicente+Fox&gpsnamespace=0&gpslimit=6 + + def _url_builder(self, q): + + # https://www.mediawiki.org/wiki/API%3aProperties#Info%3a_Parameters + params = { + 'redirects': '', + 'ppprop': 'displaytitle', + 'prop': 'pageprops|pageimages|description', + 'generator': 'prefixsearch', + 'action': 'query', + 'format': 'json', + 'piprop': 'thumbnail', + #'pithumbsize': 160, + 'pilimit': 1, + 'gpssearch': q, + 'gpsnamespace': 0, + 'gpslimit': 1 + } + url = f'{self.url_base}?{urllib.parse.urlencode(params)}' + return url + + def _api_search(self, url): + # set empty object + obj = { + 'wp_description': '', + 'wp_page_id': '', + 'wp_name': '' + } + try: + json_response = urllib.request.urlopen(url).read() + response = json.loads(json_response) + obj['wp_accessed'] = True + query = response.get('query', None) + if query: + pages = query.get('pages',[]) + if pages: + page_id= list(pages.keys())[0] + if int(page_id) != -1: + page = pages[page_id] + # populate with successful result + obj['wp_name'] = page['title'] + obj['wp_page_id'] = page_id + obj['wp_description'] = page.get('description', '') # not always available + # if fail, return None + except Exception as e: + obj['wp_error'] = e + obj['wp_accessed'] = False + return obj + + def get_meta(self, query_obj, verbose=False): + '''Searches Wikipedia API for query string''' + + if query_obj.get('wp_accessed', False): + return query_obj + else: + url = self._url_builder(query_obj['query']) + if verbose: + self.log.debug(f'querying: {url}') + print(url) + return self._api_search(url) + + def search(self, q): + '''Searches Wikipedia API for query string''' + url = self._url_builder(q) + return self._api_search(url) + + +class GoogleKnowledgeGraph: + + url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search' + log = logger_utils.Logger.getLogger() + fp_api_key = app_cfg.FP_KNOWLEDGE_GRAPH_ENV + + def __init__(self, api_key=None): + if api_key is not None: + self._api_key = api_key + else: + self._api_key = open(self.fp_api_key).read() + + + def _get_kg_meta(self, result_obj, params): + + params['indent'] = True # JSON indent + params['key'] = self._api_key + params['limit'] = 1 + ''' + Restricts returned entities to those of the specified types. + For example, you can specify `Person` (as defined in http://schema.org/Person) + to restrict the results to entities representing people. + If multiple types are specified, returned entities will contain one or more of these type''' + params['types'] = 'Person' + + '''Enables prefix (initial substring) match against names and + aliases of entities. For example, a prefix `Jung` will match entities + and aliases such as `Jung`, `Jungle`, and `Jung-ho Kang`.''' + params['prefix'] = False + + url = f'{self.url_kg_api}?{urllib.parse.urlencode(params)}' + try: + json_response = urllib.request.urlopen(url).read() + except Exception as e: + result_obj['kg_error'] = str(e) + result_obj['kg_accessed'] = False + else: + response = json.loads(json_response) + items = response.get('itemListElement', []) + if items: + item = items[0] + item_result = item.get('result', []) + result_obj['kg_url'] = item.get('url', '') + result_obj['kg_description'] = item_result.get('description', '') + result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','') + result_obj['kg_name'] = item_result.get('name', '') + result_obj['kg_score'] = item.get('resultScore', 0.0) + det_desc = item_result.get('detailedDescription', '') + if det_desc: + result_obj['kg_bio'] = det_desc.get('articleBody','') + result_obj['kg_bio_url'] = det_desc.get('url','') + else: + result_obj['kg_bio'] = '' + result_obj['kg_bio_url'] = '' + result_img = item_result.get('image', '') + if result_img: + result_obj['kg_image_url'] = result_img.get('contentUrl', '') + else: + result_obj['kg_image_url'] = '' + result_obj['kg_error'] = '' + else: + # search was valid but no results + result_obj['kg_url'] = '' + result_obj['kg_description'] = '' + result_obj['kg_id'] = '' + result_obj['kg_name'] = '' + result_obj['kg_score'] = 0 + result_obj['kg_bio'] = '' + result_obj['kg_bio_url'] = '' + result_obj['kg_image_url'] = '' + + result_obj['kg_accessed'] = True + + return result_obj + + + def get_kg_from_name(self, obj): + if obj.get('kg_accessed', False): + return obj + params = {'query': obj['query']} + return self._get_kg_meta(obj, params) + + + def get_kg_from_kg_id(self, obj): + if obj.get('kg_accessed', False): + return obj + params = {'ids': obj['kg_ig']} + return self._get_kg_meta(obj, params) |
