summaryrefslogtreecommitdiff
path: root/cli/app/utils/api_utils.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2019-12-01 11:07:59 +0100
committerJules Laplace <julescarbon@gmail.com>2019-12-01 11:07:59 +0100
commitd9c3928e542faabaf8a9cb3d235029939cb65f09 (patch)
tree7f5093e8be5c3473d056c5da4c40947a21ca2b7e /cli/app/utils/api_utils.py
utilz
Diffstat (limited to 'cli/app/utils/api_utils.py')
-rw-r--r--cli/app/utils/api_utils.py170
1 files changed, 170 insertions, 0 deletions
diff --git a/cli/app/utils/api_utils.py b/cli/app/utils/api_utils.py
new file mode 100644
index 0000000..a4dad50
--- /dev/null
+++ b/cli/app/utils/api_utils.py
@@ -0,0 +1,170 @@
+import json
+import urllib
+import urllib.request
+
+from app.settings import app_cfg
+from app.utils import logger_utils
+
+
+class WikipediaAPI:
+
+ url_base = 'https://en.wikipedia.org/w/api.php'
+ log = logger_utils.Logger.getLogger()
+ # https://en.wikipedia.org/w/api.php?redirects=&
+ # ppprop=displaytitle&prop=pageprops|pageimages|description&generator=prefixsearch
+ # &action=query&format=json&piprop=thumbnail&pithumbsize=160&pilimit=6&gpssearch=Vicente+Fox&gpsnamespace=0&gpslimit=6
+
+ def _url_builder(self, q):
+
+ # https://www.mediawiki.org/wiki/API%3aProperties#Info%3a_Parameters
+ params = {
+ 'redirects': '',
+ 'ppprop': 'displaytitle',
+ 'prop': 'pageprops|pageimages|description',
+ 'generator': 'prefixsearch',
+ 'action': 'query',
+ 'format': 'json',
+ 'piprop': 'thumbnail',
+ #'pithumbsize': 160,
+ 'pilimit': 1,
+ 'gpssearch': q,
+ 'gpsnamespace': 0,
+ 'gpslimit': 1
+ }
+ url = f'{self.url_base}?{urllib.parse.urlencode(params)}'
+ return url
+
+ def _api_search(self, url):
+ # set empty object
+ obj = {
+ 'wp_description': '',
+ 'wp_page_id': '',
+ 'wp_name': ''
+ }
+ try:
+ json_response = urllib.request.urlopen(url).read()
+ response = json.loads(json_response)
+ obj['wp_accessed'] = True
+ query = response.get('query', None)
+ if query:
+ pages = query.get('pages',[])
+ if pages:
+ page_id= list(pages.keys())[0]
+ if int(page_id) != -1:
+ page = pages[page_id]
+ # populate with successful result
+ obj['wp_name'] = page['title']
+ obj['wp_page_id'] = page_id
+ obj['wp_description'] = page.get('description', '') # not always available
+ # if fail, return None
+ except Exception as e:
+ obj['wp_error'] = e
+ obj['wp_accessed'] = False
+ return obj
+
+ def get_meta(self, query_obj, verbose=False):
+ '''Searches Wikipedia API for query string'''
+
+ if query_obj.get('wp_accessed', False):
+ return query_obj
+ else:
+ url = self._url_builder(query_obj['query'])
+ if verbose:
+ self.log.debug(f'querying: {url}')
+ print(url)
+ return self._api_search(url)
+
+ def search(self, q):
+ '''Searches Wikipedia API for query string'''
+ url = self._url_builder(q)
+ return self._api_search(url)
+
+
+class GoogleKnowledgeGraph:
+
+ url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'
+ log = logger_utils.Logger.getLogger()
+ fp_api_key = app_cfg.FP_KNOWLEDGE_GRAPH_ENV
+
+ def __init__(self, api_key=None):
+ if api_key is not None:
+ self._api_key = api_key
+ else:
+ self._api_key = open(self.fp_api_key).read()
+
+
+ def _get_kg_meta(self, result_obj, params):
+
+ params['indent'] = True # JSON indent
+ params['key'] = self._api_key
+ params['limit'] = 1
+ '''
+ Restricts returned entities to those of the specified types.
+ For example, you can specify `Person` (as defined in http://schema.org/Person)
+ to restrict the results to entities representing people.
+ If multiple types are specified, returned entities will contain one or more of these type'''
+ params['types'] = 'Person'
+
+ '''Enables prefix (initial substring) match against names and
+ aliases of entities. For example, a prefix `Jung` will match entities
+ and aliases such as `Jung`, `Jungle`, and `Jung-ho Kang`.'''
+ params['prefix'] = False
+
+ url = f'{self.url_kg_api}?{urllib.parse.urlencode(params)}'
+ try:
+ json_response = urllib.request.urlopen(url).read()
+ except Exception as e:
+ result_obj['kg_error'] = str(e)
+ result_obj['kg_accessed'] = False
+ else:
+ response = json.loads(json_response)
+ items = response.get('itemListElement', [])
+ if items:
+ item = items[0]
+ item_result = item.get('result', [])
+ result_obj['kg_url'] = item.get('url', '')
+ result_obj['kg_description'] = item_result.get('description', '')
+ result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')
+ result_obj['kg_name'] = item_result.get('name', '')
+ result_obj['kg_score'] = item.get('resultScore', 0.0)
+ det_desc = item_result.get('detailedDescription', '')
+ if det_desc:
+ result_obj['kg_bio'] = det_desc.get('articleBody','')
+ result_obj['kg_bio_url'] = det_desc.get('url','')
+ else:
+ result_obj['kg_bio'] = ''
+ result_obj['kg_bio_url'] = ''
+ result_img = item_result.get('image', '')
+ if result_img:
+ result_obj['kg_image_url'] = result_img.get('contentUrl', '')
+ else:
+ result_obj['kg_image_url'] = ''
+ result_obj['kg_error'] = ''
+ else:
+ # search was valid but no results
+ result_obj['kg_url'] = ''
+ result_obj['kg_description'] = ''
+ result_obj['kg_id'] = ''
+ result_obj['kg_name'] = ''
+ result_obj['kg_score'] = 0
+ result_obj['kg_bio'] = ''
+ result_obj['kg_bio_url'] = ''
+ result_obj['kg_image_url'] = ''
+
+ result_obj['kg_accessed'] = True
+
+ return result_obj
+
+
+ def get_kg_from_name(self, obj):
+ if obj.get('kg_accessed', False):
+ return obj
+ params = {'query': obj['query']}
+ return self._get_kg_meta(obj, params)
+
+
+ def get_kg_from_kg_id(self, obj):
+ if obj.get('kg_accessed', False):
+ return obj
+ params = {'ids': obj['kg_ig']}
+ return self._get_kg_meta(obj, params)