1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
|
import json
import urllib
import urllib.request
from app.settings import app_cfg
from app.utils import file_utils, im_utils, logger_utils
class WikipediaAPI:
url_base = 'https://en.wikipedia.org/w/api.php'
log = logger_utils.Logger.getLogger()
# https://en.wikipedia.org/w/api.php?redirects=&
# ppprop=displaytitle&prop=pageprops|pageimages|description&generator=prefixsearch
# &action=query&format=json&piprop=thumbnail&pithumbsize=160&pilimit=6&gpssearch=Vicente+Fox&gpsnamespace=0&gpslimit=6
def _url_builder(self, q):
# https://www.mediawiki.org/wiki/API%3aProperties#Info%3a_Parameters
params = {
'redirects': '',
'ppprop': 'displaytitle',
'prop': 'pageprops|pageimages|description',
'generator': 'prefixsearch',
'action': 'query',
'format': 'json',
'piprop': 'thumbnail',
#'pithumbsize': 160,
'pilimit': 1,
'gpssearch': q,
'gpsnamespace': 0,
'gpslimit': 1
}
url = f'{self.url_base}?{urllib.parse.urlencode(params)}'
return url
def _api_search(self, url):
# set empty object
obj = {
'wp_description': '',
'wp_page_id': '',
'wp_name': ''
}
try:
json_response = urllib.request.urlopen(url).read()
response = json.loads(json_response)
obj['wp_accessed'] = True
query = response.get('query', None)
if query:
pages = query.get('pages',[])
if pages:
page_id= list(pages.keys())[0]
if int(page_id) != -1:
page = pages[page_id]
# populate with successful result
obj['wp_name'] = page['title']
obj['wp_page_id'] = page_id
obj['wp_description'] = page.get('description', '') # not always available
# if fail, return None
except Exception as e:
obj['wp_error'] = e
obj['wp_accessed'] = False
return obj
def get_meta(self, query_obj, verbose=False):
'''Searches Wikipedia API for query string'''
if query_obj.get('wp_accessed', False):
return query_obj
else:
url = self._url_builder(query_obj['query'])
if verbose:
self.log.debug(f'querying: {url}')
print(url)
return self._api_search(url)
def search(self, q):
'''Searches Wikipedia API for query string'''
url = self._url_builder(q)
return self._api_search(url)
class GoogleKnowledgeGraph:
url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'
log = logger_utils.Logger.getLogger()
fp_api_key = app_cfg.FP_KNOWLEDGE_GRAPH_ENV
def __init__(self, api_key=None):
if api_key is not None:
self._api_key = api_key
else:
self._api_key = open(self.fp_api_key).read()
def _get_kg_meta(self, result_obj, params):
params['indent'] = True # JSON indent
params['key'] = self._api_key
params['limit'] = 1
'''
Restricts returned entities to those of the specified types.
For example, you can specify `Person` (as defined in http://schema.org/Person)
to restrict the results to entities representing people.
If multiple types are specified, returned entities will contain one or more of these type'''
params['types'] = 'Person'
'''Enables prefix (initial substring) match against names and
aliases of entities. For example, a prefix `Jung` will match entities
and aliases such as `Jung`, `Jungle`, and `Jung-ho Kang`.'''
params['prefix'] = False
url = f'{self.url_kg_api}?{urllib.parse.urlencode(params)}'
try:
json_response = urllib.request.urlopen(url).read()
except Exception as e:
result_obj['kg_error'] = str(e)
result_obj['kg_accessed'] = False
else:
response = json.loads(json_response)
items = response.get('itemListElement', [])
if items:
item = items[0]
item_result = item.get('result', [])
result_obj['kg_url'] = item.get('url', '')
result_obj['kg_description'] = item_result.get('description', '')
result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')
result_obj['kg_name'] = item_result.get('name', '')
result_obj['kg_score'] = item.get('resultScore', 0.0)
det_desc = item_result.get('detailedDescription', '')
if det_desc:
result_obj['kg_bio'] = det_desc.get('articleBody','')
result_obj['kg_bio_url'] = det_desc.get('url','')
else:
result_obj['kg_bio'] = ''
result_obj['kg_bio_url'] = ''
result_img = item_result.get('image', '')
if result_img:
result_obj['kg_image_url'] = result_img.get('contentUrl', '')
else:
result_obj['kg_image_url'] = ''
result_obj['kg_error'] = ''
else:
# search was valid but no results
result_obj['kg_url'] = ''
result_obj['kg_description'] = ''
result_obj['kg_id'] = ''
result_obj['kg_name'] = ''
result_obj['kg_score'] = 0
result_obj['kg_bio'] = ''
result_obj['kg_bio_url'] = ''
result_obj['kg_image_url'] = ''
result_obj['kg_accessed'] = True
return result_obj
def get_kg_from_name(self, obj):
if obj.get('kg_accessed', False):
return obj
params = {'query': obj['query']}
return self._get_kg_meta(obj, params)
def get_kg_from_kg_id(self, obj):
if obj.get('kg_accessed', False):
return obj
params = {'ids': obj['kg_ig']}
return self._get_kg_meta(obj, params)
|