summaryrefslogtreecommitdiff
path: root/cli/app/thesaurus/api.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2020-03-31 20:12:34 +0200
committerJules Laplace <julescarbon@gmail.com>2020-03-31 20:12:34 +0200
commit2b407d1f4a608d0ac23592ff16def77797e4fa41 (patch)
tree44d7b146d56d083e9d1c9fd2979831ea14d19334 /cli/app/thesaurus/api.py
init thesaurus api client
Diffstat (limited to 'cli/app/thesaurus/api.py')
-rw-r--r--cli/app/thesaurus/api.py71
1 files changed, 71 insertions, 0 deletions
diff --git a/cli/app/thesaurus/api.py b/cli/app/thesaurus/api.py
new file mode 100644
index 0000000..ad0dd92
--- /dev/null
+++ b/cli/app/thesaurus/api.py
@@ -0,0 +1,71 @@
+import os
+import requests
+
+from app.utils.util import *
+from app.settings import app_cfg
+
+class Thesaurus:
+ def __init__(self):
+ self.api = ThesaurusAPI()
+
+ def load(self, base_path, word, api_fn):
+ sha = sha256(word)
+ hash_path = os.path.join(base_path, sha[0:2])
+ os.makedirs(hash_path, exist_ok=True)
+ path = os.path.join(hash_path, word + '.json')
+ if os.path.exists(path):
+ return read_json(path)
+ data = api_fn(word)
+ write_json(path, data)
+ return data
+
+ def search(self, word):
+ return self.load(app_cfg.SEARCH_PATH, word, self.api.search)
+
+ def category(self, id):
+ return self.load(app_cfg.CATEGORY_PATH, str(id), self.api.category)
+
+class ThesaurusAPI:
+ SEARCH_ENDPOINT = "https://ht.ac.uk/category-selection/"
+ CATEGORY_ENDPOINT = "https://ht.ac.uk/api/v1/loadCategory-v2.php"
+ HEADERS = {
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
+ }
+
+ def search(self, word):
+ query = {
+ 'qsearch': word,
+ }
+ resp = requests.get(self.SEARCH_ENDPOINT, params=query, headers=self.HEADERS)
+ if resp.status_code != 200:
+ return []
+ data = resp.text
+ data = data.split('<div id="resultsTimelineData">')
+ data = data[0].split('</div>')
+ return json.loads(data)
+
+ def category(self, id):
+ query = {
+ 'id': id,
+ }
+ resp = requests.get(self.CATEGORY_ENDPOINT, params=query, headers=self.HEADERS)
+ if resp.status_code != 200:
+ return ""
+ raw = resp.text
+ classification = raw.split("<span style='font-size: 0.6em'>")[1].split('</span>')[0]
+ category = raw.split("<br />")[1].split('</h2>')[0]
+ raw_words = raw.split('<b>')[1:]
+ words = []
+ for word in raw_words:
+ word, rest = word.split('</b>')
+ years = word.split(' <span')[0].trim()
+ words.append({
+ 'word': word,
+ 'years': years,
+ })
+ return {
+ 'id': id,
+ 'category': category,
+ 'classification': classification,
+ 'words': words,
+ } \ No newline at end of file