1 files changed, 71 insertions, 0 deletions
diff --git a/cli/app/thesaurus/api.py b/cli/app/thesaurus/api.py
new file mode 100644
index 0000000..ad0dd92
--- /dev/null
+++ b/cli/app/thesaurus/api.py
@@ -0,0 +1,71 @@
+import os
+import requests
+
+from app.utils.util import *
+from app.settings import app_cfg
+
+class Thesaurus:
+  def __init__(self):
+    self.api = ThesaurusAPI()
+
+  def load(self, base_path, word, api_fn):
+    sha = sha256(word)
+    hash_path = os.path.join(base_path, sha[0:2])
+    os.makedirs(hash_path, exist_ok=True)
+    path = os.path.join(hash_path, word + '.json')
+    if os.path.exists(path):
+      return read_json(path)
+    data = api_fn(word)
+    write_json(path, data)
+    return data
+
+  def search(self, word):
+    return self.load(app_cfg.SEARCH_PATH, word, self.api.search)
+
+  def category(self, id):
+    return self.load(app_cfg.CATEGORY_PATH, str(id), self.api.category)
+
+class ThesaurusAPI:
+  SEARCH_ENDPOINT = "https://ht.ac.uk/category-selection/"
+  CATEGORY_ENDPOINT = "https://ht.ac.uk/api/v1/loadCategory-v2.php"
+  HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
+  }
+
+  def search(self, word):
+    query = {
+      'qsearch': word,
+    }
+    resp = requests.get(self.SEARCH_ENDPOINT, params=query, headers=self.HEADERS)
+    if resp.status_code != 200:
+      return []
+    data = resp.text
+    data = data.split('<div id="resultsTimelineData">')
+    data = data[0].split('</div>')
+    return json.loads(data)
+
+  def category(self, id):
+    query = {
+      'id': id,
+    }
+    resp = requests.get(self.CATEGORY_ENDPOINT, params=query, headers=self.HEADERS)
+    if resp.status_code != 200:
+      return ""
+    raw = resp.text
+    classification = raw.split("<span style='font-size: 0.6em'>")[1].split('</span>')[0]
+    category = raw.split("<br />")[1].split('</h2>')[0]
+    raw_words = raw.split('<b>')[1:]
+    words = []
+    for word in raw_words:
+      word, rest = word.split('</b>')
+      years = word.split(' <span')[0].trim()
+      words.append({
+        'word': word,
+        'years': years,
+      })
+    return {
+      'id': id,
+      'category': category,
+      'classification': classification,
+      'words': words,
+    }
+\ No newline at end of file