cli/app/thesaurus/api.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71

import os
import requests

from app.utils.util import *
from app.settings import app_cfg

class Thesaurus:
  def __init__(self):
    self.api = ThesaurusAPI()

  def load(self, base_path, word, api_fn):
    sha = sha256(word)
    hash_path = os.path.join(base_path, sha[0:2])
    os.makedirs(hash_path, exist_ok=True)
    path = os.path.join(hash_path, word + '.json')
    if os.path.exists(path):
      return read_json(path)
    data = api_fn(word)
    write_json(path, data)
    return data

  def search(self, word):
    return self.load(app_cfg.SEARCH_PATH, word, self.api.search)

  def category(self, id):
    return self.load(app_cfg.CATEGORY_PATH, str(id), self.api.category)

class ThesaurusAPI:
  SEARCH_ENDPOINT = "https://ht.ac.uk/category-selection/"
  CATEGORY_ENDPOINT = "https://ht.ac.uk/api/v1/loadCategory-v2.php"
  HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
  }

  def search(self, word):
    query = {
      'qsearch': word,
    }
    resp = requests.get(self.SEARCH_ENDPOINT, params=query, headers=self.HEADERS)
    if resp.status_code != 200:
      return []
    data = resp.text
    data = data.split('<div id="resultsTimelineData">')
    data = data[0].split('</div>')
    return json.loads(data)

  def category(self, id):
    query = {
      'id': id,
    }
    resp = requests.get(self.CATEGORY_ENDPOINT, params=query, headers=self.HEADERS)
    if resp.status_code != 200:
      return ""
    raw = resp.text
    classification = raw.split("<span style='font-size: 0.6em'>")[1].split('</span>')[0]
    category = raw.split("<br />")[1].split('</h2>')[0]
    raw_words = raw.split('<b>')[1:]
    words = []
    for word in raw_words:
      word, rest = word.split('</b>')
      years = word.split(' <span')[0].trim()
      words.append({
        'word': word,
        'years': years,
      })
    return {
      'id': id,
      'category': category,
      'classification': classification,
      'words': words,
    }