cli/app/thesaurus/api.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

import os
import re
import time
import requests
from hashlib import sha256

from app.utils.util import *
from app.settings import app_cfg

class Thesaurus:
  def __init__(self):
    self.api = ThesaurusAPI()

  def load(self, base_path, word, api_fn):
    sha = sha256(word)
    hash_path = os.path.join(base_path, sha[0:2])
    os.makedirs(hash_path, exist_ok=True)
    clean_word = re.sub('[^0-9a-zA-Z]+', '*', word)
    path = os.path.join(hash_path, clean_word + '.json')
    if os.path.exists(path):
      return read_json(path)
    data = None
    while data is None:
      try:
        data = api_fn(word)
      except Exception as e:
        print("Got HTTP error for {word}, sleeping for 5 seconds")
        time.sleep(5)
        pass
    write_json(path, data)
    return data

  def search(self, word):
    return self.load(app_cfg.SEARCH_PATH, word, self.api.search)

  def category(self, id):
    return self.load(app_cfg.CATEGORY_PATH, str(id), self.api.category)

class ThesaurusAPI:
  SEARCH_ENDPOINT = "https://ht.ac.uk/category-selection/"
  CATEGORY_ENDPOINT = "https://ht.ac.uk/api/v1/loadCategory-v2.php"
  HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
  }

  def search(self, word):
    word = word.split('<')[0]
    word = word.split('/')[0]
    word = word.replace('(', '').replace(')', '')
    if len(word) < 1:
      return { 'word': word, 'categories': [] }
    query = {
      'qsearch': word,
    }
    resp = requests.get(self.SEARCH_ENDPOINT, params=query, headers=self.HEADERS)
    if resp.status_code != 200:
      return []
    data = resp.text
    data = data.split('<div id="resultsTimelineData">')[1].split('</div>')[0]
    data = data.replace('<span class="oesc">', '')
    data = data.replace('</span>', '')
    try:
      rows = json.loads(data)
    except Exception as e:
      print(f"Error loading JSON for {word}")
      print(data)
      # raise e
      return {
        'word': word,
        'categories': [],
      }
    cats = []
    for row in rows:
      cat, years = row['popup'].split(']: ')
      cat = cat.split('[')[1]
      cats.append({
        'catid': row['catid'],
        'catnum': row['catnum'],
        'category': cat,
        'years': years,
      })
    return {
      'word': word,
      'categories': cats,
    }

  def category(self, id):
    query = {
      'id': id,
    }
    resp = requests.get(self.CATEGORY_ENDPOINT, params=query, headers=self.HEADERS)
    if resp.status_code != 200:
      return ""
    raw = resp.text
    catnum = raw.split("<span style='font-size: 0.6em'>")[1].split('</span>')[0]
    category = raw.split("<br />")[1].split('</h2>')[0].replace("<span style='font-size: 0.6em'>", "").replace("</span>", "")
    raw_words = raw.split('"><b>')[1:]
    words = []
    for word in raw_words:
      word_partz = word.split('</b>')
      word = word_partz[0]
      years = word_partz[1].split(' <span')[0].strip()
      years = years.replace('\u2013', '-')
      words.append({
        'word': word,
        'years': years,
      })
    return {
      'catid': id,
      'catnum': catnum,
      'category': category,
      'words': words,
    }