1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
import os
import requests
from app.utils.util import *
from app.settings import app_cfg
class Thesaurus:
def __init__(self):
self.api = ThesaurusAPI()
def load(self, base_path, word, api_fn):
sha = sha256(word)
hash_path = os.path.join(base_path, sha[0:2])
os.makedirs(hash_path, exist_ok=True)
path = os.path.join(hash_path, word + '.json')
if os.path.exists(path):
return read_json(path)
data = api_fn(word)
write_json(path, data)
return data
def search(self, word):
return self.load(app_cfg.SEARCH_PATH, word, self.api.search)
def category(self, id):
return self.load(app_cfg.CATEGORY_PATH, str(id), self.api.category)
class ThesaurusAPI:
SEARCH_ENDPOINT = "https://ht.ac.uk/category-selection/"
CATEGORY_ENDPOINT = "https://ht.ac.uk/api/v1/loadCategory-v2.php"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}
def search(self, word):
query = {
'qsearch': word,
}
resp = requests.get(self.SEARCH_ENDPOINT, params=query, headers=self.HEADERS)
if resp.status_code != 200:
return []
data = resp.text
data = data.split('<div id="resultsTimelineData">')
data = data[0].split('</div>')
return json.loads(data)
def category(self, id):
query = {
'id': id,
}
resp = requests.get(self.CATEGORY_ENDPOINT, params=query, headers=self.HEADERS)
if resp.status_code != 200:
return ""
raw = resp.text
classification = raw.split("<span style='font-size: 0.6em'>")[1].split('</span>')[0]
category = raw.split("<br />")[1].split('</h2>')[0]
raw_words = raw.split('<b>')[1:]
words = []
for word in raw_words:
word, rest = word.split('</b>')
years = word.split(' <span')[0].trim()
words.append({
'word': word,
'years': years,
})
return {
'id': id,
'category': category,
'classification': classification,
'words': words,
}
|