import os import requests from hashlib import sha256 from app.utils.util import * from app.settings import app_cfg class Thesaurus: def __init__(self): self.api = ThesaurusAPI() def load(self, base_path, word, api_fn): sha = sha256(word) hash_path = os.path.join(base_path, sha[0:2]) os.makedirs(hash_path, exist_ok=True) path = os.path.join(hash_path, word + '.json') if os.path.exists(path): return read_json(path) data = api_fn(word) write_json(path, data) return data def search(self, word): return self.load(app_cfg.SEARCH_PATH, word, self.api.search) def category(self, id): return self.load(app_cfg.CATEGORY_PATH, str(id), self.api.category) class ThesaurusAPI: SEARCH_ENDPOINT = "https://ht.ac.uk/category-selection/" CATEGORY_ENDPOINT = "https://ht.ac.uk/api/v1/loadCategory-v2.php" HEADERS = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', } def search(self, word): query = { 'qsearch': word, } resp = requests.get(self.SEARCH_ENDPOINT, params=query, headers=self.HEADERS) if resp.status_code != 200: return [] data = resp.text data = data.split('
')[1].split('
')[0] # print(data) rows = json.loads(data) cats = [] for row in rows: cat, years = row['popup'].split(']: ') cat = cat.split('[')[1] cats.append({ 'catid': row['catid'], 'catnum': row['catnum'], 'category': cat, 'years': years, }) return { 'word': word, 'categories': cats, } def category(self, id): query = { 'id': id, } resp = requests.get(self.CATEGORY_ENDPOINT, params=query, headers=self.HEADERS) if resp.status_code != 200: return "" raw = resp.text catnum = raw.split("")[1].split('')[0] category = raw.split("
")[1].split('')[0].replace("", "").replace("", "") raw_words = raw.split('">')[1:] words = [] for word in raw_words: word, rest = word.split('') years = word.split('