import os import re import time import requests from hashlib import sha256 from app.utils.util import * from app.settings import app_cfg class Thesaurus: def __init__(self): self.api = ThesaurusAPI() def load(self, base_path, word, api_fn): sha = sha256(word) hash_path = os.path.join(base_path, sha[0:2]) os.makedirs(hash_path, exist_ok=True) clean_word = re.sub('[^0-9a-zA-Z]+', '*', word) path = os.path.join(hash_path, clean_word + '.json') if os.path.exists(path): return read_json(path) data = None while data is None: try: data = api_fn(word) except Exception as e: print("Got HTTP error, sleeping for 5 seconds") time.sleep(5) pass write_json(path, data) return data def search(self, word): return self.load(app_cfg.SEARCH_PATH, word, self.api.search) def category(self, id): return self.load(app_cfg.CATEGORY_PATH, str(id), self.api.category) class ThesaurusAPI: SEARCH_ENDPOINT = "https://ht.ac.uk/category-selection/" CATEGORY_ENDPOINT = "https://ht.ac.uk/api/v1/loadCategory-v2.php" HEADERS = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', } def search(self, word): word = word.split('<')[0] word = word.split('/')[0] word = word.replace('(', '').replace(')', '') if len(word) < 1: return { 'word': word, 'categories': [] } query = { 'qsearch': word, } resp = requests.get(self.SEARCH_ENDPOINT, params=query, headers=self.HEADERS) if resp.status_code != 200: return [] data = resp.text data = data.split('