diff options
Diffstat (limited to 'cli/app/thesaurus')
| -rw-r--r-- | cli/app/thesaurus/api.py | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/cli/app/thesaurus/api.py b/cli/app/thesaurus/api.py new file mode 100644 index 0000000..ad0dd92 --- /dev/null +++ b/cli/app/thesaurus/api.py @@ -0,0 +1,71 @@ +import os +import requests + +from app.utils.util import * +from app.settings import app_cfg + +class Thesaurus: + def __init__(self): + self.api = ThesaurusAPI() + + def load(self, base_path, word, api_fn): + sha = sha256(word) + hash_path = os.path.join(base_path, sha[0:2]) + os.makedirs(hash_path, exist_ok=True) + path = os.path.join(hash_path, word + '.json') + if os.path.exists(path): + return read_json(path) + data = api_fn(word) + write_json(path, data) + return data + + def search(self, word): + return self.load(app_cfg.SEARCH_PATH, word, self.api.search) + + def category(self, id): + return self.load(app_cfg.CATEGORY_PATH, str(id), self.api.category) + +class ThesaurusAPI: + SEARCH_ENDPOINT = "https://ht.ac.uk/category-selection/" + CATEGORY_ENDPOINT = "https://ht.ac.uk/api/v1/loadCategory-v2.php" + HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', + } + + def search(self, word): + query = { + 'qsearch': word, + } + resp = requests.get(self.SEARCH_ENDPOINT, params=query, headers=self.HEADERS) + if resp.status_code != 200: + return [] + data = resp.text + data = data.split('<div id="resultsTimelineData">') + data = data[0].split('</div>') + return json.loads(data) + + def category(self, id): + query = { + 'id': id, + } + resp = requests.get(self.CATEGORY_ENDPOINT, params=query, headers=self.HEADERS) + if resp.status_code != 200: + return "" + raw = resp.text + classification = raw.split("<span style='font-size: 0.6em'>")[1].split('</span>')[0] + category = raw.split("<br />")[1].split('</h2>')[0] + raw_words = raw.split('<b>')[1:] + words = [] + for word in raw_words: + word, rest = word.split('</b>') + years = word.split(' <span')[0].trim() + words.append({ + 'word': word, + 'years': years, + }) + return { + 'id': id, + 'category': category, + 'classification': classification, + 'words': words, + }
\ No newline at end of file |
