From 2b407d1f4a608d0ac23592ff16def77797e4fa41 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Tue, 31 Mar 2020 20:12:34 +0200 Subject: init thesaurus api client --- cli/app/settings/app_cfg.py | 10 +++ cli/app/thesaurus/api.py | 71 ++++++++++++++++++++ cli/app/utils/click_factory.py | 145 +++++++++++++++++++++++++++++++++++++++++ cli/app/utils/logger_utils.py | 68 +++++++++++++++++++ cli/app/utils/util.py | 15 +++++ cli/cli.py | 49 ++++++++++++++ cli/commands/api/category.py | 19 ++++++ cli/commands/api/search.py | 19 ++++++ 8 files changed, 396 insertions(+) create mode 100644 cli/app/settings/app_cfg.py create mode 100644 cli/app/thesaurus/api.py create mode 100644 cli/app/utils/click_factory.py create mode 100644 cli/app/utils/logger_utils.py create mode 100644 cli/app/utils/util.py create mode 100755 cli/cli.py create mode 100644 cli/commands/api/category.py create mode 100644 cli/commands/api/search.py (limited to 'cli') diff --git a/cli/app/settings/app_cfg.py b/cli/app/settings/app_cfg.py new file mode 100644 index 0000000..952e76b --- /dev/null +++ b/cli/app/settings/app_cfg.py @@ -0,0 +1,10 @@ +import os + +CLICK_GROUPS = { + 'api': 'commands/api', +} + +DATA_STORE = 'data_store' + +SEARCH_PATH = os.path.join(DATA_STORE, "search") +CATEGORIES_PATH = os.path.join(DATA_STORE, "categories") diff --git a/cli/app/thesaurus/api.py b/cli/app/thesaurus/api.py new file mode 100644 index 0000000..ad0dd92 --- /dev/null +++ b/cli/app/thesaurus/api.py @@ -0,0 +1,71 @@ +import os +import requests + +from app.utils.util import * +from app.settings import app_cfg + +class Thesaurus: + def __init__(self): + self.api = ThesaurusAPI() + + def load(self, base_path, word, api_fn): + sha = sha256(word) + hash_path = os.path.join(base_path, sha[0:2]) + os.makedirs(hash_path, exist_ok=True) + path = os.path.join(hash_path, word + '.json') + if os.path.exists(path): + return read_json(path) + data = api_fn(word) + write_json(path, data) + return data + + def search(self, word): + return self.load(app_cfg.SEARCH_PATH, word, self.api.search) + + def category(self, id): + return self.load(app_cfg.CATEGORY_PATH, str(id), self.api.category) + +class ThesaurusAPI: + SEARCH_ENDPOINT = "https://ht.ac.uk/category-selection/" + CATEGORY_ENDPOINT = "https://ht.ac.uk/api/v1/loadCategory-v2.php" + HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', + } + + def search(self, word): + query = { + 'qsearch': word, + } + resp = requests.get(self.SEARCH_ENDPOINT, params=query, headers=self.HEADERS) + if resp.status_code != 200: + return [] + data = resp.text + data = data.split('
') + data = data[0].split('
') + return json.loads(data) + + def category(self, id): + query = { + 'id': id, + } + resp = requests.get(self.CATEGORY_ENDPOINT, params=query, headers=self.HEADERS) + if resp.status_code != 200: + return "" + raw = resp.text + classification = raw.split("")[1].split('')[0] + category = raw.split("
")[1].split('')[0] + raw_words = raw.split('')[1:] + words = [] + for word in raw_words: + word, rest = word.split('') + years = word.split('