From 2b407d1f4a608d0ac23592ff16def77797e4fa41 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Tue, 31 Mar 2020 20:12:34 +0200 Subject: init thesaurus api client --- .gitignore | 192 +++++++++++++++++++++++++++++++++++++++++ cli/app/settings/app_cfg.py | 10 +++ cli/app/thesaurus/api.py | 71 +++++++++++++++ cli/app/utils/click_factory.py | 145 +++++++++++++++++++++++++++++++ cli/app/utils/logger_utils.py | 68 +++++++++++++++ cli/app/utils/util.py | 15 ++++ cli/cli.py | 49 +++++++++++ cli/commands/api/category.py | 19 ++++ cli/commands/api/search.py | 19 ++++ 9 files changed, 588 insertions(+) create mode 100644 .gitignore create mode 100644 cli/app/settings/app_cfg.py create mode 100644 cli/app/thesaurus/api.py create mode 100644 cli/app/utils/click_factory.py create mode 100644 cli/app/utils/logger_utils.py create mode 100644 cli/app/utils/util.py create mode 100755 cli/cli.py create mode 100644 cli/commands/api/category.py create mode 100644 cli/commands/api/search.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5ee34bb --- /dev/null +++ b/.gitignore @@ -0,0 +1,192 @@ +# Custom +3rdparty/ +*_local_* +static/js/dist/ +webpack-stats.dev.json +3rdparty/ +_local/ +*db.sqlite3 +__pycache__ +deploy_settings.py +s3cfg +.DS_Store +node_modules/ +client_secret.json +sheets.googleapis.com-python.json + +# ------------------------------------------ +# GitHub Python .gitignore ruleset +# ------------------------------------------ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python + +env/ +build/ +develop-eggs/ +#dist/ +downloads/ +eggs/ +.eggs/ +#lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environment +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mypy +.mypy_cache/ + +backend/backend/ +backend/media +backend/static/* + +*.swp +*.pkl + +.git-old/ + +old-package.json + +*.tar + +.creds + +check/static/assets/js/dist/index.js +static/public/user_content + +*.obj +*.ply + +# google API key :-o +.api_key + +site/datasets/final/*.csv + +flask.log +flask.log.* + +*.sql + +.idea + +static/js/dev + +v1_search +v1_common +v1_frontend + +static/js/dev/*.hot-update.json +static/js/dev/0.* + +etc + +static/js/dist + +data_store/docker/mysql/* +data_store/docker/redis/* +data_store/media/* +data_store/models/* +data_store/indexes/* +data_store/incoming/* +data_store/uploads/* +data_store/features/* +data_store/exports/* + +!data_store/.gitkeep +!data_store/media/.gitkeep +!data_store/models/.gitkeep +!data_store/indexes/.gitkeep +!data_store/incoming/.gitkeep +!data_store/uploads/.gitkeep +!data_store/features/.gitkeep +!data_store/exports/.gitkeep +!data_store/docker/.gitkeep +!data_store/docker/mysql/.gitkeep +!data_store/docker/redis/.gitkeep + diff --git a/cli/app/settings/app_cfg.py b/cli/app/settings/app_cfg.py new file mode 100644 index 0000000..952e76b --- /dev/null +++ b/cli/app/settings/app_cfg.py @@ -0,0 +1,10 @@ +import os + +CLICK_GROUPS = { + 'api': 'commands/api', +} + +DATA_STORE = 'data_store' + +SEARCH_PATH = os.path.join(DATA_STORE, "search") +CATEGORIES_PATH = os.path.join(DATA_STORE, "categories") diff --git a/cli/app/thesaurus/api.py b/cli/app/thesaurus/api.py new file mode 100644 index 0000000..ad0dd92 --- /dev/null +++ b/cli/app/thesaurus/api.py @@ -0,0 +1,71 @@ +import os +import requests + +from app.utils.util import * +from app.settings import app_cfg + +class Thesaurus: + def __init__(self): + self.api = ThesaurusAPI() + + def load(self, base_path, word, api_fn): + sha = sha256(word) + hash_path = os.path.join(base_path, sha[0:2]) + os.makedirs(hash_path, exist_ok=True) + path = os.path.join(hash_path, word + '.json') + if os.path.exists(path): + return read_json(path) + data = api_fn(word) + write_json(path, data) + return data + + def search(self, word): + return self.load(app_cfg.SEARCH_PATH, word, self.api.search) + + def category(self, id): + return self.load(app_cfg.CATEGORY_PATH, str(id), self.api.category) + +class ThesaurusAPI: + SEARCH_ENDPOINT = "https://ht.ac.uk/category-selection/" + CATEGORY_ENDPOINT = "https://ht.ac.uk/api/v1/loadCategory-v2.php" + HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', + } + + def search(self, word): + query = { + 'qsearch': word, + } + resp = requests.get(self.SEARCH_ENDPOINT, params=query, headers=self.HEADERS) + if resp.status_code != 200: + return [] + data = resp.text + data = data.split('
') + data = data[0].split('
') + return json.loads(data) + + def category(self, id): + query = { + 'id': id, + } + resp = requests.get(self.CATEGORY_ENDPOINT, params=query, headers=self.HEADERS) + if resp.status_code != 200: + return "" + raw = resp.text + classification = raw.split("")[1].split('')[0] + category = raw.split("
")[1].split('')[0] + raw_words = raw.split('')[1:] + words = [] + for word in raw_words: + word, rest = word.split('') + years = word.split('