diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2020-04-02 15:31:46 +0200 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2020-04-02 15:31:46 +0200 |
| commit | c14810fe9f663d46b5477088d06047fea66d1524 (patch) | |
| tree | 72eebe1589b71147f003ebb55c66a20eaf5bce6a | |
| parent | 0b55e297d5088962fe8397903041c2b1737c7cdd (diff) | |
ability to skip words if you dont like the connection
| -rw-r--r-- | cli/app/settings/app_cfg.py | 1 | ||||
| -rw-r--r-- | cli/app/thesaurus/api.py | 39 | ||||
| -rw-r--r-- | cli/commands/bridge/words.py | 183 |
3 files changed, 178 insertions, 45 deletions
diff --git a/cli/app/settings/app_cfg.py b/cli/app/settings/app_cfg.py index 09f557b..ee1be51 100644 --- a/cli/app/settings/app_cfg.py +++ b/cli/app/settings/app_cfg.py @@ -10,6 +10,7 @@ from app.settings import types CLICK_GROUPS = { 'api': 'commands/api', + 'bridge': 'commands/bridge', } # ----------------------------------------------------------------------------- diff --git a/cli/app/thesaurus/api.py b/cli/app/thesaurus/api.py index 98e0210..467d5fb 100644 --- a/cli/app/thesaurus/api.py +++ b/cli/app/thesaurus/api.py @@ -1,4 +1,6 @@ import os +import re +import time import requests from hashlib import sha256 @@ -13,10 +15,18 @@ class Thesaurus: sha = sha256(word) hash_path = os.path.join(base_path, sha[0:2]) os.makedirs(hash_path, exist_ok=True) - path = os.path.join(hash_path, word + '.json') + clean_word = re.sub('[^0-9a-zA-Z]+', '*', word) + path = os.path.join(hash_path, clean_word + '.json') if os.path.exists(path): return read_json(path) - data = api_fn(word) + data = None + while data is None: + try: + data = api_fn(word) + except Exception as e: + print("Got HTTP error, sleeping for 5 seconds") + time.sleep(5) + pass write_json(path, data) return data @@ -34,6 +44,11 @@ class ThesaurusAPI: } def search(self, word): + word = word.split('<')[0] + word = word.split('/')[0] + word = word.replace('(', '').replace(')', '') + if len(word) < 1: + return { 'word': word, 'categories': [] } query = { 'qsearch': word, } @@ -42,8 +57,18 @@ class ThesaurusAPI: return [] data = resp.text data = data.split('<div id="resultsTimelineData">')[1].split('</div>')[0] - # print(data) - rows = json.loads(data) + data = data.replace('<span class="oesc">', '') + data = data.replace('</span>', '') + try: + rows = json.loads(data) + except Exception as e: + print(f"Error loading JSON for {word}") + print(data) + # raise e + return { + 'word': word, + 'categories': [], + } cats = [] for row in rows: cat, years = row['popup'].split(']: ') @@ -72,8 +97,10 @@ class ThesaurusAPI: raw_words = raw.split('"><b>')[1:] words = [] for word in raw_words: - word, rest = word.split('</b>') - years = word.split(' <span')[0].strip() + word_partz = word.split('</b>') + word = word_partz[0] + years = word_partz[1].split(' <span')[0].strip() + years = years.replace('\u2013', '-') words.append({ 'word': word, 'years': years, diff --git a/cli/commands/bridge/words.py b/cli/commands/bridge/words.py index 5ebb078..d695bc9 100644 --- a/cli/commands/bridge/words.py +++ b/cli/commands/bridge/words.py @@ -3,7 +3,9 @@ Find connections between two words """ import click +import random import simplejson as json +from tqdm import tqdm from app.thesaurus.api import Thesaurus @@ -12,75 +14,178 @@ from app.thesaurus.api import Thesaurus help='Starting word') @click.option('-b', '--b', 'opt_word_b', required=True, help='Ending word') +@click.option('-oe', '--include_oe', 'opt_include_oe', is_flag=True, + help='Whether to include OE/archaic words') +@click.option('-sl', '--include_slang', 'opt_include_slang', is_flag=True, + help='Whether to include slang/colloquial words') +@click.option('-w', '--words_per_step', 'opt_words_per_step', default=20, + help='Number of words to check per step') +@click.option('-c', '--categories_per_word', 'opt_categories_per_word', default=3, + help='Number of categories to check per word') @click.pass_context -def cli(ctx, opt_word_a, opt_word_b): +def cli(ctx, opt_word_a, opt_word_b, opt_include_oe, opt_include_slang, opt_words_per_step, opt_categories_per_word): """Find connections between two words """ thesaurus = Thesaurus() print(f"Starting word: {opt_word_a}") print(f"Ending word: {opt_word_b}") visited = set() - results_a = thesaurus.search(opt_word_a) - results_b = thesaurus.search(opt_word_b) - # use sets - # make set of results_a - # find overlap with results_b - # if there's no match... - # search for first word in results_a - # loop over results... - # print(json.dumps(results, indent=2)) - dist = 0 - marked = {} + step = 0 + max_dist = 0 + marked = { opt_word_a: 0 } queue = [opt_word_a] + newqueue = [] + skip = {} found = False # First compute distance to each node to find a path while len(queue): - dist = dist + 1 - print(f"Iteration: distance {dist}, {len(queue)} items in queue") - newqueue = [] - for word_q in queue: + step = step + 1 + print(f"Iteration step {step}, distance {max_dist}, {len(queue) + len(newqueue)} items in queue") + print(f"Words: {', '.join(queue[:7])} ...") + if step > 1: + print_chain(thesaurus, opt_word_a, queue, marked, skip, prompt_to_remove=False) + for word_q in tqdm(queue): word_result = thesaurus.search(word_q) - for cat in word_result['categories']: + # print(json.dumps(word_result, indent=2)) + categories = word_result['categories'] + if step > 1 and len(categories) > opt_categories_per_word: + categories = categories[:opt_categories_per_word] + for cat in categories: catid = cat['catid'] if catid in marked: continue - marked[catid] = dist + marked[catid] = marked[word_q] + 1 category_result = thesaurus.category(catid) + # print(json.dumps(category_result, indent=2)) for word_c in category_result['words']: word_n = word_c['word'] + years = word_c['years'].lower() + if not opt_include_oe and (('oe' in years and 'oe-' not in years) or 'arch' in years): + continue + if not opt_include_slang and 'slang' in years or 'colloq' in years or 'Scots' in years: + continue + if '<' in word_n or '/' in word_n or ',' in word_n: + word_n = word_n.split("<")[0] + word_n = word_n.split(",")[0] + word_n = word_n.split("/")[0].strip() if word_n in marked: continue - marked[word_n] = dist + marked[word_n] = marked[catid] + 1 + max_dist = max(max_dist, marked[word_n]) if word_n == opt_word_b: thesaurus.search(word_n) - found = True - break + # print(queue) + print_chain(thesaurus, opt_word_a, [opt_word_b], marked, skip, prompt_to_remove=True) newqueue.append(word_n) - queue = newqueue - - if not Found: - print(f"No path found, distance of {dist} reached, {len(marked)} nodes checked") + if step > 1 and len(newqueue) > opt_words_per_step: + random.shuffle(newqueue) + queue = newqueue[:opt_words_per_step] + newqueue = newqueue[opt_words_per_step:] + else: + queue = [] + newqueue + if not found: + print(f"No path found, step {step} reached, {len(marked)} nodes checked") return - # Then follow the chain of shortest distance to follow the path - word_n = opt_word_b +def print_chain(thesaurus, opt_word_a, opt_words_b, marked, skip, prompt_to_remove=False): + """Follow the chain of shortest distance from the end back to the start""" + # print(opt_word_a) + word_n = opt_words_b[0] + dist = marked[word_n] + tries = 0 + depth_tries = 0 + chain = [] + skip_here = [] + cat_reverse = {} while word_n != opt_word_a: - dist = 999999 - next_catid = "" + if tries > len(opt_words_b): + return next_word = "" + break_loop = False word_result = thesaurus.search(word_n) - print(f"-> {word_result['word']}") - for cat in word_result['categories'] + chain.append(word_result['word']) + # print(word_result['word']) + for cat in word_result['categories']: catid = cat['catid'] + if (word_n in skip and catid in skip[word_n]) or catid in skip_here: + continue if catid in marked and marked[catid] < dist: dist = marked[catid] - next_catid = catid - cat_result = thesaurus.category(catid) - print(f"-> {cat_result['category']}") - for word_c in category_result: - word_n = word_c['word'] - if word_n in marked and marked[word_n] < dist: - next_word = word_n + # print(f"{dist}: {catid}") + cat_result = thesaurus.category(catid) + for word_c in cat_result['words']: + word_m = word_c['word'] + if '<' in word_m or '/' in word_m or ',' in word_m: + word_m = word_m.split("<")[0] + word_n = word_n.split(",")[0] + word_m = word_m.split("/")[0].strip() + # print(word_m) + if (catid in skip and word_m in skip[catid]) or word_m in skip_here: + continue + if word_m == opt_word_a or (word_m in marked and marked[word_m] < dist): + dist = marked[word_m] + # print(f"{dist}: {word_m}") + next_word = word_m + break_loop = True + break + if break_loop: + cat_name = cat_result['category'] + cat_reverse[cat_name] = catid + chain.append(cat_name) + break + if next_word == '': + if depth_tries < 100 and len(chain) > 2: + to_skip = chain[-1] + if to_skip in cat_reverse: + to_skip = cat_reverse[to_skip] + skip_here.append(to_skip) + to_skip = chain[-2] + if to_skip in cat_reverse: + to_skip = cat_reverse[to_skip] + skip_here.append(to_skip) + chain.pop() + chain.pop() + word_n = chain[-1] + depth_tries += 1 + elif depth_tries >= 100: + tries += 1 + if tries >= len(opt_words_b): + return + word_n = opt_words_b[tries] + chain = [] + dist = marked[word_n] + depth_tries = 0 + continue word_n = next_word - print(f"-> {word_n}") + chain.append(opt_word_a) + chain = list(reversed(chain)) + for i, word in enumerate(chain): + if (i % 2) == 0: + print(f"{i+1} -> {word}") + else: + print(f"{i+1} => {word}") + if prompt_to_remove: + print("If you don't like this path, enter the IDs of words to remove separated by spaces, or Ctrl-C to exit.") + ids = input("Enter numbers > ") + ids = ids.split(" ") + for id in ids: + if len(id): + try: + id = int(id) + id -= 1 + word_a = chain[id] + print(f"Removing {word_a}") + if word_a in cat_reverse: + word_a = cat_reverse[word_a] + word_b = chain[id-1] + print(f"Connected upward to {word_b}") + if word_b in cat_reverse: + word_b = cat_reverse[word_b] + if word_a in skip: + skip[word_a].append(word_b) + else: + skip[word_a] = [word_b] + except Exception as e: + continue + print(skip) |
