""" Find connections between two words """ import sys import time import click import random import simplejson as json from tqdm import tqdm from app.thesaurus.api import Thesaurus @click.command() @click.option('-a', '--a', 'opt_word_a', required=True, help='Starting word') @click.option('-b', '--b', 'opt_word_b', required=True, help='Ending word') @click.option('-oe', '--include_oe', 'opt_include_oe', is_flag=True, help='Whether to include OE/archaic words') @click.option('-sl', '--include_slang', 'opt_include_slang', is_flag=True, help='Whether to include slang/colloquial words') @click.option('-w', '--words_per_step', 'opt_words_per_step', default=20, help='Number of words to check per step') @click.option('-c', '--categories_per_word', 'opt_categories_per_word', default=3, help='Number of categories to check per word') @click.option('-d', '--min_depth', 'opt_min_depth', default=10, help='Minimum depth of matches') @click.option('-sh', '--use_shortest_path', 'opt_use_shortest_path', is_flag=True, help='Use shortest path between words') @click.pass_context def cli(ctx, opt_word_a, opt_word_b, opt_include_oe, opt_include_slang, opt_words_per_step, opt_categories_per_word, opt_min_depth, opt_use_shortest_path): """ Find connections between two words """ thesaurus = Thesaurus() solver = TreeSolver(thesaurus, opt_word_a, opt_word_b, opt_include_oe, opt_include_slang, opt_words_per_step, opt_categories_per_word, opt_min_depth, opt_use_shortest_path) print(f"Starting word: {opt_word_a}") print(f"Ending word: {opt_word_b}") queue_a = [opt_word_a] queue_b = [opt_word_b] while True: queue_a = solver.build_tree(words=queue_a, tree=solver.tree_a, target=solver.tree_b) if solver.should_reset: queue_a = [ opt_word_a ] queue_b = [ opt_word_b ] solver.reset() queue_b = solver.build_tree(words=queue_b, tree=solver.tree_b, target=solver.tree_a) if solver.should_reset: queue_a = [ opt_word_a ] queue_b = [ opt_word_b ] solver.reset() print(f"[depth] {solver.max_dist} [queue a] {len(queue_a)} [queue b] {len(queue_b)} [skips] {len(solver.skips)}") # print(solver.skips) class TreeSolver: def __init__(self, thesaurus, word_a, word_b, include_oe, include_slang, words_per_step, categories_per_word, min_depth, use_shortest_path): self.thesaurus = thesaurus self.word_a = word_a self.word_b = word_b self.include_oe = include_oe self.include_slang = include_slang self.words_per_step = words_per_step self.categories_per_word = categories_per_word self.skips = [] self.min_depth = min_depth self.use_shortest_path = use_shortest_path self.max_dist = 0 self.reset() def reset(self): self.tree_a = { self.word_a: 0 } self.tree_b = { self.word_b: 0 } self.should_reset = False def build_tree(self, words=[], tree={}, target={}, depth=999): next_queue = [] if len(words) > self.words_per_step: next_queue += words[self.words_per_step:] words = words[:self.words_per_step] for word in tqdm(words): categories = self.thesaurus.search(word)['categories'] random.shuffle(categories) count = 0 for category in categories: if count > self.categories_per_word: break catid = category['catid'] if (word, str(catid),) in self.skips: # print(f"Skip {word} {catid}") continue if catid in tree: continue tree[catid] = tree[word] + 1 add_to_queue = self.process_category(catid, tree, target) if self.should_reset: return [] if len(add_to_queue): next_queue += add_to_queue count += 1 random.shuffle(next_queue) return next_queue def process_category(self, catid, tree, target): queue = [] category_result = self.thesaurus.category(catid) for category_word in category_result['words']: word = self.fix_word(category_word['word']) years = category_word['years'].lower() if (catid, word,) in self.skips: continue word = self.process_word(word, years, catid, tree, target) if word: queue.append(word) if self.should_reset: return return queue def process_word(self, word, years, catid, tree, target): if not self.include_oe and self.is_oe(years): return None if not self.include_slang and self.is_slang(years): return None if word not in tree: tree[word] = tree[catid] + 1 self.max_dist = max(self.max_dist, tree[word]) if word in target and self.is_deep_enough(word): self.make_chain(hinge=word, can_remove=True) return word if word in target and self.is_deep_enough(word): self.make_chain(hinge=word, can_remove=True) return None def is_deep_enough(self, word): return (self.tree_a[word] + self.tree_b[word]) >= self.min_depth def make_chain(self, hinge, can_remove=True): # tqdm.write(f"Making chain from {hinge}") chain_a = self.descend_chain(hinge, self.tree_a) chain_b = self.descend_chain(hinge, self.tree_b) if chain_a is None or chain_b is None: return False chain = list(reversed(chain_a)) + [hinge] + chain_b self.display_chain(chain) if can_remove: tqdm.write("Enter a number to break the chain, enter to keep searching, or Ctrl-C to exit") tqdm.write("") index = input("> ").strip() if index and self.is_integer(index): item = chain[int(index)] if item in chain_a: self.add_skip(item, chain_a) self.should_reset = True if item in chain_b: self.add_skip(item, chain_b) self.should_reset = True return True return False def add_skip(self, item, chain): index = chain.index(item) if index == len(chain) - 1: return prev_item = chain[index + 1] self.skips.append((prev_item, item)) if self.is_integer(item): tqdm.write(f"Removing: {prev_item} => {self.get_category_name(item)}") else: tqdm.write(f"Removing: {self.get_category_name(prev_item)} => {item}") def descend_chain(self, word, tree): start_word = word chain = [] while word is not None: match = None if self.is_integer(word): category_words = self.thesaurus.category(word)['words'] random.shuffle(category_words) for category_word in category_words: cat_word = self.fix_word(category_word['word']) if cat_word != word and cat_word in tree and self.can_descend(tree, cat_word, word): chain.append(cat_word) match = cat_word break else: categories = self.thesaurus.search(word)['categories'] random.shuffle(categories) for category in categories: catid = category['catid'] if catid != word and catid in tree and self.can_descend(tree, catid, word): chain.append(catid) match = catid break if match is not None: word = match if tree[word] == 0: break else: # if self.is_integer(word): # tqdm.write(f"No match for: {self.get_category_name(word)}") # tqdm.write(f"Chain started with {start_word}") # self.display_chain(chain) # else: # tqdm.write(f"No match for: {word}") # tqdm.write(f"Chain started with {start_word}") # self.display_chain(chain) return None return chain def can_descend(self, tree, word_x, word_y): if self.use_shortest_path: return tree[word_x] < tree[word_y] else: return tree[word_x] == tree[word_y] - 1 def display_chain(self, chain): tqdm.write("") for i, word in enumerate(chain): if self.is_integer(word): word = self.get_category_name(word) tqdm.write(f"{i} -> {word}") else: tqdm.write(f"{i} => {word}") tqdm.write("") def get_category_name(self, catid): category = self.thesaurus.category(catid) return category['category'] def is_integer(self, s): try: int(s) return True except Exception as e: return False def is_oe(self, years): return (('oe' in years and 'oe-' not in years) or 'arch' in years) def is_slang(self, years): return 'slang' in years or 'colloq' in years or 'Scots' in years def fix_word(self, word): if '<' in word or '/' in word or ',' in word: word = word.split("<")[0] word = word.split(",")[0] word = word.split("/")[0] return word.strip()