ability to skip words if you dont like the connection

author: Jules Laplace <julescarbon@gmail.com> 2020-04-02 15:31:46 +0200
committer: Jules Laplace <julescarbon@gmail.com> 2020-04-02 15:31:46 +0200
commit: c14810fe9f663d46b5477088d06047fea66d1524 (patch)
tree: 72eebe1589b71147f003ebb55c66a20eaf5bce6a
parent: 0b55e297d5088962fe8397903041c2b1737c7cdd (diff)
3 files changed, 178 insertions, 45 deletions
diff --git a/cli/app/settings/app_cfg.py b/cli/app/settings/app_cfg.py
index 09f557b..ee1be51 100644
--- a/cli/app/settings/app_cfg.py
+++ b/cli/app/settings/app_cfg.py
@@ -10,6 +10,7 @@ from app.settings import types
 
 CLICK_GROUPS = {
   'api': 'commands/api',
+  'bridge': 'commands/bridge',
 }
 
 # -----------------------------------------------------------------------------
diff --git a/cli/app/thesaurus/api.py b/cli/app/thesaurus/api.py
index 98e0210..467d5fb 100644
--- a/cli/app/thesaurus/api.py
+++ b/cli/app/thesaurus/api.py
@@ -1,4 +1,6 @@
 import os
+import re
+import time
 import requests
 from hashlib import sha256
 
@@ -13,10 +15,18 @@ class Thesaurus:
     sha = sha256(word)
     hash_path = os.path.join(base_path, sha[0:2])
     os.makedirs(hash_path, exist_ok=True)
-    path = os.path.join(hash_path, word + '.json')
+    clean_word = re.sub('[^0-9a-zA-Z]+', '*', word)
+    path = os.path.join(hash_path, clean_word + '.json')
     if os.path.exists(path):
       return read_json(path)
-    data = api_fn(word)
+    data = None
+    while data is None:
+      try:
+        data = api_fn(word)
+      except Exception as e:
+        print("Got HTTP error, sleeping for 5 seconds")
+        time.sleep(5)
+        pass
     write_json(path, data)
     return data
 
@@ -34,6 +44,11 @@ class ThesaurusAPI:
   }
 
   def search(self, word):
+    word = word.split('<')[0]
+    word = word.split('/')[0]
+    word = word.replace('(', '').replace(')', '')
+    if len(word) < 1:
+      return { 'word': word, 'categories': [] }
     query = {
       'qsearch': word,
     }
@@ -42,8 +57,18 @@ class ThesaurusAPI:
       return []
     data = resp.text
     data = data.split('<div id="resultsTimelineData">')[1].split('</div>')[0]
-    # print(data)
-    rows = json.loads(data)
+    data = data.replace('<span class="oesc">', '')
+    data = data.replace('</span>', '')
+    try:
+      rows = json.loads(data)
+    except Exception as e:
+      print(f"Error loading JSON for {word}")
+      print(data)
+      # raise e
+      return {
+        'word': word,
+        'categories': [],
+      }
     cats = []
     for row in rows:
       cat, years = row['popup'].split(']: ')
@@ -72,8 +97,10 @@ class ThesaurusAPI:
     raw_words = raw.split('"><b>')[1:]
     words = []
     for word in raw_words:
-      word, rest = word.split('</b>')
-      years = word.split(' <span')[0].strip()
+      word_partz = word.split('</b>')
+      word = word_partz[0]
+      years = word_partz[1].split(' <span')[0].strip()
+      years = years.replace('\u2013', '-')
       words.append({
         'word': word,
         'years': years,
diff --git a/cli/commands/bridge/words.py b/cli/commands/bridge/words.py
index 5ebb078..d695bc9 100644
--- a/cli/commands/bridge/words.py
+++ b/cli/commands/bridge/words.py
@@ -3,7 +3,9 @@ Find connections between two words
 """
 
 import click
+import random
 import simplejson as json
+from tqdm import tqdm
 
 from app.thesaurus.api import Thesaurus
 
@@ -12,75 +14,178 @@ from app.thesaurus.api import Thesaurus
   help='Starting word')
 @click.option('-b', '--b', 'opt_word_b', required=True,
   help='Ending word')
+@click.option('-oe', '--include_oe', 'opt_include_oe', is_flag=True,
+  help='Whether to include OE/archaic words')
+@click.option('-sl', '--include_slang', 'opt_include_slang', is_flag=True,
+  help='Whether to include slang/colloquial words')
+@click.option('-w', '--words_per_step', 'opt_words_per_step', default=20,
+  help='Number of words to check per step')
+@click.option('-c', '--categories_per_word', 'opt_categories_per_word', default=3,
+  help='Number of categories to check per word')
 @click.pass_context
-def cli(ctx, opt_word_a, opt_word_b):
+def cli(ctx, opt_word_a, opt_word_b, opt_include_oe, opt_include_slang, opt_words_per_step, opt_categories_per_word):
   """Find connections between two words
   """
   thesaurus = Thesaurus()
   print(f"Starting word: {opt_word_a}")
   print(f"Ending word: {opt_word_b}")
   visited = set()
-  results_a = thesaurus.search(opt_word_a)
-  results_b = thesaurus.search(opt_word_b)
-  # use sets
-  # make set of results_a
-  # find overlap with results_b
-  # if there's no match...
-  #   search for first word in results_a
-  #   loop over results...
-  # print(json.dumps(results, indent=2))
 
-  dist = 0
-  marked = {}
+  step = 0
+  max_dist = 0
+  marked = { opt_word_a: 0 }
   queue = [opt_word_a]
+  newqueue = []
+  skip = {}
   found = False
   # First compute distance to each node to find a path
   while len(queue):
-    dist = dist + 1
-    print(f"Iteration: distance {dist}, {len(queue)} items in queue")
-    newqueue = []
-    for word_q in queue:
+    step = step + 1
+    print(f"Iteration step {step}, distance {max_dist}, {len(queue) + len(newqueue)} items in queue")
+    print(f"Words: {', '.join(queue[:7])} ...")
+    if step > 1:
+      print_chain(thesaurus, opt_word_a, queue, marked, skip, prompt_to_remove=False)
+    for word_q in tqdm(queue):
       word_result = thesaurus.search(word_q)
-      for cat in word_result['categories']:
+      # print(json.dumps(word_result, indent=2))
+      categories = word_result['categories']
+      if step > 1 and len(categories) > opt_categories_per_word:
+        categories = categories[:opt_categories_per_word]
+      for cat in categories:
         catid = cat['catid']
         if catid in marked:
           continue
-        marked[catid] = dist
+        marked[catid] = marked[word_q] + 1
         category_result = thesaurus.category(catid)
+        # print(json.dumps(category_result, indent=2))
         for word_c in category_result['words']:
           word_n = word_c['word']
+          years = word_c['years'].lower()
+          if not opt_include_oe and (('oe' in years and 'oe-' not in years) or 'arch' in years):
+            continue
+          if not opt_include_slang and 'slang' in years or 'colloq' in years or 'Scots' in years:
+            continue
+          if '<' in word_n or '/' in word_n or ',' in word_n:
+            word_n = word_n.split("<")[0]
+            word_n = word_n.split(",")[0]
+            word_n = word_n.split("/")[0].strip()
           if word_n in marked:
             continue
-          marked[word_n] = dist
+          marked[word_n] = marked[catid] + 1
+          max_dist = max(max_dist, marked[word_n])
           if word_n == opt_word_b:
             thesaurus.search(word_n)
-            found = True
-            break
+            # print(queue)
+            print_chain(thesaurus, opt_word_a, [opt_word_b], marked, skip, prompt_to_remove=True)
           newqueue.append(word_n)
-    queue = newqueue
-
-  if not Found:
-    print(f"No path found, distance of {dist} reached, {len(marked)} nodes checked")
+    if step > 1 and len(newqueue) > opt_words_per_step:
+      random.shuffle(newqueue)
+      queue = newqueue[:opt_words_per_step]
+      newqueue = newqueue[opt_words_per_step:]
+    else:
+      queue = [] + newqueue
+  if not found:
+    print(f"No path found, step {step} reached, {len(marked)} nodes checked")
     return
 
-  # Then follow the chain of shortest distance to follow the path
-  word_n = opt_word_b
+def print_chain(thesaurus, opt_word_a, opt_words_b, marked, skip, prompt_to_remove=False):
+  """Follow the chain of shortest distance from the end back to the start"""
+  # print(opt_word_a)
+  word_n = opt_words_b[0]
+  dist = marked[word_n]
+  tries = 0
+  depth_tries = 0
+  chain = []
+  skip_here = []
+  cat_reverse = {}
   while word_n != opt_word_a:
-    dist = 999999
-    next_catid = ""
+    if tries > len(opt_words_b):
+      return
     next_word = ""
+    break_loop = False
     word_result = thesaurus.search(word_n)
-    print(f"-> {word_result['word']}")
-    for cat in word_result['categories']
+    chain.append(word_result['word'])
+    # print(word_result['word'])
+    for cat in word_result['categories']:
       catid = cat['catid']
+      if (word_n in skip and catid in skip[word_n]) or catid in skip_here:
+        continue
       if catid in marked and marked[catid] < dist:
         dist = marked[catid]
-        next_catid = catid
-    cat_result = thesaurus.category(catid)
-    print(f"-> {cat_result['category']}")
-    for word_c in category_result:
-      word_n = word_c['word']
-      if word_n in marked and marked[word_n] < dist:
-        next_word = word_n
+        # print(f"{dist}: {catid}")
+        cat_result = thesaurus.category(catid)
+        for word_c in cat_result['words']:
+          word_m = word_c['word']
+          if '<' in word_m or '/' in word_m or ',' in word_m:
+            word_m = word_m.split("<")[0]
+            word_n = word_n.split(",")[0]
+            word_m = word_m.split("/")[0].strip()
+          # print(word_m)
+          if (catid in skip and word_m in skip[catid]) or word_m in skip_here:
+            continue
+          if word_m == opt_word_a or (word_m in marked and marked[word_m] < dist):
+            dist = marked[word_m]
+            # print(f"{dist}: {word_m}")
+            next_word = word_m
+            break_loop = True
+            break
+      if break_loop:
+        cat_name = cat_result['category']
+        cat_reverse[cat_name] = catid
+        chain.append(cat_name)
+        break
+    if next_word == '':
+      if depth_tries < 100 and len(chain) > 2:
+        to_skip = chain[-1]
+        if to_skip in cat_reverse:
+          to_skip = cat_reverse[to_skip]
+        skip_here.append(to_skip)
+        to_skip = chain[-2]
+        if to_skip in cat_reverse:
+          to_skip = cat_reverse[to_skip]
+        skip_here.append(to_skip)
+        chain.pop()
+        chain.pop()
+        word_n = chain[-1]
+        depth_tries += 1
+      elif depth_tries >= 100:
+        tries += 1
+        if tries >= len(opt_words_b):
+          return
+        word_n = opt_words_b[tries]
+        chain = []
+        dist = marked[word_n]
+        depth_tries = 0
+      continue
     word_n = next_word
-  print(f"-> {word_n}")
+  chain.append(opt_word_a)
+  chain = list(reversed(chain))
+  for i, word in enumerate(chain):
+    if (i % 2) == 0:
+      print(f"{i+1} -> {word}")
+    else:
+      print(f"{i+1}  => {word}")
+  if prompt_to_remove:
+    print("If you don't like this path, enter the IDs of words to remove separated by spaces, or Ctrl-C to exit.")
+    ids = input("Enter numbers > ")
+    ids = ids.split(" ")
+    for id in ids:
+      if len(id):
+        try:
+          id = int(id)
+          id -= 1
+          word_a = chain[id]
+          print(f"Removing {word_a}")
+          if word_a in cat_reverse:
+            word_a = cat_reverse[word_a]
+          word_b = chain[id-1]
+          print(f"Connected upward to {word_b}")
+          if word_b in cat_reverse:
+            word_b = cat_reverse[word_b]
+          if word_a in skip:
+            skip[word_a].append(word_b)
+          else:
+            skip[word_a] = [word_b]
+        except Exception as e:
+          continue
+    print(skip)
author	Jules Laplace <julescarbon@gmail.com>	2020-04-02 15:31:46 +0200
committer	Jules Laplace <julescarbon@gmail.com>	2020-04-02 15:31:46 +0200
commit	c14810fe9f663d46b5477088d06047fea66d1524 (patch)
tree	72eebe1589b71147f003ebb55c66a20eaf5bce6a
parent	0b55e297d5088962fe8397903041c2b1737c7cdd (diff)