summaryrefslogtreecommitdiff
path: root/cli
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2020-04-02 15:31:46 +0200
committerJules Laplace <julescarbon@gmail.com>2020-04-02 15:31:46 +0200
commitc14810fe9f663d46b5477088d06047fea66d1524 (patch)
tree72eebe1589b71147f003ebb55c66a20eaf5bce6a /cli
parent0b55e297d5088962fe8397903041c2b1737c7cdd (diff)
ability to skip words if you dont like the connection
Diffstat (limited to 'cli')
-rw-r--r--cli/app/settings/app_cfg.py1
-rw-r--r--cli/app/thesaurus/api.py39
-rw-r--r--cli/commands/bridge/words.py183
3 files changed, 178 insertions, 45 deletions
diff --git a/cli/app/settings/app_cfg.py b/cli/app/settings/app_cfg.py
index 09f557b..ee1be51 100644
--- a/cli/app/settings/app_cfg.py
+++ b/cli/app/settings/app_cfg.py
@@ -10,6 +10,7 @@ from app.settings import types
CLICK_GROUPS = {
'api': 'commands/api',
+ 'bridge': 'commands/bridge',
}
# -----------------------------------------------------------------------------
diff --git a/cli/app/thesaurus/api.py b/cli/app/thesaurus/api.py
index 98e0210..467d5fb 100644
--- a/cli/app/thesaurus/api.py
+++ b/cli/app/thesaurus/api.py
@@ -1,4 +1,6 @@
import os
+import re
+import time
import requests
from hashlib import sha256
@@ -13,10 +15,18 @@ class Thesaurus:
sha = sha256(word)
hash_path = os.path.join(base_path, sha[0:2])
os.makedirs(hash_path, exist_ok=True)
- path = os.path.join(hash_path, word + '.json')
+ clean_word = re.sub('[^0-9a-zA-Z]+', '*', word)
+ path = os.path.join(hash_path, clean_word + '.json')
if os.path.exists(path):
return read_json(path)
- data = api_fn(word)
+ data = None
+ while data is None:
+ try:
+ data = api_fn(word)
+ except Exception as e:
+ print("Got HTTP error, sleeping for 5 seconds")
+ time.sleep(5)
+ pass
write_json(path, data)
return data
@@ -34,6 +44,11 @@ class ThesaurusAPI:
}
def search(self, word):
+ word = word.split('<')[0]
+ word = word.split('/')[0]
+ word = word.replace('(', '').replace(')', '')
+ if len(word) < 1:
+ return { 'word': word, 'categories': [] }
query = {
'qsearch': word,
}
@@ -42,8 +57,18 @@ class ThesaurusAPI:
return []
data = resp.text
data = data.split('<div id="resultsTimelineData">')[1].split('</div>')[0]
- # print(data)
- rows = json.loads(data)
+ data = data.replace('<span class="oesc">', '')
+ data = data.replace('</span>', '')
+ try:
+ rows = json.loads(data)
+ except Exception as e:
+ print(f"Error loading JSON for {word}")
+ print(data)
+ # raise e
+ return {
+ 'word': word,
+ 'categories': [],
+ }
cats = []
for row in rows:
cat, years = row['popup'].split(']: ')
@@ -72,8 +97,10 @@ class ThesaurusAPI:
raw_words = raw.split('"><b>')[1:]
words = []
for word in raw_words:
- word, rest = word.split('</b>')
- years = word.split(' <span')[0].strip()
+ word_partz = word.split('</b>')
+ word = word_partz[0]
+ years = word_partz[1].split(' <span')[0].strip()
+ years = years.replace('\u2013', '-')
words.append({
'word': word,
'years': years,
diff --git a/cli/commands/bridge/words.py b/cli/commands/bridge/words.py
index 5ebb078..d695bc9 100644
--- a/cli/commands/bridge/words.py
+++ b/cli/commands/bridge/words.py
@@ -3,7 +3,9 @@ Find connections between two words
"""
import click
+import random
import simplejson as json
+from tqdm import tqdm
from app.thesaurus.api import Thesaurus
@@ -12,75 +14,178 @@ from app.thesaurus.api import Thesaurus
help='Starting word')
@click.option('-b', '--b', 'opt_word_b', required=True,
help='Ending word')
+@click.option('-oe', '--include_oe', 'opt_include_oe', is_flag=True,
+ help='Whether to include OE/archaic words')
+@click.option('-sl', '--include_slang', 'opt_include_slang', is_flag=True,
+ help='Whether to include slang/colloquial words')
+@click.option('-w', '--words_per_step', 'opt_words_per_step', default=20,
+ help='Number of words to check per step')
+@click.option('-c', '--categories_per_word', 'opt_categories_per_word', default=3,
+ help='Number of categories to check per word')
@click.pass_context
-def cli(ctx, opt_word_a, opt_word_b):
+def cli(ctx, opt_word_a, opt_word_b, opt_include_oe, opt_include_slang, opt_words_per_step, opt_categories_per_word):
"""Find connections between two words
"""
thesaurus = Thesaurus()
print(f"Starting word: {opt_word_a}")
print(f"Ending word: {opt_word_b}")
visited = set()
- results_a = thesaurus.search(opt_word_a)
- results_b = thesaurus.search(opt_word_b)
- # use sets
- # make set of results_a
- # find overlap with results_b
- # if there's no match...
- # search for first word in results_a
- # loop over results...
- # print(json.dumps(results, indent=2))
- dist = 0
- marked = {}
+ step = 0
+ max_dist = 0
+ marked = { opt_word_a: 0 }
queue = [opt_word_a]
+ newqueue = []
+ skip = {}
found = False
# First compute distance to each node to find a path
while len(queue):
- dist = dist + 1
- print(f"Iteration: distance {dist}, {len(queue)} items in queue")
- newqueue = []
- for word_q in queue:
+ step = step + 1
+ print(f"Iteration step {step}, distance {max_dist}, {len(queue) + len(newqueue)} items in queue")
+ print(f"Words: {', '.join(queue[:7])} ...")
+ if step > 1:
+ print_chain(thesaurus, opt_word_a, queue, marked, skip, prompt_to_remove=False)
+ for word_q in tqdm(queue):
word_result = thesaurus.search(word_q)
- for cat in word_result['categories']:
+ # print(json.dumps(word_result, indent=2))
+ categories = word_result['categories']
+ if step > 1 and len(categories) > opt_categories_per_word:
+ categories = categories[:opt_categories_per_word]
+ for cat in categories:
catid = cat['catid']
if catid in marked:
continue
- marked[catid] = dist
+ marked[catid] = marked[word_q] + 1
category_result = thesaurus.category(catid)
+ # print(json.dumps(category_result, indent=2))
for word_c in category_result['words']:
word_n = word_c['word']
+ years = word_c['years'].lower()
+ if not opt_include_oe and (('oe' in years and 'oe-' not in years) or 'arch' in years):
+ continue
+ if not opt_include_slang and 'slang' in years or 'colloq' in years or 'Scots' in years:
+ continue
+ if '<' in word_n or '/' in word_n or ',' in word_n:
+ word_n = word_n.split("<")[0]
+ word_n = word_n.split(",")[0]
+ word_n = word_n.split("/")[0].strip()
if word_n in marked:
continue
- marked[word_n] = dist
+ marked[word_n] = marked[catid] + 1
+ max_dist = max(max_dist, marked[word_n])
if word_n == opt_word_b:
thesaurus.search(word_n)
- found = True
- break
+ # print(queue)
+ print_chain(thesaurus, opt_word_a, [opt_word_b], marked, skip, prompt_to_remove=True)
newqueue.append(word_n)
- queue = newqueue
-
- if not Found:
- print(f"No path found, distance of {dist} reached, {len(marked)} nodes checked")
+ if step > 1 and len(newqueue) > opt_words_per_step:
+ random.shuffle(newqueue)
+ queue = newqueue[:opt_words_per_step]
+ newqueue = newqueue[opt_words_per_step:]
+ else:
+ queue = [] + newqueue
+ if not found:
+ print(f"No path found, step {step} reached, {len(marked)} nodes checked")
return
- # Then follow the chain of shortest distance to follow the path
- word_n = opt_word_b
+def print_chain(thesaurus, opt_word_a, opt_words_b, marked, skip, prompt_to_remove=False):
+ """Follow the chain of shortest distance from the end back to the start"""
+ # print(opt_word_a)
+ word_n = opt_words_b[0]
+ dist = marked[word_n]
+ tries = 0
+ depth_tries = 0
+ chain = []
+ skip_here = []
+ cat_reverse = {}
while word_n != opt_word_a:
- dist = 999999
- next_catid = ""
+ if tries > len(opt_words_b):
+ return
next_word = ""
+ break_loop = False
word_result = thesaurus.search(word_n)
- print(f"-> {word_result['word']}")
- for cat in word_result['categories']
+ chain.append(word_result['word'])
+ # print(word_result['word'])
+ for cat in word_result['categories']:
catid = cat['catid']
+ if (word_n in skip and catid in skip[word_n]) or catid in skip_here:
+ continue
if catid in marked and marked[catid] < dist:
dist = marked[catid]
- next_catid = catid
- cat_result = thesaurus.category(catid)
- print(f"-> {cat_result['category']}")
- for word_c in category_result:
- word_n = word_c['word']
- if word_n in marked and marked[word_n] < dist:
- next_word = word_n
+ # print(f"{dist}: {catid}")
+ cat_result = thesaurus.category(catid)
+ for word_c in cat_result['words']:
+ word_m = word_c['word']
+ if '<' in word_m or '/' in word_m or ',' in word_m:
+ word_m = word_m.split("<")[0]
+ word_n = word_n.split(",")[0]
+ word_m = word_m.split("/")[0].strip()
+ # print(word_m)
+ if (catid in skip and word_m in skip[catid]) or word_m in skip_here:
+ continue
+ if word_m == opt_word_a or (word_m in marked and marked[word_m] < dist):
+ dist = marked[word_m]
+ # print(f"{dist}: {word_m}")
+ next_word = word_m
+ break_loop = True
+ break
+ if break_loop:
+ cat_name = cat_result['category']
+ cat_reverse[cat_name] = catid
+ chain.append(cat_name)
+ break
+ if next_word == '':
+ if depth_tries < 100 and len(chain) > 2:
+ to_skip = chain[-1]
+ if to_skip in cat_reverse:
+ to_skip = cat_reverse[to_skip]
+ skip_here.append(to_skip)
+ to_skip = chain[-2]
+ if to_skip in cat_reverse:
+ to_skip = cat_reverse[to_skip]
+ skip_here.append(to_skip)
+ chain.pop()
+ chain.pop()
+ word_n = chain[-1]
+ depth_tries += 1
+ elif depth_tries >= 100:
+ tries += 1
+ if tries >= len(opt_words_b):
+ return
+ word_n = opt_words_b[tries]
+ chain = []
+ dist = marked[word_n]
+ depth_tries = 0
+ continue
word_n = next_word
- print(f"-> {word_n}")
+ chain.append(opt_word_a)
+ chain = list(reversed(chain))
+ for i, word in enumerate(chain):
+ if (i % 2) == 0:
+ print(f"{i+1} -> {word}")
+ else:
+ print(f"{i+1} => {word}")
+ if prompt_to_remove:
+ print("If you don't like this path, enter the IDs of words to remove separated by spaces, or Ctrl-C to exit.")
+ ids = input("Enter numbers > ")
+ ids = ids.split(" ")
+ for id in ids:
+ if len(id):
+ try:
+ id = int(id)
+ id -= 1
+ word_a = chain[id]
+ print(f"Removing {word_a}")
+ if word_a in cat_reverse:
+ word_a = cat_reverse[word_a]
+ word_b = chain[id-1]
+ print(f"Connected upward to {word_b}")
+ if word_b in cat_reverse:
+ word_b = cat_reverse[word_b]
+ if word_a in skip:
+ skip[word_a].append(word_b)
+ else:
+ skip[word_a] = [word_b]
+ except Exception as e:
+ continue
+ print(skip)