add finneganizer

author: Jules Laplace <julescarbon@gmail.com> 2020-05-13 14:43:32 +0200
committer: Jules Laplace <julescarbon@gmail.com> 2020-05-13 14:43:32 +0200
commit: 4014fe1faca0ac38c0e40962bc14dd1124780a38 (patch)
tree: c1c1e81e65cd11020c4393b9e6ddbe41a73bccec
parent: 3e837b0b2c2607e699a2997578a580f1c3e605b2 (diff)
3 files changed, 116 insertions, 20 deletions
diff --git a/cli/app/utils/word_utils.py b/cli/app/utils/word_utils.py
new file mode 100644
index 0000000..cbbcb85
--- /dev/null
+++ b/cli/app/utils/word_utils.py
@@ -0,0 +1,22 @@
+def is_oe(years):
+  return (('oe' in years and 'oe-' not in years) or 'arch' in years)
+
+def is_slang(years):
+  return 'slang' in years or 'colloq' in years
+
+def is_scots(years):
+  return 'Scots' in years
+
+def fix_word(word):
+  if '<' in word or '/' in word or ',' in word:
+    word = word.split("<")[0]
+    word = word.split(",")[0]
+    word = word.split("/")[0]
+  return word.strip()
+
+def is_compound_word(word):
+  if '-' in word:
+    return True
+  if ' ' in word:
+    return True
+  return False
diff --git a/cli/commands/bridge/finnegan.py b/cli/commands/bridge/finnegan.py
new file mode 100644
index 0000000..4a6242b
--- /dev/null
+++ b/cli/commands/bridge/finnegan.py
@@ -0,0 +1,88 @@
+import click
+import random
+import simplejson as json
+from tqdm import tqdm
+import itertools
+
+from app.thesaurus.api import Thesaurus
+from app.utils.word_utils import is_oe, is_slang, is_scots, is_compound_word, fix_word
+
+@click.command()
+@click.option('-a', '--word_a', 'opt_word', required=True,
+  help='Word to split')
+@click.option('-l', '--length', 'opt_min_len', required=True, type=int, default=2,
+  help='Minimum length')
+@click.option('-s', '--splits', 'opt_splits', required=True, type=int, default=2,
+  help='Minimum length')
+@click.option('-c', '--categories', 'opt_category_count', required=True, type=int, default=2,
+  help='Categories to use for synonyms per word-fragment')
+@click.option('-oe', '--include_oe', 'opt_include_oe', is_flag=True,
+  help='Whether to include OE/archaic words')
+@click.option('-sl', '--include_slang', 'opt_include_slang', is_flag=True,
+  help='Whether to include slang/colloquial words')
+@click.option('-sc', '--include_scots', 'opt_include_scots', is_flag=True,
+  help='Whether to include Scots words')
+@click.option('-comp', '--include_compound', 'opt_include_compound_words', is_flag=True,
+  help='Whether to include compound words (words that contain a space or hyphen)')
+@click.pass_context
+def cli(ctx, opt_word, opt_min_len, opt_splits, opt_category_count, opt_include_oe, opt_include_slang, opt_include_scots, opt_include_compound_words):
+  """Split a word into pieces and substitute the pieces with entries from the thesaurus"""
+  finneganizer = Finneganizer(opt_min_len, opt_include_oe, opt_include_slang, opt_include_scots, opt_include_compound_words, opt_category_count)
+  new_words = finneganizer.process_pieces(opt_word, opt_splits - 1)
+  for word in new_words:
+    print(word)
+
+class Finneganizer:
+  def __init__(self, opt_min_len, include_oe, include_slang, include_scots, include_compound_words, category_count):
+    self.thesaurus = Thesaurus()
+    self.opt_min_len = opt_min_len
+    self.include_oe = include_oe
+    self.include_slang = include_slang
+    self.include_scots = include_scots
+    self.include_compound_words = include_compound_words
+    self.category_count = category_count
+
+  def process_pieces(self, word, opt_splits):
+    """Recursively find synonyms for the pieces of a word"""
+    if opt_splits == 0:
+      return self.get_synonyms(word)
+
+    index = self.opt_min_len
+    end_len = len(word) - self.opt_min_len
+    words = [word]
+
+    while index < end_len:
+      word_a = word[:index]
+      word_b = word[index:]
+      synonyms_a = self.get_synonyms(word_a)
+      synonyms_b = self.process_pieces(word_b, opt_splits - 1)
+      for synonym_a, synonym_b in itertools.product(synonyms_a, synonyms_b):
+        words.append(synonym_a + synonym_b)
+      index += 1
+    return words
+
+  def get_synonyms(self, word_a):
+    """Get synonyms from a random category"""
+    categories = self.thesaurus.search(word_a)['categories']
+    if not categories or len(categories) == 0:
+      return [word_a]
+    random.shuffle(categories)
+    categories = categories[:self.category_count]
+    results = [ word_a ]
+    for category in categories:
+      catid = category['catid']
+      category_result = self.thesaurus.category(catid)
+      for category_word in category_result['words']:
+        word = fix_word(category_word['word'])
+        years = category_word['years'].lower()
+        if not self.include_oe and is_oe(years):
+          continue
+        if not self.include_slang and is_slang(years):
+          continue
+        if not self.include_scots and is_scots(years):
+          continue
+        if not self.include_compound_words and is_compound_word(word):
+          continue
+        if word != word_a:
+          results.append(word)
+      return results
diff --git a/cli/commands/bridge/words.py b/cli/commands/bridge/words.py
index 27d0bd4..65a67c0 100644
--- a/cli/commands/bridge/words.py
+++ b/cli/commands/bridge/words.py
@@ -10,6 +10,7 @@ import simplejson as json
 from tqdm import tqdm
 
 from app.thesaurus.api import Thesaurus
+from app.utils.word_utils import is_oe, is_slang, is_scots, fix_word
 
 @click.command()
 @click.option('-a', '--a', 'opt_word_a', required=True,
@@ -115,7 +116,7 @@ class TreeSolver:
     queue = []
     category_result = self.thesaurus.category(catid)
     for category_word in category_result['words']:
-      word = self.fix_word(category_word['word'])
+      word = fix_word(category_word['word'])
       years = category_word['years'].lower()
       if (catid, word,) in self.skips:
         continue
@@ -127,11 +128,11 @@ class TreeSolver:
     return queue
 
   def process_word(self, word, years, catid, tree, target):
-    if not self.include_oe and self.is_oe(years):
+    if not self.include_oe and is_oe(years):
       return None
-    if not self.include_slang and self.is_slang(years):
+    if not self.include_slang and is_slang(years):
       return None
-    if not self.include_scots and self.is_scots(years):
+    if not self.include_scots and is_scots(years):
       return None
     if word not in tree:
       tree[word] = tree[catid] + 1
@@ -190,7 +191,7 @@ class TreeSolver:
         if self.shuffle:
           random.shuffle(category_words)
         for category_word in category_words:
-          cat_word = self.fix_word(category_word['word'])
+          cat_word = fix_word(category_word['word'])
           if cat_word != word and cat_word in tree and self.can_descend(tree, cat_word, word):
             chain.append(cat_word)
             match = cat_word
@@ -248,18 +249,3 @@ class TreeSolver:
     except Exception as e:
       return False
 
-  def is_oe(self, years):
-    return (('oe' in years and 'oe-' not in years) or 'arch' in years)
-
-  def is_slang(self, years):
-    return 'slang' in years or 'colloq' in years
-
-  def is_scots(self, years):
-    return 'Scots' in years
-
-  def fix_word(self, word):
-    if '<' in word or '/' in word or ',' in word:
-      word = word.split("<")[0]
-      word = word.split(",")[0]
-      word = word.split("/")[0]
-    return word.strip()
author	Jules Laplace <julescarbon@gmail.com>	2020-05-13 14:43:32 +0200
committer	Jules Laplace <julescarbon@gmail.com>	2020-05-13 14:43:32 +0200
commit	4014fe1faca0ac38c0e40962bc14dd1124780a38 (patch)
tree	c1c1e81e65cd11020c4393b9e6ddbe41a73bccec
parent	3e837b0b2c2607e699a2997578a580f1c3e605b2 (diff)