summaryrefslogtreecommitdiff
path: root/cli/commands/bridge/finnegan.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2020-05-13 14:43:32 +0200
committerJules Laplace <julescarbon@gmail.com>2020-05-13 14:43:32 +0200
commit4014fe1faca0ac38c0e40962bc14dd1124780a38 (patch)
treec1c1e81e65cd11020c4393b9e6ddbe41a73bccec /cli/commands/bridge/finnegan.py
parent3e837b0b2c2607e699a2997578a580f1c3e605b2 (diff)
add finneganizer
Diffstat (limited to 'cli/commands/bridge/finnegan.py')
-rw-r--r--cli/commands/bridge/finnegan.py88
1 files changed, 88 insertions, 0 deletions
diff --git a/cli/commands/bridge/finnegan.py b/cli/commands/bridge/finnegan.py
new file mode 100644
index 0000000..4a6242b
--- /dev/null
+++ b/cli/commands/bridge/finnegan.py
@@ -0,0 +1,88 @@
+import click
+import random
+import simplejson as json
+from tqdm import tqdm
+import itertools
+
+from app.thesaurus.api import Thesaurus
+from app.utils.word_utils import is_oe, is_slang, is_scots, is_compound_word, fix_word
+
+@click.command()
+@click.option('-a', '--word_a', 'opt_word', required=True,
+ help='Word to split')
+@click.option('-l', '--length', 'opt_min_len', required=True, type=int, default=2,
+ help='Minimum length')
+@click.option('-s', '--splits', 'opt_splits', required=True, type=int, default=2,
+ help='Minimum length')
+@click.option('-c', '--categories', 'opt_category_count', required=True, type=int, default=2,
+ help='Categories to use for synonyms per word-fragment')
+@click.option('-oe', '--include_oe', 'opt_include_oe', is_flag=True,
+ help='Whether to include OE/archaic words')
+@click.option('-sl', '--include_slang', 'opt_include_slang', is_flag=True,
+ help='Whether to include slang/colloquial words')
+@click.option('-sc', '--include_scots', 'opt_include_scots', is_flag=True,
+ help='Whether to include Scots words')
+@click.option('-comp', '--include_compound', 'opt_include_compound_words', is_flag=True,
+ help='Whether to include compound words (words that contain a space or hyphen)')
+@click.pass_context
+def cli(ctx, opt_word, opt_min_len, opt_splits, opt_category_count, opt_include_oe, opt_include_slang, opt_include_scots, opt_include_compound_words):
+ """Split a word into pieces and substitute the pieces with entries from the thesaurus"""
+ finneganizer = Finneganizer(opt_min_len, opt_include_oe, opt_include_slang, opt_include_scots, opt_include_compound_words, opt_category_count)
+ new_words = finneganizer.process_pieces(opt_word, opt_splits - 1)
+ for word in new_words:
+ print(word)
+
+class Finneganizer:
+ def __init__(self, opt_min_len, include_oe, include_slang, include_scots, include_compound_words, category_count):
+ self.thesaurus = Thesaurus()
+ self.opt_min_len = opt_min_len
+ self.include_oe = include_oe
+ self.include_slang = include_slang
+ self.include_scots = include_scots
+ self.include_compound_words = include_compound_words
+ self.category_count = category_count
+
+ def process_pieces(self, word, opt_splits):
+ """Recursively find synonyms for the pieces of a word"""
+ if opt_splits == 0:
+ return self.get_synonyms(word)
+
+ index = self.opt_min_len
+ end_len = len(word) - self.opt_min_len
+ words = [word]
+
+ while index < end_len:
+ word_a = word[:index]
+ word_b = word[index:]
+ synonyms_a = self.get_synonyms(word_a)
+ synonyms_b = self.process_pieces(word_b, opt_splits - 1)
+ for synonym_a, synonym_b in itertools.product(synonyms_a, synonyms_b):
+ words.append(synonym_a + synonym_b)
+ index += 1
+ return words
+
+ def get_synonyms(self, word_a):
+ """Get synonyms from a random category"""
+ categories = self.thesaurus.search(word_a)['categories']
+ if not categories or len(categories) == 0:
+ return [word_a]
+ random.shuffle(categories)
+ categories = categories[:self.category_count]
+ results = [ word_a ]
+ for category in categories:
+ catid = category['catid']
+ category_result = self.thesaurus.category(catid)
+ for category_word in category_result['words']:
+ word = fix_word(category_word['word'])
+ years = category_word['years'].lower()
+ if not self.include_oe and is_oe(years):
+ continue
+ if not self.include_slang and is_slang(years):
+ continue
+ if not self.include_scots and is_scots(years):
+ continue
+ if not self.include_compound_words and is_compound_word(word):
+ continue
+ if word != word_a:
+ results.append(word)
+ return results