diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2020-05-13 14:43:32 +0200 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2020-05-13 14:43:32 +0200 |
| commit | 4014fe1faca0ac38c0e40962bc14dd1124780a38 (patch) | |
| tree | c1c1e81e65cd11020c4393b9e6ddbe41a73bccec | |
| parent | 3e837b0b2c2607e699a2997578a580f1c3e605b2 (diff) | |
add finneganizer
| -rw-r--r-- | cli/app/utils/word_utils.py | 22 | ||||
| -rw-r--r-- | cli/commands/bridge/finnegan.py | 88 | ||||
| -rw-r--r-- | cli/commands/bridge/words.py | 26 |
3 files changed, 116 insertions, 20 deletions
diff --git a/cli/app/utils/word_utils.py b/cli/app/utils/word_utils.py new file mode 100644 index 0000000..cbbcb85 --- /dev/null +++ b/cli/app/utils/word_utils.py @@ -0,0 +1,22 @@ +def is_oe(years): + return (('oe' in years and 'oe-' not in years) or 'arch' in years) + +def is_slang(years): + return 'slang' in years or 'colloq' in years + +def is_scots(years): + return 'Scots' in years + +def fix_word(word): + if '<' in word or '/' in word or ',' in word: + word = word.split("<")[0] + word = word.split(",")[0] + word = word.split("/")[0] + return word.strip() + +def is_compound_word(word): + if '-' in word: + return True + if ' ' in word: + return True + return False diff --git a/cli/commands/bridge/finnegan.py b/cli/commands/bridge/finnegan.py new file mode 100644 index 0000000..4a6242b --- /dev/null +++ b/cli/commands/bridge/finnegan.py @@ -0,0 +1,88 @@ +import click +import random +import simplejson as json +from tqdm import tqdm +import itertools + +from app.thesaurus.api import Thesaurus +from app.utils.word_utils import is_oe, is_slang, is_scots, is_compound_word, fix_word + +@click.command() +@click.option('-a', '--word_a', 'opt_word', required=True, + help='Word to split') +@click.option('-l', '--length', 'opt_min_len', required=True, type=int, default=2, + help='Minimum length') +@click.option('-s', '--splits', 'opt_splits', required=True, type=int, default=2, + help='Minimum length') +@click.option('-c', '--categories', 'opt_category_count', required=True, type=int, default=2, + help='Categories to use for synonyms per word-fragment') +@click.option('-oe', '--include_oe', 'opt_include_oe', is_flag=True, + help='Whether to include OE/archaic words') +@click.option('-sl', '--include_slang', 'opt_include_slang', is_flag=True, + help='Whether to include slang/colloquial words') +@click.option('-sc', '--include_scots', 'opt_include_scots', is_flag=True, + help='Whether to include Scots words') +@click.option('-comp', '--include_compound', 'opt_include_compound_words', is_flag=True, + help='Whether to include compound words (words that contain a space or hyphen)') +@click.pass_context +def cli(ctx, opt_word, opt_min_len, opt_splits, opt_category_count, opt_include_oe, opt_include_slang, opt_include_scots, opt_include_compound_words): + """Split a word into pieces and substitute the pieces with entries from the thesaurus""" + finneganizer = Finneganizer(opt_min_len, opt_include_oe, opt_include_slang, opt_include_scots, opt_include_compound_words, opt_category_count) + new_words = finneganizer.process_pieces(opt_word, opt_splits - 1) + for word in new_words: + print(word) + +class Finneganizer: + def __init__(self, opt_min_len, include_oe, include_slang, include_scots, include_compound_words, category_count): + self.thesaurus = Thesaurus() + self.opt_min_len = opt_min_len + self.include_oe = include_oe + self.include_slang = include_slang + self.include_scots = include_scots + self.include_compound_words = include_compound_words + self.category_count = category_count + + def process_pieces(self, word, opt_splits): + """Recursively find synonyms for the pieces of a word""" + if opt_splits == 0: + return self.get_synonyms(word) + + index = self.opt_min_len + end_len = len(word) - self.opt_min_len + words = [word] + + while index < end_len: + word_a = word[:index] + word_b = word[index:] + synonyms_a = self.get_synonyms(word_a) + synonyms_b = self.process_pieces(word_b, opt_splits - 1) + for synonym_a, synonym_b in itertools.product(synonyms_a, synonyms_b): + words.append(synonym_a + synonym_b) + index += 1 + return words + + def get_synonyms(self, word_a): + """Get synonyms from a random category""" + categories = self.thesaurus.search(word_a)['categories'] + if not categories or len(categories) == 0: + return [word_a] + random.shuffle(categories) + categories = categories[:self.category_count] + results = [ word_a ] + for category in categories: + catid = category['catid'] + category_result = self.thesaurus.category(catid) + for category_word in category_result['words']: + word = fix_word(category_word['word']) + years = category_word['years'].lower() + if not self.include_oe and is_oe(years): + continue + if not self.include_slang and is_slang(years): + continue + if not self.include_scots and is_scots(years): + continue + if not self.include_compound_words and is_compound_word(word): + continue + if word != word_a: + results.append(word) + return results diff --git a/cli/commands/bridge/words.py b/cli/commands/bridge/words.py index 27d0bd4..65a67c0 100644 --- a/cli/commands/bridge/words.py +++ b/cli/commands/bridge/words.py @@ -10,6 +10,7 @@ import simplejson as json from tqdm import tqdm from app.thesaurus.api import Thesaurus +from app.utils.word_utils import is_oe, is_slang, is_scots, fix_word @click.command() @click.option('-a', '--a', 'opt_word_a', required=True, @@ -115,7 +116,7 @@ class TreeSolver: queue = [] category_result = self.thesaurus.category(catid) for category_word in category_result['words']: - word = self.fix_word(category_word['word']) + word = fix_word(category_word['word']) years = category_word['years'].lower() if (catid, word,) in self.skips: continue @@ -127,11 +128,11 @@ class TreeSolver: return queue def process_word(self, word, years, catid, tree, target): - if not self.include_oe and self.is_oe(years): + if not self.include_oe and is_oe(years): return None - if not self.include_slang and self.is_slang(years): + if not self.include_slang and is_slang(years): return None - if not self.include_scots and self.is_scots(years): + if not self.include_scots and is_scots(years): return None if word not in tree: tree[word] = tree[catid] + 1 @@ -190,7 +191,7 @@ class TreeSolver: if self.shuffle: random.shuffle(category_words) for category_word in category_words: - cat_word = self.fix_word(category_word['word']) + cat_word = fix_word(category_word['word']) if cat_word != word and cat_word in tree and self.can_descend(tree, cat_word, word): chain.append(cat_word) match = cat_word @@ -248,18 +249,3 @@ class TreeSolver: except Exception as e: return False - def is_oe(self, years): - return (('oe' in years and 'oe-' not in years) or 'arch' in years) - - def is_slang(self, years): - return 'slang' in years or 'colloq' in years - - def is_scots(self, years): - return 'Scots' in years - - def fix_word(self, word): - if '<' in word or '/' in word or ',' in word: - word = word.split("<")[0] - word = word.split(",")[0] - word = word.split("/")[0] - return word.strip() |
