summaryrefslogtreecommitdiff
path: root/cli/commands/bridge/finnegan.py
blob: 376ff7bdacbbbc0bd3c151e7a66ad6325ac0b0bf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import click
import random
import simplejson as json
from tqdm import tqdm
import itertools

from app.thesaurus.api import Thesaurus
from app.utils.word_utils import is_oe, is_slang, is_scots, is_compound_word, fix_word

@click.command()
@click.option('-a', '--word_a', 'opt_word', required=True,
  help='Word to split')
@click.option('-l', '--length', 'opt_min_len', required=True, type=int, default=2,
  help='Minimum length')
@click.option('-s', '--splits', 'opt_splits', required=True, type=int, default=2,
  help='Minimum length')
@click.option('-c', '--categories', 'opt_category_count', required=True, type=int, default=2,
  help='Categories to use for synonyms per word-fragment')
@click.option('-oe', '--include_oe', 'opt_include_oe', is_flag=True,
  help='Whether to include OE/archaic words')
@click.option('-sl', '--include_slang', 'opt_include_slang', is_flag=True,
  help='Whether to include slang/colloquial words')
@click.option('-sc', '--include_scots', 'opt_include_scots', is_flag=True,
  help='Whether to include Scots words')
@click.option('-comp', '--include_compound', 'opt_include_compound_words', is_flag=True,
  help='Whether to include compound words (words that contain a space or hyphen)')
@click.pass_context
def cli(ctx, opt_word, opt_min_len, opt_splits, opt_category_count, opt_include_oe, opt_include_slang, opt_include_scots, opt_include_compound_words):
  """Split a word into pieces and substitute the pieces with entries from the thesaurus"""
  finneganizer = Finneganizer(opt_min_len, opt_include_oe, opt_include_slang, opt_include_scots, opt_include_compound_words, opt_category_count)
  new_words = finneganizer.process_pieces(opt_word, opt_splits - 1)
  for word in new_words:
    print(word)

class Finneganizer:
  def __init__(self, opt_min_len, include_oe, include_slang, include_scots, include_compound_words, category_count):
    self.thesaurus = Thesaurus()
    self.opt_min_len = opt_min_len
    self.include_oe = include_oe
    self.include_slang = include_slang
    self.include_scots = include_scots
    self.include_compound_words = include_compound_words
    self.category_count = category_count

  def process_pieces(self, word, opt_splits):
    """Recursively find synonyms for the pieces of a word"""
    if opt_splits == 0:
      return self.get_synonyms(word)

    index = self.opt_min_len
    end_len = len(word) - self.opt_min_len
    words = [word]

    while index < end_len:
      word_a = word[:index]
      word_b = word[index:]
      synonyms_a = self.get_synonyms(word_a)
      synonyms_b = self.process_pieces(word_b, opt_splits - 1)
      for synonym_a, synonym_b in itertools.product(synonyms_a, synonyms_b):
        words.append(synonym_a + synonym_b)
      index += 1
    return words

  def get_synonyms(self, word_a):
    """Get synonyms from a random category"""
    categories = self.thesaurus.search(word_a)['categories']
    if not categories or len(categories) == 0:
      return [word_a]
    random.shuffle(categories)
    if self.category_count > 0:
      categories = categories[:self.category_count]
    results = [ word_a ]
    for category in categories:
      catid = category['catid']
      category_result = self.thesaurus.category(catid)
      for category_word in category_result['words']:
        word = fix_word(category_word['word'])
        years = category_word['years'].lower()
        if not self.include_oe and is_oe(years):
          continue
        if not self.include_slang and is_slang(years):
          continue
        if not self.include_scots and is_scots(years):
          continue
        if not self.include_compound_words and is_compound_word(word):
          continue
        if word != word_a:
          results.append(word)
    return results