summaryrefslogtreecommitdiff
path: root/lib/search/snippet.js
diff options
context:
space:
mode:
Diffstat (limited to 'lib/search/snippet.js')
-rw-r--r--lib/search/snippet.js103
1 files changed, 103 insertions, 0 deletions
diff --git a/lib/search/snippet.js b/lib/search/snippet.js
new file mode 100644
index 0000000..de71911
--- /dev/null
+++ b/lib/search/snippet.js
@@ -0,0 +1,103 @@
+var util = require('../util/util')
+
+function bold_terms (s, terms) {
+
+}
+sub bold_terms
+ {
+ my ($self, $string, $terms) = @_;
+ $string = $self->strip_html($string);
+ foreach my $term (@$terms)
+ {
+ $string =~ s/\b($term)\b/<b>$1<\/b>/gi;
+ }
+ return $string;
+ }
+sub bold_snippet
+ {
+ my ($self, $string, $terms) = @_;
+ my $snippet = $self->snippet($string, $terms);
+ return $self->bold_terms($snippet, $terms);
+ }
+sub snippet
+ {
+ my ($self, $string, $terms) = @_;
+
+ # clean up the string we got
+ $string = $self->strip_html($string);
+
+ # create a regex out of the search terms
+ my $term_re = join "|", @$terms;
+
+ # take the string to be snippetized and split it into words
+ my @words = split /\s+/, $string;
+
+ # deduper for matching @words indexes, so we don't add a word twice
+ my $index_matches = {};
+
+ # words in the eventual snippet
+ my @words_matched;
+
+ # the snippet itself
+ my $snippet = '';
+
+ # counter for aggregating context after a match
+ my $aggr = 0;
+
+ # amount of context to show, in number of words surrounding a match
+ my $pad = 4;
+
+ # loop over each of the words in the string
+ for (my $i = 0; $i < scalar @words; $i++)
+ {
+ # does this word contain a match?
+ if ($words[$i] =~ /\b($term_re)\b/i && ! $self->is_stopword($1))
+ {
+ # if we aren't already aggregating, add an ellipsis
+ if (! $aggr)
+ {
+ push @words_matched, "...";
+ }
+ # look backward $pad words
+ for (my $j = -$pad; $j < 1; $j++)
+ {
+ # create a new index from the offset
+ my $idx = $i + $j;
+
+ # is this a valid index? has it already been encountered?
+ next if $idx < 0;
+ next if $idx > scalar @words;
+ next if exists $index_matches->{$i+$j};
+
+ # checks out, save this word
+ push @words_matched, $words[$i+$j];
+
+ # note the matching index in our deduper
+ $index_matches->{$i+$j} ++;
+ }
+ # enter aggregate mode -- add the next $pad words
+ $aggr = $pad;
+ }
+ # have we been told to aggregate?
+ elsif ($aggr)
+ {
+ # save this word
+ push @words_matched, $words[$i];
+
+ # add index to the deduper
+ $index_matches->{$i} ++;
+
+ # one less word to aggregate
+ $aggr--;
+ }
+ # keep snippets to a modest length
+ last if scalar @words_matched > 30;
+ }
+ # add a trailing ellipsis
+ push @words_matched, "...";
+
+ # create the snippet from the saved context words
+ $snippet = join " ", @words_matched;
+
+ return $snippet;
+ } \ No newline at end of file