1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
var util = require('../util/util')
function bold_terms (s, terms) {
}
sub bold_terms
{
my ($self, $string, $terms) = @_;
$string = $self->strip_html($string);
foreach my $term (@$terms)
{
$string =~ s/\b($term)\b/<b>$1<\/b>/gi;
}
return $string;
}
sub bold_snippet
{
my ($self, $string, $terms) = @_;
my $snippet = $self->snippet($string, $terms);
return $self->bold_terms($snippet, $terms);
}
sub snippet
{
my ($self, $string, $terms) = @_;
# clean up the string we got
$string = $self->strip_html($string);
# create a regex out of the search terms
my $term_re = join "|", @$terms;
# take the string to be snippetized and split it into words
my @words = split /\s+/, $string;
# deduper for matching @words indexes, so we don't add a word twice
my $index_matches = {};
# words in the eventual snippet
my @words_matched;
# the snippet itself
my $snippet = '';
# counter for aggregating context after a match
my $aggr = 0;
# amount of context to show, in number of words surrounding a match
my $pad = 4;
# loop over each of the words in the string
for (my $i = 0; $i < scalar @words; $i++)
{
# does this word contain a match?
if ($words[$i] =~ /\b($term_re)\b/i && ! $self->is_stopword($1))
{
# if we aren't already aggregating, add an ellipsis
if (! $aggr)
{
push @words_matched, "...";
}
# look backward $pad words
for (my $j = -$pad; $j < 1; $j++)
{
# create a new index from the offset
my $idx = $i + $j;
# is this a valid index? has it already been encountered?
next if $idx < 0;
next if $idx > scalar @words;
next if exists $index_matches->{$i+$j};
# checks out, save this word
push @words_matched, $words[$i+$j];
# note the matching index in our deduper
$index_matches->{$i+$j} ++;
}
# enter aggregate mode -- add the next $pad words
$aggr = $pad;
}
# have we been told to aggregate?
elsif ($aggr)
{
# save this word
push @words_matched, $words[$i];
# add index to the deduper
$index_matches->{$i} ++;
# one less word to aggregate
$aggr--;
}
# keep snippets to a modest length
last if scalar @words_matched > 30;
}
# add a trailing ellipsis
push @words_matched, "...";
# create the snippet from the saved context words
$snippet = join " ", @words_matched;
return $snippet;
}
|