summaryrefslogtreecommitdiff
path: root/bucky2/lib/Bucky/Search.pm
diff options
context:
space:
mode:
Diffstat (limited to 'bucky2/lib/Bucky/Search.pm')
-rw-r--r--bucky2/lib/Bucky/Search.pm413
1 files changed, 413 insertions, 0 deletions
diff --git a/bucky2/lib/Bucky/Search.pm b/bucky2/lib/Bucky/Search.pm
new file mode 100644
index 0000000..43c23f7
--- /dev/null
+++ b/bucky2/lib/Bucky/Search.pm
@@ -0,0 +1,413 @@
+package Bucky::Search;
+
+use base 'Bucky';
+
+use Data::Dumper;
+use DB_File;
+
+sub index
+ {
+ my ($self, $index) = @_;
+ $self->{'_index'} = $index || $self->index_read;
+ return $self->{'_index'};
+ }
+sub index_read
+ {
+ my ($self) = @_;
+ my %index;
+ tie %index, "DB_File", $self->index_filename, O_RDONLY, 0666, $DB_HASH ;
+ return \%index;
+ }
+sub index_write
+ {
+ my ($self) = @_;
+ my %index;
+ tie %index, "DB_File", $self->index_filename, O_CREAT|O_RDWR, 0666, $DB_HASH ;
+ return \%index;
+ }
+sub index_close
+ {
+ my ($self, $index) = @_;
+ $index ||= $self->index;
+ untie %$index;
+ }
+sub index_filename
+ { "gross.db" }
+sub auto_index
+ {
+ my ($self, $auto_index) = @_;
+ $self->{'_auto_index'} = $auto_index || $self->auto_index_read;
+ return $self->{'_auto_index'};
+ }
+sub auto_index_read
+ {
+ my ($self) = @_;
+ my %auto_index;
+ tie %auto_index, "DB_File", $self->auto_index_filename, O_RDONLY, 0666, $DB_HASH ;
+ return \%auto_index;
+ }
+sub auto_index_write
+ {
+ my ($self) = @_;
+ my %auto_index;
+ tie %auto_index, "DB_File", $self->auto_index_filename, O_CREAT|O_RDWR, 0666, $DB_HASH ;
+ return \%auto_index;
+ }
+sub auto_index_close
+ {
+ my ($self, $auto_index) = @_;
+ $auto_index ||= $self->auto_index;
+ untie %$auto_index;
+ }
+sub auto_index_filename
+ { "auto.db" }
+sub lexicon_store
+ {
+ my ($self, $lexicon) = @_;
+ my $index = $self->index_write;
+ foreach my $term (keys %$lexicon)
+ {
+ next if $self->is_stopword($term);
+ my $serialized = $self->serialize_matches($lexicon->{$term});
+ next unless $serialized;
+ $index->{$term} = $serialized;
+ }
+ $self->index_close($index);
+ }
+sub unserialize_matches
+ {
+ my ($self, $serialized_string) = @_;
+ my @serialized_matches = split ",", $serialized_string;
+ my @matches;
+ foreach my $serialized_match (@serialized_matches)
+ {
+ my ($thread, $comment, $file, $strength) = split " ", $serialized_match;
+ my $match = {};
+ $match->{'thread'} = $thread;
+ $match->{'comment'} = $comment;
+ $match->{'file'} = $file;
+ $match->{'strength'} = $strength;
+ push @matches, $match;
+ }
+ return \@matches;
+ }
+sub serialize_matches
+ {
+ my ($self, $matches) = @_;
+ my @serialized_matches;
+ foreach my $match (values %$matches)
+ {
+ next unless $match && ref($match) eq "HASH";
+ my $string = join " ",
+ $match->{'thread'},
+ $match->{'comment'},
+ $match->{'file'},
+ $match->{'strength'};
+ next unless $string;
+ push @serialized_matches, $string if $string;
+ }
+ return undef unless scalar @serialized_matches;
+ return join ",", @serialized_matches;
+ }
+my $STOPWORDS = {( map { lc $_, 1 } qw(
+a about above across adj after again against all almost alone along also
+although always am among an and another any anybody anyone anything anywhere
+apart are around as aside at away be because been before behind being below
+besides between beyond both but by can cannot could deep did do does doing done
+down downwards during each either else enough etc even ever every everybody
+everyone except far few for forth from get gets got had hardly has have having
+her here herself him himself his how however i if in indeed instead into inward
+is it its itself just kept many maybe might mine more most mostly much must
+myself near neither next no nobody none nor not nothing nowhere of off often on
+only onto or other others ought our ours out outside over own p per please plus
+pp quite rather really said seem self selves several shall she should since so
+some somebody somewhat still such than that the their theirs them themselves
+then there therefore these they this thorough thoroughly those through thus to
+together too toward towards under until up upon v very was well were what
+whatever when whenever where whether which while who whom whose will with
+within without would yet young your yourself s ) )};
+sub is_stopword
+ {
+ my ($self, $term) = @_;
+ return exists $STOPWORDS->{lc $term};
+ }
+sub autocomplete
+ {
+ my ($self, $query) = @_;
+ return unless $query;
+ my $terms = parse_terms($query);
+ my $last_term = pop @$terms;
+
+ my $auto_index = $self->auto_index;
+ my $guess_term = $auto_index->{$last_term};
+ my $guess_full = join " ", @$terms, $guess_term;
+ my $guess_tail = $guess_term;
+ $guess_tail =~ s/^$last_term//;
+
+ my $guess = {};
+ $guess->{'full'} = $guess_full;
+ $guess->{'tail'} = $guess_tail;
+ $guess->{'term'} = $guess_term;
+ return $guess;
+ }
+sub search_light
+ {
+ my ($self, $query, $start, $limit) = @_;
+ return unless $query;
+ $start ||= 0;
+ $limit ||= 10;
+ my $scores = {};
+ my $terms = parse_terms($query);
+ my $index = $self->index;
+ foreach my $term (@$terms)
+ {
+ next if $self->is_stopword($term);
+ next unless my $serial = $index->{$term};
+ my $results = $self->unserialize_matches($serial);
+ foreach my $result (@$results)
+ {
+ my $thread = $result->{'thread'};
+ $scores->{$thread} ||= {};
+ $scores->{$thread}->{thread} ||= $result->{'thread'};
+ $scores->{$thread}->{file} ||= $result->{'file'};
+ $scores->{$thread}->{strength} += $result->{'strength'};
+ $scores->{$thread}->{count}++;
+ }
+ }
+ my $total = scalar keys %$scores;
+ my $i = 0;
+ my $to_display = $limit;
+ my $threads = {};
+ my $comments_to_get = [];
+ my $files_to_get = [];
+ foreach my $match (sort { $b->{count} <=> $a->{count} || $b->{strength} * $b->{count} <=> $a->{strength} * $a->{count} } values %$scores )
+ {
+ next if $i++ < $start;
+ my $thread = $self->thread( $match->{'thread'} );
+ next unless $thread;
+ # next if $thread->{'private'};
+ last if $to_display-- == 0;
+ push @$results, $match;
+ push @$comments_to_get, $match->{'comment'} if $match->{'comment'};
+ if ( $match->{'file'} )
+ { push @$files_to_get, $match->{'file'}; }
+ if ( $thread->{'flagged'} )
+ { push @$files_to_get, $thread->{'flagged'}; }
+ $threads->{ $thread->{'id'} } = $thread;
+ }
+ my $files = $self->files_by_id($files_to_get);
+ # $self->log_query($query, $total);
+ return
+ {
+ start => $start + $limit,
+ limit => $limit,
+ total => $total,
+ results => $results,
+ threads => $threads,
+ files => $files,
+ terms => $terms,
+ };
+ }
+sub search
+ {
+ my ($self, $query, $start, $limit) = @_;
+ return unless $query;
+ $start ||= 0;
+ $limit ||= 10;
+ my $scores = {};
+ my $terms = parse_terms($query);
+ my $index = $self->index;
+ foreach my $term (@$terms)
+ {
+ next if $self->is_stopword($term);
+ next unless my $serial = $index->{$term};
+ my $results = $self->unserialize_matches($serial);
+ foreach my $result (@$results)
+ {
+ my $thread = $result->{'thread'};
+ $scores->{$thread} ||= {};
+ $scores->{$thread}->{thread} ||= $result->{'thread'};
+ $scores->{$thread}->{comment} ||= $result->{'comment'};
+ $scores->{$thread}->{file} ||= $result->{'file'};
+ $scores->{$thread}->{strength} += $result->{'strength'};
+ $scores->{$thread}->{count}++;
+ }
+ }
+ my $total = scalar keys %$scores;
+ my $i = 0;
+ my $to_display = $limit;
+ my $threads = {};
+ my $comments_to_get = [];
+ my $files_to_get = [];
+ foreach my $match (sort { $b->{count} <=> $a->{count} || $b->{strength} * $b->{count} <=> $a->{strength} * $a->{count} } values %$scores )
+ {
+ next if $i++ < $start;
+ my $thread = $self->thread( $match->{'thread'} );
+ next unless $thread;
+ next if $thread->{'private'};
+ last if $to_display-- == 0;
+ push @$results, $match;
+ push @$comments_to_get, $match->{'comment'} if $match->{'comment'};
+ if ( $match->{'file'} )
+ { push @$files_to_get, $match->{'file'}; }
+ if ( $thread->{'flagged'} )
+ { push @$files_to_get, $thread->{'flagged'}; }
+ $threads->{ $thread->{'id'} } = $thread;
+ }
+ my $files = $self->files_by_id($files_to_get);
+ my $comments = $self->comments_by_id($comments_to_get);
+ $self->log_query($query, $total);
+ return
+ {
+ start => $start + $limit,
+ limit => $limit,
+ total => $total,
+ results => $results,
+ threads => $threads,
+ comments => $comments,
+ files => $files,
+ terms => $terms,
+ };
+ }
+sub score_display
+ {
+ my ($obj) = @_;
+ return scalar(keys %{$obj->{terms}}) . "x" . $obj->{count};
+ }
+sub display_object
+ {
+ my ($obj) = @_;
+ my ($type, $id) = split ":", $obj;
+ my $thread = $bucky->thread($id);
+ my $title = $thread ? $thread->_title : "* * *";
+ return $type . " " . $id . "\t" . $title;
+ }
+sub bold_terms
+ {
+ my ($self, $string, $terms) = @_;
+ $string = $self->strip_html($string);
+ foreach my $term (@$terms)
+ {
+ $string =~ s/\b($term)\b/<b>$1<\/b>/gi;
+ }
+ return $string;
+ }
+sub bold_snippet
+ {
+ my ($self, $string, $terms) = @_;
+ my $snippet = $self->snippet($string, $terms);
+ return $self->bold_terms($snippet, $terms);
+ }
+sub snippet
+ {
+ my ($self, $string, $terms) = @_;
+
+ # clean up the string we got
+ $string = $self->strip_html($string);
+
+ # create a regex out of the search terms
+ my $term_re = join "|", @$terms;
+
+ # take the string to be snippetized and split it into words
+ my @words = split /\s+/, $string;
+
+ # deduper for matching @words indexes, so we don't add a word twice
+ my $index_matches = {};
+
+ # words in the eventual snippet
+ my @words_matched;
+
+ # the snippet itself
+ my $snippet = '';
+
+ # counter for aggregating context after a match
+ my $aggr = 0;
+
+ # amount of context to show, in number of words surrounding a match
+ my $pad = 4;
+
+ # loop over each of the words in the string
+ for (my $i = 0; $i < scalar @words; $i++)
+ {
+ # does this word contain a match?
+ if ($words[$i] =~ /\b($term_re)\b/i && ! $self->is_stopword($1))
+ {
+ # if we aren't already aggregating, add an ellipsis
+ if (! $aggr)
+ {
+ push @words_matched, "...";
+ }
+ # look backward $pad words
+ for (my $j = -$pad; $j < 1; $j++)
+ {
+ # create a new index from the offset
+ my $idx = $i + $j;
+
+ # is this a valid index? has it already been encountered?
+ next if $idx < 0;
+ next if $idx > scalar @words;
+ next if exists $index_matches->{$i+$j};
+
+ # checks out, save this word
+ push @words_matched, $words[$i+$j];
+
+ # note the matching index in our deduper
+ $index_matches->{$i+$j} ++;
+ }
+ # enter aggregate mode -- add the next $pad words
+ $aggr = $pad;
+ }
+ # have we been told to aggregate?
+ elsif ($aggr)
+ {
+ # save this word
+ push @words_matched, $words[$i];
+
+ # add index to the deduper
+ $index_matches->{$i} ++;
+
+ # one less word to aggregate
+ $aggr--;
+ }
+ # keep snippets to a modest length
+ last if scalar @words_matched > 30;
+ }
+ # add a trailing ellipsis
+ push @words_matched, "...";
+
+ # create the snippet from the saved context words
+ $snippet = join " ", @words_matched;
+
+ return $snippet;
+ }
+sub parse_terms
+ {
+ my ($s) = @_;
+ my @terms = split /(\W+)/, lc($s);
+ my $words = [];
+ my $count = 0;
+ foreach my $term (@terms)
+ {
+ if ( $term !~ /\W/ )
+ {
+ push @$words, $term;
+ }
+ }
+ return $words;
+ }
+sub log
+ {
+ my ($self, $date) = @_;
+ my $criteria = {};
+ $criteria->{'date'} = $date if $date;
+ return $self->db->select("search_log", $criteria);
+ }
+sub log_query
+ {
+ my ($self, $query, $total) = @_;
+ return unless $query;
+ my $date = time;
+ $matches ||= '0';
+ $self->db->insert("search_log", { query => $query, date => $date, matches => $total });
+ }
+1;