diff options
Diffstat (limited to 'bucky2/lib/Bucky/Search.pm')
| -rw-r--r-- | bucky2/lib/Bucky/Search.pm | 413 |
1 files changed, 413 insertions, 0 deletions
diff --git a/bucky2/lib/Bucky/Search.pm b/bucky2/lib/Bucky/Search.pm new file mode 100644 index 0000000..43c23f7 --- /dev/null +++ b/bucky2/lib/Bucky/Search.pm @@ -0,0 +1,413 @@ +package Bucky::Search; + +use base 'Bucky'; + +use Data::Dumper; +use DB_File; + +sub index + { + my ($self, $index) = @_; + $self->{'_index'} = $index || $self->index_read; + return $self->{'_index'}; + } +sub index_read + { + my ($self) = @_; + my %index; + tie %index, "DB_File", $self->index_filename, O_RDONLY, 0666, $DB_HASH ; + return \%index; + } +sub index_write + { + my ($self) = @_; + my %index; + tie %index, "DB_File", $self->index_filename, O_CREAT|O_RDWR, 0666, $DB_HASH ; + return \%index; + } +sub index_close + { + my ($self, $index) = @_; + $index ||= $self->index; + untie %$index; + } +sub index_filename + { "gross.db" } +sub auto_index + { + my ($self, $auto_index) = @_; + $self->{'_auto_index'} = $auto_index || $self->auto_index_read; + return $self->{'_auto_index'}; + } +sub auto_index_read + { + my ($self) = @_; + my %auto_index; + tie %auto_index, "DB_File", $self->auto_index_filename, O_RDONLY, 0666, $DB_HASH ; + return \%auto_index; + } +sub auto_index_write + { + my ($self) = @_; + my %auto_index; + tie %auto_index, "DB_File", $self->auto_index_filename, O_CREAT|O_RDWR, 0666, $DB_HASH ; + return \%auto_index; + } +sub auto_index_close + { + my ($self, $auto_index) = @_; + $auto_index ||= $self->auto_index; + untie %$auto_index; + } +sub auto_index_filename + { "auto.db" } +sub lexicon_store + { + my ($self, $lexicon) = @_; + my $index = $self->index_write; + foreach my $term (keys %$lexicon) + { + next if $self->is_stopword($term); + my $serialized = $self->serialize_matches($lexicon->{$term}); + next unless $serialized; + $index->{$term} = $serialized; + } + $self->index_close($index); + } +sub unserialize_matches + { + my ($self, $serialized_string) = @_; + my @serialized_matches = split ",", $serialized_string; + my @matches; + foreach my $serialized_match (@serialized_matches) + { + my ($thread, $comment, $file, $strength) = split " ", $serialized_match; + my $match = {}; + $match->{'thread'} = $thread; + $match->{'comment'} = $comment; + $match->{'file'} = $file; + $match->{'strength'} = $strength; + push @matches, $match; + } + return \@matches; + } +sub serialize_matches + { + my ($self, $matches) = @_; + my @serialized_matches; + foreach my $match (values %$matches) + { + next unless $match && ref($match) eq "HASH"; + my $string = join " ", + $match->{'thread'}, + $match->{'comment'}, + $match->{'file'}, + $match->{'strength'}; + next unless $string; + push @serialized_matches, $string if $string; + } + return undef unless scalar @serialized_matches; + return join ",", @serialized_matches; + } +my $STOPWORDS = {( map { lc $_, 1 } qw( +a about above across adj after again against all almost alone along also +although always am among an and another any anybody anyone anything anywhere +apart are around as aside at away be because been before behind being below +besides between beyond both but by can cannot could deep did do does doing done +down downwards during each either else enough etc even ever every everybody +everyone except far few for forth from get gets got had hardly has have having +her here herself him himself his how however i if in indeed instead into inward +is it its itself just kept many maybe might mine more most mostly much must +myself near neither next no nobody none nor not nothing nowhere of off often on +only onto or other others ought our ours out outside over own p per please plus +pp quite rather really said seem self selves several shall she should since so +some somebody somewhat still such than that the their theirs them themselves +then there therefore these they this thorough thoroughly those through thus to +together too toward towards under until up upon v very was well were what +whatever when whenever where whether which while who whom whose will with +within without would yet young your yourself s ) )}; +sub is_stopword + { + my ($self, $term) = @_; + return exists $STOPWORDS->{lc $term}; + } +sub autocomplete + { + my ($self, $query) = @_; + return unless $query; + my $terms = parse_terms($query); + my $last_term = pop @$terms; + + my $auto_index = $self->auto_index; + my $guess_term = $auto_index->{$last_term}; + my $guess_full = join " ", @$terms, $guess_term; + my $guess_tail = $guess_term; + $guess_tail =~ s/^$last_term//; + + my $guess = {}; + $guess->{'full'} = $guess_full; + $guess->{'tail'} = $guess_tail; + $guess->{'term'} = $guess_term; + return $guess; + } +sub search_light + { + my ($self, $query, $start, $limit) = @_; + return unless $query; + $start ||= 0; + $limit ||= 10; + my $scores = {}; + my $terms = parse_terms($query); + my $index = $self->index; + foreach my $term (@$terms) + { + next if $self->is_stopword($term); + next unless my $serial = $index->{$term}; + my $results = $self->unserialize_matches($serial); + foreach my $result (@$results) + { + my $thread = $result->{'thread'}; + $scores->{$thread} ||= {}; + $scores->{$thread}->{thread} ||= $result->{'thread'}; + $scores->{$thread}->{file} ||= $result->{'file'}; + $scores->{$thread}->{strength} += $result->{'strength'}; + $scores->{$thread}->{count}++; + } + } + my $total = scalar keys %$scores; + my $i = 0; + my $to_display = $limit; + my $threads = {}; + my $comments_to_get = []; + my $files_to_get = []; + foreach my $match (sort { $b->{count} <=> $a->{count} || $b->{strength} * $b->{count} <=> $a->{strength} * $a->{count} } values %$scores ) + { + next if $i++ < $start; + my $thread = $self->thread( $match->{'thread'} ); + next unless $thread; + # next if $thread->{'private'}; + last if $to_display-- == 0; + push @$results, $match; + push @$comments_to_get, $match->{'comment'} if $match->{'comment'}; + if ( $match->{'file'} ) + { push @$files_to_get, $match->{'file'}; } + if ( $thread->{'flagged'} ) + { push @$files_to_get, $thread->{'flagged'}; } + $threads->{ $thread->{'id'} } = $thread; + } + my $files = $self->files_by_id($files_to_get); + # $self->log_query($query, $total); + return + { + start => $start + $limit, + limit => $limit, + total => $total, + results => $results, + threads => $threads, + files => $files, + terms => $terms, + }; + } +sub search + { + my ($self, $query, $start, $limit) = @_; + return unless $query; + $start ||= 0; + $limit ||= 10; + my $scores = {}; + my $terms = parse_terms($query); + my $index = $self->index; + foreach my $term (@$terms) + { + next if $self->is_stopword($term); + next unless my $serial = $index->{$term}; + my $results = $self->unserialize_matches($serial); + foreach my $result (@$results) + { + my $thread = $result->{'thread'}; + $scores->{$thread} ||= {}; + $scores->{$thread}->{thread} ||= $result->{'thread'}; + $scores->{$thread}->{comment} ||= $result->{'comment'}; + $scores->{$thread}->{file} ||= $result->{'file'}; + $scores->{$thread}->{strength} += $result->{'strength'}; + $scores->{$thread}->{count}++; + } + } + my $total = scalar keys %$scores; + my $i = 0; + my $to_display = $limit; + my $threads = {}; + my $comments_to_get = []; + my $files_to_get = []; + foreach my $match (sort { $b->{count} <=> $a->{count} || $b->{strength} * $b->{count} <=> $a->{strength} * $a->{count} } values %$scores ) + { + next if $i++ < $start; + my $thread = $self->thread( $match->{'thread'} ); + next unless $thread; + next if $thread->{'private'}; + last if $to_display-- == 0; + push @$results, $match; + push @$comments_to_get, $match->{'comment'} if $match->{'comment'}; + if ( $match->{'file'} ) + { push @$files_to_get, $match->{'file'}; } + if ( $thread->{'flagged'} ) + { push @$files_to_get, $thread->{'flagged'}; } + $threads->{ $thread->{'id'} } = $thread; + } + my $files = $self->files_by_id($files_to_get); + my $comments = $self->comments_by_id($comments_to_get); + $self->log_query($query, $total); + return + { + start => $start + $limit, + limit => $limit, + total => $total, + results => $results, + threads => $threads, + comments => $comments, + files => $files, + terms => $terms, + }; + } +sub score_display + { + my ($obj) = @_; + return scalar(keys %{$obj->{terms}}) . "x" . $obj->{count}; + } +sub display_object + { + my ($obj) = @_; + my ($type, $id) = split ":", $obj; + my $thread = $bucky->thread($id); + my $title = $thread ? $thread->_title : "* * *"; + return $type . " " . $id . "\t" . $title; + } +sub bold_terms + { + my ($self, $string, $terms) = @_; + $string = $self->strip_html($string); + foreach my $term (@$terms) + { + $string =~ s/\b($term)\b/<b>$1<\/b>/gi; + } + return $string; + } +sub bold_snippet + { + my ($self, $string, $terms) = @_; + my $snippet = $self->snippet($string, $terms); + return $self->bold_terms($snippet, $terms); + } +sub snippet + { + my ($self, $string, $terms) = @_; + + # clean up the string we got + $string = $self->strip_html($string); + + # create a regex out of the search terms + my $term_re = join "|", @$terms; + + # take the string to be snippetized and split it into words + my @words = split /\s+/, $string; + + # deduper for matching @words indexes, so we don't add a word twice + my $index_matches = {}; + + # words in the eventual snippet + my @words_matched; + + # the snippet itself + my $snippet = ''; + + # counter for aggregating context after a match + my $aggr = 0; + + # amount of context to show, in number of words surrounding a match + my $pad = 4; + + # loop over each of the words in the string + for (my $i = 0; $i < scalar @words; $i++) + { + # does this word contain a match? + if ($words[$i] =~ /\b($term_re)\b/i && ! $self->is_stopword($1)) + { + # if we aren't already aggregating, add an ellipsis + if (! $aggr) + { + push @words_matched, "..."; + } + # look backward $pad words + for (my $j = -$pad; $j < 1; $j++) + { + # create a new index from the offset + my $idx = $i + $j; + + # is this a valid index? has it already been encountered? + next if $idx < 0; + next if $idx > scalar @words; + next if exists $index_matches->{$i+$j}; + + # checks out, save this word + push @words_matched, $words[$i+$j]; + + # note the matching index in our deduper + $index_matches->{$i+$j} ++; + } + # enter aggregate mode -- add the next $pad words + $aggr = $pad; + } + # have we been told to aggregate? + elsif ($aggr) + { + # save this word + push @words_matched, $words[$i]; + + # add index to the deduper + $index_matches->{$i} ++; + + # one less word to aggregate + $aggr--; + } + # keep snippets to a modest length + last if scalar @words_matched > 30; + } + # add a trailing ellipsis + push @words_matched, "..."; + + # create the snippet from the saved context words + $snippet = join " ", @words_matched; + + return $snippet; + } +sub parse_terms + { + my ($s) = @_; + my @terms = split /(\W+)/, lc($s); + my $words = []; + my $count = 0; + foreach my $term (@terms) + { + if ( $term !~ /\W/ ) + { + push @$words, $term; + } + } + return $words; + } +sub log + { + my ($self, $date) = @_; + my $criteria = {}; + $criteria->{'date'} = $date if $date; + return $self->db->select("search_log", $criteria); + } +sub log_query + { + my ($self, $query, $total) = @_; + return unless $query; + my $date = time; + $matches ||= '0'; + $self->db->insert("search_log", { query => $query, date => $date, matches => $total }); + } +1; |
