diff options
Diffstat (limited to 'search/lib/Bucky/Search.pm')
| -rw-r--r-- | search/lib/Bucky/Search.pm | 413 |
1 files changed, 0 insertions, 413 deletions
diff --git a/search/lib/Bucky/Search.pm b/search/lib/Bucky/Search.pm deleted file mode 100644 index c3516a1..0000000 --- a/search/lib/Bucky/Search.pm +++ /dev/null @@ -1,413 +0,0 @@ -package Bucky::Search; - -use base 'Bucky'; - -use Data::Dumper; -use DB_File; - -sub index - { - my ($self, $index) = @_; - $self->{'_index'} = $index || $self->index_read; - return $self->{'_index'}; - } -sub index_read - { - my ($self) = @_; - my %index; - tie %index, "DB_File", $self->index_filename, O_RDONLY, 0666, $DB_HASH ; - return \%index; - } -sub index_write - { - my ($self) = @_; - my %index; - tie %index, "DB_File", $self->index_filename, O_CREAT|O_RDWR, 0666, $DB_HASH ; - return \%index; - } -sub index_close - { - my ($self, $index) = @_; - $index ||= $self->index; - untie %$index; - } -sub index_filename - { "search_init.db" } -sub auto_index - { - my ($self, $auto_index) = @_; - $self->{'_auto_index'} = $auto_index || $self->auto_index_read; - return $self->{'_auto_index'}; - } -sub auto_index_read - { - my ($self) = @_; - my %auto_index; - tie %auto_index, "DB_File", $self->auto_index_filename, O_RDONLY, 0666, $DB_HASH ; - return \%auto_index; - } -sub auto_index_write - { - my ($self) = @_; - my %auto_index; - tie %auto_index, "DB_File", $self->auto_index_filename, O_CREAT|O_RDWR, 0666, $DB_HASH ; - return \%auto_index; - } -sub auto_index_close - { - my ($self, $auto_index) = @_; - $auto_index ||= $self->auto_index; - untie %$auto_index; - } -sub auto_index_filename - { "auto.db" } -sub lexicon_store - { - my ($self, $lexicon) = @_; - my $index = $self->index_write; - foreach my $term (keys %$lexicon) - { - next if $self->is_stopword($term); - my $serialized = $self->serialize_matches($lexicon->{$term}); - next unless $serialized; - $index->{$term} = $serialized; - } - $self->index_close($index); - } -sub unserialize_matches - { - my ($self, $serialized_string) = @_; - my @serialized_matches = split ",", $serialized_string; - my @matches; - foreach my $serialized_match (@serialized_matches) - { - my ($thread, $comment, $file, $strength) = split " ", $serialized_match; - my $match = {}; - $match->{'thread'} = $thread; - $match->{'comment'} = $comment; - $match->{'file'} = $file; - $match->{'strength'} = $strength; - push @matches, $match; - } - return \@matches; - } -sub serialize_matches - { - my ($self, $matches) = @_; - my @serialized_matches; - foreach my $match (values %$matches) - { - next unless $match && ref($match) eq "HASH"; - my $string = join " ", - $match->{'thread'}, - $match->{'comment'}, - $match->{'file'}, - $match->{'strength'}; - next unless $string; - push @serialized_matches, $string if $string; - } - return undef unless scalar @serialized_matches; - return join ",", @serialized_matches; - } -my $STOPWORDS = {( map { lc $_, 1 } qw( -a about above across adj after again against all almost alone along also -although always am among an and another any anybody anyone anything anywhere -apart are around as aside at away be because been before behind being below -besides between beyond both but by can cannot could deep did do does doing done -down downwards during each either else enough etc even ever every everybody -everyone except far few for forth from get gets got had hardly has have having -her here herself him himself his how however i if in indeed instead into inward -is it its itself just kept many maybe might mine more most mostly much must -myself near neither next no nobody none nor not nothing nowhere of off often on -only onto or other others ought our ours out outside over own p per please plus -pp quite rather really said seem self selves several shall she should since so -some somebody somewhat still such than that the their theirs them themselves -then there therefore these they this thorough thoroughly those through thus to -together too toward towards under until up upon v very was well were what -whatever when whenever where whether which while who whom whose will with -within without would yet young your yourself s ) )}; -sub is_stopword - { - my ($self, $term) = @_; - return exists $STOPWORDS->{lc $term}; - } -sub autocomplete - { - my ($self, $query) = @_; - return unless $query; - my $terms = parse_terms($query); - my $last_term = pop @$terms; - - my $auto_index = $self->auto_index; - my $guess_term = $auto_index->{$last_term}; - my $guess_full = join " ", @$terms, $guess_term; - my $guess_tail = $guess_term; - $guess_tail =~ s/^$last_term//; - - my $guess = {}; - $guess->{'full'} = $guess_full; - $guess->{'tail'} = $guess_tail; - $guess->{'term'} = $guess_term; - return $guess; - } -sub search_light - { - my ($self, $query, $start, $limit) = @_; - return unless $query; - $start ||= 0; - $limit ||= 10; - my $scores = {}; - my $terms = parse_terms($query); - my $index = $self->index; - foreach my $term (@$terms) - { - next if $self->is_stopword($term); - next unless my $serial = $index->{$term}; - my $results = $self->unserialize_matches($serial); - foreach my $result (@$results) - { - my $thread = $result->{'thread'}; - $scores->{$thread} ||= {}; - $scores->{$thread}->{thread} ||= $result->{'thread'}; - $scores->{$thread}->{file} ||= $result->{'file'}; - $scores->{$thread}->{strength} += $result->{'strength'}; - $scores->{$thread}->{count}++; - } - } - my $total = scalar keys %$scores; - my $i = 0; - my $to_display = $limit; - my $threads = {}; - my $comments_to_get = []; - my $files_to_get = []; - foreach my $match (sort { $b->{count} <=> $a->{count} || $b->{strength} * $b->{count} <=> $a->{strength} * $a->{count} } values %$scores ) - { - next if $i++ < $start; - my $thread = $self->thread( $match->{'thread'} ); - next unless $thread; - next if $thread->{'private'}; - last if $to_display-- == 0; - push @$results, $match; - push @$comments_to_get, $match->{'comment'} if $match->{'comment'}; - if ( $match->{'file'} ) - { push @$files_to_get, $match->{'file'}; } - if ( $thread->{'flagged'} ) - { push @$files_to_get, $thread->{'flagged'}; } - $threads->{ $thread->{'id'} } = $thread; - } - my $files = $self->files_by_id($files_to_get); - # $self->log_query($query, $total); - return - { - start => $start + $limit, - limit => $limit, - total => $total, - results => $results, - threads => $threads, - files => $files, - terms => $terms, - }; - } -sub search - { - my ($self, $query, $start, $limit) = @_; - return unless $query; - $start ||= 0; - $limit ||= 10; - my $scores = {}; - my $terms = parse_terms($query); - my $index = $self->index; - foreach my $term (@$terms) - { - next if $self->is_stopword($term); - next unless my $serial = $index->{$term}; - my $results = $self->unserialize_matches($serial); - foreach my $result (@$results) - { - my $thread = $result->{'thread'}; - $scores->{$thread} ||= {}; - $scores->{$thread}->{thread} ||= $result->{'thread'}; - $scores->{$thread}->{comment} ||= $result->{'comment'}; - $scores->{$thread}->{file} ||= $result->{'file'}; - $scores->{$thread}->{strength} += $result->{'strength'}; - $scores->{$thread}->{count}++; - } - } - my $total = scalar keys %$scores; - my $i = 0; - my $to_display = $limit; - my $threads = {}; - my $comments_to_get = []; - my $files_to_get = []; - foreach my $match (sort { $b->{count} <=> $a->{count} || $b->{strength} * $b->{count} <=> $a->{strength} * $a->{count} } values %$scores ) - { - next if $i++ < $start; - my $thread = $self->thread( $match->{'thread'} ); - next unless $thread; - next if $thread->{'private'}; - last if $to_display-- == 0; - push @$results, $match; - push @$comments_to_get, $match->{'comment'} if $match->{'comment'}; - if ( $match->{'file'} ) - { push @$files_to_get, $match->{'file'}; } - if ( $thread->{'flagged'} ) - { push @$files_to_get, $thread->{'flagged'}; } - $threads->{ $thread->{'id'} } = $thread; - } - my $files = $self->files_by_id($files_to_get); - my $comments = $self->comments_by_id($comments_to_get); - $self->log_query($query, $total); - return - { - start => $start + $limit, - limit => $limit, - total => $total, - results => $results, - threads => $threads, - comments => $comments, - files => $files, - terms => $terms, - }; - } -sub score_display - { - my ($obj) = @_; - return scalar(keys %{$obj->{terms}}) . "x" . $obj->{count}; - } -sub display_object - { - my ($obj) = @_; - my ($type, $id) = split ":", $obj; - my $thread = $bucky->thread($id); - my $title = $thread ? $thread->_title : "* * *"; - return $type . " " . $id . "\t" . $title; - } -sub bold_terms - { - my ($self, $string, $terms) = @_; - $string = $self->strip_html($string); - foreach my $term (@$terms) - { - $string =~ s/\b($term)\b/<b>$1<\/b>/gi; - } - return $string; - } -sub bold_snippet - { - my ($self, $string, $terms) = @_; - my $snippet = $self->snippet($string, $terms); - return $self->bold_terms($snippet, $terms); - } -sub snippet - { - my ($self, $string, $terms) = @_; - - # clean up the string we got - $string = $self->strip_html($string); - - # create a regex out of the search terms - my $term_re = join "|", @$terms; - - # take the string to be snippetized and split it into words - my @words = split /\s+/, $string; - - # deduper for matching @words indexes, so we don't add a word twice - my $index_matches = {}; - - # words in the eventual snippet - my @words_matched; - - # the snippet itself - my $snippet = ''; - - # counter for aggregating context after a match - my $aggr = 0; - - # amount of context to show, in number of words surrounding a match - my $pad = 4; - - # loop over each of the words in the string - for (my $i = 0; $i < scalar @words; $i++) - { - # does this word contain a match? - if ($words[$i] =~ /\b($term_re)\b/i && ! $self->is_stopword($1)) - { - # if we aren't already aggregating, add an ellipsis - if (! $aggr) - { - push @words_matched, "..."; - } - # look backward $pad words - for (my $j = -$pad; $j < 1; $j++) - { - # create a new index from the offset - my $idx = $i + $j; - - # is this a valid index? has it already been encountered? - next if $idx < 0; - next if $idx > scalar @words; - next if exists $index_matches->{$i+$j}; - - # checks out, save this word - push @words_matched, $words[$i+$j]; - - # note the matching index in our deduper - $index_matches->{$i+$j} ++; - } - # enter aggregate mode -- add the next $pad words - $aggr = $pad; - } - # have we been told to aggregate? - elsif ($aggr) - { - # save this word - push @words_matched, $words[$i]; - - # add index to the deduper - $index_matches->{$i} ++; - - # one less word to aggregate - $aggr--; - } - # keep snippets to a modest length - last if scalar @words_matched > 30; - } - # add a trailing ellipsis - push @words_matched, "..."; - - # create the snippet from the saved context words - $snippet = join " ", @words_matched; - - return $snippet; - } -sub parse_terms - { - my ($s) = @_; - my @terms = split /(\W+)/, lc($s); - my $words = []; - my $count = 0; - foreach my $term (@terms) - { - if ( $term !~ /\W/ ) - { - push @$words, $term; - } - } - return $words; - } -sub log - { - my ($self, $date) = @_; - my $criteria = {}; - $criteria->{'date'} = $date if $date; - return $self->db->select("search_log", $criteria); - } -sub log_query - { - my ($self, $query, $total) = @_; - return unless $query; - my $date = time; - $matches ||= '0'; - $self->db->insert("search_log", { query => $query, date => $date, matches => $total }); - } -1; |
