diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2017-12-08 01:34:52 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2017-12-08 01:34:52 +0100 |
| commit | 3a4f027ec05aa5fdf4098ceb0dab09f69c5e0b8b (patch) | |
| tree | 8a0c5bebff6a40e77bda8b02142d99a7c448545e /search/bin/build-index | |
| parent | 340c3080b38518976c5c833399d8e07a7fc561bf (diff) | |
adding perl search index builder
Diffstat (limited to 'search/bin/build-index')
| -rwxr-xr-x | search/bin/build-index | 112 |
1 files changed, 112 insertions, 0 deletions
diff --git a/search/bin/build-index b/search/bin/build-index new file mode 100755 index 0000000..b838924 --- /dev/null +++ b/search/bin/build-index @@ -0,0 +1,112 @@ +#!/usr/bin/perl +use strict; +use lib "../lib"; +use Bucky; +use DB_File; +#require Time::Stopwatch; +tie my $timer, 'Time::Stopwatch'; + +print_timer($timer, "Initialized"); + +my $bucky = new Bucky::Search; + +my $keywords = $bucky->db->select("keyword"); +my $threads = $bucky->db->select("thread", {"id > 1"}); +my $files = $bucky->db->select("file"); +my $comments = $bucky->db->select("comment", {"thread > 1"}); + +print_timer($timer, "Loaded mysql"); + +my $lexicon = {}; +my $total = 0; +#foreach my $keyword (@$keywords) +# { +# my $id = $keyword->{$id}; +# $lexicon->{ $keyword->{'keyword'} }++; +# $total++; +# } +foreach my $thread (@$threads) + { + $total += parse_terms({ string => $thread->{'title'}, thread => $thread->{'id'} }); + } +foreach my $comment (@$comments) + { + $total += parse_terms({ string => $comment->{'comment'}, thread => $comment->{'thread'}, comment => $comment->{'id'} }); + } +foreach my $file (@$files) + { + $total += parse_terms({ string => $file->{'filename'}, thread => $file->{'thread'}, file => $file->{'id'} }); + } + +print_timer($timer, "Created index"); + +my $unique = scalar keys %$lexicon; +print "--- WORD COUNT: " . $total . "\n"; +print "--- UNIQUE WORDS: " . $unique . "\n"; + +$bucky->lexicon_store($lexicon); + +my $file = $bucky->index_filename; + +print_timer($timer, "Dumped $file"); + +print "NEW: " ; system("/bin/ls", "-l", "./$file"); +print "OLD: " ; system("/bin/ls", "-l", "../cgi-bin/$file"); +system("/bin/mv", "../cgi-bin/$file", "../cgi-bin/$file.1"); +system("/bin/cp", "./$file", "../cgi-bin/$file"); +system("/usr/bin/perl", "./build-autocomplete"); +exit; + +sub parse_terms + { + my ($args) = @_; + my $thread = $args->{'thread'} || return; + my $comment = $args->{'comment'} || '0'; + my $file = $args->{'file'} || '0'; + my $string = $args->{'string'}; + $string =~ s/_/ /g; + my @terms = split /(\W+)/, $string; + my $count = 0; + foreach my $term (@terms) + { + if ( $term !~ /\W/ ) + { + my $t = lc($term); + $lexicon->{$t} ||= {}; + $lexicon->{$t}->{$thread} ||= {}; + $lexicon->{$t}->{$thread}->{'thread'} ||= $thread; + $lexicon->{$t}->{$thread}->{'comment'} ||= $comment; + $lexicon->{$t}->{$thread}->{'file'} ||= $file; + # give terms in title an extra bump + if ($comment eq '0' && $file eq '0') + { $lexicon->{$t}->{$thread}->{'strength'} += 2; } + else + { $lexicon->{$t}->{$thread}->{'strength'} += 1; } + $count++; + } + } + return $count; + } + +sub print_timer + { print STDERR sprintf "%3.2f s %s\n", shift, shift; } + +################################################3 + +package Time::Stopwatch; +my $VERSION = '1.00'; + +use strict; +use constant HIRES => eval { local $SIG{__DIE__}; require Time::HiRes }; + +sub TIESCALAR { + my $pkg = shift; + my $time = (HIRES ? Time::HiRes::time() : time()) - (@_ ? shift() : 0); + bless \$time, $pkg; +} + +sub FETCH { (HIRES ? Time::HiRes::time() : time()) - ${$_[0]}; } +sub STORE { ${$_[0]} = (HIRES ? Time::HiRes::time() : time()) - $_[1]; } + +1; + |
