summaryrefslogtreecommitdiff
path: root/search/bin/build-index
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2017-12-08 01:34:52 +0100
committerJules Laplace <julescarbon@gmail.com>2017-12-08 01:34:52 +0100
commit3a4f027ec05aa5fdf4098ceb0dab09f69c5e0b8b (patch)
tree8a0c5bebff6a40e77bda8b02142d99a7c448545e /search/bin/build-index
parent340c3080b38518976c5c833399d8e07a7fc561bf (diff)
adding perl search index builder
Diffstat (limited to 'search/bin/build-index')
-rwxr-xr-xsearch/bin/build-index112
1 files changed, 112 insertions, 0 deletions
diff --git a/search/bin/build-index b/search/bin/build-index
new file mode 100755
index 0000000..b838924
--- /dev/null
+++ b/search/bin/build-index
@@ -0,0 +1,112 @@
+#!/usr/bin/perl
+use strict;
+use lib "../lib";
+use Bucky;
+use DB_File;
+#require Time::Stopwatch;
+tie my $timer, 'Time::Stopwatch';
+
+print_timer($timer, "Initialized");
+
+my $bucky = new Bucky::Search;
+
+my $keywords = $bucky->db->select("keyword");
+my $threads = $bucky->db->select("thread", {"id > 1"});
+my $files = $bucky->db->select("file");
+my $comments = $bucky->db->select("comment", {"thread > 1"});
+
+print_timer($timer, "Loaded mysql");
+
+my $lexicon = {};
+my $total = 0;
+#foreach my $keyword (@$keywords)
+# {
+# my $id = $keyword->{$id};
+# $lexicon->{ $keyword->{'keyword'} }++;
+# $total++;
+# }
+foreach my $thread (@$threads)
+ {
+ $total += parse_terms({ string => $thread->{'title'}, thread => $thread->{'id'} });
+ }
+foreach my $comment (@$comments)
+ {
+ $total += parse_terms({ string => $comment->{'comment'}, thread => $comment->{'thread'}, comment => $comment->{'id'} });
+ }
+foreach my $file (@$files)
+ {
+ $total += parse_terms({ string => $file->{'filename'}, thread => $file->{'thread'}, file => $file->{'id'} });
+ }
+
+print_timer($timer, "Created index");
+
+my $unique = scalar keys %$lexicon;
+print "--- WORD COUNT: " . $total . "\n";
+print "--- UNIQUE WORDS: " . $unique . "\n";
+
+$bucky->lexicon_store($lexicon);
+
+my $file = $bucky->index_filename;
+
+print_timer($timer, "Dumped $file");
+
+print "NEW: " ; system("/bin/ls", "-l", "./$file");
+print "OLD: " ; system("/bin/ls", "-l", "../cgi-bin/$file");
+system("/bin/mv", "../cgi-bin/$file", "../cgi-bin/$file.1");
+system("/bin/cp", "./$file", "../cgi-bin/$file");
+system("/usr/bin/perl", "./build-autocomplete");
+exit;
+
+sub parse_terms
+ {
+ my ($args) = @_;
+ my $thread = $args->{'thread'} || return;
+ my $comment = $args->{'comment'} || '0';
+ my $file = $args->{'file'} || '0';
+ my $string = $args->{'string'};
+ $string =~ s/_/ /g;
+ my @terms = split /(\W+)/, $string;
+ my $count = 0;
+ foreach my $term (@terms)
+ {
+ if ( $term !~ /\W/ )
+ {
+ my $t = lc($term);
+ $lexicon->{$t} ||= {};
+ $lexicon->{$t}->{$thread} ||= {};
+ $lexicon->{$t}->{$thread}->{'thread'} ||= $thread;
+ $lexicon->{$t}->{$thread}->{'comment'} ||= $comment;
+ $lexicon->{$t}->{$thread}->{'file'} ||= $file;
+ # give terms in title an extra bump
+ if ($comment eq '0' && $file eq '0')
+ { $lexicon->{$t}->{$thread}->{'strength'} += 2; }
+ else
+ { $lexicon->{$t}->{$thread}->{'strength'} += 1; }
+ $count++;
+ }
+ }
+ return $count;
+ }
+
+sub print_timer
+ { print STDERR sprintf "%3.2f s %s\n", shift, shift; }
+
+################################################3
+
+package Time::Stopwatch;
+my $VERSION = '1.00';
+
+use strict;
+use constant HIRES => eval { local $SIG{__DIE__}; require Time::HiRes };
+
+sub TIESCALAR {
+ my $pkg = shift;
+ my $time = (HIRES ? Time::HiRes::time() : time()) - (@_ ? shift() : 0);
+ bless \$time, $pkg;
+}
+
+sub FETCH { (HIRES ? Time::HiRes::time() : time()) - ${$_[0]}; }
+sub STORE { ${$_[0]} = (HIRES ? Time::HiRes::time() : time()) - $_[1]; }
+
+1;
+