#!/usr/bin/perl use strict; use lib "../lib"; use Bucky; use DB_File; #require Time::Stopwatch; tie my $timer, 'Time::Stopwatch'; print_timer($timer, "Initialized"); my $bucky = new Bucky; my $keywords = $bucky->db->select("keyword"); my $threads = $bucky->db->select("thread"); my $files = $bucky->db->select("file"); my $comments = $bucky->db->select("comment"); print_timer($timer, "Loaded mysql"); my $lexicon = {}; my $total = 0; #foreach my $keyword (@$keywords) # { # my $id = $keyword->{$id}; # $lexicon->{ $keyword->{'keyword'} }++; # $total++; # } foreach my $thread (@$threads) { $total += parse_terms({ string => $thread->{'title'}, type => "t", id => $thread->{'id'} }); } foreach my $file (@$files) { $total += parse_terms({ string => $file->{'filename'}, type => "t", id => $file->{'thread'} }); } foreach my $comment (@$comments) { $total += parse_terms({ string => $comment->{'comment'}, type => "t", id => $comment->{'thread'} }); } print_timer($timer, "Created index"); my $unique = scalar keys %$lexicon; print "--- WORD COUNT: " . $total . "\n"; print "--- UNIQUE WORDS: " . $unique . "\n"; my %index; tie %index, "DB_File", "gross.db", O_CREAT|O_RDWR, 0666, $DB_HASH ; foreach my $term (sort { $lexicon->{$b} <=> $lexicon->{$a} } keys %$lexicon) { print scalar @{$lexicon->{$term}} . "\t" . $term . "\n"; $index{$term} = join " ", @{$lexicon->{$term}}; } untie %index; print_timer($timer, "Dumped db_file"); exit; sub parse_terms { my ($args) = @_; my $s = $args->{'string'}; my $id = $args->{'type'} . ":" . $args->{'id'}; $s =~ s/_/ /g; my @terms = split /(\W+)/, $s; my $count = 0; foreach my $term (@terms) { if ( $term !~ /\W/ ) { $lexicon->{ lc($term) } ||= []; push @{ $lexicon->{lc($term)} }, $id; $count++; } } return $count; } sub print_timer { print STDERR sprintf "%3.2f s %s\n", shift, shift; } ################################################3 package Time::Stopwatch; my $VERSION = '1.00'; use strict; use constant HIRES => eval { local $SIG{__DIE__}; require Time::HiRes }; sub TIESCALAR { my $pkg = shift; my $time = (HIRES ? Time::HiRes::time() : time()) - (@_ ? shift() : 0); bless \$time, $pkg; } sub FETCH { (HIRES ? Time::HiRes::time() : time()) - ${$_[0]}; } sub STORE { ${$_[0]} = (HIRES ? Time::HiRes::time() : time()) - $_[1]; } 1;