diff options
Diffstat (limited to 'bucky2/t/count-words.pl')
| -rwxr-xr-x | bucky2/t/count-words.pl | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/bucky2/t/count-words.pl b/bucky2/t/count-words.pl new file mode 100755 index 0000000..a6ffc14 --- /dev/null +++ b/bucky2/t/count-words.pl @@ -0,0 +1,102 @@ +#!/usr/bin/perl +use strict; +use lib "../lib"; +use Bucky; +use DB_File; +#require Time::Stopwatch; +tie my $timer, 'Time::Stopwatch'; + +print_timer($timer, "Initialized"); + +my $bucky = new Bucky; + +my $keywords = $bucky->db->select("keyword"); +my $threads = $bucky->db->select("thread"); +my $files = $bucky->db->select("file"); +my $comments = $bucky->db->select("comment"); + +print_timer($timer, "Loaded mysql"); + +my $lexicon = {}; +my $total = 0; +#foreach my $keyword (@$keywords) +# { +# my $id = $keyword->{$id}; +# $lexicon->{ $keyword->{'keyword'} }++; +# $total++; +# } +foreach my $thread (@$threads) + { + $total += parse_terms({ string => $thread->{'title'}, type => "t", id => $thread->{'id'} }); + } +foreach my $file (@$files) + { + $total += parse_terms({ string => $file->{'filename'}, type => "t", id => $file->{'thread'} }); + } +foreach my $comment (@$comments) + { + $total += parse_terms({ string => $comment->{'comment'}, type => "t", id => $comment->{'thread'} }); + } + +print_timer($timer, "Created index"); + +my $unique = scalar keys %$lexicon; +print "--- WORD COUNT: " . $total . "\n"; +print "--- UNIQUE WORDS: " . $unique . "\n"; + +my %index; +tie %index, "DB_File", "gross.db", O_CREAT|O_RDWR, 0666, $DB_HASH ; + +foreach my $term (sort { $lexicon->{$b} <=> $lexicon->{$a} } keys %$lexicon) + { + print scalar @{$lexicon->{$term}} . "\t" . $term . "\n"; + $index{$term} = join " ", @{$lexicon->{$term}}; + } + +untie %index; + +print_timer($timer, "Dumped db_file"); +exit; + +sub parse_terms + { + my ($args) = @_; + my $s = $args->{'string'}; + my $id = $args->{'type'} . ":" . $args->{'id'}; + $s =~ s/_/ /g; + my @terms = split /(\W+)/, $s; + my $count = 0; + foreach my $term (@terms) + { + if ( $term !~ /\W/ ) + { + $lexicon->{ lc($term) } ||= []; + push @{ $lexicon->{lc($term)} }, $id; + $count++; + } + } + return $count; + } + +sub print_timer + { print STDERR sprintf "%3.2f s %s\n", shift, shift; } + +################################################3 + +package Time::Stopwatch; +my $VERSION = '1.00'; + +use strict; +use constant HIRES => eval { local $SIG{__DIE__}; require Time::HiRes }; + +sub TIESCALAR { + my $pkg = shift; + my $time = (HIRES ? Time::HiRes::time() : time()) - (@_ ? shift() : 0); + bless \$time, $pkg; +} + +sub FETCH { (HIRES ? Time::HiRes::time() : time()) - ${$_[0]}; } +sub STORE { ${$_[0]} = (HIRES ? Time::HiRes::time() : time()) - $_[1]; } + +1; + |
