summaryrefslogtreecommitdiff
path: root/bucky2/t/count-words.pl
diff options
context:
space:
mode:
authorJules Laplace <carbon@melanarchy.org>2013-08-02 17:23:25 -0500
committerJules Laplace <carbon@melanarchy.org>2013-08-02 17:23:25 -0500
commite76b691e78e273226cba9284cb8cd22a423319ed (patch)
treea58d22f69869fe2bf3885f81bdda4952f87ff6d7 /bucky2/t/count-words.pl
parent753f60c7d4769fa72d3b910e491f37db6f130898 (diff)
bucky2
Diffstat (limited to 'bucky2/t/count-words.pl')
-rwxr-xr-xbucky2/t/count-words.pl102
1 files changed, 102 insertions, 0 deletions
diff --git a/bucky2/t/count-words.pl b/bucky2/t/count-words.pl
new file mode 100755
index 0000000..a6ffc14
--- /dev/null
+++ b/bucky2/t/count-words.pl
@@ -0,0 +1,102 @@
+#!/usr/bin/perl
+use strict;
+use lib "../lib";
+use Bucky;
+use DB_File;
+#require Time::Stopwatch;
+tie my $timer, 'Time::Stopwatch';
+
+print_timer($timer, "Initialized");
+
+my $bucky = new Bucky;
+
+my $keywords = $bucky->db->select("keyword");
+my $threads = $bucky->db->select("thread");
+my $files = $bucky->db->select("file");
+my $comments = $bucky->db->select("comment");
+
+print_timer($timer, "Loaded mysql");
+
+my $lexicon = {};
+my $total = 0;
+#foreach my $keyword (@$keywords)
+# {
+# my $id = $keyword->{$id};
+# $lexicon->{ $keyword->{'keyword'} }++;
+# $total++;
+# }
+foreach my $thread (@$threads)
+ {
+ $total += parse_terms({ string => $thread->{'title'}, type => "t", id => $thread->{'id'} });
+ }
+foreach my $file (@$files)
+ {
+ $total += parse_terms({ string => $file->{'filename'}, type => "t", id => $file->{'thread'} });
+ }
+foreach my $comment (@$comments)
+ {
+ $total += parse_terms({ string => $comment->{'comment'}, type => "t", id => $comment->{'thread'} });
+ }
+
+print_timer($timer, "Created index");
+
+my $unique = scalar keys %$lexicon;
+print "--- WORD COUNT: " . $total . "\n";
+print "--- UNIQUE WORDS: " . $unique . "\n";
+
+my %index;
+tie %index, "DB_File", "gross.db", O_CREAT|O_RDWR, 0666, $DB_HASH ;
+
+foreach my $term (sort { $lexicon->{$b} <=> $lexicon->{$a} } keys %$lexicon)
+ {
+ print scalar @{$lexicon->{$term}} . "\t" . $term . "\n";
+ $index{$term} = join " ", @{$lexicon->{$term}};
+ }
+
+untie %index;
+
+print_timer($timer, "Dumped db_file");
+exit;
+
+sub parse_terms
+ {
+ my ($args) = @_;
+ my $s = $args->{'string'};
+ my $id = $args->{'type'} . ":" . $args->{'id'};
+ $s =~ s/_/ /g;
+ my @terms = split /(\W+)/, $s;
+ my $count = 0;
+ foreach my $term (@terms)
+ {
+ if ( $term !~ /\W/ )
+ {
+ $lexicon->{ lc($term) } ||= [];
+ push @{ $lexicon->{lc($term)} }, $id;
+ $count++;
+ }
+ }
+ return $count;
+ }
+
+sub print_timer
+ { print STDERR sprintf "%3.2f s %s\n", shift, shift; }
+
+################################################3
+
+package Time::Stopwatch;
+my $VERSION = '1.00';
+
+use strict;
+use constant HIRES => eval { local $SIG{__DIE__}; require Time::HiRes };
+
+sub TIESCALAR {
+ my $pkg = shift;
+ my $time = (HIRES ? Time::HiRes::time() : time()) - (@_ ? shift() : 0);
+ bless \$time, $pkg;
+}
+
+sub FETCH { (HIRES ? Time::HiRes::time() : time()) - ${$_[0]}; }
+sub STORE { ${$_[0]} = (HIRES ? Time::HiRes::time() : time()) - $_[1]; }
+
+1;
+