summaryrefslogtreecommitdiff
path: root/bucky2/t/count-words.pl
blob: a6ffc143227ff1d82dafb75d9a7e53c8f7d84892 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/perl
use strict;
use lib "../lib";
use Bucky;
use DB_File;
#require Time::Stopwatch;
tie my $timer, 'Time::Stopwatch';

print_timer($timer, "Initialized");

my $bucky = new Bucky;

my $keywords = $bucky->db->select("keyword");
my $threads = $bucky->db->select("thread");
my $files = $bucky->db->select("file");
my $comments = $bucky->db->select("comment");

print_timer($timer, "Loaded mysql");

my $lexicon = {};
my $total = 0;
#foreach my $keyword (@$keywords)
#	{
#	my $id = $keyword->{$id};
#	$lexicon->{ $keyword->{'keyword'} }++;
#	$total++;
#	}
foreach my $thread (@$threads)
	{
	$total += parse_terms({ string => $thread->{'title'}, type => "t", id => $thread->{'id'} });
	}
foreach my $file (@$files)
	{
	$total += parse_terms({ string => $file->{'filename'}, type => "t", id => $file->{'thread'} });
	}
foreach my $comment (@$comments)
	{
	$total += parse_terms({ string => $comment->{'comment'}, type => "t", id => $comment->{'thread'} });
	}

print_timer($timer, "Created index");

my $unique = scalar keys %$lexicon;
print "--- WORD COUNT: " . $total . "\n";
print "--- UNIQUE WORDS: " . $unique . "\n";

my %index;
tie %index, "DB_File", "gross.db", O_CREAT|O_RDWR, 0666, $DB_HASH ;

foreach my $term (sort { $lexicon->{$b} <=> $lexicon->{$a} } keys %$lexicon)
	{
	print scalar @{$lexicon->{$term}} . "\t" . $term . "\n";
	$index{$term} = join " ", @{$lexicon->{$term}};
	}

untie %index;

print_timer($timer, "Dumped db_file");
exit;

sub parse_terms
	{
	my ($args) = @_;
	my $s = $args->{'string'};
	my $id = $args->{'type'} . ":" . $args->{'id'};
	$s =~ s/_/ /g;
	my @terms = split /(\W+)/, $s;
	my $count = 0;
	foreach my $term (@terms)
		{
		if ( $term !~ /\W/ )
			{
			$lexicon->{ lc($term) } ||= [];
			push @{ $lexicon->{lc($term)} }, $id;
			$count++;
			}
		}
	return $count;
	}

sub print_timer
	{ print STDERR sprintf "%3.2f s %s\n", shift, shift; }

################################################3

package Time::Stopwatch;
my $VERSION = '1.00';

use strict;
use constant HIRES => eval { local $SIG{__DIE__}; require Time::HiRes };

sub TIESCALAR {
    my $pkg = shift;
    my $time = (HIRES ? Time::HiRes::time() : time()) - (@_ ? shift() : 0);
    bless \$time, $pkg;
}

sub FETCH { (HIRES ? Time::HiRes::time() : time()) - ${$_[0]}; }
sub STORE { ${$_[0]} = (HIRES ? Time::HiRes::time() : time()) - $_[1]; }

1;