1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
#!/usr/bin/perl
use strict;
use lib "../lib";
use Bucky;
use DB_File;
#require Time::Stopwatch;
tie my $timer, 'Time::Stopwatch';
print_timer($timer, "Initialized");
my $bucky = new Bucky;
my $keywords = $bucky->db->select("keyword");
my $threads = $bucky->db->select("thread");
my $files = $bucky->db->select("file");
my $comments = $bucky->db->select("comment");
print_timer($timer, "Loaded mysql");
my $lexicon = {};
my $total = 0;
#foreach my $keyword (@$keywords)
# {
# my $id = $keyword->{$id};
# $lexicon->{ $keyword->{'keyword'} }++;
# $total++;
# }
foreach my $thread (@$threads)
{
$total += parse_terms({ string => $thread->{'title'}, type => "t", id => $thread->{'id'} });
}
foreach my $file (@$files)
{
$total += parse_terms({ string => $file->{'filename'}, type => "t", id => $file->{'thread'} });
}
foreach my $comment (@$comments)
{
$total += parse_terms({ string => $comment->{'comment'}, type => "t", id => $comment->{'thread'} });
}
print_timer($timer, "Created index");
my $unique = scalar keys %$lexicon;
print "--- WORD COUNT: " . $total . "\n";
print "--- UNIQUE WORDS: " . $unique . "\n";
my %index;
tie %index, "DB_File", "gross.db", O_CREAT|O_RDWR, 0666, $DB_HASH ;
foreach my $term (sort { $lexicon->{$b} <=> $lexicon->{$a} } keys %$lexicon)
{
print scalar @{$lexicon->{$term}} . "\t" . $term . "\n";
$index{$term} = join " ", @{$lexicon->{$term}};
}
untie %index;
print_timer($timer, "Dumped db_file");
exit;
sub parse_terms
{
my ($args) = @_;
my $s = $args->{'string'};
my $id = $args->{'type'} . ":" . $args->{'id'};
$s =~ s/_/ /g;
my @terms = split /(\W+)/, $s;
my $count = 0;
foreach my $term (@terms)
{
if ( $term !~ /\W/ )
{
$lexicon->{ lc($term) } ||= [];
push @{ $lexicon->{lc($term)} }, $id;
$count++;
}
}
return $count;
}
sub print_timer
{ print STDERR sprintf "%3.2f s %s\n", shift, shift; }
################################################3
package Time::Stopwatch;
my $VERSION = '1.00';
use strict;
use constant HIRES => eval { local $SIG{__DIE__}; require Time::HiRes };
sub TIESCALAR {
my $pkg = shift;
my $time = (HIRES ? Time::HiRes::time() : time()) - (@_ ? shift() : 0);
bless \$time, $pkg;
}
sub FETCH { (HIRES ? Time::HiRes::time() : time()) - ${$_[0]}; }
sub STORE { ${$_[0]} = (HIRES ? Time::HiRes::time() : time()) - $_[1]; }
1;
|