blob: 93a41a7262b97bbb53f3f91f43db3f69c641e039 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
package Rest::Dailyrotten;
use base 'Rest';
# my $topsy_data = $self->rest_get_raw($self->topsy_query($page));
# $self->write_data("../tmp/topsy_call", $topsy_data);
# exit;
sub dailyrotten_get
{
my ($self) = @_;
my $year = 2009;
my $archive_url = "http://www.dailyrotten.com/archive/$year/";
my $dailyrotten_calendar = $self->rest_get_raw($archive_url);
my @lines = split "\n", $dailyrotten_calendar;
my $valid = [];
foreach my $line (@lines)
{
if ($line =~ /<a href="$archive_url(_$year-\d+-\d+.html)">/)
{
push @$valid, $1;
}
}
# skip the last day so we can get accurate forum count later
my $skip = pop(@$valid);
my $xml_data = [];
foreach my $file (@$valid)
{
my $raw_data = $self->read_data("../tmp/dr/raw/$file");
if (!$raw_data)
{
sleep 5;
my $page_url = $archive_url . $file;
$raw_data = $self->rest_get_raw($page_url);
$self->write_data("../tmp/dr/raw/$file", $raw_data);
}
my $posts = $self->dailyrotten_posts($raw_data);
push @$xml_data, { file => $file, post => $posts };
}
$self->write_xml("../tmp/dr/2009.xml", $xml_data);
}
sub dailyrotten_load
{
my ($self) = @_;
return $self->read_xml("../tmp/dr/2009.xml");
}
sub dailyrotten_posts
{
my ($self, $raw_data) = @_;
my @lines = split "\n", $raw_data;
my $recs = [];
my $rec = {};
foreach my $line (@lines)
{
## if ($line =~ /Daily Rotten Archives<\/font><br>(.*)<br>/)
# {
# }
if ($line =~ /^<a href="(.*)" target="_blank">Read article\.\.\.<\/a>/)
{
$rec->{'url'} = $1;
}
if ($line =~ /class="newslink">(.*)<\/a>/)
{
$rec->{'title'} = $1;
}
if ($line =~ /Comments \((\d+)\)/)
{
$rec->{'comments'} = $1;
push @$recs, $rec;
$rec = {};
}
}
return $recs;
}
1;
|