diff options
Diffstat (limited to 'bucky2/lib/Rest')
| -rw-r--r-- | bucky2/lib/Rest/Dailyrotten.pm | 76 | ||||
| -rw-r--r-- | bucky2/lib/Rest/Topsy.pm | 185 | ||||
| -rw-r--r-- | bucky2/lib/Rest/Twitter.pm | 129 |
3 files changed, 390 insertions, 0 deletions
diff --git a/bucky2/lib/Rest/Dailyrotten.pm b/bucky2/lib/Rest/Dailyrotten.pm new file mode 100644 index 0000000..93a41a7 --- /dev/null +++ b/bucky2/lib/Rest/Dailyrotten.pm @@ -0,0 +1,76 @@ +package Rest::Dailyrotten; +use base 'Rest'; + +# my $topsy_data = $self->rest_get_raw($self->topsy_query($page)); +# $self->write_data("../tmp/topsy_call", $topsy_data); +# exit; + + +sub dailyrotten_get + { + my ($self) = @_; + my $year = 2009; + my $archive_url = "http://www.dailyrotten.com/archive/$year/"; + my $dailyrotten_calendar = $self->rest_get_raw($archive_url); + my @lines = split "\n", $dailyrotten_calendar; + my $valid = []; + foreach my $line (@lines) + { + if ($line =~ /<a href="$archive_url(_$year-\d+-\d+.html)">/) + { + push @$valid, $1; + } + } + # skip the last day so we can get accurate forum count later + my $skip = pop(@$valid); + my $xml_data = []; + foreach my $file (@$valid) + { + my $raw_data = $self->read_data("../tmp/dr/raw/$file"); + if (!$raw_data) + { + sleep 5; + my $page_url = $archive_url . $file; + $raw_data = $self->rest_get_raw($page_url); + $self->write_data("../tmp/dr/raw/$file", $raw_data); + } + my $posts = $self->dailyrotten_posts($raw_data); + push @$xml_data, { file => $file, post => $posts }; + } + $self->write_xml("../tmp/dr/2009.xml", $xml_data); + } +sub dailyrotten_load + { + my ($self) = @_; + return $self->read_xml("../tmp/dr/2009.xml"); + } +sub dailyrotten_posts + { + my ($self, $raw_data) = @_; + my @lines = split "\n", $raw_data; + my $recs = []; + my $rec = {}; + foreach my $line (@lines) + { +## if ($line =~ /Daily Rotten Archives<\/font><br>(.*)<br>/) +# { +# } + if ($line =~ /^<a href="(.*)" target="_blank">Read article\.\.\.<\/a>/) + { + $rec->{'url'} = $1; + } + if ($line =~ /class="newslink">(.*)<\/a>/) + { + $rec->{'title'} = $1; + } + if ($line =~ /Comments \((\d+)\)/) + { + $rec->{'comments'} = $1; + push @$recs, $rec; + $rec = {}; + } + } + return $recs; + } + +1; diff --git a/bucky2/lib/Rest/Topsy.pm b/bucky2/lib/Rest/Topsy.pm new file mode 100644 index 0000000..3ef045e --- /dev/null +++ b/bucky2/lib/Rest/Topsy.pm @@ -0,0 +1,185 @@ +package Rest::Topsy; +use base 'Rest'; +use Data::Dumper; +use XML::Simple; + +sub topsy_search + { + my ($self, $query) = @_; + my $topsy_data = $self->rest_get_raw("http://topsy.com/s", { q => $query }); + my $initials = $query; + $initials =~ s/\W//g; + $initials = substr($initials, 0, 2); + if (! -e "../tmp/nndb/topsy/$initials") + { + system("/bin/mkdir", "../tmp/nndb/topsy/$initials"); + } + $self->write_data("../tmp/nndb/topsy/$initials/$query.txt", $topsy_data); + my @lines = split "\n", $topsy_data; + my $value = undef; + my $rank = {}; + foreach my $line (@lines) + { + if ($line =~ /<span class="(count|label)">(.*)<\/span>/) + { + my ($token, $text) = ($1, $2); + if ($token eq "count") + { + $value = $text; + $value =~ s/\,//; + } + elsif ($token eq "label") + { + $rank->{lc $text} = $value if $value; + undef $value; + } + } + } + return $rank; + } +sub topsy_get + { + my ($self) = @_; + +# my $topsy_data = $self->rest_get_raw($self->topsy_query($page)); +# $self->write_data("../tmp/topsy_call", $topsy_data); +# exit; + + $topsy_data = $self->read_data("../tmp/topsy_call"); + + my $topsy_script_data ||= $self->topsy_script_data($topsy_data); + my $topsy_entries = $self->topsy_entries($topsy_data); + my $xml_entries = $self->topsy_load; + if (scalar(@$xml_entries)) + { + $topsy_entries = $xml_entries; + } + + my $page = int(scalar(@$topsy_entries) / 10) || 1; + my $last_page = $topsy_script_data->{'pages'}; + + print "$last_page pages\n"; + + while ($page < $last_page) + { + $page++; + print $page . "..."; + my $page_data = $self->rest_get_raw($self->topsy_query($page)); + my $page_entries = $self->topsy_entries($page_data); + push @$topsy_entries, @$page_entries; + $self->write_xml("../tmp/topsy_entries.xml", $topsy_entries); + sleep 10 + (int rand 5); + } + + print "Expected " . $topsy_script_data->{'total'} . ", got " . scalar(@$topsy_entries)."\n"; + + $self->write_xml("../tmp/topsy_entries.xml", $topsy_entries); + + return @$page_entries; + } +sub topsy_load + { + my ($self) = @_; + my $page_entries = $self->read_xml("../tmp/topsy_entries.xml"); + print "Loaded ".scalar(@$page_entries)." entries\n"; + return $page_entries; + } + +sub topsy_entries + { + my ($self, $data) = @_; + my @raw_entries = split '<div class="concept-rank">', $data; + my $entries = []; + my $current = ""; + foreach my $entry (@raw_entries) + { + next if $entry =~ /concept-list-re/; + my $entry_hash = {}; + my @lines = split "\n", $entry; + my $key = ""; +LINE: foreach my $line (@lines) + { + $line = $self->trim($line); + next unless $line; + if ($line =~ /class\=\"(\w+)\"/) + { + $key = $1; + } + my $value = ""; + if ($line =~ /url\((.*)\)/) + { + $entry_hash->{'tile'} = $1; + } + if ($key eq "total" && $line =~ />(\d+)</) + { + $value = $1; + } + elsif ($line =~ />(.*)</) + { + $value = $1; + } + elsif ($key eq "description" && $line =~ /\&ldquo\;(.*)\&rdquo\;/) + { + $value = $1; + $value =~ s/ http.*$//; + } + if ($key && $value) + { + $entry_hash->{$key} = $value; + undef $key; + } + last LINE if $line =~ /script type\=\"text/; + } + if (scalar keys %$entry_hash) + { + push @$entries, $entry_hash; + } + } + return $entries; + } +# Recipes.re= { +# page: 2, +# total: 964, +# perpage: 10, +# pages: 97 +# }; +sub topsy_script_data + { + my ($self, $data) = @_; + my @lines = split "\n", $data; + my $script_data = {}; + foreach my $line (@lines) + { + next unless $line =~ /\:/; + $line =~ s/\s+//g; + $line =~ s/\,//; + my ($k, $v) = split ":", $line; + next unless $k && $v; + next unless $k =~ /^(page|total|perpage|pages)/; + $script_data->{$k} = $v; + } + return scalar keys %$script_data ? $script_data : undef; + } +sub topsy_query + { + my ($self, $page) = @_; + my $url = "http://topsy.com/concept"; + my $query = + { + "page" => $page, + "sort_method" => "", + "url" => $self->url, + "class" => "UB::Concept::List::Re", + }; + return ($url, $query); + } +sub url + { + my ($self, $url) = @_; + if ($url) + { + $self->{'url'} = $url; + } + return $self->{'url'}; + } +1; diff --git a/bucky2/lib/Rest/Twitter.pm b/bucky2/lib/Rest/Twitter.pm new file mode 100644 index 0000000..00220a6 --- /dev/null +++ b/bucky2/lib/Rest/Twitter.pm @@ -0,0 +1,129 @@ +package Rest::Twitter; +use base 'Rest'; +use Data::Dumper; +my $twitter_status_uri = "http://twitter.com/statuses/mentions.xml"; +my $twitter_update_uri = "http://twitter.com/statuses/update.xml"; +my $twitter_dm_uri = "http://twitter.com/direct_messages.xml"; +my $twitter_dm_new_uri = "http://twitter.com/direct_messages/new.xml"; + +sub dm_post + { + my ($self, $user, $tweet) = @_; + return unless $user && $tweet; + $tweet =~ s/\s+/ /g; + print ">>> D $user: $tweet\n"; + return $self->rest_post($twitter_dm_new_uri, {text => $tweet, user => $user}); + } +sub tweet_post + { + my ($self, $tweet, $replyid) = @_; + $tweet =~ s/\s+/ /g; + print ">>> $tweet\n"; + return $self->rest_post($twitter_update_uri, {status => $tweet, in_reply_to_status_id => $replyid}); + } +sub dm_get + { + my ($self) = @_; + my $twitter_since_id = $self->since_id("dm"); + my $dm_data = $self->rest_get($twitter_dm_uri, {since_id => $twitter_since_id}); +# return (undef, undef) unless exists($tweet_data->{'status'}); + # DEFAULT: plural behavior + my $status = $dm_data->{'direct_message'}; + # CATCH: singular behavior + if (exists($status->{'id'})) + { + my $id = $status->{'id'}; + $status = { $id => $status }; + } + my $dms = []; + my $last_id = 0; + foreach my $id (keys %{ $status }) + { + if ($id > $last_id) + { $last_id = $id; } + my $dm = $status->{$id}->{'text'}; + my $user = $status->{$id}->{'sender_screen_name'}; + push @$dms, {id => $id, tweet => $dm, user => $user, type => "dm"}; + } + if ($last_id > $twitter_since_id) + { + $self->since_id("dm", $last_id); + } + return $dms; + } +sub tweet_get + { + my ($self) = @_; + my $twitter_since_id = $self->since_id("status"); + my $tweet_data = $self->rest_get($twitter_status_uri, {since_id => $twitter_since_id}); + return (undef, undef) unless exists($tweet_data->{'status'}); + # DEFAULT: plural behavior + my $status = $tweet_data->{'status'}; + # CATCH: singular behavior + if (exists($status->{'id'})) + { + my $id = $status->{'id'}; + $status = { $id => $status }; + } + my $tweets = []; + my $last_id = 0; + foreach my $id (keys %{ $status }) + { + if ($id > $last_id) + { $last_id = $id; } + my $tweet = $status->{$id}->{'text'}; + my $user = $status->{$id}->{'user'}->{'screen_name'}; + push @$tweets, {id => $id, tweet => $tweet, user => $user, type => "tweet"}; + } + if ($last_id > $twitter_since_id) + { + $self->since_id("status", $last_id); + } + return $tweets; + } +sub since_id + { + my ($self, $key, $id) = @_; + return unless $key; + $self->{'since'} ||= {}; + if ($id) + { + if ($self->{'since'}->{$key} < $id) + { + $self->{'since'}->{$key} = $id; + $self->_since_id_write($key, $id); + } + } + if (! exists($self->{'since'}->{$key})) + { + $self->{'since'}->{$key} = $self->_since_id_read($key); + } + return $self->{'since'}->{$key}; + } +sub _since_id_read + { + my ($self, $key) = @_; + my $file = $self->_since_id_file($key); + open LAST, $file; + my $line = $self->trim(<LAST>); + close LAST; + return $line; + } +sub _since_id_write + { + my ($self, $key, $id) = @_; + my $file = $self->_since_id_file($key); + open LAST, ">$file"; + print LAST $id."\n"; + close LAST; + } +sub _since_id_file + { + my ($self, $key) = @_; + my $tmp_dir = "../tmp"; + my $file = $tmp_dir."/twitter-".$self->user."-".$key; + for ($tmp_dir, $file) + { die ("can't find $_") unless -e $_; } + return $file; + } +1; |
