summaryrefslogtreecommitdiff
path: root/bucky2/lib/Rest
diff options
context:
space:
mode:
Diffstat (limited to 'bucky2/lib/Rest')
-rw-r--r--bucky2/lib/Rest/Dailyrotten.pm76
-rw-r--r--bucky2/lib/Rest/Topsy.pm185
-rw-r--r--bucky2/lib/Rest/Twitter.pm129
3 files changed, 390 insertions, 0 deletions
diff --git a/bucky2/lib/Rest/Dailyrotten.pm b/bucky2/lib/Rest/Dailyrotten.pm
new file mode 100644
index 0000000..93a41a7
--- /dev/null
+++ b/bucky2/lib/Rest/Dailyrotten.pm
@@ -0,0 +1,76 @@
+package Rest::Dailyrotten;
+use base 'Rest';
+
+# my $topsy_data = $self->rest_get_raw($self->topsy_query($page));
+# $self->write_data("../tmp/topsy_call", $topsy_data);
+# exit;
+
+
+sub dailyrotten_get
+ {
+ my ($self) = @_;
+ my $year = 2009;
+ my $archive_url = "http://www.dailyrotten.com/archive/$year/";
+ my $dailyrotten_calendar = $self->rest_get_raw($archive_url);
+ my @lines = split "\n", $dailyrotten_calendar;
+ my $valid = [];
+ foreach my $line (@lines)
+ {
+ if ($line =~ /<a href="$archive_url(_$year-\d+-\d+.html)">/)
+ {
+ push @$valid, $1;
+ }
+ }
+ # skip the last day so we can get accurate forum count later
+ my $skip = pop(@$valid);
+ my $xml_data = [];
+ foreach my $file (@$valid)
+ {
+ my $raw_data = $self->read_data("../tmp/dr/raw/$file");
+ if (!$raw_data)
+ {
+ sleep 5;
+ my $page_url = $archive_url . $file;
+ $raw_data = $self->rest_get_raw($page_url);
+ $self->write_data("../tmp/dr/raw/$file", $raw_data);
+ }
+ my $posts = $self->dailyrotten_posts($raw_data);
+ push @$xml_data, { file => $file, post => $posts };
+ }
+ $self->write_xml("../tmp/dr/2009.xml", $xml_data);
+ }
+sub dailyrotten_load
+ {
+ my ($self) = @_;
+ return $self->read_xml("../tmp/dr/2009.xml");
+ }
+sub dailyrotten_posts
+ {
+ my ($self, $raw_data) = @_;
+ my @lines = split "\n", $raw_data;
+ my $recs = [];
+ my $rec = {};
+ foreach my $line (@lines)
+ {
+## if ($line =~ /Daily Rotten Archives<\/font><br>(.*)<br>/)
+# {
+# }
+ if ($line =~ /^<a href="(.*)" target="_blank">Read article\.\.\.<\/a>/)
+ {
+ $rec->{'url'} = $1;
+ }
+ if ($line =~ /class="newslink">(.*)<\/a>/)
+ {
+ $rec->{'title'} = $1;
+ }
+ if ($line =~ /Comments \((\d+)\)/)
+ {
+ $rec->{'comments'} = $1;
+ push @$recs, $rec;
+ $rec = {};
+ }
+ }
+ return $recs;
+ }
+
+1;
diff --git a/bucky2/lib/Rest/Topsy.pm b/bucky2/lib/Rest/Topsy.pm
new file mode 100644
index 0000000..3ef045e
--- /dev/null
+++ b/bucky2/lib/Rest/Topsy.pm
@@ -0,0 +1,185 @@
+package Rest::Topsy;
+use base 'Rest';
+use Data::Dumper;
+use XML::Simple;
+
+sub topsy_search
+ {
+ my ($self, $query) = @_;
+ my $topsy_data = $self->rest_get_raw("http://topsy.com/s", { q => $query });
+ my $initials = $query;
+ $initials =~ s/\W//g;
+ $initials = substr($initials, 0, 2);
+ if (! -e "../tmp/nndb/topsy/$initials")
+ {
+ system("/bin/mkdir", "../tmp/nndb/topsy/$initials");
+ }
+ $self->write_data("../tmp/nndb/topsy/$initials/$query.txt", $topsy_data);
+ my @lines = split "\n", $topsy_data;
+ my $value = undef;
+ my $rank = {};
+ foreach my $line (@lines)
+ {
+ if ($line =~ /<span class="(count|label)">(.*)<\/span>/)
+ {
+ my ($token, $text) = ($1, $2);
+ if ($token eq "count")
+ {
+ $value = $text;
+ $value =~ s/\,//;
+ }
+ elsif ($token eq "label")
+ {
+ $rank->{lc $text} = $value if $value;
+ undef $value;
+ }
+ }
+ }
+ return $rank;
+ }
+sub topsy_get
+ {
+ my ($self) = @_;
+
+# my $topsy_data = $self->rest_get_raw($self->topsy_query($page));
+# $self->write_data("../tmp/topsy_call", $topsy_data);
+# exit;
+
+ $topsy_data = $self->read_data("../tmp/topsy_call");
+
+ my $topsy_script_data ||= $self->topsy_script_data($topsy_data);
+ my $topsy_entries = $self->topsy_entries($topsy_data);
+ my $xml_entries = $self->topsy_load;
+ if (scalar(@$xml_entries))
+ {
+ $topsy_entries = $xml_entries;
+ }
+
+ my $page = int(scalar(@$topsy_entries) / 10) || 1;
+ my $last_page = $topsy_script_data->{'pages'};
+
+ print "$last_page pages\n";
+
+ while ($page < $last_page)
+ {
+ $page++;
+ print $page . "...";
+ my $page_data = $self->rest_get_raw($self->topsy_query($page));
+ my $page_entries = $self->topsy_entries($page_data);
+ push @$topsy_entries, @$page_entries;
+ $self->write_xml("../tmp/topsy_entries.xml", $topsy_entries);
+ sleep 10 + (int rand 5);
+ }
+
+ print "Expected " . $topsy_script_data->{'total'} . ", got " . scalar(@$topsy_entries)."\n";
+
+ $self->write_xml("../tmp/topsy_entries.xml", $topsy_entries);
+
+ return @$page_entries;
+ }
+sub topsy_load
+ {
+ my ($self) = @_;
+ my $page_entries = $self->read_xml("../tmp/topsy_entries.xml");
+ print "Loaded ".scalar(@$page_entries)." entries\n";
+ return $page_entries;
+ }
+
+sub topsy_entries
+ {
+ my ($self, $data) = @_;
+ my @raw_entries = split '<div class="concept-rank">', $data;
+ my $entries = [];
+ my $current = "";
+ foreach my $entry (@raw_entries)
+ {
+ next if $entry =~ /concept-list-re/;
+ my $entry_hash = {};
+ my @lines = split "\n", $entry;
+ my $key = "";
+LINE: foreach my $line (@lines)
+ {
+ $line = $self->trim($line);
+ next unless $line;
+ if ($line =~ /class\=\"(\w+)\"/)
+ {
+ $key = $1;
+ }
+ my $value = "";
+ if ($line =~ /url\((.*)\)/)
+ {
+ $entry_hash->{'tile'} = $1;
+ }
+ if ($key eq "total" && $line =~ />(\d+)</)
+ {
+ $value = $1;
+ }
+ elsif ($line =~ />(.*)</)
+ {
+ $value = $1;
+ }
+ elsif ($key eq "description" && $line =~ /\&ldquo\;(.*)\&rdquo\;/)
+ {
+ $value = $1;
+ $value =~ s/ http.*$//;
+ }
+ if ($key && $value)
+ {
+ $entry_hash->{$key} = $value;
+ undef $key;
+ }
+ last LINE if $line =~ /script type\=\"text/;
+ }
+ if (scalar keys %$entry_hash)
+ {
+ push @$entries, $entry_hash;
+ }
+ }
+ return $entries;
+ }
+# Recipes.re= {
+# page: 2,
+# total: 964,
+# perpage: 10,
+# pages: 97
+# };
+sub topsy_script_data
+ {
+ my ($self, $data) = @_;
+ my @lines = split "\n", $data;
+ my $script_data = {};
+ foreach my $line (@lines)
+ {
+ next unless $line =~ /\:/;
+ $line =~ s/\s+//g;
+ $line =~ s/\,//;
+ my ($k, $v) = split ":", $line;
+ next unless $k && $v;
+ next unless $k =~ /^(page|total|perpage|pages)/;
+ $script_data->{$k} = $v;
+ }
+ return scalar keys %$script_data ? $script_data : undef;
+ }
+sub topsy_query
+ {
+ my ($self, $page) = @_;
+ my $url = "http://topsy.com/concept";
+ my $query =
+ {
+ "page" => $page,
+ "sort_method" => "",
+ "url" => $self->url,
+ "class" => "UB::Concept::List::Re",
+ };
+ return ($url, $query);
+ }
+sub url
+ {
+ my ($self, $url) = @_;
+ if ($url)
+ {
+ $self->{'url'} = $url;
+ }
+ return $self->{'url'};
+ }
+1;
diff --git a/bucky2/lib/Rest/Twitter.pm b/bucky2/lib/Rest/Twitter.pm
new file mode 100644
index 0000000..00220a6
--- /dev/null
+++ b/bucky2/lib/Rest/Twitter.pm
@@ -0,0 +1,129 @@
+package Rest::Twitter;
+use base 'Rest';
+use Data::Dumper;
+my $twitter_status_uri = "http://twitter.com/statuses/mentions.xml";
+my $twitter_update_uri = "http://twitter.com/statuses/update.xml";
+my $twitter_dm_uri = "http://twitter.com/direct_messages.xml";
+my $twitter_dm_new_uri = "http://twitter.com/direct_messages/new.xml";
+
+sub dm_post
+ {
+ my ($self, $user, $tweet) = @_;
+ return unless $user && $tweet;
+ $tweet =~ s/\s+/ /g;
+ print ">>> D $user: $tweet\n";
+ return $self->rest_post($twitter_dm_new_uri, {text => $tweet, user => $user});
+ }
+sub tweet_post
+ {
+ my ($self, $tweet, $replyid) = @_;
+ $tweet =~ s/\s+/ /g;
+ print ">>> $tweet\n";
+ return $self->rest_post($twitter_update_uri, {status => $tweet, in_reply_to_status_id => $replyid});
+ }
+sub dm_get
+ {
+ my ($self) = @_;
+ my $twitter_since_id = $self->since_id("dm");
+ my $dm_data = $self->rest_get($twitter_dm_uri, {since_id => $twitter_since_id});
+# return (undef, undef) unless exists($tweet_data->{'status'});
+ # DEFAULT: plural behavior
+ my $status = $dm_data->{'direct_message'};
+ # CATCH: singular behavior
+ if (exists($status->{'id'}))
+ {
+ my $id = $status->{'id'};
+ $status = { $id => $status };
+ }
+ my $dms = [];
+ my $last_id = 0;
+ foreach my $id (keys %{ $status })
+ {
+ if ($id > $last_id)
+ { $last_id = $id; }
+ my $dm = $status->{$id}->{'text'};
+ my $user = $status->{$id}->{'sender_screen_name'};
+ push @$dms, {id => $id, tweet => $dm, user => $user, type => "dm"};
+ }
+ if ($last_id > $twitter_since_id)
+ {
+ $self->since_id("dm", $last_id);
+ }
+ return $dms;
+ }
+sub tweet_get
+ {
+ my ($self) = @_;
+ my $twitter_since_id = $self->since_id("status");
+ my $tweet_data = $self->rest_get($twitter_status_uri, {since_id => $twitter_since_id});
+ return (undef, undef) unless exists($tweet_data->{'status'});
+ # DEFAULT: plural behavior
+ my $status = $tweet_data->{'status'};
+ # CATCH: singular behavior
+ if (exists($status->{'id'}))
+ {
+ my $id = $status->{'id'};
+ $status = { $id => $status };
+ }
+ my $tweets = [];
+ my $last_id = 0;
+ foreach my $id (keys %{ $status })
+ {
+ if ($id > $last_id)
+ { $last_id = $id; }
+ my $tweet = $status->{$id}->{'text'};
+ my $user = $status->{$id}->{'user'}->{'screen_name'};
+ push @$tweets, {id => $id, tweet => $tweet, user => $user, type => "tweet"};
+ }
+ if ($last_id > $twitter_since_id)
+ {
+ $self->since_id("status", $last_id);
+ }
+ return $tweets;
+ }
+sub since_id
+ {
+ my ($self, $key, $id) = @_;
+ return unless $key;
+ $self->{'since'} ||= {};
+ if ($id)
+ {
+ if ($self->{'since'}->{$key} < $id)
+ {
+ $self->{'since'}->{$key} = $id;
+ $self->_since_id_write($key, $id);
+ }
+ }
+ if (! exists($self->{'since'}->{$key}))
+ {
+ $self->{'since'}->{$key} = $self->_since_id_read($key);
+ }
+ return $self->{'since'}->{$key};
+ }
+sub _since_id_read
+ {
+ my ($self, $key) = @_;
+ my $file = $self->_since_id_file($key);
+ open LAST, $file;
+ my $line = $self->trim(<LAST>);
+ close LAST;
+ return $line;
+ }
+sub _since_id_write
+ {
+ my ($self, $key, $id) = @_;
+ my $file = $self->_since_id_file($key);
+ open LAST, ">$file";
+ print LAST $id."\n";
+ close LAST;
+ }
+sub _since_id_file
+ {
+ my ($self, $key) = @_;
+ my $tmp_dir = "../tmp";
+ my $file = $tmp_dir."/twitter-".$self->user."-".$key;
+ for ($tmp_dir, $file)
+ { die ("can't find $_") unless -e $_; }
+ return $file;
+ }
+1;