summaryrefslogtreecommitdiff
path: root/bucky2/lib/Rest/Topsy.pm
diff options
context:
space:
mode:
Diffstat (limited to 'bucky2/lib/Rest/Topsy.pm')
-rw-r--r--bucky2/lib/Rest/Topsy.pm185
1 files changed, 185 insertions, 0 deletions
diff --git a/bucky2/lib/Rest/Topsy.pm b/bucky2/lib/Rest/Topsy.pm
new file mode 100644
index 0000000..3ef045e
--- /dev/null
+++ b/bucky2/lib/Rest/Topsy.pm
@@ -0,0 +1,185 @@
+package Rest::Topsy;
+use base 'Rest';
+use Data::Dumper;
+use XML::Simple;
+
+sub topsy_search
+ {
+ my ($self, $query) = @_;
+ my $topsy_data = $self->rest_get_raw("http://topsy.com/s", { q => $query });
+ my $initials = $query;
+ $initials =~ s/\W//g;
+ $initials = substr($initials, 0, 2);
+ if (! -e "../tmp/nndb/topsy/$initials")
+ {
+ system("/bin/mkdir", "../tmp/nndb/topsy/$initials");
+ }
+ $self->write_data("../tmp/nndb/topsy/$initials/$query.txt", $topsy_data);
+ my @lines = split "\n", $topsy_data;
+ my $value = undef;
+ my $rank = {};
+ foreach my $line (@lines)
+ {
+ if ($line =~ /<span class="(count|label)">(.*)<\/span>/)
+ {
+ my ($token, $text) = ($1, $2);
+ if ($token eq "count")
+ {
+ $value = $text;
+ $value =~ s/\,//;
+ }
+ elsif ($token eq "label")
+ {
+ $rank->{lc $text} = $value if $value;
+ undef $value;
+ }
+ }
+ }
+ return $rank;
+ }
+sub topsy_get
+ {
+ my ($self) = @_;
+
+# my $topsy_data = $self->rest_get_raw($self->topsy_query($page));
+# $self->write_data("../tmp/topsy_call", $topsy_data);
+# exit;
+
+ $topsy_data = $self->read_data("../tmp/topsy_call");
+
+ my $topsy_script_data ||= $self->topsy_script_data($topsy_data);
+ my $topsy_entries = $self->topsy_entries($topsy_data);
+ my $xml_entries = $self->topsy_load;
+ if (scalar(@$xml_entries))
+ {
+ $topsy_entries = $xml_entries;
+ }
+
+ my $page = int(scalar(@$topsy_entries) / 10) || 1;
+ my $last_page = $topsy_script_data->{'pages'};
+
+ print "$last_page pages\n";
+
+ while ($page < $last_page)
+ {
+ $page++;
+ print $page . "...";
+ my $page_data = $self->rest_get_raw($self->topsy_query($page));
+ my $page_entries = $self->topsy_entries($page_data);
+ push @$topsy_entries, @$page_entries;
+ $self->write_xml("../tmp/topsy_entries.xml", $topsy_entries);
+ sleep 10 + (int rand 5);
+ }
+
+ print "Expected " . $topsy_script_data->{'total'} . ", got " . scalar(@$topsy_entries)."\n";
+
+ $self->write_xml("../tmp/topsy_entries.xml", $topsy_entries);
+
+ return @$page_entries;
+ }
+sub topsy_load
+ {
+ my ($self) = @_;
+ my $page_entries = $self->read_xml("../tmp/topsy_entries.xml");
+ print "Loaded ".scalar(@$page_entries)." entries\n";
+ return $page_entries;
+ }
+
+sub topsy_entries
+ {
+ my ($self, $data) = @_;
+ my @raw_entries = split '<div class="concept-rank">', $data;
+ my $entries = [];
+ my $current = "";
+ foreach my $entry (@raw_entries)
+ {
+ next if $entry =~ /concept-list-re/;
+ my $entry_hash = {};
+ my @lines = split "\n", $entry;
+ my $key = "";
+LINE: foreach my $line (@lines)
+ {
+ $line = $self->trim($line);
+ next unless $line;
+ if ($line =~ /class\=\"(\w+)\"/)
+ {
+ $key = $1;
+ }
+ my $value = "";
+ if ($line =~ /url\((.*)\)/)
+ {
+ $entry_hash->{'tile'} = $1;
+ }
+ if ($key eq "total" && $line =~ />(\d+)</)
+ {
+ $value = $1;
+ }
+ elsif ($line =~ />(.*)</)
+ {
+ $value = $1;
+ }
+ elsif ($key eq "description" && $line =~ /\&ldquo\;(.*)\&rdquo\;/)
+ {
+ $value = $1;
+ $value =~ s/ http.*$//;
+ }
+ if ($key && $value)
+ {
+ $entry_hash->{$key} = $value;
+ undef $key;
+ }
+ last LINE if $line =~ /script type\=\"text/;
+ }
+ if (scalar keys %$entry_hash)
+ {
+ push @$entries, $entry_hash;
+ }
+ }
+ return $entries;
+ }
+# Recipes.re= {
+# page: 2,
+# total: 964,
+# perpage: 10,
+# pages: 97
+# };
+sub topsy_script_data
+ {
+ my ($self, $data) = @_;
+ my @lines = split "\n", $data;
+ my $script_data = {};
+ foreach my $line (@lines)
+ {
+ next unless $line =~ /\:/;
+ $line =~ s/\s+//g;
+ $line =~ s/\,//;
+ my ($k, $v) = split ":", $line;
+ next unless $k && $v;
+ next unless $k =~ /^(page|total|perpage|pages)/;
+ $script_data->{$k} = $v;
+ }
+ return scalar keys %$script_data ? $script_data : undef;
+ }
+sub topsy_query
+ {
+ my ($self, $page) = @_;
+ my $url = "http://topsy.com/concept";
+ my $query =
+ {
+ "page" => $page,
+ "sort_method" => "",
+ "url" => $self->url,
+ "class" => "UB::Concept::List::Re",
+ };
+ return ($url, $query);
+ }
+sub url
+ {
+ my ($self, $url) = @_;
+ if ($url)
+ {
+ $self->{'url'} = $url;
+ }
+ return $self->{'url'};
+ }
+1;