diff options
| author | Jules Laplace <carbon@melanarchy.org> | 2013-08-02 17:23:25 -0500 |
|---|---|---|
| committer | Jules Laplace <carbon@melanarchy.org> | 2013-08-02 17:23:25 -0500 |
| commit | e76b691e78e273226cba9284cb8cd22a423319ed (patch) | |
| tree | a58d22f69869fe2bf3885f81bdda4952f87ff6d7 /bucky2/lib/Rest/Topsy.pm | |
| parent | 753f60c7d4769fa72d3b910e491f37db6f130898 (diff) | |
bucky2
Diffstat (limited to 'bucky2/lib/Rest/Topsy.pm')
| -rw-r--r-- | bucky2/lib/Rest/Topsy.pm | 185 |
1 files changed, 185 insertions, 0 deletions
diff --git a/bucky2/lib/Rest/Topsy.pm b/bucky2/lib/Rest/Topsy.pm new file mode 100644 index 0000000..3ef045e --- /dev/null +++ b/bucky2/lib/Rest/Topsy.pm @@ -0,0 +1,185 @@ +package Rest::Topsy; +use base 'Rest'; +use Data::Dumper; +use XML::Simple; + +sub topsy_search + { + my ($self, $query) = @_; + my $topsy_data = $self->rest_get_raw("http://topsy.com/s", { q => $query }); + my $initials = $query; + $initials =~ s/\W//g; + $initials = substr($initials, 0, 2); + if (! -e "../tmp/nndb/topsy/$initials") + { + system("/bin/mkdir", "../tmp/nndb/topsy/$initials"); + } + $self->write_data("../tmp/nndb/topsy/$initials/$query.txt", $topsy_data); + my @lines = split "\n", $topsy_data; + my $value = undef; + my $rank = {}; + foreach my $line (@lines) + { + if ($line =~ /<span class="(count|label)">(.*)<\/span>/) + { + my ($token, $text) = ($1, $2); + if ($token eq "count") + { + $value = $text; + $value =~ s/\,//; + } + elsif ($token eq "label") + { + $rank->{lc $text} = $value if $value; + undef $value; + } + } + } + return $rank; + } +sub topsy_get + { + my ($self) = @_; + +# my $topsy_data = $self->rest_get_raw($self->topsy_query($page)); +# $self->write_data("../tmp/topsy_call", $topsy_data); +# exit; + + $topsy_data = $self->read_data("../tmp/topsy_call"); + + my $topsy_script_data ||= $self->topsy_script_data($topsy_data); + my $topsy_entries = $self->topsy_entries($topsy_data); + my $xml_entries = $self->topsy_load; + if (scalar(@$xml_entries)) + { + $topsy_entries = $xml_entries; + } + + my $page = int(scalar(@$topsy_entries) / 10) || 1; + my $last_page = $topsy_script_data->{'pages'}; + + print "$last_page pages\n"; + + while ($page < $last_page) + { + $page++; + print $page . "..."; + my $page_data = $self->rest_get_raw($self->topsy_query($page)); + my $page_entries = $self->topsy_entries($page_data); + push @$topsy_entries, @$page_entries; + $self->write_xml("../tmp/topsy_entries.xml", $topsy_entries); + sleep 10 + (int rand 5); + } + + print "Expected " . $topsy_script_data->{'total'} . ", got " . scalar(@$topsy_entries)."\n"; + + $self->write_xml("../tmp/topsy_entries.xml", $topsy_entries); + + return @$page_entries; + } +sub topsy_load + { + my ($self) = @_; + my $page_entries = $self->read_xml("../tmp/topsy_entries.xml"); + print "Loaded ".scalar(@$page_entries)." entries\n"; + return $page_entries; + } + +sub topsy_entries + { + my ($self, $data) = @_; + my @raw_entries = split '<div class="concept-rank">', $data; + my $entries = []; + my $current = ""; + foreach my $entry (@raw_entries) + { + next if $entry =~ /concept-list-re/; + my $entry_hash = {}; + my @lines = split "\n", $entry; + my $key = ""; +LINE: foreach my $line (@lines) + { + $line = $self->trim($line); + next unless $line; + if ($line =~ /class\=\"(\w+)\"/) + { + $key = $1; + } + my $value = ""; + if ($line =~ /url\((.*)\)/) + { + $entry_hash->{'tile'} = $1; + } + if ($key eq "total" && $line =~ />(\d+)</) + { + $value = $1; + } + elsif ($line =~ />(.*)</) + { + $value = $1; + } + elsif ($key eq "description" && $line =~ /\&ldquo\;(.*)\&rdquo\;/) + { + $value = $1; + $value =~ s/ http.*$//; + } + if ($key && $value) + { + $entry_hash->{$key} = $value; + undef $key; + } + last LINE if $line =~ /script type\=\"text/; + } + if (scalar keys %$entry_hash) + { + push @$entries, $entry_hash; + } + } + return $entries; + } +# Recipes.re= { +# page: 2, +# total: 964, +# perpage: 10, +# pages: 97 +# }; +sub topsy_script_data + { + my ($self, $data) = @_; + my @lines = split "\n", $data; + my $script_data = {}; + foreach my $line (@lines) + { + next unless $line =~ /\:/; + $line =~ s/\s+//g; + $line =~ s/\,//; + my ($k, $v) = split ":", $line; + next unless $k && $v; + next unless $k =~ /^(page|total|perpage|pages)/; + $script_data->{$k} = $v; + } + return scalar keys %$script_data ? $script_data : undef; + } +sub topsy_query + { + my ($self, $page) = @_; + my $url = "http://topsy.com/concept"; + my $query = + { + "page" => $page, + "sort_method" => "", + "url" => $self->url, + "class" => "UB::Concept::List::Re", + }; + return ($url, $query); + } +sub url + { + my ($self, $url) = @_; + if ($url) + { + $self->{'url'} = $url; + } + return $self->{'url'}; + } +1; |
