package Rest::Topsy; use base 'Rest'; use Data::Dumper; use XML::Simple; sub topsy_search { my ($self, $query) = @_; my $topsy_data = $self->rest_get_raw("http://topsy.com/s", { q => $query }); my $initials = $query; $initials =~ s/\W//g; $initials = substr($initials, 0, 2); if (! -e "../tmp/nndb/topsy/$initials") { system("/bin/mkdir", "../tmp/nndb/topsy/$initials"); } $self->write_data("../tmp/nndb/topsy/$initials/$query.txt", $topsy_data); my @lines = split "\n", $topsy_data; my $value = undef; my $rank = {}; foreach my $line (@lines) { if ($line =~ /(.*)<\/span>/) { my ($token, $text) = ($1, $2); if ($token eq "count") { $value = $text; $value =~ s/\,//; } elsif ($token eq "label") { $rank->{lc $text} = $value if $value; undef $value; } } } return $rank; } sub topsy_get { my ($self) = @_; # my $topsy_data = $self->rest_get_raw($self->topsy_query($page)); # $self->write_data("../tmp/topsy_call", $topsy_data); # exit; $topsy_data = $self->read_data("../tmp/topsy_call"); my $topsy_script_data ||= $self->topsy_script_data($topsy_data); my $topsy_entries = $self->topsy_entries($topsy_data); my $xml_entries = $self->topsy_load; if (scalar(@$xml_entries)) { $topsy_entries = $xml_entries; } my $page = int(scalar(@$topsy_entries) / 10) || 1; my $last_page = $topsy_script_data->{'pages'}; print "$last_page pages\n"; while ($page < $last_page) { $page++; print $page . "..."; my $page_data = $self->rest_get_raw($self->topsy_query($page)); my $page_entries = $self->topsy_entries($page_data); push @$topsy_entries, @$page_entries; $self->write_xml("../tmp/topsy_entries.xml", $topsy_entries); sleep 10 + (int rand 5); } print "Expected " . $topsy_script_data->{'total'} . ", got " . scalar(@$topsy_entries)."\n"; $self->write_xml("../tmp/topsy_entries.xml", $topsy_entries); return @$page_entries; } sub topsy_load { my ($self) = @_; my $page_entries = $self->read_xml("../tmp/topsy_entries.xml"); print "Loaded ".scalar(@$page_entries)." entries\n"; return $page_entries; } sub topsy_entries { my ($self, $data) = @_; my @raw_entries = split '
', $data; my $entries = []; my $current = ""; foreach my $entry (@raw_entries) { next if $entry =~ /concept-list-re/; my $entry_hash = {}; my @lines = split "\n", $entry; my $key = ""; LINE: foreach my $line (@lines) { $line = $self->trim($line); next unless $line; if ($line =~ /class\=\"(\w+)\"/) { $key = $1; } my $value = ""; if ($line =~ /url\((.*)\)/) { $entry_hash->{'tile'} = $1; } if ($key eq "total" && $line =~ />(\d+)(.*){$key} = $value; undef $key; } last LINE if $line =~ /script type\=\"text/; } if (scalar keys %$entry_hash) { push @$entries, $entry_hash; } } return $entries; } # Recipes.re= { # page: 2, # total: 964, # perpage: 10, # pages: 97 # }; sub topsy_script_data { my ($self, $data) = @_; my @lines = split "\n", $data; my $script_data = {}; foreach my $line (@lines) { next unless $line =~ /\:/; $line =~ s/\s+//g; $line =~ s/\,//; my ($k, $v) = split ":", $line; next unless $k && $v; next unless $k =~ /^(page|total|perpage|pages)/; $script_data->{$k} = $v; } return scalar keys %$script_data ? $script_data : undef; } sub topsy_query { my ($self, $page) = @_; my $url = "http://topsy.com/concept"; my $query = { "page" => $page, "sort_method" => "", "url" => $self->url, "class" => "UB::Concept::List::Re", }; return ($url, $query); } sub url { my ($self, $url) = @_; if ($url) { $self->{'url'} = $url; } return $self->{'url'}; } 1;