diff options
Diffstat (limited to 'DumpSearchScraper')
| -rwxr-xr-x | DumpSearchScraper | 88 |
1 files changed, 88 insertions, 0 deletions
diff --git a/DumpSearchScraper b/DumpSearchScraper new file mode 100755 index 0000000..004aa83 --- /dev/null +++ b/DumpSearchScraper @@ -0,0 +1,88 @@ +#!/usr/bin/python + +import urllib +import urllib2 +import simplejson +import sys +import re +import os +urlencode = urllib.urlencode +urlopen = urllib2.urlopen +Request = urllib2.Request + +class DumpSearchScrape: + def __init__(self, term): + self.search_api = 'http://dump.fm/cmd/search' + self.term = urllib.quote_plus(term) + self.url = self.search_api + "/" + term + def showSelection(self, filelist): + if not os.path.exists("dumpselections"): + os.system("mkdir dumpselections") + f = open('dumpselections/index.html','w') + f.write(""" + <html> + <head> + <link href="main.css" rel="stylesheet" type="text/css" /> + </head> + <body> + """) + for file in filelist: + f.write("<img class='examplefiles' src='"+file+"'/>") + f.write(""" + </body> + </html> + """) + def makeScrapelist (self, makepage=False, data=None): + headers = { + 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', + 'Accept': '*/*' + } + try: + req = Request(self.url, data, headers) + response = urlopen(req) + thejson = response.read() + urldata = simplejson.loads(thejson) + urllist = [] + for row in urldata: + url = "" + if "url" not in row: + continue + if row['url'][0] == "/": + url = "http://dump.fm/images" + row['url'] + urllist.append(url) + else: + url = "http://" + row['url'] + urllist.append(url) + terms = self.term.split('+') + for item in urllist: + parts = item.split('/') + theRe = "" + for term in terms: + new = '('+term+')*' + theRe = theRe+new + check = re.split(theRe, parts[-1]) + for term in terms: + if term not in check: + urllist.remove(item) + break + if makepage == True: + self.showSelection(urllist) + return urllist[0] + except IOError, e: + if hasattr(e, 'code'): + print '%s - ERROR %s' % (self.url, e.code) + return None + else: + return response + +if __name__ == '__main__': + term = sys.argv[1] + makepage = False + if len(sys.argv) > 2: + if str(sys.argv[2].lower())=='true': + makepage = True + else: + makepage = False + scrappy = DumpSearchScrape(term) + josh = scrappy.makeScrapelist(makepage) + print josh |
