#!/usr/bin/python
import urllib
import urllib2
import simplejson
import sys
import re
import os
urlencode = urllib.urlencode
urlopen = urllib2.urlopen
Request = urllib2.Request
class DumpSearchScrape:
def __init__(self, term):
self.search_api = 'http://dump.fm/cmd/search'
self.term = urllib.quote_plus(term)
self.url = self.search_api + "/" + term
def showSelection(self, filelist):
if not os.path.exists("dumpselections"):
os.system("mkdir dumpselections")
f = open('dumpselections/index.html','w')
f.write("""
""")
for file in filelist:
f.write("
")
f.write("""
""")
def makeScrapelist (self, makepage=False, data=None):
headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
'Accept': '*/*'
}
try:
req = Request(self.url, data, headers)
response = urlopen(req)
thejson = response.read()
urldata = simplejson.loads(thejson)
urllist = []
for row in urldata:
url = ""
if "url" not in row:
continue
if row['url'][0] == "/":
url = "http://dump.fm/images" + row['url']
urllist.append(url)
else:
url = "http://" + row['url']
urllist.append(url)
terms = self.term.split('+')
for item in urllist:
parts = item.split('/')
theRe = ""
for term in terms:
new = '('+term+')*'
theRe = theRe+new
check = re.split(theRe, parts[-1])
for term in terms:
if term not in check:
urllist.remove(item)
break
if makepage == True:
self.showSelection(urllist)
return urllist[0]
except IOError, e:
if hasattr(e, 'code'):
print '%s - ERROR %s' % (self.url, e.code)
return None
else:
return response
if __name__ == '__main__':
term = sys.argv[1]
makepage = False
if len(sys.argv) > 2:
if str(sys.argv[2].lower())=='true':
makepage = True
else:
makepage = False
scrappy = DumpSearchScrape(term)
josh = scrappy.makeScrapelist(makepage)
print josh