summaryrefslogtreecommitdiff
path: root/DumpSearchScraper
diff options
context:
space:
mode:
Diffstat (limited to 'DumpSearchScraper')
-rwxr-xr-xDumpSearchScraper88
1 files changed, 88 insertions, 0 deletions
diff --git a/DumpSearchScraper b/DumpSearchScraper
new file mode 100755
index 0000000..004aa83
--- /dev/null
+++ b/DumpSearchScraper
@@ -0,0 +1,88 @@
+#!/usr/bin/python
+
+import urllib
+import urllib2
+import simplejson
+import sys
+import re
+import os
+urlencode = urllib.urlencode
+urlopen = urllib2.urlopen
+Request = urllib2.Request
+
+class DumpSearchScrape:
+ def __init__(self, term):
+ self.search_api = 'http://dump.fm/cmd/search'
+ self.term = urllib.quote_plus(term)
+ self.url = self.search_api + "/" + term
+ def showSelection(self, filelist):
+ if not os.path.exists("dumpselections"):
+ os.system("mkdir dumpselections")
+ f = open('dumpselections/index.html','w')
+ f.write("""
+ <html>
+ <head>
+ <link href="main.css" rel="stylesheet" type="text/css" />
+ </head>
+ <body>
+ """)
+ for file in filelist:
+ f.write("<img class='examplefiles' src='"+file+"'/>")
+ f.write("""
+ </body>
+ </html>
+ """)
+ def makeScrapelist (self, makepage=False, data=None):
+ headers = {
+ 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
+ 'Accept': '*/*'
+ }
+ try:
+ req = Request(self.url, data, headers)
+ response = urlopen(req)
+ thejson = response.read()
+ urldata = simplejson.loads(thejson)
+ urllist = []
+ for row in urldata:
+ url = ""
+ if "url" not in row:
+ continue
+ if row['url'][0] == "/":
+ url = "http://dump.fm/images" + row['url']
+ urllist.append(url)
+ else:
+ url = "http://" + row['url']
+ urllist.append(url)
+ terms = self.term.split('+')
+ for item in urllist:
+ parts = item.split('/')
+ theRe = ""
+ for term in terms:
+ new = '('+term+')*'
+ theRe = theRe+new
+ check = re.split(theRe, parts[-1])
+ for term in terms:
+ if term not in check:
+ urllist.remove(item)
+ break
+ if makepage == True:
+ self.showSelection(urllist)
+ return urllist[0]
+ except IOError, e:
+ if hasattr(e, 'code'):
+ print '%s - ERROR %s' % (self.url, e.code)
+ return None
+ else:
+ return response
+
+if __name__ == '__main__':
+ term = sys.argv[1]
+ makepage = False
+ if len(sys.argv) > 2:
+ if str(sys.argv[2].lower())=='true':
+ makepage = True
+ else:
+ makepage = False
+ scrappy = DumpSearchScrape(term)
+ josh = scrappy.makeScrapelist(makepage)
+ print josh