DumpSearchScraper


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88

#!/usr/bin/python

import urllib
import urllib2
import simplejson
import sys
import re
import os
urlencode = urllib.urlencode
urlopen = urllib2.urlopen
Request = urllib2.Request

class DumpSearchScrape:
  def __init__(self, term):
    self.search_api = 'http://dump.fm/cmd/search'
    self.term = urllib.quote_plus(term)
    self.url = self.search_api + "/" + term
  def showSelection(self, filelist):
    if not os.path.exists("dumpselections"):
      os.system("mkdir dumpselections")
    f = open('dumpselections/index.html','w')
    f.write("""
      <html>
      <head>
      <link href="main.css" rel="stylesheet" type="text/css" />
      </head>
      <body>
      """)
    for file in filelist:
      f.write("<img class='examplefiles' src='"+file+"'/>")
    f.write("""
      </body>
      </html>
      """)
  def makeScrapelist (self, makepage=False, data=None):
    headers = {
      'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
      'Accept': '*/*'
      }
    try:
      req = Request(self.url, data, headers)
      response = urlopen(req)
      thejson = response.read()
      urldata = simplejson.loads(thejson)  
      urllist = []
      for row in urldata:
        url = ""
        if "url" not in row:
          continue
        if row['url'][0] == "/":
          url = "http://dump.fm/images" + row['url']
          urllist.append(url)
        else:
          url = "http://" + row['url']
          urllist.append(url)
      terms = self.term.split('+')
      for item in urllist:
        parts = item.split('/')
        theRe = ""
        for term in terms:
          new = '('+term+')*'
          theRe = theRe+new
        check = re.split(theRe, parts[-1])
        for term in terms:
          if term not in check:
            urllist.remove(item)
            break
      if makepage == True:
        self.showSelection(urllist)
      return urllist[0]
    except IOError, e:
      if hasattr(e, 'code'):
        print '%s - ERROR %s' % (self.url, e.code)
        return None
      else:
        return response
  
if __name__ == '__main__':
  term = sys.argv[1]
  makepage = False
  if len(sys.argv) > 2:
    if str(sys.argv[2].lower())=='true':
      makepage = True
    else:
      makepage = False
  scrappy = DumpSearchScrape(term)
  josh = scrappy.makeScrapelist(makepage)
  print josh