1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
#!/usr/bin/python
import browser
import time
import re
HTML_TITLE_RE = re.compile('<title>([^<]+)</title>')
class Scraper:
def __init__ (self):
self.browser = browser.Browser ()
self.videos = True
self.images = False
self.dupes = {}
def scrape (self, url):
time.sleep(1)
response = self.browser.get (url)
urls = self.parse (response.read())
return urls
def parse (self, html):
tags = html.replace(">","<").split("<")
lastimage = ""
urls = []
for t in tags:
url = None
if len(t) < 1:
continue
if t[0] == "a":
if "href" not in t:
continue
url = self.getAttribute("href", t)
elif t[0] == "iframe":
if "src" not in t:
continue
url = self.getAttribute("src", t)
elif self.images and t[0:3] == "img":
if "src" not in t:
continue
if "php" in t:
continue
url = self.getAttribute("src", t)
if url is None:
continue
if url in self.dupes:
continue
if url[-3:] != "jpg":
continue
self.dupes[url] = True
urls.append(url)
continue
else:
continue
if url is None:
continue
if url in self.dupes:
continue
if "youtube.com" in url:
self.dupes[url] = True
urls.append(url)
if "youtu.be" in url:
self.dupes[url] = True
urls.append(url)
if "vimeo.com" in url:
self.dupes[url] = True
# http://player.vimeo.com/video/23731158
if "http://player.vimeo.com/video/" in url:
url = "http://vimeo.com/" + url.replace('http://player.vimeo.com/video/', '')
urls.append(url)
if "soundcloud.com" in url:
self.dupes[url] = True
urls.append(url)
if url[-3:] == "mp3":
self.dupes[url] = True
u = url.replace(" ","%20")
urls.append(lastimage+" "+u)
return urls
def getAttribute (self, attr, s):
quote = None
if '\"' in s:
quote = '\"'
elif '\'' in s:
quote = '\''
else:
return None
attrpos = s.find(attr)
startquotepos = s.find(quote, attrpos+1)
endquotepos = s.find(quote, startquotepos+1)
url = s[startquotepos+1:endquotepos]
if url[0:4] != "http":
return None
return url
def getTitle (self, s):
if '>' in s:
return s.split(">")[1]
return None
|