#!/usr/bin/python
import browser
import time
import re
HTML_TITLE_RE = re.compile('
([^<]+)')
class Scraper:
def __init__ (self):
self.browser = browser.Browser ()
self.videos = True
self.images = False
self.dupes = {}
def scrape (self, url):
time.sleep(1)
response = self.browser.get (url)
urls = self.parse (response.read())
return urls
def parse (self, html):
tags = html.replace(">","<").split("<")
lastimage = ""
urls = []
for t in tags:
url = None
if len(t) < 1:
continue
if t[0] == "a":
if "href" not in t:
continue
url = self.getAttribute("href", t)
elif t[0] == "iframe":
if "src" not in t:
continue
url = self.getAttribute("src", t)
elif self.images and t[0:3] == "img":
if "src" not in t:
continue
if "php" in t:
continue
url = self.getAttribute("src", t)
if url is None:
continue
if url in self.dupes:
continue
if url[-3:] != "jpg":
continue
self.dupes[url] = True
urls.append(url)
continue
else:
continue
if url is None:
continue
if url in self.dupes:
continue
if "youtube.com" in url:
self.dupes[url] = True
urls.append(url)
if "youtu.be" in url:
self.dupes[url] = True
urls.append(url)
if "vimeo.com" in url:
self.dupes[url] = True
# http://player.vimeo.com/video/23731158
if "http://player.vimeo.com/video/" in url:
url = "http://vimeo.com/" + url.replace('http://player.vimeo.com/video/', '')
urls.append(url)
if "soundcloud.com" in url:
self.dupes[url] = True
urls.append(url)
if url[-3:] == "mp3":
self.dupes[url] = True
u = url.replace(" ","%20")
urls.append(lastimage+" "+u)
return urls
def getAttribute (self, attr, s):
quote = None
if '\"' in s:
quote = '\"'
elif '\'' in s:
quote = '\''
else:
return None
attrpos = s.find(attr)
startquotepos = s.find(quote, attrpos+1)
endquotepos = s.find(quote, startquotepos+1)
url = s[startquotepos+1:endquotepos]
if url[0:4] != "http":
return None
return url
def getTitle (self, s):
if '>' in s:
return s.split(">")[1]
return None