#!/usr/bin/python import browser import time import re HTML_TITLE_RE = re.compile('([^<]+)') class Scraper: def __init__ (self): self.browser = browser.Browser () self.videos = True self.images = False self.dupes = {} def scrape (self, url): time.sleep(1) response = self.browser.get (url) urls = self.parse (response.read()) return urls def parse (self, html): tags = html.replace(">","<").split("<") lastimage = "" urls = [] for t in tags: url = None if len(t) < 1: continue if t[0] == "a": if "href" not in t: continue url = self.getAttribute("href", t) elif t[0] == "iframe": if "src" not in t: continue url = self.getAttribute("src", t) elif self.images and t[0:3] == "img": if "src" not in t: continue if "php" in t: continue url = self.getAttribute("src", t) if url is None: continue if url in self.dupes: continue if url[-3:] != "jpg": continue self.dupes[url] = True urls.append(url) continue else: continue if url is None: continue if url in self.dupes: continue if "youtube.com" in url: self.dupes[url] = True urls.append(url) if "youtu.be" in url: self.dupes[url] = True urls.append(url) if "vimeo.com" in url: self.dupes[url] = True # http://player.vimeo.com/video/23731158 if "http://player.vimeo.com/video/" in url: url = "http://vimeo.com/" + url.replace('http://player.vimeo.com/video/', '') urls.append(url) if "soundcloud.com" in url: self.dupes[url] = True urls.append(url) if url[-3:] == "mp3": self.dupes[url] = True u = url.replace(" ","%20") urls.append(lastimage+" "+u) return urls def getAttribute (self, attr, s): quote = None if '\"' in s: quote = '\"' elif '\'' in s: quote = '\'' else: return None attrpos = s.find(attr) startquotepos = s.find(quote, attrpos+1) endquotepos = s.find(quote, startquotepos+1) url = s[startquotepos+1:endquotepos] if url[0:4] != "http": return None return url def getTitle (self, s): if '>' in s: return s.split(">")[1] return None