diff options
Diffstat (limited to 'feeder/scraper.py')
| -rwxr-xr-x | feeder/scraper.py | 101 |
1 files changed, 101 insertions, 0 deletions
diff --git a/feeder/scraper.py b/feeder/scraper.py new file mode 100755 index 0000000..9a1523d --- /dev/null +++ b/feeder/scraper.py @@ -0,0 +1,101 @@ +#!/usr/bin/python + +import browser +import time +import re + +HTML_TITLE_RE = re.compile('<title>([^<]+)</title>') + +class Scraper: + def __init__ (self): + self.browser = browser.Browser () + self.videos = True + self.images = False + self.dupes = {} + + def scrape (self, url): + time.sleep(1) + response = self.browser.get (url) + urls = self.parse (response.read()) + return urls + + def parse (self, html): + tags = html.replace(">","<").split("<") + lastimage = "" + urls = [] + for t in tags: + url = None + if len(t) < 1: + continue + if t[0] == "a": + if "href" not in t: + continue + url = self.getAttribute("href", t) + elif t[0] == "iframe": + if "src" not in t: + continue + url = self.getAttribute("src", t) + elif self.images and t[0:3] == "img": + if "src" not in t: + continue + if "php" in t: + continue + url = self.getAttribute("src", t) + if url is None: + continue + if url in self.dupes: + continue + if url[-3:] != "jpg": + continue + self.dupes[url] = True + urls.append(url) + continue + else: + continue + + if url is None: + continue + if url in self.dupes: + continue + if "youtube.com" in url: + self.dupes[url] = True + urls.append(url) + if "youtu.be" in url: + self.dupes[url] = True + urls.append(url) + if "vimeo.com" in url: + self.dupes[url] = True + # http://player.vimeo.com/video/23731158 + if "http://player.vimeo.com/video/" in url: + url = "http://vimeo.com/" + url.replace('http://player.vimeo.com/video/', '') + urls.append(url) + if "soundcloud.com" in url: + self.dupes[url] = True + urls.append(url) + if url[-3:] == "mp3": + self.dupes[url] = True + u = url.replace(" ","%20") + urls.append(lastimage+" "+u) + return urls + + def getAttribute (self, attr, s): + quote = None + if '\"' in s: + quote = '\"' + elif '\'' in s: + quote = '\'' + else: + return None + + attrpos = s.find(attr) + startquotepos = s.find(quote, attrpos+1) + endquotepos = s.find(quote, startquotepos+1) + url = s[startquotepos+1:endquotepos] + if url[0:4] != "http": + return None + return url + def getTitle (self, s): + if '>' in s: + return s.split(">")[1] + return None + |
