summaryrefslogtreecommitdiff
path: root/feeder/scraper.py
diff options
context:
space:
mode:
Diffstat (limited to 'feeder/scraper.py')
-rwxr-xr-xfeeder/scraper.py101
1 files changed, 101 insertions, 0 deletions
diff --git a/feeder/scraper.py b/feeder/scraper.py
new file mode 100755
index 0000000..9a1523d
--- /dev/null
+++ b/feeder/scraper.py
@@ -0,0 +1,101 @@
+#!/usr/bin/python
+
+import browser
+import time
+import re
+
+HTML_TITLE_RE = re.compile('<title>([^<]+)</title>')
+
+class Scraper:
+ def __init__ (self):
+ self.browser = browser.Browser ()
+ self.videos = True
+ self.images = False
+ self.dupes = {}
+
+ def scrape (self, url):
+ time.sleep(1)
+ response = self.browser.get (url)
+ urls = self.parse (response.read())
+ return urls
+
+ def parse (self, html):
+ tags = html.replace("&gt;","<").split("<")
+ lastimage = ""
+ urls = []
+ for t in tags:
+ url = None
+ if len(t) < 1:
+ continue
+ if t[0] == "a":
+ if "href" not in t:
+ continue
+ url = self.getAttribute("href", t)
+ elif t[0] == "iframe":
+ if "src" not in t:
+ continue
+ url = self.getAttribute("src", t)
+ elif self.images and t[0:3] == "img":
+ if "src" not in t:
+ continue
+ if "php" in t:
+ continue
+ url = self.getAttribute("src", t)
+ if url is None:
+ continue
+ if url in self.dupes:
+ continue
+ if url[-3:] != "jpg":
+ continue
+ self.dupes[url] = True
+ urls.append(url)
+ continue
+ else:
+ continue
+
+ if url is None:
+ continue
+ if url in self.dupes:
+ continue
+ if "youtube.com" in url:
+ self.dupes[url] = True
+ urls.append(url)
+ if "youtu.be" in url:
+ self.dupes[url] = True
+ urls.append(url)
+ if "vimeo.com" in url:
+ self.dupes[url] = True
+ # http://player.vimeo.com/video/23731158
+ if "http://player.vimeo.com/video/" in url:
+ url = "http://vimeo.com/" + url.replace('http://player.vimeo.com/video/', '')
+ urls.append(url)
+ if "soundcloud.com" in url:
+ self.dupes[url] = True
+ urls.append(url)
+ if url[-3:] == "mp3":
+ self.dupes[url] = True
+ u = url.replace(" ","%20")
+ urls.append(lastimage+" "+u)
+ return urls
+
+ def getAttribute (self, attr, s):
+ quote = None
+ if '\"' in s:
+ quote = '\"'
+ elif '\'' in s:
+ quote = '\''
+ else:
+ return None
+
+ attrpos = s.find(attr)
+ startquotepos = s.find(quote, attrpos+1)
+ endquotepos = s.find(quote, startquotepos+1)
+ url = s[startquotepos+1:endquotepos]
+ if url[0:4] != "http":
+ return None
+ return url
+ def getTitle (self, s):
+ if '>' in s:
+ return s.split(">")[1]
+ return None
+