feeder/scraper.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101

#!/usr/bin/python

import browser
import time
import re

HTML_TITLE_RE = re.compile('<title>([^<]+)</title>')

class Scraper:
	def __init__ (self):
		self.browser = browser.Browser ()
		self.videos = True
		self.images = False
		self.dupes = {}

	def scrape (self, url):
		time.sleep(1)
		response = self.browser.get (url)
		urls = self.parse (response.read())
		return urls

	def parse (self, html):
		tags = html.replace("&gt;","<").split("<")
		lastimage = ""
		urls = []
		for t in tags:
			url = None
			if len(t) < 1:
				continue
			if t[0] == "a":
				if "href" not in t:
					continue
				url = self.getAttribute("href", t)
			elif t[0] == "iframe":
				if "src" not in t:
					continue
				url = self.getAttribute("src", t)
			elif self.images and t[0:3] == "img":
				if "src" not in t:
					continue
				if "php" in t:
					continue
				url = self.getAttribute("src", t)
				if url is None:
					continue
				if url in self.dupes:
					continue
				if url[-3:] != "jpg":
					continue
				self.dupes[url] = True
				urls.append(url)
				continue
			else:
				continue

			if url is None:
				continue
			if url in self.dupes:
				continue
			if "youtube.com" in url:
				self.dupes[url] = True
				urls.append(url)
			if "youtu.be" in url:
				self.dupes[url] = True
				urls.append(url)
			if "vimeo.com" in url:
				self.dupes[url] = True
				# http://player.vimeo.com/video/23731158
				if "http://player.vimeo.com/video/" in url:
					url = "http://vimeo.com/" + url.replace('http://player.vimeo.com/video/', '')
				urls.append(url)
			if "soundcloud.com" in url:
				self.dupes[url] = True
				urls.append(url)
			if url[-3:] == "mp3":
				self.dupes[url] = True
				u = url.replace(" ","%20")
				urls.append(lastimage+" "+u)
		return urls

	def getAttribute (self, attr, s):
		quote = None
		if '\"' in s:
			quote = '\"'
		elif '\'' in s:
			quote = '\''
		else:
			return None

		attrpos = s.find(attr)
		startquotepos = s.find(quote, attrpos+1)
		endquotepos = s.find(quote, startquotepos+1)
		url = s[startquotepos+1:endquotepos]
		if url[0:4] != "http":
			return None
		return url
	def getTitle (self, s):
		if '>' in s:
			return s.split(">")[1]
		return None