diff options
Diffstat (limited to 'feeder/feeder.py')
| -rwxr-xr-x | feeder/feeder.py | 237 |
1 files changed, 237 insertions, 0 deletions
diff --git a/feeder/feeder.py b/feeder/feeder.py new file mode 100755 index 0000000..39b43e5 --- /dev/null +++ b/feeder/feeder.py @@ -0,0 +1,237 @@ +#!/usr/bin/python + +import commands +#import MySQLdb +import urllib +import sha +import sys +import time +import re +import os +def now (): + return int(time.mktime(time.localtime())) + +SERVER_HOST = 'scannerjammer.com' +SERVER_PORT = 80 + +API_HEADER = "#@scanjam 0.2\n" + +HTML_TITLE_RE = re.compile('<title>([^<]+)</title>') + +DUPE_LIST = "feeder/dupes.test" +#DUPE_LIST = "feeder/dupes.txt" +FEED_LIST = "feeder/feeds.txt" +#FEED_PATH = "feeder/feeds" +FEED_PATH = "feeder/test" +FEED_STALE_TIME = 0; +#FEED_STALE_TIME = 3600 +FEED_ROOM = "feederbleeder" + + +API_LOGIN = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/auth/sneakin"+"/" +API_POST_VIDEO = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/room/video"+"/" +API_POST_IMAGE = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/room/say"+"/" +API_LOGOUT = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/auth/logout"+"/" + +#{{{ **USE IF YOU ADD APPEND_SLASH +#API_LOGIN = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/auth/sneakin" +#API_POST_VIDEO = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/room/video" +#API_POST_IMAGE = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/room/say" +#API_LOGOUT = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/auth/logout" +#}}} + +print API_LOGIN +print API_POST_VIDEO +print API_POST_IMAGE +print API_LOGOUT + + +dupes = {} + +class Feed: + def __init__ (self, src, title, userid): + self.src = src + self.title = title + self.userid = userid + self.domain = "http://" + src.split("/")[2] + self.urls = [] + self.images = [] + self.load() + + def load (self): + filename = "/".join([FEED_PATH, self.title]) + refresh = True + + # check last update of feed + if os.path.exists(filename): + stat = os.stat(filename) + if stat.st_mtime > now() - FEED_STALE_TIME: + refresh = False + + # if stale/empty then download + if refresh: + print self.title, "loading from web" + feedhtml = urllib.urlopen(self.src).read() + if len(feedhtml): + out = open(filename, 'w') + out.write(feedhtml) + self.parse(feedhtml) + + # otherwise, load from disk + else: + print self.title, "loading from disk" + feed = open (filename, 'r') + feedhtml = feed.read() + feed.close() + self.parse(feedhtml) + + # parse urls out of html files + # display these urls (by date, by source) + def parse (self, html): + tags = html.replace(">","<").split("<") + lastimage = "" + for t in tags: + url = None + if len(t) < 1: + continue + if t[0] == "a": + if "href" not in t: + continue + url = self.getAttribute("href", t) + elif t[0] == "iframe": + if "src" not in t: + continue + url = self.getAttribute("src", t) + elif t[0:3] == "img": + if "src" not in t: + continue + if "php" in t: + continue + url = self.getAttribute("src", t) + if url is None: + continue + if url in dupes: + continue + if url[-3:] != "jpg": + continue + print url + lastimage = url + dupes[url.strip()] = True + self.images.append(url) + continue + else: + continue + + if url is None: + continue + if url in dupes: + continue + if "youtube.com" in url: + dupes[url.strip()] = True + self.urls.append(url) + if "youtu.be" in url: + dupes[url.strip()] = True + self.urls.append(url) + if "vimeo.com" in url: + dupes[url.strip()] = True + # http://player.vimeo.com/video/23731158 + if "http://player.vimeo.com/video/" in url: + url = "http://vimeo.com/" + url.replace('http://player.vimeo.com/video/', '') + self.urls.append(url) + if "soundcloud.com" in url: + dupes[url.strip()] = True + self.urls.append(url) + if url[-3:] == "mp3": + dupes[url.strip()] = True + u = url.replace(" ","%20") + self.urls.append(lastimage+" "+u) + + def getAttribute (self, attr, s): + quote = None + if '\"' in s: + quote = '\"' + elif '\'' in s: + quote = '\'' + else: + return None + + attrpos = s.find(attr) + startquotepos = s.find(quote, attrpos+1) + endquotepos = s.find(quote, startquotepos+1) + url = s[startquotepos+1:endquotepos] + #if url[0] == "/": + # url = self.domain + url + if url[0:4] != "http": + return None + return url + def getTitle (self, s): + if '>' in s: + return s.split(">")[1] + return None + def login (self): + print "getting token for", self.title + data = urllib.urlencode({ 'userid': self.userid, 'username': self.title }) + f = urllib.urlopen(API_LOGIN, data) + api = f.read().split("\n") + if api[0] != "#@scanjam 0.3b" or api[1] != "OK": + print "ERROR GETTING TOKEN" + return None + payload = api[2].split("\t") + print "GOT SESSION:", payload[2] + time.sleep(0.5) + return payload[2] + def report (self): + if len(self.urls) == 0 and len(self.images) == 0: + print self.title, "nothing to do" + return + self.session = self.login() + if self.session is None: + print self.title, "error getting session" + return + print "" + print self.title, "reported", len(self.urls), "urls,", len(self.images), "images" + for url in reversed(self.urls): + if "wearesolidgold" in url: + continue + if url == "http://vimeo.com/": + continue + print "URL", url + data = urllib.urlencode({ 'session': self.session, 'room': FEED_ROOM, 'msg': url }) + f = urllib.urlopen(API_POST_VIDEO, data) + print f.read() + print data, API_POST_VIDEO; exit(0); + time.sleep(5) + for url in reversed(self.images): + print "IMG", url + data = urllib.urlencode({ 'session': self.session, 'room': FEED_ROOM, 'msg': url }) + f = urllib.urlopen(API_POST_IMAGE, data) + time.sleep(5) + +def load_dupes (): + dupelist = open (DUPE_LIST, 'r') + for line in dupelist: + dupes[line.strip()] = True + dupelist.close() + +def load_feeds (): + feedlist = open (FEED_LIST, 'r') + feeds = [] + for line in feedlist: + src,title,userid = line.strip().split("\t") + feed = Feed (src,title,userid) + feeds.append(feed) + feedlist.close() + for feed in reversed(feeds): + feed.report() + +def save_dupes (): + dupelist = open (DUPE_LIST+".tmp", 'w') + for k,v in dupes.iteritems(): + dupelist.write(k.strip()+"\n") + dupelist.close() + os.rename(DUPE_LIST+".tmp", DUPE_LIST) + +load_dupes() +load_feeds() +save_dupes() + |
