([^<]+)

#!/usr/bin/python import commands import MySQLdb import urllib import sha import sys import time import re import os def now (): return int(time.mktime(time.localtime())) #so I can't change this host to the IP address to test this #because we're running a name-based virtual hosts setup, right? #yeah, but we can change /etc/hosts to set sj.com to 127.0.0.1 and it basically should work # cool SERVER_HOST = 'www.scannerjammer.com' SERVER_PORT = 80 API_HEADER = "#@scanjam 0.2\n" HTML_TITLE_RE = re.compile('([^<]+)') DUPE_LIST = "feeder/dupes.test" #DUPE_LIST = "feeder/dupes.txt" FEED_LIST = "feeder/feeds.txt" #FEED_PATH = "feeder/feeds" FEED_PATH = "feeder/test" FEED_STALE_TIME = 3600 FEED_ROOM = "feederbleeder" API_LOGIN = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/auth/sneakin" API_POST_VIDEO = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/room/video" API_POST_IMAGE = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/room/say" API_LOGOUT = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/auth/logout" print API_LOGIN print API_POST_VIDEO print API_POST_IMAGE print API_LOGOUT dupes = {} class Feed: def __init__ (self, src, title, userid): self.src = src self.title = title self.userid = userid self.domain = "http://" + src.split("/")[2] self.urls = [] self.images = [] self.load() def load (self): filename = "/".join([FEED_PATH, self.title]) refresh = True # check last update of feed if os.path.exists(filename): stat = os.stat(filename) if stat.st_mtime > now() - FEED_STALE_TIME: refresh = False # if stale/empty then download if refresh: print self.title, "loading from web" feedhtml = urllib.urlopen(self.src).read() if len(feedhtml): out = open(filename, 'w') out.write(feedhtml) self.parse(feedhtml) # otherwise, load from disk else: print self.title, "loading from disk" feed = open (filename, 'r') feedhtml = feed.read() feed.close() self.parse(feedhtml) # parse urls out of html files # display these urls (by date, by source) def parse (self, html): tags = html.replace(">","<").split("<") lastimage = "" for t in tags: url = None if len(t) < 1: continue if t[0] == "a": if "href" not in t: continue url = self.getAttribute("href", t) elif t[0] == "iframe": if "src" not in t: continue url = self.getAttribute("src", t) elif t[0:3] == "img": if "src" not in t: continue if "php" in t: continue url = self.getAttribute("src", t) if url is None: continue if url in dupes: continue if url[-3:] != "jpg": continue print url lastimage = url dupes[url.strip()] = True self.images.append(url) continue else: continue if url is None: continue if url in dupes: continue if "youtube.com" in url: dupes[url.strip()] = True self.urls.append(url) if "youtu.be" in url: dupes[url.strip()] = True self.urls.append(url) if "vimeo.com" in url: dupes[url.strip()] = True # http://player.vimeo.com/video/23731158 if "http://player.vimeo.com/video/" in url: url = "http://vimeo.com/" + url.replace('http://player.vimeo.com/video/', '') self.urls.append(url) if "soundcloud.com" in url: dupes[url.strip()] = True self.urls.append(url) if url[-3:] == "mp3": dupes[url.strip()] = True u = url.replace(" ","%20") self.urls.append(lastimage+" "+u) def getAttribute (self, attr, s): quote = None if '\"' in s: quote = '\"' elif '\'' in s: quote = '\'' else: return None attrpos = s.find(attr) startquotepos = s.find(quote, attrpos+1) endquotepos = s.find(quote, startquotepos+1) url = s[startquotepos+1:endquotepos] #if url[0] == "/": # url = self.domain + url if url[0:4] != "http": return None return url def getTitle (self, s): if '>' in s: return s.split(">")[1] return None def login (self): print "getting token for", self.title data = urllib.urlencode({ 'userid': self.userid, 'username': self.title }) f = urllib.urlopen(API_LOGIN, data) api = f.read().split("\n") if api[0] != "#@scanjam 0.2" or api[1] != "OK": print "ERROR GETTING TOKEN" return None payload = api[2].split("\t") print "GOT SESSION:", payload[2] time.sleep(0.5) return payload[2] def report (self): if len(self.urls) == 0 and len(self.images) == 0: print self.title, "nothing to do" return self.session = self.login() if self.session is None: print self.title, "error getting session" return print "" print self.title, "reported", len(self.urls), "urls,", len(self.images), "images" for url in reversed(self.urls): if "wearesolidgold" in url: continue print "URL", url data = urllib.urlencode({ 'session': self.session, 'room': FEED_ROOM, 'msg': url }) f = urllib.urlopen(API_POST_VIDEO, data) # print f.read() time.sleep(5) for url in reversed(self.images): print "IMG", url data = urllib.urlencode({ 'session': self.session, 'room': FEED_ROOM, 'msg': url }) f = urllib.urlopen(API_POST_IMAGE, data) time.sleep(5) def load_dupes (): dupelist = open (DUPE_LIST, 'r') for line in dupelist: dupes[line.strip()] = True dupelist.close() def load_feeds (): feedlist = open (FEED_LIST, 'r') feeds = [] for line in feedlist: print(line); src,title,userid = line.strip().split("\t") print (src,title,userid); #feed = Feed (src,title,userid) #feeds.append(feed) feedlist.close() exit; for feed in reversed(feeds): feed.report() def save_dupes (): dupelist = open (DUPE_LIST+".tmp", 'w') for k,v in dupes.iteritems(): dupelist.write(k.strip()+"\n") dupelist.close() os.rename(DUPE_LIST+".tmp", DUPE_LIST) load_dupes() load_feeds() save_dupes()