diff options
| author | Pepper <pepper@scannerjammer.com> | 2015-05-20 11:16:13 -0400 |
|---|---|---|
| committer | Pepper <pepper@scannerjammer.com> | 2015-05-20 11:16:13 -0400 |
| commit | a4916103efb2d97896c456ff0e83064b21e85d25 (patch) | |
| tree | b3eb529e4b96375109626bbeada35d4f8a2667ee /feeder/feeder.py.bak | |
| parent | 3790eedc2f48c725c586b8c7b924875fedbeb7b4 (diff) | |
first commit in a while
Diffstat (limited to 'feeder/feeder.py.bak')
| -rwxr-xr-x | feeder/feeder.py.bak | 232 |
1 files changed, 232 insertions, 0 deletions
diff --git a/feeder/feeder.py.bak b/feeder/feeder.py.bak new file mode 100755 index 0000000..6197ba3 --- /dev/null +++ b/feeder/feeder.py.bak @@ -0,0 +1,232 @@ +#!/usr/bin/python + +import commands +import MySQLdb +import urllib +import sha +import sys +import time +import re +import os +def now (): + return int(time.mktime(time.localtime())) + +#so I can't change this host to the IP address to test this +#because we're running a name-based virtual hosts setup, right? +#yeah, but we can change /etc/hosts to set sj.com to 127.0.0.1 and it basically should work +# cool +SERVER_HOST = 'www.scannerjammer.com' +SERVER_PORT = 80 + +API_HEADER = "#@scanjam 0.2\n" + +HTML_TITLE_RE = re.compile('<title>([^<]+)</title>') + +DUPE_LIST = "feeder/dupes.test" +#DUPE_LIST = "feeder/dupes.txt" +FEED_LIST = "feeder/feeds.txt" +#FEED_PATH = "feeder/feeds" +FEED_PATH = "feeder/test" +FEED_STALE_TIME = 3600 +FEED_ROOM = "feederbleeder" + +API_LOGIN = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/auth/sneakin" +API_POST_VIDEO = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/room/video" +API_POST_IMAGE = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/room/say" +API_LOGOUT = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/auth/logout" + +print API_LOGIN +print API_POST_VIDEO +print API_POST_IMAGE +print API_LOGOUT + + +dupes = {} + +class Feed: + def __init__ (self, src, title, userid): + self.src = src + self.title = title + self.userid = userid + self.domain = "http://" + src.split("/")[2] + self.urls = [] + self.images = [] + self.load() + + def load (self): + filename = "/".join([FEED_PATH, self.title]) + refresh = True + + # check last update of feed + if os.path.exists(filename): + stat = os.stat(filename) + if stat.st_mtime > now() - FEED_STALE_TIME: + refresh = False + + # if stale/empty then download + if refresh: + print self.title, "loading from web" + feedhtml = urllib.urlopen(self.src).read() + if len(feedhtml): + out = open(filename, 'w') + out.write(feedhtml) + self.parse(feedhtml) + + # otherwise, load from disk + else: + print self.title, "loading from disk" + feed = open (filename, 'r') + feedhtml = feed.read() + feed.close() + self.parse(feedhtml) + + # parse urls out of html files + # display these urls (by date, by source) + def parse (self, html): + tags = html.replace(">","<").split("<") + lastimage = "" + for t in tags: + url = None + if len(t) < 1: + continue + if t[0] == "a": + if "href" not in t: + continue + url = self.getAttribute("href", t) + elif t[0] == "iframe": + if "src" not in t: + continue + url = self.getAttribute("src", t) + elif t[0:3] == "img": + if "src" not in t: + continue + if "php" in t: + continue + url = self.getAttribute("src", t) + if url is None: + continue + if url in dupes: + continue + if url[-3:] != "jpg": + continue + print url + lastimage = url + dupes[url.strip()] = True + self.images.append(url) + continue + else: + continue + + if url is None: + continue + if url in dupes: + continue + if "youtube.com" in url: + dupes[url.strip()] = True + self.urls.append(url) + if "youtu.be" in url: + dupes[url.strip()] = True + self.urls.append(url) + if "vimeo.com" in url: + dupes[url.strip()] = True + # http://player.vimeo.com/video/23731158 + if "http://player.vimeo.com/video/" in url: + url = "http://vimeo.com/" + url.replace('http://player.vimeo.com/video/', '') + self.urls.append(url) + if "soundcloud.com" in url: + dupes[url.strip()] = True + self.urls.append(url) + if url[-3:] == "mp3": + dupes[url.strip()] = True + u = url.replace(" ","%20") + self.urls.append(lastimage+" "+u) + + def getAttribute (self, attr, s): + quote = None + if '\"' in s: + quote = '\"' + elif '\'' in s: + quote = '\'' + else: + return None + + attrpos = s.find(attr) + startquotepos = s.find(quote, attrpos+1) + endquotepos = s.find(quote, startquotepos+1) + url = s[startquotepos+1:endquotepos] + #if url[0] == "/": + # url = self.domain + url + if url[0:4] != "http": + return None + return url + def getTitle (self, s): + if '>' in s: + return s.split(">")[1] + return None + def login (self): + print "getting token for", self.title + data = urllib.urlencode({ 'userid': self.userid, 'username': self.title }) + f = urllib.urlopen(API_LOGIN, data) + api = f.read().split("\n") + if api[0] != "#@scanjam 0.2" or api[1] != "OK": + print "ERROR GETTING TOKEN" + return None + payload = api[2].split("\t") + print "GOT SESSION:", payload[2] + time.sleep(0.5) + return payload[2] + def report (self): + if len(self.urls) == 0 and len(self.images) == 0: + print self.title, "nothing to do" + return + self.session = self.login() + if self.session is None: + print self.title, "error getting session" + return + print "" + print self.title, "reported", len(self.urls), "urls,", len(self.images), "images" + for url in reversed(self.urls): + if "wearesolidgold" in url: + continue + print "URL", url + data = urllib.urlencode({ 'session': self.session, 'room': FEED_ROOM, 'msg': url }) + f = urllib.urlopen(API_POST_VIDEO, data) + # print f.read() + time.sleep(5) + for url in reversed(self.images): + print "IMG", url + data = urllib.urlencode({ 'session': self.session, 'room': FEED_ROOM, 'msg': url }) + f = urllib.urlopen(API_POST_IMAGE, data) + time.sleep(5) + +def load_dupes (): + dupelist = open (DUPE_LIST, 'r') + for line in dupelist: + dupes[line.strip()] = True + dupelist.close() + +def load_feeds (): + feedlist = open (FEED_LIST, 'r') + feeds = [] + for line in feedlist: + print(line); + src,title,userid = line.strip().split("\t") + print (src,title,userid); + #feed = Feed (src,title,userid) + #feeds.append(feed) + feedlist.close() + exit; + for feed in reversed(feeds): + feed.report() + +def save_dupes (): + dupelist = open (DUPE_LIST+".tmp", 'w') + for k,v in dupes.iteritems(): + dupelist.write(k.strip()+"\n") + dupelist.close() + os.rename(DUPE_LIST+".tmp", DUPE_LIST) + +load_dupes() +load_feeds() +save_dupes() + |
