#!/usr/bin/python
import commands
import MySQLdb
import urllib
import sha
import sys
import time
import re
import os
def now ():
return int(time.mktime(time.localtime()))
#so I can't change this host to the IP address to test this
#because we're running a name-based virtual hosts setup, right?
#yeah, but we can change /etc/hosts to set sj.com to 127.0.0.1 and it basically should work
# cool
SERVER_HOST = 'www.scannerjammer.com'
SERVER_PORT = 80
API_HEADER = "#@scanjam 0.2\n"
HTML_TITLE_RE = re.compile('
([^<]+)')
DUPE_LIST = "feeder/dupes.test"
#DUPE_LIST = "feeder/dupes.txt"
FEED_LIST = "feeder/feeds.txt"
#FEED_PATH = "feeder/feeds"
FEED_PATH = "feeder/test"
FEED_STALE_TIME = 3600
FEED_ROOM = "feederbleeder"
API_LOGIN = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/auth/sneakin"
API_POST_VIDEO = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/room/video"
API_POST_IMAGE = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/room/say"
API_LOGOUT = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/auth/logout"
print API_LOGIN
print API_POST_VIDEO
print API_POST_IMAGE
print API_LOGOUT
dupes = {}
class Feed:
def __init__ (self, src, title, userid):
self.src = src
self.title = title
self.userid = userid
self.domain = "http://" + src.split("/")[2]
self.urls = []
self.images = []
self.load()
def load (self):
filename = "/".join([FEED_PATH, self.title])
refresh = True
# check last update of feed
if os.path.exists(filename):
stat = os.stat(filename)
if stat.st_mtime > now() - FEED_STALE_TIME:
refresh = False
# if stale/empty then download
if refresh:
print self.title, "loading from web"
feedhtml = urllib.urlopen(self.src).read()
if len(feedhtml):
out = open(filename, 'w')
out.write(feedhtml)
self.parse(feedhtml)
# otherwise, load from disk
else:
print self.title, "loading from disk"
feed = open (filename, 'r')
feedhtml = feed.read()
feed.close()
self.parse(feedhtml)
# parse urls out of html files
# display these urls (by date, by source)
def parse (self, html):
tags = html.replace(">","<").split("<")
lastimage = ""
for t in tags:
url = None
if len(t) < 1:
continue
if t[0] == "a":
if "href" not in t:
continue
url = self.getAttribute("href", t)
elif t[0] == "iframe":
if "src" not in t:
continue
url = self.getAttribute("src", t)
elif t[0:3] == "img":
if "src" not in t:
continue
if "php" in t:
continue
url = self.getAttribute("src", t)
if url is None:
continue
if url in dupes:
continue
if url[-3:] != "jpg":
continue
print url
lastimage = url
dupes[url.strip()] = True
self.images.append(url)
continue
else:
continue
if url is None:
continue
if url in dupes:
continue
if "youtube.com" in url:
dupes[url.strip()] = True
self.urls.append(url)
if "youtu.be" in url:
dupes[url.strip()] = True
self.urls.append(url)
if "vimeo.com" in url:
dupes[url.strip()] = True
# http://player.vimeo.com/video/23731158
if "http://player.vimeo.com/video/" in url:
url = "http://vimeo.com/" + url.replace('http://player.vimeo.com/video/', '')
self.urls.append(url)
if "soundcloud.com" in url:
dupes[url.strip()] = True
self.urls.append(url)
if url[-3:] == "mp3":
dupes[url.strip()] = True
u = url.replace(" ","%20")
self.urls.append(lastimage+" "+u)
def getAttribute (self, attr, s):
quote = None
if '\"' in s:
quote = '\"'
elif '\'' in s:
quote = '\''
else:
return None
attrpos = s.find(attr)
startquotepos = s.find(quote, attrpos+1)
endquotepos = s.find(quote, startquotepos+1)
url = s[startquotepos+1:endquotepos]
#if url[0] == "/":
# url = self.domain + url
if url[0:4] != "http":
return None
return url
def getTitle (self, s):
if '>' in s:
return s.split(">")[1]
return None
def login (self):
print "getting token for", self.title
data = urllib.urlencode({ 'userid': self.userid, 'username': self.title })
f = urllib.urlopen(API_LOGIN, data)
api = f.read().split("\n")
if api[0] != "#@scanjam 0.2" or api[1] != "OK":
print "ERROR GETTING TOKEN"
return None
payload = api[2].split("\t")
print "GOT SESSION:", payload[2]
time.sleep(0.5)
return payload[2]
def report (self):
if len(self.urls) == 0 and len(self.images) == 0:
print self.title, "nothing to do"
return
self.session = self.login()
if self.session is None:
print self.title, "error getting session"
return
print ""
print self.title, "reported", len(self.urls), "urls,", len(self.images), "images"
for url in reversed(self.urls):
if "wearesolidgold" in url:
continue
print "URL", url
data = urllib.urlencode({ 'session': self.session, 'room': FEED_ROOM, 'msg': url })
f = urllib.urlopen(API_POST_VIDEO, data)
# print f.read()
time.sleep(5)
for url in reversed(self.images):
print "IMG", url
data = urllib.urlencode({ 'session': self.session, 'room': FEED_ROOM, 'msg': url })
f = urllib.urlopen(API_POST_IMAGE, data)
time.sleep(5)
def load_dupes ():
dupelist = open (DUPE_LIST, 'r')
for line in dupelist:
dupes[line.strip()] = True
dupelist.close()
def load_feeds ():
feedlist = open (FEED_LIST, 'r')
feeds = []
for line in feedlist:
print(line);
src,title,userid = line.strip().split("\t")
print (src,title,userid);
#feed = Feed (src,title,userid)
#feeds.append(feed)
feedlist.close()
exit;
for feed in reversed(feeds):
feed.report()
def save_dupes ():
dupelist = open (DUPE_LIST+".tmp", 'w')
for k,v in dupes.iteritems():
dupelist.write(k.strip()+"\n")
dupelist.close()
os.rename(DUPE_LIST+".tmp", DUPE_LIST)
load_dupes()
load_feeds()
save_dupes()