summaryrefslogtreecommitdiff
path: root/feeder/feeder.py.bak
diff options
context:
space:
mode:
authorPepper <pepper@scannerjammer.com>2015-05-20 11:16:13 -0400
committerPepper <pepper@scannerjammer.com>2015-05-20 11:16:13 -0400
commita4916103efb2d97896c456ff0e83064b21e85d25 (patch)
treeb3eb529e4b96375109626bbeada35d4f8a2667ee /feeder/feeder.py.bak
parent3790eedc2f48c725c586b8c7b924875fedbeb7b4 (diff)
first commit in a while
Diffstat (limited to 'feeder/feeder.py.bak')
-rwxr-xr-xfeeder/feeder.py.bak232
1 files changed, 232 insertions, 0 deletions
diff --git a/feeder/feeder.py.bak b/feeder/feeder.py.bak
new file mode 100755
index 0000000..6197ba3
--- /dev/null
+++ b/feeder/feeder.py.bak
@@ -0,0 +1,232 @@
+#!/usr/bin/python
+
+import commands
+import MySQLdb
+import urllib
+import sha
+import sys
+import time
+import re
+import os
+def now ():
+ return int(time.mktime(time.localtime()))
+
+#so I can't change this host to the IP address to test this
+#because we're running a name-based virtual hosts setup, right?
+#yeah, but we can change /etc/hosts to set sj.com to 127.0.0.1 and it basically should work
+# cool
+SERVER_HOST = 'www.scannerjammer.com'
+SERVER_PORT = 80
+
+API_HEADER = "#@scanjam 0.2\n"
+
+HTML_TITLE_RE = re.compile('<title>([^<]+)</title>')
+
+DUPE_LIST = "feeder/dupes.test"
+#DUPE_LIST = "feeder/dupes.txt"
+FEED_LIST = "feeder/feeds.txt"
+#FEED_PATH = "feeder/feeds"
+FEED_PATH = "feeder/test"
+FEED_STALE_TIME = 3600
+FEED_ROOM = "feederbleeder"
+
+API_LOGIN = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/auth/sneakin"
+API_POST_VIDEO = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/room/video"
+API_POST_IMAGE = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/room/say"
+API_LOGOUT = "http://"+SERVER_HOST+":"+str(SERVER_PORT)+"/api/auth/logout"
+
+print API_LOGIN
+print API_POST_VIDEO
+print API_POST_IMAGE
+print API_LOGOUT
+
+
+dupes = {}
+
+class Feed:
+ def __init__ (self, src, title, userid):
+ self.src = src
+ self.title = title
+ self.userid = userid
+ self.domain = "http://" + src.split("/")[2]
+ self.urls = []
+ self.images = []
+ self.load()
+
+ def load (self):
+ filename = "/".join([FEED_PATH, self.title])
+ refresh = True
+
+ # check last update of feed
+ if os.path.exists(filename):
+ stat = os.stat(filename)
+ if stat.st_mtime > now() - FEED_STALE_TIME:
+ refresh = False
+
+ # if stale/empty then download
+ if refresh:
+ print self.title, "loading from web"
+ feedhtml = urllib.urlopen(self.src).read()
+ if len(feedhtml):
+ out = open(filename, 'w')
+ out.write(feedhtml)
+ self.parse(feedhtml)
+
+ # otherwise, load from disk
+ else:
+ print self.title, "loading from disk"
+ feed = open (filename, 'r')
+ feedhtml = feed.read()
+ feed.close()
+ self.parse(feedhtml)
+
+ # parse urls out of html files
+ # display these urls (by date, by source)
+ def parse (self, html):
+ tags = html.replace("&gt;","<").split("<")
+ lastimage = ""
+ for t in tags:
+ url = None
+ if len(t) < 1:
+ continue
+ if t[0] == "a":
+ if "href" not in t:
+ continue
+ url = self.getAttribute("href", t)
+ elif t[0] == "iframe":
+ if "src" not in t:
+ continue
+ url = self.getAttribute("src", t)
+ elif t[0:3] == "img":
+ if "src" not in t:
+ continue
+ if "php" in t:
+ continue
+ url = self.getAttribute("src", t)
+ if url is None:
+ continue
+ if url in dupes:
+ continue
+ if url[-3:] != "jpg":
+ continue
+ print url
+ lastimage = url
+ dupes[url.strip()] = True
+ self.images.append(url)
+ continue
+ else:
+ continue
+
+ if url is None:
+ continue
+ if url in dupes:
+ continue
+ if "youtube.com" in url:
+ dupes[url.strip()] = True
+ self.urls.append(url)
+ if "youtu.be" in url:
+ dupes[url.strip()] = True
+ self.urls.append(url)
+ if "vimeo.com" in url:
+ dupes[url.strip()] = True
+ # http://player.vimeo.com/video/23731158
+ if "http://player.vimeo.com/video/" in url:
+ url = "http://vimeo.com/" + url.replace('http://player.vimeo.com/video/', '')
+ self.urls.append(url)
+ if "soundcloud.com" in url:
+ dupes[url.strip()] = True
+ self.urls.append(url)
+ if url[-3:] == "mp3":
+ dupes[url.strip()] = True
+ u = url.replace(" ","%20")
+ self.urls.append(lastimage+" "+u)
+
+ def getAttribute (self, attr, s):
+ quote = None
+ if '\"' in s:
+ quote = '\"'
+ elif '\'' in s:
+ quote = '\''
+ else:
+ return None
+
+ attrpos = s.find(attr)
+ startquotepos = s.find(quote, attrpos+1)
+ endquotepos = s.find(quote, startquotepos+1)
+ url = s[startquotepos+1:endquotepos]
+ #if url[0] == "/":
+ # url = self.domain + url
+ if url[0:4] != "http":
+ return None
+ return url
+ def getTitle (self, s):
+ if '>' in s:
+ return s.split(">")[1]
+ return None
+ def login (self):
+ print "getting token for", self.title
+ data = urllib.urlencode({ 'userid': self.userid, 'username': self.title })
+ f = urllib.urlopen(API_LOGIN, data)
+ api = f.read().split("\n")
+ if api[0] != "#@scanjam 0.2" or api[1] != "OK":
+ print "ERROR GETTING TOKEN"
+ return None
+ payload = api[2].split("\t")
+ print "GOT SESSION:", payload[2]
+ time.sleep(0.5)
+ return payload[2]
+ def report (self):
+ if len(self.urls) == 0 and len(self.images) == 0:
+ print self.title, "nothing to do"
+ return
+ self.session = self.login()
+ if self.session is None:
+ print self.title, "error getting session"
+ return
+ print ""
+ print self.title, "reported", len(self.urls), "urls,", len(self.images), "images"
+ for url in reversed(self.urls):
+ if "wearesolidgold" in url:
+ continue
+ print "URL", url
+ data = urllib.urlencode({ 'session': self.session, 'room': FEED_ROOM, 'msg': url })
+ f = urllib.urlopen(API_POST_VIDEO, data)
+ # print f.read()
+ time.sleep(5)
+ for url in reversed(self.images):
+ print "IMG", url
+ data = urllib.urlencode({ 'session': self.session, 'room': FEED_ROOM, 'msg': url })
+ f = urllib.urlopen(API_POST_IMAGE, data)
+ time.sleep(5)
+
+def load_dupes ():
+ dupelist = open (DUPE_LIST, 'r')
+ for line in dupelist:
+ dupes[line.strip()] = True
+ dupelist.close()
+
+def load_feeds ():
+ feedlist = open (FEED_LIST, 'r')
+ feeds = []
+ for line in feedlist:
+ print(line);
+ src,title,userid = line.strip().split("\t")
+ print (src,title,userid);
+ #feed = Feed (src,title,userid)
+ #feeds.append(feed)
+ feedlist.close()
+ exit;
+ for feed in reversed(feeds):
+ feed.report()
+
+def save_dupes ():
+ dupelist = open (DUPE_LIST+".tmp", 'w')
+ for k,v in dupes.iteritems():
+ dupelist.write(k.strip()+"\n")
+ dupelist.close()
+ os.rename(DUPE_LIST+".tmp", DUPE_LIST)
+
+load_dupes()
+load_feeds()
+save_dupes()
+