summaryrefslogtreecommitdiff
path: root/feeder/fb_feed.py
diff options
context:
space:
mode:
authorPepper <pepper@scannerjammer.com>2015-05-20 11:16:13 -0400
committerPepper <pepper@scannerjammer.com>2015-05-20 11:16:13 -0400
commita4916103efb2d97896c456ff0e83064b21e85d25 (patch)
treeb3eb529e4b96375109626bbeada35d4f8a2667ee /feeder/fb_feed.py
parent3790eedc2f48c725c586b8c7b924875fedbeb7b4 (diff)
first commit in a while
Diffstat (limited to 'feeder/fb_feed.py')
-rwxr-xr-xfeeder/fb_feed.py245
1 files changed, 245 insertions, 0 deletions
diff --git a/feeder/fb_feed.py b/feeder/fb_feed.py
new file mode 100755
index 0000000..2677256
--- /dev/null
+++ b/feeder/fb_feed.py
@@ -0,0 +1,245 @@
+#!/usr/bin/python
+
+import facebook
+import scanjam
+import scraper
+import sys
+import time
+import os
+import codecs
+
+POLL_INTERVAL = 60 * 60
+GROUPS_FILE = "fb/groups.txt"
+
+tumblr = scraper.Scraper ()
+# ryz = facebook.Facebook ("ryz")
+jono = facebook.Facebook ("daytimetelevision")
+pepper = facebook.Facebook ("pepper")
+
+def is_image (url):
+ return url[-3:].lower() in ["gif","jpg","png"] or url[-4:].lower == "jpeg"
+def is_video (url):
+ if url[0:4] != "http":
+ return False
+ if "youtube.com" in url:
+ return True
+ if "youtu.be" in url:
+ return True
+ if "soundcloud.com" in url:
+ return True
+ if "vimeo.com" in url:
+ return True
+ return False
+
+class FacebookGroup:
+ def __init__ (self, name, groupid, userid, username, room):
+ print "loaded", name
+ self.name = name
+ self.groupid = groupid
+ self.scanjam = scanjam.ScanJam (userid, username, room)
+ self.first_time = False
+ self.dupe_list = "fb/feeds/" + name + "_dupes.txt"
+ self.dupes = {}
+ self.urls = []
+ self.load_dupes ()
+ def feed (self):
+ if len(self.urls) < 1:
+ print self.name, "nothing to do"
+ return
+ print self.name, "got", len(self.urls), "urls"
+ for url in reversed(self.urls):
+ print url
+ if is_image(url):
+ self.scanjam.image (url)
+ else:
+ self.scanjam.video (url)
+ self.save_dupes ()
+ self.urls = []
+
+
+ def check (self):
+ print self.name, "FETCHING"
+ feed = fb.get_post (self.groupid)
+ print repr(feed)
+ feed = fb.get_feed (self.groupid)
+ print repr(feed)
+ def report (self, fb):
+ print self.name, "FETCHING"
+ feed = fb.get_feed (self.groupid)
+ return self.report_feed (feed)
+ def parse_date (self, date):
+ return time.mktime(time.strptime(date, "%Y-%m-%dT%H:%M:%S+0000"))
+ def report_feed (self, feed):
+ recs = []
+ for record in feed['data']:
+ # print repr(record)
+ rec = {
+ 'fbid': record['id'],
+ 'date': self.parse_date(record['created_time']),
+ 'datetime': record['created_time'],
+ 'likes': 0,
+ 'comments': 0,
+ 'from': "",
+ 'fromid': 0,
+ 'type': "",
+ 'linkname': "",
+ 'link': "",
+ 'img': "",
+ 'desc': "",
+ }
+ print record['id']
+ print record['created_time']
+ if "likes" in record and "count" in record['likes']:
+ print 'LIKES\t', record["likes"]["count"]
+ rec['likes'] = record['likes']['count']
+ if "comments" in record and "count" in record['comments']:
+ print 'COMMENTS\t', record["comments"]["count"]
+ rec['comments'] = record['comments']['count']
+ if "from" in record:
+ if "name" in record["from"]:
+ print 'FROM\t', record['from']['name']
+ rec['from'] = record['from']['name']
+ rec['fromid'] = record['from']['id']
+ if record['type'] == "photo":
+ if "picture" in record and record['picture'][-6:] == "_s.jpg":
+ pic = record['picture'].replace("_s.jpg", "_o.jpg")
+ rec['link'] = pic
+ print pic
+ if "link" in record:
+ print 'LINK\t', record['link']
+ rec['link'] = record['link']
+ rec['linkname'] = record['name']
+ if "description" in record:
+ print 'DESC\t', repr( record['description'] )
+ rec['desc'] = record['description']
+ #if "source" in record:
+ # print 'SOURCE\t', record['source']
+ #if "message" in record:
+ # # if "http" in record['message']:
+ # print 'MESSAGE\t', repr( record['message'] )
+ recs.append(rec)
+ print
+ return recs
+
+ def deep_report (self, fb):
+ print self.name, "DEEP REPORTING", fb.name
+ feed = fb.get_feed (self.groupid)
+ recs = self.report_feed(feed)
+ page = 0
+ for i in xrange(2):
+ page += 1
+ if "data" in feed and "paging" in feed and "next" in feed['paging']:
+ print "SCRAPING page", page
+ feed = fb.get_feed_page (feed['paging']['next'])
+ try:
+ newrecs = self.report_feed(feed)
+ recs.extend(newrecs)
+ except:
+ continue
+ else:
+ print "done"
+ break
+ return recs
+ def deep_scrape (self, fb):
+ print self.name, "DEEP SCRAPING", fb.name
+ feed = fb.get_feed (self.groupid)
+ self.scrape_feed(fb, feed)
+ page = 0
+ for i in xrange(10):
+ page += 1
+ if "data" in feed and "paging" in feed and "next" in feed['paging']:
+ print "SCRAPING page", page
+ feed = fb.get_feed_page (feed['paging']['next'])
+ try:
+ self.scrape_feed(fb, feed)
+ except:
+ continue
+ else:
+ print "done"
+ return
+ def scrape (self, fb):
+ try:
+ print self.name, "SCRAPING", fb.name
+ feed = fb.get_feed (self.groupid)
+ self.scrape_feed(fb, feed)
+ except:
+ print self.name, "ERROR DURING SCRAPE"
+ def scrape_feed (self, fb, feed):
+ for record in feed['data']:
+ if record['type'] == "photo":
+ if "picture" in record and record['picture'][-6:] == "_s.jpg":
+ pic = record['picture'].replace("_s.jpg", "_o.jpg")
+ self.remember(pic)
+ if "link" in record:
+ link = record['link']
+ if "facebook.com" in link:
+ continue
+ if is_video(link):
+ self.remember(link)
+ else:
+ if link in self.dupes:
+ continue
+ self.dupes[link] = True
+ try:
+ print "SCRAPING", link
+ urls_from_link = tumblr.scrape(link)
+ for url in urls_from_link:
+ if is_video(link):
+ # self.remember(url)
+ continue
+ except:
+ print "error scraping link"
+ def remember (self, url):
+ if url in self.dupes:
+ return
+ self.dupes[url] = True
+ self.urls.append(url)
+ def load_dupes (self):
+ if not os.path.exists(self.dupe_list):
+ self.first_time = True
+ return
+ dupelist = open (self.dupe_list, 'r')
+ for line in dupelist:
+ self.dupes[line.strip()] = True
+ dupelist.close()
+ def save_dupes (self):
+ dupelist = codecs.open (self.dupe_list+".tmp", 'w', "utf-8")
+ for k,v in self.dupes.iteritems():
+ try:
+ dupelist.write(k.encode('utf-8', 'replace').strip()+"\n")
+ except (UnicodeDecodeError):
+ continue
+ dupelist.close()
+ os.rename(self.dupe_list+".tmp", self.dupe_list)
+def load_groups ():
+ print "LOADING GROUPS"
+ groups = []
+ f = file (GROUPS_FILE, "r")
+ for line in f.readlines():
+ if line[0] == "#" or len(line) < 2:
+ continue
+ name,groupid,userid,username,room = line.strip().split("\t")
+ group = FacebookGroup (name,groupid,userid,username,room)
+ groups.append(group)
+ f.close()
+ return groups
+def poll_forever (groups):
+ while (1):
+ for group in groups:
+ print "scraping", group.name
+ if group.first_time:
+ group.deep_scrape (pepper)
+ group.deep_scrape (jono)
+ # group.deep_scrape (ryz)
+ else:
+ group.scrape (pepper)
+ group.scrape (jono)
+ # group.scrape (ryz)
+ group.feed ()
+ time.sleep (1)
+ time.sleep(POLL_INTERVAL)
+
+if __name__ == '__main__':
+ groups = load_groups ()
+ poll_forever (groups)
+