#!/usr/bin/python import facebook import scanjam import scraper import sys import time import os import codecs POLL_INTERVAL = 60 * 60 GROUPS_FILE = "fb/groups.txt" tumblr = scraper.Scraper () # ryz = facebook.Facebook ("ryz") jono = facebook.Facebook ("daytimetelevision") pepper = facebook.Facebook ("pepper") def is_image (url): return url[-3:].lower() in ["gif","jpg","png"] or url[-4:].lower == "jpeg" def is_video (url): if url[0:4] != "http": return False if "youtube.com" in url: return True if "youtu.be" in url: return True if "soundcloud.com" in url: return True if "vimeo.com" in url: return True return False class FacebookGroup: def __init__ (self, name, groupid, userid, username, room): print "loaded", name self.name = name self.groupid = groupid self.scanjam = scanjam.ScanJam (userid, username, room) self.first_time = False self.dupe_list = "fb/feeds/" + name + "_dupes.txt" self.dupes = {} self.urls = [] self.load_dupes () def feed (self): if len(self.urls) < 1: print self.name, "nothing to do" return print self.name, "got", len(self.urls), "urls" for url in reversed(self.urls): print url if is_image(url): self.scanjam.image (url) else: self.scanjam.video (url) self.save_dupes () self.urls = [] def check (self): print self.name, "FETCHING" feed = fb.get_post (self.groupid) print repr(feed) feed = fb.get_feed (self.groupid) print repr(feed) def report (self, fb): print self.name, "FETCHING" feed = fb.get_feed (self.groupid) return self.report_feed (feed) def parse_date (self, date): return time.mktime(time.strptime(date, "%Y-%m-%dT%H:%M:%S+0000")) def report_feed (self, feed): recs = [] for record in feed['data']: # print repr(record) rec = { 'fbid': record['id'], 'date': self.parse_date(record['created_time']), 'datetime': record['created_time'], 'likes': 0, 'comments': 0, 'from': "", 'fromid': 0, 'type': "", 'linkname': "", 'link': "", 'img': "", 'desc': "", } print record['id'] print record['created_time'] if "likes" in record and "count" in record['likes']: print 'LIKES\t', record["likes"]["count"] rec['likes'] = record['likes']['count'] if "comments" in record and "count" in record['comments']: print 'COMMENTS\t', record["comments"]["count"] rec['comments'] = record['comments']['count'] if "from" in record: if "name" in record["from"]: print 'FROM\t', record['from']['name'] rec['from'] = record['from']['name'] rec['fromid'] = record['from']['id'] if record['type'] == "photo": if "picture" in record and record['picture'][-6:] == "_s.jpg": pic = record['picture'].replace("_s.jpg", "_o.jpg") rec['link'] = pic print pic if "link" in record: print 'LINK\t', record['link'] rec['link'] = record['link'] rec['linkname'] = record['name'] if "description" in record: print 'DESC\t', repr( record['description'] ) rec['desc'] = record['description'] #if "source" in record: # print 'SOURCE\t', record['source'] #if "message" in record: # # if "http" in record['message']: # print 'MESSAGE\t', repr( record['message'] ) recs.append(rec) print return recs def deep_report (self, fb): print self.name, "DEEP REPORTING", fb.name feed = fb.get_feed (self.groupid) recs = self.report_feed(feed) page = 0 for i in xrange(2): page += 1 if "data" in feed and "paging" in feed and "next" in feed['paging']: print "SCRAPING page", page feed = fb.get_feed_page (feed['paging']['next']) try: newrecs = self.report_feed(feed) recs.extend(newrecs) except: continue else: print "done" break return recs def deep_scrape (self, fb): print self.name, "DEEP SCRAPING", fb.name feed = fb.get_feed (self.groupid) self.scrape_feed(fb, feed) page = 0 for i in xrange(10): page += 1 if "data" in feed and "paging" in feed and "next" in feed['paging']: print "SCRAPING page", page feed = fb.get_feed_page (feed['paging']['next']) try: self.scrape_feed(fb, feed) except: continue else: print "done" return def scrape (self, fb): try: print self.name, "SCRAPING", fb.name feed = fb.get_feed (self.groupid) self.scrape_feed(fb, feed) except: print self.name, "ERROR DURING SCRAPE" def scrape_feed (self, fb, feed): for record in feed['data']: if record['type'] == "photo": if "picture" in record and record['picture'][-6:] == "_s.jpg": pic = record['picture'].replace("_s.jpg", "_o.jpg") self.remember(pic) if "link" in record: link = record['link'] if "facebook.com" in link: continue if is_video(link): self.remember(link) else: if link in self.dupes: continue self.dupes[link] = True try: print "SCRAPING", link urls_from_link = tumblr.scrape(link) for url in urls_from_link: if is_video(link): # self.remember(url) continue except: print "error scraping link" def remember (self, url): if url in self.dupes: return self.dupes[url] = True self.urls.append(url) def load_dupes (self): if not os.path.exists(self.dupe_list): self.first_time = True return dupelist = open (self.dupe_list, 'r') for line in dupelist: self.dupes[line.strip()] = True dupelist.close() def save_dupes (self): dupelist = codecs.open (self.dupe_list+".tmp", 'w', "utf-8") for k,v in self.dupes.iteritems(): try: dupelist.write(k.encode('utf-8', 'replace').strip()+"\n") except (UnicodeDecodeError): continue dupelist.close() os.rename(self.dupe_list+".tmp", self.dupe_list) def load_groups (): print "LOADING GROUPS" groups = [] f = file (GROUPS_FILE, "r") for line in f.readlines(): if line[0] == "#" or len(line) < 2: continue name,groupid,userid,username,room = line.strip().split("\t") group = FacebookGroup (name,groupid,userid,username,room) groups.append(group) f.close() return groups def poll_forever (groups): while (1): for group in groups: print "scraping", group.name if group.first_time: group.deep_scrape (pepper) group.deep_scrape (jono) # group.deep_scrape (ryz) else: group.scrape (pepper) group.scrape (jono) # group.scrape (ryz) group.feed () time.sleep (1) time.sleep(POLL_INTERVAL) if __name__ == '__main__': groups = load_groups () poll_forever (groups)