diff options
Diffstat (limited to 'feeder/fb_feed.py')
| -rwxr-xr-x | feeder/fb_feed.py | 245 |
1 files changed, 245 insertions, 0 deletions
diff --git a/feeder/fb_feed.py b/feeder/fb_feed.py new file mode 100755 index 0000000..2677256 --- /dev/null +++ b/feeder/fb_feed.py @@ -0,0 +1,245 @@ +#!/usr/bin/python + +import facebook +import scanjam +import scraper +import sys +import time +import os +import codecs + +POLL_INTERVAL = 60 * 60 +GROUPS_FILE = "fb/groups.txt" + +tumblr = scraper.Scraper () +# ryz = facebook.Facebook ("ryz") +jono = facebook.Facebook ("daytimetelevision") +pepper = facebook.Facebook ("pepper") + +def is_image (url): + return url[-3:].lower() in ["gif","jpg","png"] or url[-4:].lower == "jpeg" +def is_video (url): + if url[0:4] != "http": + return False + if "youtube.com" in url: + return True + if "youtu.be" in url: + return True + if "soundcloud.com" in url: + return True + if "vimeo.com" in url: + return True + return False + +class FacebookGroup: + def __init__ (self, name, groupid, userid, username, room): + print "loaded", name + self.name = name + self.groupid = groupid + self.scanjam = scanjam.ScanJam (userid, username, room) + self.first_time = False + self.dupe_list = "fb/feeds/" + name + "_dupes.txt" + self.dupes = {} + self.urls = [] + self.load_dupes () + def feed (self): + if len(self.urls) < 1: + print self.name, "nothing to do" + return + print self.name, "got", len(self.urls), "urls" + for url in reversed(self.urls): + print url + if is_image(url): + self.scanjam.image (url) + else: + self.scanjam.video (url) + self.save_dupes () + self.urls = [] + + + def check (self): + print self.name, "FETCHING" + feed = fb.get_post (self.groupid) + print repr(feed) + feed = fb.get_feed (self.groupid) + print repr(feed) + def report (self, fb): + print self.name, "FETCHING" + feed = fb.get_feed (self.groupid) + return self.report_feed (feed) + def parse_date (self, date): + return time.mktime(time.strptime(date, "%Y-%m-%dT%H:%M:%S+0000")) + def report_feed (self, feed): + recs = [] + for record in feed['data']: + # print repr(record) + rec = { + 'fbid': record['id'], + 'date': self.parse_date(record['created_time']), + 'datetime': record['created_time'], + 'likes': 0, + 'comments': 0, + 'from': "", + 'fromid': 0, + 'type': "", + 'linkname': "", + 'link': "", + 'img': "", + 'desc': "", + } + print record['id'] + print record['created_time'] + if "likes" in record and "count" in record['likes']: + print 'LIKES\t', record["likes"]["count"] + rec['likes'] = record['likes']['count'] + if "comments" in record and "count" in record['comments']: + print 'COMMENTS\t', record["comments"]["count"] + rec['comments'] = record['comments']['count'] + if "from" in record: + if "name" in record["from"]: + print 'FROM\t', record['from']['name'] + rec['from'] = record['from']['name'] + rec['fromid'] = record['from']['id'] + if record['type'] == "photo": + if "picture" in record and record['picture'][-6:] == "_s.jpg": + pic = record['picture'].replace("_s.jpg", "_o.jpg") + rec['link'] = pic + print pic + if "link" in record: + print 'LINK\t', record['link'] + rec['link'] = record['link'] + rec['linkname'] = record['name'] + if "description" in record: + print 'DESC\t', repr( record['description'] ) + rec['desc'] = record['description'] + #if "source" in record: + # print 'SOURCE\t', record['source'] + #if "message" in record: + # # if "http" in record['message']: + # print 'MESSAGE\t', repr( record['message'] ) + recs.append(rec) + print + return recs + + def deep_report (self, fb): + print self.name, "DEEP REPORTING", fb.name + feed = fb.get_feed (self.groupid) + recs = self.report_feed(feed) + page = 0 + for i in xrange(2): + page += 1 + if "data" in feed and "paging" in feed and "next" in feed['paging']: + print "SCRAPING page", page + feed = fb.get_feed_page (feed['paging']['next']) + try: + newrecs = self.report_feed(feed) + recs.extend(newrecs) + except: + continue + else: + print "done" + break + return recs + def deep_scrape (self, fb): + print self.name, "DEEP SCRAPING", fb.name + feed = fb.get_feed (self.groupid) + self.scrape_feed(fb, feed) + page = 0 + for i in xrange(10): + page += 1 + if "data" in feed and "paging" in feed and "next" in feed['paging']: + print "SCRAPING page", page + feed = fb.get_feed_page (feed['paging']['next']) + try: + self.scrape_feed(fb, feed) + except: + continue + else: + print "done" + return + def scrape (self, fb): + try: + print self.name, "SCRAPING", fb.name + feed = fb.get_feed (self.groupid) + self.scrape_feed(fb, feed) + except: + print self.name, "ERROR DURING SCRAPE" + def scrape_feed (self, fb, feed): + for record in feed['data']: + if record['type'] == "photo": + if "picture" in record and record['picture'][-6:] == "_s.jpg": + pic = record['picture'].replace("_s.jpg", "_o.jpg") + self.remember(pic) + if "link" in record: + link = record['link'] + if "facebook.com" in link: + continue + if is_video(link): + self.remember(link) + else: + if link in self.dupes: + continue + self.dupes[link] = True + try: + print "SCRAPING", link + urls_from_link = tumblr.scrape(link) + for url in urls_from_link: + if is_video(link): + # self.remember(url) + continue + except: + print "error scraping link" + def remember (self, url): + if url in self.dupes: + return + self.dupes[url] = True + self.urls.append(url) + def load_dupes (self): + if not os.path.exists(self.dupe_list): + self.first_time = True + return + dupelist = open (self.dupe_list, 'r') + for line in dupelist: + self.dupes[line.strip()] = True + dupelist.close() + def save_dupes (self): + dupelist = codecs.open (self.dupe_list+".tmp", 'w', "utf-8") + for k,v in self.dupes.iteritems(): + try: + dupelist.write(k.encode('utf-8', 'replace').strip()+"\n") + except (UnicodeDecodeError): + continue + dupelist.close() + os.rename(self.dupe_list+".tmp", self.dupe_list) +def load_groups (): + print "LOADING GROUPS" + groups = [] + f = file (GROUPS_FILE, "r") + for line in f.readlines(): + if line[0] == "#" or len(line) < 2: + continue + name,groupid,userid,username,room = line.strip().split("\t") + group = FacebookGroup (name,groupid,userid,username,room) + groups.append(group) + f.close() + return groups +def poll_forever (groups): + while (1): + for group in groups: + print "scraping", group.name + if group.first_time: + group.deep_scrape (pepper) + group.deep_scrape (jono) + # group.deep_scrape (ryz) + else: + group.scrape (pepper) + group.scrape (jono) + # group.scrape (ryz) + group.feed () + time.sleep (1) + time.sleep(POLL_INTERVAL) + +if __name__ == '__main__': + groups = load_groups () + poll_forever (groups) + |
