# this needs python 3 import re import sys import postgresql from urllib.parse import urlparse db = postgresql.open("pq://postgres:root@localhost/dumpfm") db.execute("SET CLIENT_ENCODING to 'UNICODE'") def get_num_images_in_db(): ps = db.prepare("SELECT COUNT(*) FROM image_urls") return int(ps()[0][0]) def add_url(url): try: print(url) except UnicodeEncodeError: print("i thought python 3 fixed the unicode shit. yet i still get unicode errors everywhere. GOOD JOB FUCKHEADS") ps = db.prepare("DELETE FROM image_urls WHERE url=$1") ps(url) ps2 = db.prepare("INSERT INTO image_urls(url) VALUES($1)") ps2(url) # NOTE. hardcoded room numbers to index here... only indexing DUMPFM (1) and GIF (8) currently. def get_messages(lower, upper): ps = db.prepare("SELECT content FROM messages WHERE message_id >= $1 AND message_id <= $2 AND room_id IN (1,8) ORDER BY message_id ASC") rows = ps(lower, upper) return rows def is_url_an_image(url): image_types = {"jpg", "bmp", "gif", "png"} url = urlparse(url) filetype = url.path[-3:].lower() return filetype in image_types def get_images_from_messages(message): images = [] if message[0:6] == "": # skip html messages return images tokens = message.split(" ") for token in tokens: if token[0:7] == "http://" and is_url_an_image(token): images.append(token) return images def get_urls_from_messages(messages): urls = [] for message in messages: urls.extend(get_images_from_messages(message[0])) return urls # this does 3 things... # convert 'http://dumpfm.s3.amazonaws.com' to 'http://dump.fm' # drops 'http://' from urls # drops 'dump.fm/images' from urls # the client is expected to rebuild urls based on this heuristic: # if the url starts with '/', prepend 'http://dump.fm' # otherwise, prepend 'http://' def make_url_smaller(url): if url[:37] == 'http://dumpfm.s3.amazonaws.com/images': url = 'http://dump.fm/images' + url[37:] if url[:21] == 'http://dump.fm/images': url = url[21:] else: url = url[7:] return url if __name__ == "__main__": if not len(sys.argv) == 3: print('usage: fill.image_urls.py message_id_start message_id_end') sys.exit(1) lower = int(sys.argv[1]) upper = int(sys.argv[2]) num_existing_images = get_num_images_in_db() messages = get_messages(lower, upper) urls = get_urls_from_messages(messages) for url in urls: add_url(make_url_smaller(url)) print("added ", len(urls), " images to db") num_new_images = get_num_images_in_db() - num_existing_images percent_new_images = num_new_images / len(urls) * 100 print(num_new_images, " were new (", percent_new_images ,"%)")