# this needs python 3 import sys import postgresql from urllib.parse import urlparse db = postgresql.open("pq://postgres:root@localhost/dumpfm") db.execute("SET CLIENT_ENCODING to 'UNICODE'") def get_num_images_in_db(): ps = db.prepare("SELECT COUNT(*) FROM image_urls") return int(ps()[0][0]) def add_url(url): print(url) ps = db.prepare("DELETE FROM image_urls WHERE url=$1") ps(url) ps2 = db.prepare("INSERT INTO image_urls(url) VALUES($1)") ps2(url) def get_messages(lower, upper): ps = db.prepare("SELECT content FROM messages WHERE message_id >= $1 AND message_id <= $2 ORDER BY message_id ASC") rows = ps(lower, upper) return rows def is_url_an_image(url): image_types = {"jpg", "bmp", "gif", "png"} url = urlparse(url) filetype = url.path[-3:].lower() return filetype in image_types def get_images_from_messages(message): images = [] if message[0:6] == "": # skip html messages return images tokens = message.split(" ") for token in tokens: if token[0:7] == "http://" and is_url_an_image(token): images.append(token) return images def get_urls_from_messages(messages): urls = [] for message in messages: urls.extend(get_images_from_messages(message[0])) return urls if __name__ == "__main__": if not len(sys.argv) == 3: print('usage: fill.image_urls.py message_id_start message_id_end') sys.exit(1) lower = int(sys.argv[1]) upper = int(sys.argv[2]) num_existing_images = get_num_images_in_db() messages = get_messages(lower, upper) urls = get_urls_from_messages(messages) for url in urls: add_url(url) print("added ", len(urls), " images to db") num_new_images = get_num_images_in_db() - num_existing_images percent_new_images = num_new_images / len(urls) * 100 print(num_new_images, " were new (", percent_new_images ,"%)")