# this needs python 3 import re import sys import postgresql from urllib.parse import urlparse db = postgresql.open("pq://postgres:root@localhost/dumpfm") db.execute("SET CLIENT_ENCODING to 'UNICODE'") def get_highest_message_id_in_db(): ps = db.prepare("SELECT message_id FROM image_altars ORDER BY message_id DESC LIMIT 1") try: highest = int(ps()[0][0]) except IndexError: highest = 0 return highest def add_altar(message_id, user_id, content): try: print(message_id, content) except UnicodeEncodeError: print("i thought python 3 fixed the unicode shit. yet i still get unicode errors everywhere. GOOD JOB FUCKHEADS") ps = db.prepare("INSERT INTO image_altars(message_id, user_id) VALUES($1, $2)") try: ps(message_id, user_id) except postgresql.exceptions.UniqueError: print("skipped adding a dupe") # NOTE. hardcoded room numbers to index here... only indexing DUMPFM (1) and GIF (8) currently. def get_messages(): ps = db.prepare("SELECT message_id, user_id, content FROM messages WHERE message_id >= $1 AND message_id <= $2 AND room_id IN (1,8) ORDER BY message_id ASC") return ps.chunks def is_url_an_image(url): image_types = {"jpg", "bmp", "gif", "png"} url = urlparse(url) filetype = url.path[-3:].lower() return filetype in image_types def is_altar(content): if content[0:6] == "": # skip html messages return False tokens = content.split(" ") if is_bad_sized_array(tokens): # no even sized arrays # print("array not oddly sized") return False for token in tokens: # everything must be an image if token[0:7] != "http://": # print("contains stuff thats not urls") return False elif not is_url_an_image(token): # print("contains stuff thats not images") return False middleImage = tokens[int((len(tokens)-1)/2)] i = 0 while i < (len(tokens)-1)/2: if tokens[i] != tokens[len(tokens) - 1 - i]: # must be symmetric # print("not symmetric") return False if tokens[i] == middleImage: # middle image must be unique # print("middle image not unique") return False i += 1 return True def process_messages(chunks, lower, upper): num_added = 0 processed = 0 for rowset in chunks(lower, upper): for message in rowset: if processed % 1000 == 0: print(processed, " processed so far") processed += 1 if is_altar(message[2]): add_altar(message[0], message[1], message[2]) num_added += 1 return num_added def get_urls_from_messages(messages): urls = [] for message in messages: urls.extend(get_images_from_messages(message[0])) return urls # image altars look like aba or abcba but not a or abba def is_bad_sized_array(a): if len(a) % 2 == 0: return True elif len(a) < 3: return True else: return False if __name__ == "__main__": if not len(sys.argv) == 2: print('usage: fill.image_altars.py message_id_end') sys.exit(1) upper = int(sys.argv[1]) highest = get_highest_message_id_in_db() chunks = get_messages() num_added = process_messages(chunks, highest, upper) print("added ", num_added, " altars to db")