diff options
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/fill.image_altars.py | 105 |
1 files changed, 105 insertions, 0 deletions
diff --git a/scripts/fill.image_altars.py b/scripts/fill.image_altars.py new file mode 100644 index 0000000..4b8dd91 --- /dev/null +++ b/scripts/fill.image_altars.py @@ -0,0 +1,105 @@ +# this needs python 3 + +import re +import sys +import postgresql +from urllib.parse import urlparse + +db = postgresql.open("pq://postgres:root@localhost/dumpfm") +db.execute("SET CLIENT_ENCODING to 'UNICODE'") + +def get_highest_message_id_in_db(): + ps = db.prepare("SELECT message_id FROM image_altars ORDER BY message_id DESC LIMIT 1") + try: + highest = int(ps()[0][0]) + except IndexError: + highest = 0 + return highest + +def add_altar(message_id, user_id, content): + try: + print(message_id, content) + except UnicodeEncodeError: + print("i thought python 3 fixed the unicode shit. yet i still get unicode errors everywhere. GOOD JOB FUCKHEADS") + ps = db.prepare("INSERT INTO image_altars(message_id, user_id) VALUES($1, $2)") + try: + ps(message_id, user_id) + except postgresql.exceptions.UniqueError: + print("skipped adding a dupe") + +# NOTE. hardcoded room numbers to index here... only indexing DUMPFM (1) and GIF (8) currently. +def get_messages(lower, upper): + ps = db.prepare("SELECT message_id, user_id, content FROM messages WHERE message_id >= $1 AND message_id <= $2 AND room_id IN (1,8) ORDER BY message_id ASC") + rows = ps(lower, upper) + return rows + +def is_url_an_image(url): + image_types = {"jpg", "bmp", "gif", "png"} + url = urlparse(url) + filetype = url.path[-3:].lower() + return filetype in image_types + +def is_altar(content): + if content[0:6] == "<safe>": # skip html messages + return False + tokens = content.split(" ") + if is_bad_sized_array(tokens): # no even sized arrays +# print("array not oddly sized") + return False + for token in tokens: # everything must be an image + if token[0:7] != "http://": +# print("contains stuff thats not urls") + return False + elif not is_url_an_image(token): +# print("contains stuff thats not images") + return False + middleImage = tokens[int((len(tokens)-1)/2)] + i = 0 + while i < (len(tokens)-1)/2: + if tokens[i] != tokens[len(tokens) - 1 - i]: # must be symmetric +# print("not symmetric") + return False + if tokens[i] == middleImage: # middle image must be unique +# print("middle image not unique") + return False + i += 1 + return True + +def process_messages(messages): + num_added = 0 + processed = 0 + for message in messages: + if processed % 1000 == 0: + print(processed, " processed so far") + processed += 1 + if is_altar(message[2]): + add_altar(message[0], message[1], message[2]) + num_added += 1 + return num_added + +def get_urls_from_messages(messages): + urls = [] + for message in messages: + urls.extend(get_images_from_messages(message[0])) + return urls + +# image altars look like aba or abcba but not a or abba +def is_bad_sized_array(a): + if len(a) % 2 == 0: + return True + elif len(a) < 3: + return True + else: + return False + +if __name__ == "__main__": + if not len(sys.argv) == 2: + print('usage: fill.image_altars.py message_id_end') + sys.exit(1) + + upper = int(sys.argv[1]) + + highest = get_highest_message_id_in_db() + messages = get_messages(highest, upper) + num_added = process_messages(messages) + print("added ", num_added, " altars to db") |
