1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
# this needs python 3
import re
import sys
import postgresql
from urllib.parse import urlparse
db = postgresql.open("pq://postgres:root@localhost/dumpfm")
db.execute("SET CLIENT_ENCODING to 'UNICODE'")
def get_highest_message_id_in_db():
ps = db.prepare("SELECT message_id FROM image_altars ORDER BY message_id DESC LIMIT 1")
try:
highest = int(ps()[0][0])
except IndexError:
highest = 0
return highest
def add_altar(message_id, user_id, content):
try:
print(message_id, content)
except UnicodeEncodeError:
print("i thought python 3 fixed the unicode shit. yet i still get unicode errors everywhere. GOOD JOB FUCKHEADS")
ps = db.prepare("INSERT INTO image_altars(message_id, user_id) VALUES($1, $2)")
try:
ps(message_id, user_id)
except postgresql.exceptions.UniqueError:
print("skipped adding a dupe")
# NOTE. hardcoded room numbers to index here... only indexing DUMPFM (1) and GIF (8) currently.
def get_messages():
ps = db.prepare("SELECT message_id, user_id, content FROM messages WHERE message_id >= $1 AND message_id <= $2 AND room_id IN (1,8) ORDER BY message_id ASC")
return ps.chunks
def is_url_an_image(url):
image_types = {"jpg", "bmp", "gif", "png"}
url = urlparse(url)
filetype = url.path[-3:].lower()
return filetype in image_types
def is_altar(content):
if content[0:6] == "<safe>": # skip html messages
return False
tokens = content.split(" ")
if is_bad_sized_array(tokens): # no even sized arrays
# print("array not oddly sized")
return False
for token in tokens: # everything must be an image
if token[0:7] != "http://":
# print("contains stuff thats not urls")
return False
elif not is_url_an_image(token):
# print("contains stuff thats not images")
return False
middleImage = tokens[int((len(tokens)-1)/2)]
i = 0
while i < (len(tokens)-1)/2:
if tokens[i] != tokens[len(tokens) - 1 - i]: # must be symmetric
# print("not symmetric")
return False
if tokens[i] == middleImage: # middle image must be unique
# print("middle image not unique")
return False
i += 1
return True
def process_messages(chunks, lower, upper):
num_added = 0
processed = 0
for rowset in chunks(lower, upper):
for message in rowset:
if processed % 1000 == 0:
print(processed, " processed so far")
processed += 1
if is_altar(message[2]):
add_altar(message[0], message[1], message[2])
num_added += 1
return num_added
def get_urls_from_messages(messages):
urls = []
for message in messages:
urls.extend(get_images_from_messages(message[0]))
return urls
# image altars look like aba or abcba but not a or abba
def is_bad_sized_array(a):
if len(a) % 2 == 0:
return True
elif len(a) < 3:
return True
else:
return False
if __name__ == "__main__":
if not len(sys.argv) == 2:
print('usage: fill.image_altars.py message_id_end')
sys.exit(1)
upper = int(sys.argv[1])
highest = get_highest_message_id_in_db()
chunks = get_messages()
num_added = process_messages(chunks, highest, upper)
print("added ", num_added, " altars to db")
|