scripts/fill.image_altars.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

# this needs python 3

import re
import sys
import postgresql
from urllib.parse import urlparse

db = postgresql.open("pq://postgres:root@localhost/dumpfm")
db.execute("SET CLIENT_ENCODING to 'UNICODE'")

def get_highest_message_id_in_db():
  ps = db.prepare("SELECT message_id FROM image_altars ORDER BY message_id DESC LIMIT 1")
  try:
    highest = int(ps()[0][0])
  except IndexError:
    highest = 0
  return highest

def add_altar(message_id, user_id, content):
  try:
    print(message_id, content)
  except UnicodeEncodeError:
    print("i thought python 3 fixed the unicode shit. yet i still get unicode errors everywhere. GOOD JOB FUCKHEADS")
  ps = db.prepare("INSERT INTO image_altars(message_id, user_id) VALUES($1, $2)")
  try:
    ps(message_id, user_id)
  except postgresql.exceptions.UniqueError:
    print("skipped adding a dupe")

# NOTE. hardcoded room numbers to index here... only indexing DUMPFM (1) and GIF (8) currently.
def get_messages():
  ps = db.prepare("SELECT message_id, user_id, content FROM messages WHERE message_id >= $1 AND message_id <= $2 AND room_id IN (1,8) ORDER BY message_id ASC")
  return ps.chunks

def is_url_an_image(url):
  image_types = {"jpg", "bmp", "gif", "png"}
  url = urlparse(url)
  filetype = url.path[-3:].lower()
  return filetype in image_types

def is_altar(content):
  if content[0:6] == "<safe>": # skip html messages
    return False
  tokens = content.split(" ")
  if is_bad_sized_array(tokens): # no even sized arrays
#    print("array not oddly sized")
    return False
  for token in tokens: # everything must be an image
    if token[0:7] != "http://":
#      print("contains stuff thats not urls")
      return False
    elif not is_url_an_image(token):
#      print("contains stuff thats not images")
      return False
  middleImage = tokens[int((len(tokens)-1)/2)]
  i = 0
  while i < (len(tokens)-1)/2:
    if tokens[i] != tokens[len(tokens) - 1 - i]: # must be symmetric
#      print("not symmetric")
      return False
    if tokens[i] == middleImage: # middle image must be unique
#      print("middle image not unique")
      return False
    i += 1
  return True

def process_messages(chunks, lower, upper):
  num_added = 0
  processed = 0
  for rowset in chunks(lower, upper):
    for message in rowset:
      if processed % 1000 == 0:
        print(processed, " processed so far")
        processed += 1
      if is_altar(message[2]):
        add_altar(message[0], message[1], message[2])
        num_added += 1
  return num_added

def get_urls_from_messages(messages):
  urls = []
  for message in messages:
    urls.extend(get_images_from_messages(message[0]))
  return urls

# image altars look like aba or abcba but not a or abba
def is_bad_sized_array(a):
  if len(a) % 2 == 0:
    return True
  elif len(a) < 3:
    return True
  else:
    return False

if __name__ == "__main__":
  if not len(sys.argv) == 2:
    print('usage: fill.image_altars.py message_id_end')
    sys.exit(1)

  upper = int(sys.argv[1])

  highest = get_highest_message_id_in_db()
  chunks = get_messages()
  num_added = process_messages(chunks, highest, upper)
  print("added ", num_added, " altars to db")