summaryrefslogtreecommitdiff
path: root/scripts/fill.image_urls.py
blob: 7686ecf21cd2a2211184770c3bbfa3f8600f957b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# this needs python 3

import sys
import postgresql
from urllib.parse import urlparse

db = postgresql.open("pq://postgres:root@localhost/dumpfm")
db.execute("SET CLIENT_ENCODING to 'UNICODE'")

def get_num_images_in_db():
  ps = db.prepare("SELECT COUNT(*) FROM image_urls")
  return int(ps()[0][0])

def add_url(url):
  print(url)
  ps = db.prepare("DELETE FROM image_urls WHERE url=$1")
  ps(url)
  ps2 = db.prepare("INSERT INTO image_urls(url) VALUES($1)")
  ps2(url)

def get_messages(lower, upper):
  ps = db.prepare("SELECT content FROM messages WHERE message_id >= $1 AND message_id <= $2 ORDER BY message_id ASC")
  rows = ps(lower, upper)
  return rows

def is_url_an_image(url):
  image_types = {"jpg", "bmp", "gif", "png"}
  url = urlparse(url)
  filetype = url.path[-3:].lower()
  return filetype in image_types

def get_images_from_messages(message):
  images = []
  if message[0:6] == "<safe>": # skip html messages
    return images
  tokens = message.split(" ")
  for token in tokens:
    if token[0:7] == "http://" and is_url_an_image(token):
      images.append(token)
  return images

def get_urls_from_messages(messages):
  urls = []
  for message in messages:
    urls.extend(get_images_from_messages(message[0]))
  return urls

if __name__ == "__main__":
  if not len(sys.argv) == 3:
    print('usage: fill.image_urls.py message_id_start message_id_end')
    sys.exit(1)
    
  lower = int(sys.argv[1])
  upper = int(sys.argv[2])
  
  num_existing_images = get_num_images_in_db()
  messages = get_messages(lower, upper)
  urls = get_urls_from_messages(messages)
  for url in urls:
    add_url(url)
  print("added ", len(urls), " images to db")
  num_new_images = get_num_images_in_db() - num_existing_images
  percent_new_images = num_new_images / len(urls) * 100
  print(num_new_images, " were new (", percent_new_images ,"%)")