blob: 7686ecf21cd2a2211184770c3bbfa3f8600f957b (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
|
# this needs python 3
import sys
import postgresql
from urllib.parse import urlparse
db = postgresql.open("pq://postgres:root@localhost/dumpfm")
db.execute("SET CLIENT_ENCODING to 'UNICODE'")
def get_num_images_in_db():
ps = db.prepare("SELECT COUNT(*) FROM image_urls")
return int(ps()[0][0])
def add_url(url):
print(url)
ps = db.prepare("DELETE FROM image_urls WHERE url=$1")
ps(url)
ps2 = db.prepare("INSERT INTO image_urls(url) VALUES($1)")
ps2(url)
def get_messages(lower, upper):
ps = db.prepare("SELECT content FROM messages WHERE message_id >= $1 AND message_id <= $2 ORDER BY message_id ASC")
rows = ps(lower, upper)
return rows
def is_url_an_image(url):
image_types = {"jpg", "bmp", "gif", "png"}
url = urlparse(url)
filetype = url.path[-3:].lower()
return filetype in image_types
def get_images_from_messages(message):
images = []
if message[0:6] == "<safe>": # skip html messages
return images
tokens = message.split(" ")
for token in tokens:
if token[0:7] == "http://" and is_url_an_image(token):
images.append(token)
return images
def get_urls_from_messages(messages):
urls = []
for message in messages:
urls.extend(get_images_from_messages(message[0]))
return urls
if __name__ == "__main__":
if not len(sys.argv) == 3:
print('usage: fill.image_urls.py message_id_start message_id_end')
sys.exit(1)
lower = int(sys.argv[1])
upper = int(sys.argv[2])
num_existing_images = get_num_images_in_db()
messages = get_messages(lower, upper)
urls = get_urls_from_messages(messages)
for url in urls:
add_url(url)
print("added ", len(urls), " images to db")
num_new_images = get_num_images_in_db() - num_existing_images
percent_new_images = num_new_images / len(urls) * 100
print(num_new_images, " were new (", percent_new_images ,"%)")
|