diff options
| -rw-r--r-- | scripts/fill.image_urls.py | 63 |
1 files changed, 63 insertions, 0 deletions
diff --git a/scripts/fill.image_urls.py b/scripts/fill.image_urls.py new file mode 100644 index 0000000..0b1c2e3 --- /dev/null +++ b/scripts/fill.image_urls.py @@ -0,0 +1,63 @@ +# this needs python 3 + +import sys +import postgresql +from urllib.parse import urlparse + +db = postgresql.open("pq://postgres:root@localhost/dumpfm") + +def get_num_images_in_db(): + ps = db.prepare("SELECT COUNT(*) FROM image_urls") + return int(ps()[0][0]) + +def add_url(url): + print(url) + ps = db.prepare("DELETE FROM image_urls WHERE url=$1") + ps(url) + ps2 = db.prepare("INSERT INTO image_urls(url) VALUES($1)") + ps2(url) + +def get_messages(lower, upper): + ps = db.prepare("SELECT content FROM messages WHERE message_id >= $1 AND message_id <= $2 ORDER BY message_id ASC") + rows = ps(lower, upper) + return rows + +def is_url_an_image(url): + image_types = {"jpg", "bmp", "gif", "png"} + url = urlparse(url) + filetype = url.path[-3:].lower() + return filetype in image_types + +def get_images_from_messages(message): + images = [] + if message[0:6] == "<safe>": # skip html messages + return images + tokens = message.split(" ") + for token in tokens: + if token[0:7] == "http://" and is_url_an_image(token): + images.append(token) + return images + +def get_urls_from_messages(messages): + urls = [] + for message in messages: + urls.extend(get_images_from_messages(message[0])) + return urls + +if __name__ == "__main__": + if not len(sys.argv) == 3: + print('usage: fill.image_urls.py message_id_start message_id_end') + sys.exit(1) + + lower = int(sys.argv[1]) + upper = int(sys.argv[2]) + + num_existing_images = get_num_images_in_db() + messages = get_messages(lower, upper) + urls = get_urls_from_messages(messages) + for url in urls: + add_url(url) + print("added ", len(urls), " images to db") + num_new_images = get_num_images_in_db() - num_existing_images + percent_new_images = num_new_images / len(urls) * 100 + print(num_new_images, " were new (", percent_new_images ,"%)")
\ No newline at end of file |
