summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scripts/fill.image_urls.py63
1 files changed, 63 insertions, 0 deletions
diff --git a/scripts/fill.image_urls.py b/scripts/fill.image_urls.py
new file mode 100644
index 0000000..0b1c2e3
--- /dev/null
+++ b/scripts/fill.image_urls.py
@@ -0,0 +1,63 @@
+# this needs python 3
+
+import sys
+import postgresql
+from urllib.parse import urlparse
+
+db = postgresql.open("pq://postgres:root@localhost/dumpfm")
+
+def get_num_images_in_db():
+ ps = db.prepare("SELECT COUNT(*) FROM image_urls")
+ return int(ps()[0][0])
+
+def add_url(url):
+ print(url)
+ ps = db.prepare("DELETE FROM image_urls WHERE url=$1")
+ ps(url)
+ ps2 = db.prepare("INSERT INTO image_urls(url) VALUES($1)")
+ ps2(url)
+
+def get_messages(lower, upper):
+ ps = db.prepare("SELECT content FROM messages WHERE message_id >= $1 AND message_id <= $2 ORDER BY message_id ASC")
+ rows = ps(lower, upper)
+ return rows
+
+def is_url_an_image(url):
+ image_types = {"jpg", "bmp", "gif", "png"}
+ url = urlparse(url)
+ filetype = url.path[-3:].lower()
+ return filetype in image_types
+
+def get_images_from_messages(message):
+ images = []
+ if message[0:6] == "<safe>": # skip html messages
+ return images
+ tokens = message.split(" ")
+ for token in tokens:
+ if token[0:7] == "http://" and is_url_an_image(token):
+ images.append(token)
+ return images
+
+def get_urls_from_messages(messages):
+ urls = []
+ for message in messages:
+ urls.extend(get_images_from_messages(message[0]))
+ return urls
+
+if __name__ == "__main__":
+ if not len(sys.argv) == 3:
+ print('usage: fill.image_urls.py message_id_start message_id_end')
+ sys.exit(1)
+
+ lower = int(sys.argv[1])
+ upper = int(sys.argv[2])
+
+ num_existing_images = get_num_images_in_db()
+ messages = get_messages(lower, upper)
+ urls = get_urls_from_messages(messages)
+ for url in urls:
+ add_url(url)
+ print("added ", len(urls), " images to db")
+ num_new_images = get_num_images_in_db() - num_existing_images
+ percent_new_images = num_new_images / len(urls) * 100
+ print(num_new_images, " were new (", percent_new_images ,"%)") \ No newline at end of file