diff options
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/fill.image_urls.py | 19 |
1 files changed, 18 insertions, 1 deletions
diff --git a/scripts/fill.image_urls.py b/scripts/fill.image_urls.py index 952b7ea..535d6f8 100644 --- a/scripts/fill.image_urls.py +++ b/scripts/fill.image_urls.py @@ -1,5 +1,6 @@ # this needs python 3 +import re import sys import postgresql from urllib.parse import urlparse @@ -49,6 +50,22 @@ def get_urls_from_messages(messages): urls.extend(get_images_from_messages(message[0])) return urls +# this does 3 things... +# convert 'http://dumpfm.s3.amazonaws.com' to 'http://dump.fm' +# drops 'http://' from urls +# drops 'dump.fm/images' from urls +# the client is expected to rebuild urls based on this heuristic: +# if the url starts with '/', prepend 'http://dump.fm' +# otherwise, prepend 'http://' +def make_url_smaller(url): + if url[:37] == 'http://dumpfm.s3.amazonaws.com/images': + url = 'http://dump.fm/images' + url[37:] + if url[:21] == 'http://dump.fm/images': + url = [21:] + else: + url = url[7:] + return url + if __name__ == "__main__": if not len(sys.argv) == 3: print('usage: fill.image_urls.py message_id_start message_id_end') @@ -61,7 +78,7 @@ if __name__ == "__main__": messages = get_messages(lower, upper) urls = get_urls_from_messages(messages) for url in urls: - add_url(url) + add_url(make_url_smaller(url)) print("added ", len(urls), " images to db") num_new_images = get_num_images_in_db() - num_existing_images percent_new_images = num_new_images / len(urls) * 100 |
