summaryrefslogtreecommitdiff
path: root/scraper/list-embassies.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/list-embassies.py')
-rw-r--r--scraper/list-embassies.py35
1 files changed, 35 insertions, 0 deletions
diff --git a/scraper/list-embassies.py b/scraper/list-embassies.py
new file mode 100644
index 00000000..41d64084
--- /dev/null
+++ b/scraper/list-embassies.py
@@ -0,0 +1,35 @@
+from util import *
+import glob
+
+old_urls = []
+new_urls = []
+url_list = {}
+
+with open('embassy-list-4.txt', 'r') as f:
+ for line in f.readlines():
+ old_urls.append(line.strip())
+ url = line.split(' ')[0]
+ url_list[url] = ''
+
+for f in glob.iglob('./embassy/by_country/*.txt'):
+ links = read_json(f)
+ for link in links:
+ # https://www.flickr.com/photos/us_mission_canada/37316818631
+ url = link['url']
+ url_partz = url.split('/')[:5]
+ if len(url_partz) < 5:
+ continue
+ username = url_partz[4]
+ if '@' in username:
+ continue
+ url_fix = '/'.join(url_partz)
+ if url_fix not in url_list:
+ new_urls.append(url_fix + ' ' + link['title'])
+ url_list[url_fix] = link['title']
+
+for line in old_urls:
+ print(line)
+
+for url in sorted(new_urls):
+ print(url)
+