diff options
Diffstat (limited to 'scraper/list-embassies.py')
| -rw-r--r-- | scraper/list-embassies.py | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/scraper/list-embassies.py b/scraper/list-embassies.py new file mode 100644 index 00000000..41d64084 --- /dev/null +++ b/scraper/list-embassies.py @@ -0,0 +1,35 @@ +from util import * +import glob + +old_urls = [] +new_urls = [] +url_list = {} + +with open('embassy-list-4.txt', 'r') as f: + for line in f.readlines(): + old_urls.append(line.strip()) + url = line.split(' ')[0] + url_list[url] = '' + +for f in glob.iglob('./embassy/by_country/*.txt'): + links = read_json(f) + for link in links: + # https://www.flickr.com/photos/us_mission_canada/37316818631 + url = link['url'] + url_partz = url.split('/')[:5] + if len(url_partz) < 5: + continue + username = url_partz[4] + if '@' in username: + continue + url_fix = '/'.join(url_partz) + if url_fix not in url_list: + new_urls.append(url_fix + ' ' + link['title']) + url_list[url_fix] = link['title'] + +for line in old_urls: + print(line) + +for url in sorted(new_urls): + print(url) + |
