blob: 41d64084cd8c142253a6280d3bbb96c0a8f98fef (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
from util import *
import glob
old_urls = []
new_urls = []
url_list = {}
with open('embassy-list-4.txt', 'r') as f:
for line in f.readlines():
old_urls.append(line.strip())
url = line.split(' ')[0]
url_list[url] = ''
for f in glob.iglob('./embassy/by_country/*.txt'):
links = read_json(f)
for link in links:
# https://www.flickr.com/photos/us_mission_canada/37316818631
url = link['url']
url_partz = url.split('/')[:5]
if len(url_partz) < 5:
continue
username = url_partz[4]
if '@' in username:
continue
url_fix = '/'.join(url_partz)
if url_fix not in url_list:
new_urls.append(url_fix + ' ' + link['title'])
url_list[url_fix] = link['title']
for line in old_urls:
print(line)
for url in sorted(new_urls):
print(url)
|