1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
|
from os.path import join
import click
from app.utils.logger_utils import Logger
log = Logger.getLogger()
# datasets
dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face']
@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True,
help='Input file for embassies')
@click.option('-o', '--output', 'opt_fp_out', required=True,
help='Output file')
@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
help='Slice list of files')
@click.option('-f', '--force', 'opt_force', is_flag=True,
help='Force overwrite')
@click.pass_context
def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force):
"""Cross reference"""
import sys
from os.path import join
from glob import glob
from pathlib import Path
import time
import pandas as pd
from tqdm import tqdm
log = Logger.getLogger()
log.info('Cross reference embassy list')
fp_counts = {}
fp_filepaths = {}
fp_dataset_base = '/data_store/datasets/people/'
for dk in dataset_keys:
fp_counts[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_counts.csv')
fp_filepaths[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv')
df_embassies = pd.read_csv(opt_fp_in)
df_embassies.fillna('', inplace=True)
embassy_nsids = list(df_embassies['nsid'])
match_items = []
embassy_images = []
malta_images = []
for dataset_key, fp_dataset in tqdm(fp_counts.items()):
df_counts = pd.read_csv(fp_dataset)
log.debug(f'loading: {fp_filepaths[dataset_key]}')
df_filepaths = pd.read_csv(fp_filepaths[dataset_key])
nsids = list(df_counts['nsid'])
for nsid in nsids:
if nsid in embassy_nsids:
# add to matches, and count
count = df_counts[df_counts['nsid'] == nsid]['count'].values[0]
first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0]
last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0]
path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0]
page_url = f'https://flickr.com/photos/{path_alias}'
embassy_name = f'{first_name} {last_name}'
embassy_meta = df_embassies[df_embassies['nsid'] == nsid].iloc[0]
match_obj = {
'count': count,
'path_alias': path_alias,
'name': embassy_name,
'dataset_key': dataset_key,
'nsid': nsid,
'page_url': page_url,
'embassy_type': embassy_meta.type,
'username': embassy_meta.username
}
match_items.append(match_obj)
# add photo ids or url
df_nsids = df_filepaths[df_filepaths['nsid'] == nsid]
nsid_records = df_nsids.to_dict('records')
for nsid_record in nsid_records:
photo_id = nsid_record.get('photo_id')
im_obj = {
'nsid': nsid,
'url': nsid_record.get('url'),
'photo_id': photo_id,
'dataset_key': dataset_key,
'path_alias': path_alias,
'name': embassy_name,
'page_url': page_url,
'username': embassy_meta.username,
'filepath': f'{photo_id}.jpg'
}
embassy_images.append(im_obj)
if nsid == '51226353@N03':
malta_images.append(im_obj)
# Save embassy matches
df = pd.DataFrame.from_dict(match_items)
df.to_csv(opt_fp_out, index=False)
total = df['count'].sum()
# Save image matches
df = pd.DataFrame.from_dict(embassy_images)
fp_out = opt_fp_out.replace('.csv', '_images.csv')
df.to_csv(fp_out, index=False)
total = len(embassy_images)
log.debug(f'wrote {fp_out}')
log.debug(f'Found {total:,} embassy images')
# Save malta images
df = pd.DataFrame.from_dict(malta_images)
fp_out = opt_fp_out.replace('.csv', '_images_malta.csv')
df.to_csv(fp_out, index=False)
total = len(malta_images)
log.debug(f'wrote {fp_out}')
log.debug(f'Found {total:,} malta embassy images')
|