1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
|
from os.path import join
import click
from app.utils.logger_utils import Logger
log = Logger.getLogger()
# datasets
dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face']
@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True,
help='Input file for embassies')
@click.option('-o', '--output', 'opt_fp_out', required=True,
help='Output file')
@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
help='Slice list of files')
@click.option('-f', '--force', 'opt_force', is_flag=True,
help='Force overwrite')
@click.pass_context
def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force):
"""Cross reference"""
"""
# data input example
first_name last_name nsid path_alias url username skip
Query_01 98022916@N00 range_of_light https://www.flickr.com/photos/range_of_light/ range_of_light
"""
import sys
from os.path import join
from glob import glob
from pathlib import Path
import time
import pandas as pd
from tqdm import tqdm
log = Logger.getLogger()
log.info('Cross reference embassy list')
fp_counts = {}
fp_filepaths = {}
fp_dataset_base = '/data_store/datasets/people/'
for dk in dataset_keys:
fp_counts[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_counts.csv')
fp_filepaths[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv')
df_queries = pd.read_csv(opt_fp_in)
df_queries.fillna('', inplace=True)
len_orig = len(df_queries)
df_queries = df_queries[df_queries.skip != True]
log.debug(f'Skipping {len_orig - len(df_queries)} embassies')
query_nsids = list(df_queries['nsid'])
match_items = []
images = []
for dataset_key, fp_dataset in tqdm(fp_counts.items()):
df_counts = pd.read_csv(fp_dataset)
log.debug(f'loading: {fp_filepaths[dataset_key]}')
df_filepaths = pd.read_csv(fp_filepaths[dataset_key])
nsids = list(df_counts['nsid'])
for nsid in nsids:
if nsid in query_nsids:
# add to matches, and count
count = df_counts[df_counts['nsid'] == nsid]['count'].values[0]
first_name = df_queries[df_queries['nsid'] == nsid]['first_name'].values[0]
last_name = df_queries[df_queries['nsid'] == nsid]['last_name'].values[0]
path_alias = df_queries[df_queries['nsid'] == nsid]['path_alias'].values[0]
page_url = f'https://flickr.com/photos/{path_alias}'
name = f'{first_name} {last_name}'
meta = df_queries[df_queries['nsid'] == nsid].iloc[0]
match_obj = {
'count': count,
'path_alias': path_alias,
'name': name,
'dataset_key': dataset_key,
'nsid': nsid,
'page_url': page_url,
'username': meta.username
}
match_items.append(match_obj)
# add photo ids or url
df_nsids = df_filepaths[df_filepaths['nsid'] == nsid]
nsid_records = df_nsids.to_dict('records')
for nsid_record in nsid_records:
photo_id = nsid_record.get('photo_id')
im_obj = {
'nsid': nsid,
'url': nsid_record.get('url'),
'photo_id': photo_id,
'dataset_key': dataset_key,
'path_alias': path_alias,
'name': name,
'page_url': page_url,
'username': meta.username,
'filepath': f'{photo_id}.jpg'
}
images.append(im_obj)
# Save embassy matches
df_matches = pd.DataFrame.from_dict(match_items)
df_matches.to_csv(opt_fp_out, index=False)
total = df_matches['count'].sum()
# Save image matches
df_images = pd.DataFrame.from_dict(images)
fp_out = opt_fp_out.replace('.csv', '_images.csv')
df_images.to_csv(fp_out, index=False)
total = len(images)
log.debug(f'Found {total:,} embassy images')
# save summary count per dataset
groups_datasets = df_matches.groupby('dataset_key')
summary_counts = []
for group_dataset, df_dataset in groups_datasets:
log.debug(f'{group_dataset}')
summary_counts.append({'dataset': group_dataset, 'images': df_dataset['count'].sum()})
df_dataset_counts = pd.DataFrame.from_dict(summary_counts)
fp_out = opt_fp_out.replace('.csv', '_counts_summary_dataset.csv')
df_dataset_counts.to_csv(fp_out, index=False)
log.debug(f'wrote {fp_out}')
log.debug(f'Found {len(images):,} embassy images')
|