from os.path import join import click from app.utils.logger_utils import Logger log = Logger.getLogger() # source file for Embassy NSIDs fp_in_embassies = '/data_store/datasets/msc/embassies/embassies_on_flickr_ext.csv' # list of datasets to cross reference dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there'] fp_dataset_base = '/data_store/datasets/people/' fp_datasets = {} for dk in dataset_keys: fp_datasets[dk] = join(fp_dataset_base, dk, f'research/{dk}_flickr_meta.csv') # output file fp_out = '/data_store/datasets/msc/embassies/embassies_scores.csv' @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in_embassies, help='Input file for embassies') @click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out, help='Output file') @click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), help='Slice list of files') @click.option('-f', '--force', 'opt_force', is_flag=True, help='Force overwrite') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force): """Cross reference""" import sys from os.path import join from glob import glob from pathlib import Path import time import pandas as pd from tqdm import tqdm log = Logger.getLogger() log.info('Cross reference embassy list') df_embassies = pd.read_csv(opt_fp_in) df_embassies.fillna('', inplace=True) embassy_nsids = list(df_embassies['nsid']) match_items = [] for dataset_key, fp_dataset in fp_datasets.items(): df_dataset = pd.read_csv(fp_dataset) nsids = list(df_dataset['nsid']) for nsid in nsids: if nsid in embassy_nsids: # add to matches, and count count = df_dataset[df_dataset['nsid'] == nsid]['count'].values[0] first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0] last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0] path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0] log.debug(f'{first_name} {last_name}, {path_alias} count: {count}, in {dataset_key}') match_obj = { 'count': count, 'path_alias': path_alias, 'name': f'{first_name} {last_name}', 'dataset_key': dataset_key, 'nsid': nsid } match_items.append(match_obj) df = pd.DataFrame.from_dict(match_items) df.to_csv(opt_fp_out, index=False) total = df['count'].sum() log.debug(f'Found {total} embassy photos')