from os.path import join import click from app.utils.logger_utils import Logger log = Logger.getLogger() # datasets dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face'] FP_DATASET_ROOT = '/data_store/datasets/people/' @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, help='Input file for embassies') @click.option('-o', '--output', 'opt_fp_out', required=True, help='Output file') @click.option('--datasets-dir', 'opt_dataset_dir', default=FP_DATASET_ROOT) @click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), help='Slice list of files') @click.option('-f', '--force', 'opt_force', is_flag=True, help='Force overwrite') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset_dir, opt_slice, opt_force): """Cross reference""" import sys from os.path import join from glob import glob from pathlib import Path import time import pandas as pd from tqdm import tqdm log = Logger.getLogger() log.info('Finding Flickr matches') fp_counts = {} fp_filepaths = {} for dk in dataset_keys: fp_counts[dk] = join(opt_dataset_dir, dk, f'metadata/{dk}_counts.csv') fp_filepaths[dk] = join(opt_dataset_dir, dk, f'metadata/{dk}_filepaths.csv') df_query = pd.read_csv(opt_fp_in) df_query.fillna('', inplace=True) query_nsids = list(df_query['nsid']) match_items = [] matched_images = [] malta_images = [] for dataset_key, fp_dataset in tqdm(fp_counts.items()): df_counts = pd.read_csv(fp_dataset) log.debug(f'loading: {fp_filepaths[dataset_key]}') df_filepaths = pd.read_csv(fp_filepaths[dataset_key]) nsids = list(df_counts['nsid']) for nsid in nsids: if nsid in query_nsids: # add to matches, and count count = df_counts[df_counts['nsid'] == nsid]['count'].values[0] path_alias = df_query[df_query['nsid'] == nsid]['path_alias'].values[0] page_url = f'https://flickr.com/photos/{path_alias}' flickr_user_meta = df_query[df_query['nsid'] == nsid].iloc[0] match_obj = { 'count': count, 'path_alias': path_alias, 'dataset_key': dataset_key, 'nsid': nsid, 'page_url': page_url, 'username': flickr_user_meta.username } match_items.append(match_obj) # add photo ids or url df_nsids = df_filepaths[df_filepaths['nsid'] == nsid] nsid_records = df_nsids.to_dict('records') for nsid_record in nsid_records: photo_id = nsid_record.get('photo_id') im_obj = { 'nsid': nsid, 'url': nsid_record.get('url'), 'photo_id': photo_id, 'dataset_key': dataset_key, 'path_alias': path_alias, 'page_url': page_url, 'filepath': f'{photo_id}.jpg' } matched_images.append(im_obj) # Save embassy matches df = pd.DataFrame.from_dict(match_items) df.to_csv(opt_fp_out, index=False) total = df['count'].sum() # Save image matches df = pd.DataFrame.from_dict(matched_images) fp_out = opt_fp_out.replace('.csv', '_images.csv') df.to_csv(fp_out, index=False) total = len(matched_images) log.debug(f'wrote {fp_out}') log.debug(f'Found {total:,} images')