diff options
Diffstat (limited to 'megapixels/commands/msc/cross_reference.py')
| -rw-r--r-- | megapixels/commands/msc/cross_reference.py | 78 |
1 files changed, 78 insertions, 0 deletions
diff --git a/megapixels/commands/msc/cross_reference.py b/megapixels/commands/msc/cross_reference.py new file mode 100644 index 00000000..d4457945 --- /dev/null +++ b/megapixels/commands/msc/cross_reference.py @@ -0,0 +1,78 @@ +from os.path import join + +import click + +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +# source file for Embassy NSIDs +fp_in_embassies = '/data_store/datasets/msc/embassies/embassies_on_flickr_ext.csv' + +# list of datasets to cross reference +dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there'] +fp_dataset_base = '/data_store/datasets/people/' +fp_datasets = {} +for dk in dataset_keys: + fp_datasets[dk] = join(fp_dataset_base, dk, f'research/{dk}_flickr_meta.csv') + + +# output file +fp_out = '/data_store/datasets/msc/embassies/embassies_scores.csv' + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in_embassies, + help='Input file for embassies') +@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out, + help='Output file') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force): + """Cross reference""" + + import sys + from os.path import join + from glob import glob + from pathlib import Path + import time + + import pandas as pd + from tqdm import tqdm + + log = Logger.getLogger() + log.info('Cross reference embassy list') + + df_embassies = pd.read_csv(opt_fp_in) + df_embassies.fillna('', inplace=True) + embassy_nsids = list(df_embassies['nsid']) + + match_items = [] + for dataset_key, fp_dataset in fp_datasets.items(): + df_dataset = pd.read_csv(fp_dataset) + nsids = list(df_dataset['nsid']) + for nsid in nsids: + if nsid in embassy_nsids: + # add to matches, and count + count = df_dataset[df_dataset['nsid'] == nsid]['count'].values[0] + first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0] + last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0] + path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0] + log.debug(f'{first_name} {last_name}, {path_alias} count: {count}, in {dataset_key}') + match_obj = { + 'count': count, + 'path_alias': path_alias, + 'name': f'{first_name} {last_name}', + 'dataset_key': dataset_key, + 'nsid': nsid + } + match_items.append(match_obj) + + df = pd.DataFrame.from_dict(match_items) + df.to_csv(opt_fp_out, index=False) + + total = df['count'].sum() + + log.debug(f'Found {total} embassy photos')
\ No newline at end of file |
