summaryrefslogtreecommitdiff
path: root/megapixels/commands/msc/cross_reference.py
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/commands/msc/cross_reference.py')
-rw-r--r--megapixels/commands/msc/cross_reference.py78
1 files changed, 78 insertions, 0 deletions
diff --git a/megapixels/commands/msc/cross_reference.py b/megapixels/commands/msc/cross_reference.py
new file mode 100644
index 00000000..d4457945
--- /dev/null
+++ b/megapixels/commands/msc/cross_reference.py
@@ -0,0 +1,78 @@
+from os.path import join
+
+import click
+
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+# source file for Embassy NSIDs
+fp_in_embassies = '/data_store/datasets/msc/embassies/embassies_on_flickr_ext.csv'
+
+# list of datasets to cross reference
+dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there']
+fp_dataset_base = '/data_store/datasets/people/'
+fp_datasets = {}
+for dk in dataset_keys:
+ fp_datasets[dk] = join(fp_dataset_base, dk, f'research/{dk}_flickr_meta.csv')
+
+
+# output file
+fp_out = '/data_store/datasets/msc/embassies/embassies_scores.csv'
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in_embassies,
+ help='Input file for embassies')
+@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out,
+ help='Output file')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+ help='Slice list of files')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+ help='Force overwrite')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force):
+ """Cross reference"""
+
+ import sys
+ from os.path import join
+ from glob import glob
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ from tqdm import tqdm
+
+ log = Logger.getLogger()
+ log.info('Cross reference embassy list')
+
+ df_embassies = pd.read_csv(opt_fp_in)
+ df_embassies.fillna('', inplace=True)
+ embassy_nsids = list(df_embassies['nsid'])
+
+ match_items = []
+ for dataset_key, fp_dataset in fp_datasets.items():
+ df_dataset = pd.read_csv(fp_dataset)
+ nsids = list(df_dataset['nsid'])
+ for nsid in nsids:
+ if nsid in embassy_nsids:
+ # add to matches, and count
+ count = df_dataset[df_dataset['nsid'] == nsid]['count'].values[0]
+ first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0]
+ last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0]
+ path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0]
+ log.debug(f'{first_name} {last_name}, {path_alias} count: {count}, in {dataset_key}')
+ match_obj = {
+ 'count': count,
+ 'path_alias': path_alias,
+ 'name': f'{first_name} {last_name}',
+ 'dataset_key': dataset_key,
+ 'nsid': nsid
+ }
+ match_items.append(match_obj)
+
+ df = pd.DataFrame.from_dict(match_items)
+ df.to_csv(opt_fp_out, index=False)
+
+ total = df['count'].sum()
+
+ log.debug(f'Found {total} embassy photos') \ No newline at end of file