diff options
| author | adamhrv <adam@ahprojects.com> | 2019-06-03 03:33:06 +0200 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-06-03 03:33:06 +0200 |
| commit | 1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch) | |
| tree | 86c37309ff5bcb62716638562489ddb747c16159 /megapixels/commands/msc | |
| parent | e5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff) | |
add msc working utils
Diffstat (limited to 'megapixels/commands/msc')
| -rw-r--r-- | megapixels/commands/msc/count.py | 123 | ||||
| -rw-r--r-- | megapixels/commands/msc/cross_reference.py | 78 | ||||
| -rw-r--r-- | megapixels/commands/msc/summarize.py | 5 |
3 files changed, 126 insertions, 80 deletions
diff --git a/megapixels/commands/msc/count.py b/megapixels/commands/msc/count.py new file mode 100644 index 00000000..3c242bc6 --- /dev/null +++ b/megapixels/commands/msc/count.py @@ -0,0 +1,123 @@ +from os.path import join + +import click + +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +# datasets +dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face'] + + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input file for embassies') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force): + """Cross reference""" + + import sys + from os.path import join + from glob import glob + from pathlib import Path + import time + + import pandas as pd + from tqdm import tqdm + + log = Logger.getLogger() + log.info('Cross reference embassy list') + + + fp_counts = {} + fp_filepaths = {} + fp_dataset_base = '/data_store/datasets/people/' + + for dk in dataset_keys: + fp_counts[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_counts.csv') + fp_filepaths[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv') + + df_embassies = pd.read_csv(opt_fp_in) + df_embassies.fillna('', inplace=True) + embassy_nsids = list(df_embassies['nsid']) + + match_items = [] + embassy_images = [] + malta_images = [] + + for dataset_key, fp_dataset in tqdm(fp_counts.items()): + df_counts = pd.read_csv(fp_dataset) + log.debug(f'loading: {fp_filepaths[dataset_key]}') + df_filepaths = pd.read_csv(fp_filepaths[dataset_key]) + nsids = list(df_counts['nsid']) + for nsid in nsids: + if nsid in embassy_nsids: + # add to matches, and count + count = df_counts[df_counts['nsid'] == nsid]['count'].values[0] + first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0] + last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0] + path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0] + page_url = f'https://flickr.com/photos/{path_alias}' + embassy_name = f'{first_name} {last_name}' + embassy_meta = df_embassies[df_embassies['nsid'] == nsid].iloc[0] + + match_obj = { + 'count': count, + 'path_alias': path_alias, + 'name': embassy_name, + 'dataset_key': dataset_key, + 'nsid': nsid, + 'page_url': page_url, + 'embassy_type': embassy_meta.type, + 'username': embassy_meta.username + } + match_items.append(match_obj) + + # add photo ids or url + df_nsids = df_filepaths[df_filepaths['nsid'] == nsid] + nsid_records = df_nsids.to_dict('records') + for nsid_record in nsid_records: + photo_id = nsid_record.get('photo_id') + im_obj = { + 'nsid': nsid, + 'url': nsid_record.get('url'), + 'photo_id': photo_id, + 'dataset_key': dataset_key, + 'path_alias': path_alias, + 'name': embassy_name, + 'page_url': page_url, + 'username': embassy_meta.username, + 'filepath': f'{photo_id}.jpg' + } + + embassy_images.append(im_obj) + if nsid == '51226353@N03': + malta_images.append(im_obj) + + # Save embassy matches + df = pd.DataFrame.from_dict(match_items) + df.to_csv(opt_fp_out, index=False) + total = df['count'].sum() + + # Save image matches + df = pd.DataFrame.from_dict(embassy_images) + fp_out = opt_fp_out.replace('.csv', '_images.csv') + df.to_csv(fp_out, index=False) + total = len(embassy_images) + log.debug(f'wrote {fp_out}') + log.debug(f'Found {total:,} embassy images') + + # Save malta images + df = pd.DataFrame.from_dict(malta_images) + fp_out = opt_fp_out.replace('.csv', '_images_malta.csv') + df.to_csv(fp_out, index=False) + total = len(malta) + log.debug(f'wrote {fp_out}') + log.debug(f'Found {total:,} malta embassy images')
\ No newline at end of file diff --git a/megapixels/commands/msc/cross_reference.py b/megapixels/commands/msc/cross_reference.py deleted file mode 100644 index d4457945..00000000 --- a/megapixels/commands/msc/cross_reference.py +++ /dev/null @@ -1,78 +0,0 @@ -from os.path import join - -import click - -from app.utils.logger_utils import Logger - -log = Logger.getLogger() - -# source file for Embassy NSIDs -fp_in_embassies = '/data_store/datasets/msc/embassies/embassies_on_flickr_ext.csv' - -# list of datasets to cross reference -dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there'] -fp_dataset_base = '/data_store/datasets/people/' -fp_datasets = {} -for dk in dataset_keys: - fp_datasets[dk] = join(fp_dataset_base, dk, f'research/{dk}_flickr_meta.csv') - - -# output file -fp_out = '/data_store/datasets/msc/embassies/embassies_scores.csv' - -@click.command() -@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in_embassies, - help='Input file for embassies') -@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out, - help='Output file') -@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), - help='Slice list of files') -@click.option('-f', '--force', 'opt_force', is_flag=True, - help='Force overwrite') -@click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force): - """Cross reference""" - - import sys - from os.path import join - from glob import glob - from pathlib import Path - import time - - import pandas as pd - from tqdm import tqdm - - log = Logger.getLogger() - log.info('Cross reference embassy list') - - df_embassies = pd.read_csv(opt_fp_in) - df_embassies.fillna('', inplace=True) - embassy_nsids = list(df_embassies['nsid']) - - match_items = [] - for dataset_key, fp_dataset in fp_datasets.items(): - df_dataset = pd.read_csv(fp_dataset) - nsids = list(df_dataset['nsid']) - for nsid in nsids: - if nsid in embassy_nsids: - # add to matches, and count - count = df_dataset[df_dataset['nsid'] == nsid]['count'].values[0] - first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0] - last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0] - path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0] - log.debug(f'{first_name} {last_name}, {path_alias} count: {count}, in {dataset_key}') - match_obj = { - 'count': count, - 'path_alias': path_alias, - 'name': f'{first_name} {last_name}', - 'dataset_key': dataset_key, - 'nsid': nsid - } - match_items.append(match_obj) - - df = pd.DataFrame.from_dict(match_items) - df.to_csv(opt_fp_out, index=False) - - total = df['count'].sum() - - log.debug(f'Found {total} embassy photos')
\ No newline at end of file diff --git a/megapixels/commands/msc/summarize.py b/megapixels/commands/msc/summarize.py index d5d251db..045e3b69 100644 --- a/megapixels/commands/msc/summarize.py +++ b/megapixels/commands/msc/summarize.py @@ -29,7 +29,7 @@ def cli(ctx, opt_fp_in, opt_fp_out): log = Logger.getLogger() - dataset_names = ['helen', 'megaface', 'adience', 'pipa', 'lfpw', 'duke_mtmc', 'brainwash', 'msceleb', 'uccs'] + dataset_names = ['helen', 'megaface', 'adience', 'pipa', 'lfpw', 'brainwash', 'msceleb', 'duke_mtmc', 'uccs'] df = pd.DataFrame() fp_out = opt_fp_out.replace('.csv', '_citations.csv') @@ -37,10 +37,11 @@ def cli(ctx, opt_fp_in, opt_fp_out): fp_csv = join(opt_fp_in, f'{dataset_name}.csv') _df = pd.read_csv(fp_csv) _df = _df[_df.lat != 0] + _df.drop('id', axis=1, inplace=True) print(dataset_name, len(_df)) df = df.append(_df, ignore_index=True) - df.to_csv(opt_fp_out, index=False) + df.to_csv(fp_out, index=False) # create country summary fp_out = opt_fp_out.replace('.csv', '_countries.csv') |
