diff options
| author | adamhrv <adam@ahprojects.com> | 2019-07-04 02:18:03 +0200 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-07-04 02:18:03 +0200 |
| commit | 14113102b827e08f32024e99e9ecc9a388bc11d0 (patch) | |
| tree | aae82a1bf36a210cd808dae49472e0705c734370 /megapixels/commands/datasets/flickr_matches.py | |
| parent | b0134234faf869ebcc323c634f247ea11d77cf4c (diff) | |
add generalized flickr counter
Diffstat (limited to 'megapixels/commands/datasets/flickr_matches.py')
| -rw-r--r-- | megapixels/commands/datasets/flickr_matches.py | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/flickr_matches.py b/megapixels/commands/datasets/flickr_matches.py new file mode 100644 index 00000000..56f4f28e --- /dev/null +++ b/megapixels/commands/datasets/flickr_matches.py @@ -0,0 +1,106 @@ +from os.path import join + +import click + +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +# datasets +dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face'] +FP_DATASET_ROOT = '/data_store/datasets/people/' + + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input file for embassies') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file') +@click.option('--datasets-dir', 'opt_dataset_dir', default=FP_DATASET_ROOT) +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset_dir, opt_slice, opt_force): + """Cross reference""" + + import sys + from os.path import join + from glob import glob + from pathlib import Path + import time + + import pandas as pd + from tqdm import tqdm + + log = Logger.getLogger() + log.info('Finding Flickr matches') + + + fp_counts = {} + fp_filepaths = {} + + for dk in dataset_keys: + fp_counts[dk] = join(opt_dataset_dir, dk, f'metadata/{dk}_counts.csv') + fp_filepaths[dk] = join(opt_dataset_dir, dk, f'metadata/{dk}_filepaths.csv') + + df_query = pd.read_csv(opt_fp_in) + df_query.fillna('', inplace=True) + query_nsids = list(df_query['nsid']) + + match_items = [] + matched_images = [] + malta_images = [] + + for dataset_key, fp_dataset in tqdm(fp_counts.items()): + df_counts = pd.read_csv(fp_dataset) + log.debug(f'loading: {fp_filepaths[dataset_key]}') + df_filepaths = pd.read_csv(fp_filepaths[dataset_key]) + nsids = list(df_counts['nsid']) + for nsid in nsids: + if nsid in query_nsids: + # add to matches, and count + count = df_counts[df_counts['nsid'] == nsid]['count'].values[0] + path_alias = df_query[df_query['nsid'] == nsid]['path_alias'].values[0] + page_url = f'https://flickr.com/photos/{path_alias}' + flickr_user_meta = df_query[df_query['nsid'] == nsid].iloc[0] + + match_obj = { + 'count': count, + 'path_alias': path_alias, + 'dataset_key': dataset_key, + 'nsid': nsid, + 'page_url': page_url, + 'username': flickr_user_meta.username + } + match_items.append(match_obj) + + # add photo ids or url + df_nsids = df_filepaths[df_filepaths['nsid'] == nsid] + nsid_records = df_nsids.to_dict('records') + for nsid_record in nsid_records: + photo_id = nsid_record.get('photo_id') + im_obj = { + 'nsid': nsid, + 'url': nsid_record.get('url'), + 'photo_id': photo_id, + 'dataset_key': dataset_key, + 'path_alias': path_alias, + 'page_url': page_url, + 'filepath': f'{photo_id}.jpg' + } + matched_images.append(im_obj) + + # Save embassy matches + df = pd.DataFrame.from_dict(match_items) + df.to_csv(opt_fp_out, index=False) + total = df['count'].sum() + + # Save image matches + df = pd.DataFrame.from_dict(matched_images) + fp_out = opt_fp_out.replace('.csv', '_images.csv') + df.to_csv(fp_out, index=False) + total = len(matched_images) + log.debug(f'wrote {fp_out}') + log.debug(f'Found {total:,} images')
\ No newline at end of file |
