diff options
| author | adamhrv <adam@ahprojects.com> | 2019-09-08 21:32:39 +0200 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-09-08 21:32:39 +0200 |
| commit | 5489c9960e7fc89453a657ab4cf27b4c4e74c61b (patch) | |
| tree | f7d8aa3b42313a00a86e33dba709786c45e5a42a /megapixels/commands/search/flickr_nsid.py | |
| parent | aed339c29abce0fad683736622c20427da8ad3a6 (diff) | |
remove/merge flickr
Diffstat (limited to 'megapixels/commands/search/flickr_nsid.py')
| -rw-r--r-- | megapixels/commands/search/flickr_nsid.py | 133 |
1 files changed, 0 insertions, 133 deletions
diff --git a/megapixels/commands/search/flickr_nsid.py b/megapixels/commands/search/flickr_nsid.py deleted file mode 100644 index d9997090..00000000 --- a/megapixels/commands/search/flickr_nsid.py +++ /dev/null @@ -1,133 +0,0 @@ -from os.path import join - -import click - -from app.utils.logger_utils import Logger - -log = Logger.getLogger() - -# datasets -dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face'] - - -@click.command() -@click.option('-i', '--input', 'opt_fp_in', required=True, - help='Input file for embassies') -@click.option('-o', '--output', 'opt_fp_out', required=True, - help='Output file') -@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), - help='Slice list of files') -@click.option('-f', '--force', 'opt_force', is_flag=True, - help='Force overwrite') -@click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force): - """Cross reference""" - """ - # data input example - first_name last_name nsid path_alias url username skip - Query_01 98022916@N00 range_of_light https://www.flickr.com/photos/range_of_light/ range_of_light - """ - - import sys - from os.path import join - from glob import glob - from pathlib import Path - import time - - import pandas as pd - from tqdm import tqdm - - log = Logger.getLogger() - log.info('Cross reference embassy list') - - - fp_counts = {} - fp_filepaths = {} - fp_dataset_base = '/data_store/datasets/people/' - - for dk in dataset_keys: - fp_counts[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_counts.csv') - fp_filepaths[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv') - - df_queries = pd.read_csv(opt_fp_in) - df_queries.fillna('', inplace=True) - len_orig = len(df_queries) - df_queries = df_queries[df_queries.skip != True] - log.debug(f'Skipping {len_orig - len(df_queries)} embassies') - - query_nsids = list(df_queries['nsid']) - - match_items = [] - images = [] - - for dataset_key, fp_dataset in tqdm(fp_counts.items()): - df_counts = pd.read_csv(fp_dataset) - log.debug(f'loading: {fp_filepaths[dataset_key]}') - df_filepaths = pd.read_csv(fp_filepaths[dataset_key]) - nsids = list(df_counts['nsid']) - for nsid in nsids: - if nsid in query_nsids: - # add to matches, and count - count = df_counts[df_counts['nsid'] == nsid]['count'].values[0] - first_name = df_queries[df_queries['nsid'] == nsid]['first_name'].values[0] - last_name = df_queries[df_queries['nsid'] == nsid]['last_name'].values[0] - path_alias = df_queries[df_queries['nsid'] == nsid]['path_alias'].values[0] - page_url = f'https://flickr.com/photos/{path_alias}' - name = f'{first_name} {last_name}' - meta = df_queries[df_queries['nsid'] == nsid].iloc[0] - - match_obj = { - 'count': count, - 'path_alias': path_alias, - 'name': name, - 'dataset_key': dataset_key, - 'nsid': nsid, - 'page_url': page_url, - 'username': meta.username - } - match_items.append(match_obj) - - # add photo ids or url - df_nsids = df_filepaths[df_filepaths['nsid'] == nsid] - nsid_records = df_nsids.to_dict('records') - for nsid_record in nsid_records: - photo_id = nsid_record.get('photo_id') - im_obj = { - 'nsid': nsid, - 'url': nsid_record.get('url'), - 'photo_id': photo_id, - 'dataset_key': dataset_key, - 'path_alias': path_alias, - 'name': name, - 'page_url': page_url, - 'username': meta.username, - 'filepath': f'{photo_id}.jpg' - } - - images.append(im_obj) - - # Save embassy matches - df_matches = pd.DataFrame.from_dict(match_items) - df_matches.to_csv(opt_fp_out, index=False) - total = df_matches['count'].sum() - - # Save image matches - df_images = pd.DataFrame.from_dict(images) - fp_out = opt_fp_out.replace('.csv', '_images.csv') - df_images.to_csv(fp_out, index=False) - total = len(images) - log.debug(f'Found {total:,} embassy images') - - # save summary count per dataset - groups_datasets = df_matches.groupby('dataset_key') - summary_counts = [] - for group_dataset, df_dataset in groups_datasets: - log.debug(f'{group_dataset}') - summary_counts.append({'dataset': group_dataset, 'images': df_dataset['count'].sum()}) - df_dataset_counts = pd.DataFrame.from_dict(summary_counts) - fp_out = opt_fp_out.replace('.csv', '_counts_summary_dataset.csv') - df_dataset_counts.to_csv(fp_out, index=False) - - - log.debug(f'wrote {fp_out}') - log.debug(f'Found {len(images):,} embassy images') |
