diff options
| author | adamhrv <adam@ahprojects.com> | 2019-09-08 21:32:10 +0200 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-09-08 21:32:10 +0200 |
| commit | aed339c29abce0fad683736622c20427da8ad3a6 (patch) | |
| tree | d1999418ddf276f36473c828561e02279a809e9b | |
| parent | 48d181774888206434d8096770022532bd09af87 (diff) | |
add flickr search
| -rw-r--r-- | megapixels/app/settings/app_cfg.py | 1 | ||||
| -rw-r--r-- | megapixels/cli_search.py | 36 | ||||
| -rw-r--r-- | megapixels/commands/datasets/download_images.py | 8 | ||||
| -rw-r--r-- | megapixels/commands/search/flickr.py | 71 | ||||
| -rw-r--r-- | megapixels/commands/search/flickr_nsid.py | 133 | ||||
| -rw-r--r-- | megapixels/commands/search/flickr_url.py | 47 |
6 files changed, 293 insertions, 3 deletions
diff --git a/megapixels/app/settings/app_cfg.py b/megapixels/app/settings/app_cfg.py index 5ce0a678..0041c295 100644 --- a/megapixels/app/settings/app_cfg.py +++ b/megapixels/app/settings/app_cfg.py @@ -96,6 +96,7 @@ DIR_COMMANDS_MISC = 'commands/misc' DIR_COMMANDS_SITE = 'commands/site' DIR_COMMANDS_DEMO = 'commands/demo' DIR_COMMANDS_MSC = 'commands/msc' +DIR_COMMANDS_SEARCH = 'commands/search' # ----------------------------------------------------------------------------- # Filesystem settings diff --git a/megapixels/cli_search.py b/megapixels/cli_search.py new file mode 100644 index 00000000..9597e70b --- /dev/null +++ b/megapixels/cli_search.py @@ -0,0 +1,36 @@ +# -------------------------------------------------------- +# MSC Project +# -------------------------------------------------------- + +import click + +from app.settings import app_cfg as cfg +from app.utils import logger_utils +from app.models.click_factory import ClickSimple + +# click cli factory +cc = ClickSimple.create(cfg.DIR_COMMANDS_SEARCH) + +# -------------------------------------------------------- +# CLI +# -------------------------------------------------------- +@click.group(cls=cc, chain=False) +@click.option('-v', '--verbose', 'verbosity', count=True, default=4, + show_default=True, + help='Verbosity: -v DEBUG, -vv INFO, -vvv WARN, -vvvv ERROR, -vvvvv CRITICAL') +@click.pass_context +def cli(ctx, **kwargs): + """\033[1m\033[94mMegaPixels: Search Scripts\033[0m + """ + ctx.opts = {} + # init logger + logger_utils.Logger.create(verbosity=kwargs['verbosity']) + + + +# -------------------------------------------------------- +# Entrypoint +# -------------------------------------------------------- +if __name__ == '__main__': + cli() + diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py index 45ca8f6e..f2fdfb8f 100644 --- a/megapixels/commands/datasets/download_images.py +++ b/megapixels/commands/datasets/download_images.py @@ -10,8 +10,10 @@ import click help='Number of threads') @click.option('--wayback', 'opt_wayback', is_flag=True, default=False, help='Check Wayback archive for URL and download cached image') +@click.option('--url', 'opt_key_url', default='url', help='Field name for URL', show_default=True) +@click.option('--filepath', 'opt_key_filepath', default='filepath', help='Field name for filepath', show_default=True) @click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback): +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_key_filepath, opt_key_url, opt_wayback): """Threaded image downloader""" """ @@ -69,11 +71,11 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback): pool_items = [] log.debug(f'Initializing multithreaded pool...') for x in tqdm(records): - fp_dst = join(opt_fp_out, x['filepath']) + fp_dst = join(opt_fp_out, x[opt_key_filepath]) fp_dst_is_file = Path(fp_dst).is_file() fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file() if not fp_dst_is_file and not fp_dst_is_err: - pool_items.append({'url':x['url'], 'filepath': fp_dst, 'opt_wayback': opt_wayback}) + pool_items.append({'url':x[opt_key_url], 'filepath': fp_dst, 'opt_wayback': opt_wayback}) num_items = len(pool_items) log.info(f'Going to download {num_items:,} files') diff --git a/megapixels/commands/search/flickr.py b/megapixels/commands/search/flickr.py new file mode 100644 index 00000000..ef3515bf --- /dev/null +++ b/megapixels/commands/search/flickr.py @@ -0,0 +1,71 @@ +""" +# Examples: +-q https://farm2.staticflickr.com/1252/1366994858_d4a2b377cc_o.jpg +-q 48876008@N05 +-q 1366994858 +""" + +import click + +# datasets +dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face'] + +@click.command() +@click.option('-q', '--query', 'opt_query', type=str, required=True, + help='Photo URL, photo id, or NSID') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file CSV') +@click.pass_context +def cli(ctx, opt_query, opt_fp_out): + """Locate image by Flickr identifier""" + + import sys + from os.path import join + from glob import glob + from pathlib import Path + import time + + import pandas as pd + from tqdm import tqdm + + from app.utils.logger_utils import Logger + + log = Logger.getLogger() + log.debug(f'Search query: "{opt_query}"') + + fp_dataset_base = '/data_store/datasets/people/' + + matches = [] + + if '@' in opt_query: + # process NSID format + qk = 'nsid' + elif 'staticflickr.com' in opt_query: + # process URL to photo id + opt_query = Path(opt_query).name.split('_')[0] + qk = 'photo_id' + else: + # process as photo id + qk = 'photo_id' + + log.debug(f'Searching Flickr data using "{qk}"') + + for dk in dataset_keys: + # read dataset metadata + fp_filepaths = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv') + log.debug(f'loading: {fp_filepaths}') + df = pd.read_csv(fp_filepaths, dtype={'photo_id': str}) + photo_ids = list(df[qk]) + + # search for match + if opt_query in photo_ids: + log.info(f'Found "{qk} = {opt_query}" id in "{dk}"') + df_match = df[df[qk] == opt_query] + records = df_match.to_dict('records') + for record in records: + record['dataset'] = dk + matches.append(record) + + # Write file + log.debug(f'Found {len(matches)} matches') + pd.DataFrame.from_dict(matches).to_csv(opt_fp_out, index=False)
\ No newline at end of file diff --git a/megapixels/commands/search/flickr_nsid.py b/megapixels/commands/search/flickr_nsid.py new file mode 100644 index 00000000..d9997090 --- /dev/null +++ b/megapixels/commands/search/flickr_nsid.py @@ -0,0 +1,133 @@ +from os.path import join + +import click + +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +# datasets +dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face'] + + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input file for embassies') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force): + """Cross reference""" + """ + # data input example + first_name last_name nsid path_alias url username skip + Query_01 98022916@N00 range_of_light https://www.flickr.com/photos/range_of_light/ range_of_light + """ + + import sys + from os.path import join + from glob import glob + from pathlib import Path + import time + + import pandas as pd + from tqdm import tqdm + + log = Logger.getLogger() + log.info('Cross reference embassy list') + + + fp_counts = {} + fp_filepaths = {} + fp_dataset_base = '/data_store/datasets/people/' + + for dk in dataset_keys: + fp_counts[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_counts.csv') + fp_filepaths[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv') + + df_queries = pd.read_csv(opt_fp_in) + df_queries.fillna('', inplace=True) + len_orig = len(df_queries) + df_queries = df_queries[df_queries.skip != True] + log.debug(f'Skipping {len_orig - len(df_queries)} embassies') + + query_nsids = list(df_queries['nsid']) + + match_items = [] + images = [] + + for dataset_key, fp_dataset in tqdm(fp_counts.items()): + df_counts = pd.read_csv(fp_dataset) + log.debug(f'loading: {fp_filepaths[dataset_key]}') + df_filepaths = pd.read_csv(fp_filepaths[dataset_key]) + nsids = list(df_counts['nsid']) + for nsid in nsids: + if nsid in query_nsids: + # add to matches, and count + count = df_counts[df_counts['nsid'] == nsid]['count'].values[0] + first_name = df_queries[df_queries['nsid'] == nsid]['first_name'].values[0] + last_name = df_queries[df_queries['nsid'] == nsid]['last_name'].values[0] + path_alias = df_queries[df_queries['nsid'] == nsid]['path_alias'].values[0] + page_url = f'https://flickr.com/photos/{path_alias}' + name = f'{first_name} {last_name}' + meta = df_queries[df_queries['nsid'] == nsid].iloc[0] + + match_obj = { + 'count': count, + 'path_alias': path_alias, + 'name': name, + 'dataset_key': dataset_key, + 'nsid': nsid, + 'page_url': page_url, + 'username': meta.username + } + match_items.append(match_obj) + + # add photo ids or url + df_nsids = df_filepaths[df_filepaths['nsid'] == nsid] + nsid_records = df_nsids.to_dict('records') + for nsid_record in nsid_records: + photo_id = nsid_record.get('photo_id') + im_obj = { + 'nsid': nsid, + 'url': nsid_record.get('url'), + 'photo_id': photo_id, + 'dataset_key': dataset_key, + 'path_alias': path_alias, + 'name': name, + 'page_url': page_url, + 'username': meta.username, + 'filepath': f'{photo_id}.jpg' + } + + images.append(im_obj) + + # Save embassy matches + df_matches = pd.DataFrame.from_dict(match_items) + df_matches.to_csv(opt_fp_out, index=False) + total = df_matches['count'].sum() + + # Save image matches + df_images = pd.DataFrame.from_dict(images) + fp_out = opt_fp_out.replace('.csv', '_images.csv') + df_images.to_csv(fp_out, index=False) + total = len(images) + log.debug(f'Found {total:,} embassy images') + + # save summary count per dataset + groups_datasets = df_matches.groupby('dataset_key') + summary_counts = [] + for group_dataset, df_dataset in groups_datasets: + log.debug(f'{group_dataset}') + summary_counts.append({'dataset': group_dataset, 'images': df_dataset['count'].sum()}) + df_dataset_counts = pd.DataFrame.from_dict(summary_counts) + fp_out = opt_fp_out.replace('.csv', '_counts_summary_dataset.csv') + df_dataset_counts.to_csv(fp_out, index=False) + + + log.debug(f'wrote {fp_out}') + log.debug(f'Found {len(images):,} embassy images') diff --git a/megapixels/commands/search/flickr_url.py b/megapixels/commands/search/flickr_url.py new file mode 100644 index 00000000..bc205604 --- /dev/null +++ b/megapixels/commands/search/flickr_url.py @@ -0,0 +1,47 @@ +""" +# data input example +--url https://farm2.staticflickr.com/1252/1366994858_d4a2b377cc_o.jpg +""" + +import click + +# datasets +dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face'] +#dataset_keys = ['pipa'] + +@click.command() +@click.option('-u', '--url', 'opt_url', required=True, + help='Photo URL') +@click.option('-o', '--output', 'opt_fp_out', + help='Output file CSV') +@click.pass_context +def cli(ctx, opt_url, opt_fp_out): + """Locate image by URL""" + + import sys + from os.path import join + from glob import glob + from pathlib import Path + import time + + import pandas as pd + from tqdm import tqdm + + from app.utils.logger_utils import Logger + + log = Logger.getLogger() + + fp_dataset_base = '/data_store/datasets/people/' + + matches = [] + + for dk in dataset_keys: + + fp_filepaths = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv') + log.debug(f'loading: {fp_filepaths}') + df = pd.read_csv(fp_filepaths) + urls = list(df['url']) + if opt_url in urls: + log.info(f'Found image in {dk}') + matches.append(dk) + |
