diff options
| -rw-r--r-- | megapixels/app/settings/app_cfg.py | 1 | ||||
| -rw-r--r-- | megapixels/cli_search.py | 36 | ||||
| -rw-r--r-- | megapixels/commands/datasets/download_images.py | 8 | ||||
| -rw-r--r-- | megapixels/commands/search/flickr.py | 71 |
4 files changed, 113 insertions, 3 deletions
diff --git a/megapixels/app/settings/app_cfg.py b/megapixels/app/settings/app_cfg.py index 5ce0a678..0041c295 100644 --- a/megapixels/app/settings/app_cfg.py +++ b/megapixels/app/settings/app_cfg.py @@ -96,6 +96,7 @@ DIR_COMMANDS_MISC = 'commands/misc' DIR_COMMANDS_SITE = 'commands/site' DIR_COMMANDS_DEMO = 'commands/demo' DIR_COMMANDS_MSC = 'commands/msc' +DIR_COMMANDS_SEARCH = 'commands/search' # ----------------------------------------------------------------------------- # Filesystem settings diff --git a/megapixels/cli_search.py b/megapixels/cli_search.py new file mode 100644 index 00000000..9597e70b --- /dev/null +++ b/megapixels/cli_search.py @@ -0,0 +1,36 @@ +# -------------------------------------------------------- +# MSC Project +# -------------------------------------------------------- + +import click + +from app.settings import app_cfg as cfg +from app.utils import logger_utils +from app.models.click_factory import ClickSimple + +# click cli factory +cc = ClickSimple.create(cfg.DIR_COMMANDS_SEARCH) + +# -------------------------------------------------------- +# CLI +# -------------------------------------------------------- +@click.group(cls=cc, chain=False) +@click.option('-v', '--verbose', 'verbosity', count=True, default=4, + show_default=True, + help='Verbosity: -v DEBUG, -vv INFO, -vvv WARN, -vvvv ERROR, -vvvvv CRITICAL') +@click.pass_context +def cli(ctx, **kwargs): + """\033[1m\033[94mMegaPixels: Search Scripts\033[0m + """ + ctx.opts = {} + # init logger + logger_utils.Logger.create(verbosity=kwargs['verbosity']) + + + +# -------------------------------------------------------- +# Entrypoint +# -------------------------------------------------------- +if __name__ == '__main__': + cli() + diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py index 45ca8f6e..f2fdfb8f 100644 --- a/megapixels/commands/datasets/download_images.py +++ b/megapixels/commands/datasets/download_images.py @@ -10,8 +10,10 @@ import click help='Number of threads') @click.option('--wayback', 'opt_wayback', is_flag=True, default=False, help='Check Wayback archive for URL and download cached image') +@click.option('--url', 'opt_key_url', default='url', help='Field name for URL', show_default=True) +@click.option('--filepath', 'opt_key_filepath', default='filepath', help='Field name for filepath', show_default=True) @click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback): +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_key_filepath, opt_key_url, opt_wayback): """Threaded image downloader""" """ @@ -69,11 +71,11 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback): pool_items = [] log.debug(f'Initializing multithreaded pool...') for x in tqdm(records): - fp_dst = join(opt_fp_out, x['filepath']) + fp_dst = join(opt_fp_out, x[opt_key_filepath]) fp_dst_is_file = Path(fp_dst).is_file() fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file() if not fp_dst_is_file and not fp_dst_is_err: - pool_items.append({'url':x['url'], 'filepath': fp_dst, 'opt_wayback': opt_wayback}) + pool_items.append({'url':x[opt_key_url], 'filepath': fp_dst, 'opt_wayback': opt_wayback}) num_items = len(pool_items) log.info(f'Going to download {num_items:,} files') diff --git a/megapixels/commands/search/flickr.py b/megapixels/commands/search/flickr.py new file mode 100644 index 00000000..ef3515bf --- /dev/null +++ b/megapixels/commands/search/flickr.py @@ -0,0 +1,71 @@ +""" +# Examples: +-q https://farm2.staticflickr.com/1252/1366994858_d4a2b377cc_o.jpg +-q 48876008@N05 +-q 1366994858 +""" + +import click + +# datasets +dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face'] + +@click.command() +@click.option('-q', '--query', 'opt_query', type=str, required=True, + help='Photo URL, photo id, or NSID') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file CSV') +@click.pass_context +def cli(ctx, opt_query, opt_fp_out): + """Locate image by Flickr identifier""" + + import sys + from os.path import join + from glob import glob + from pathlib import Path + import time + + import pandas as pd + from tqdm import tqdm + + from app.utils.logger_utils import Logger + + log = Logger.getLogger() + log.debug(f'Search query: "{opt_query}"') + + fp_dataset_base = '/data_store/datasets/people/' + + matches = [] + + if '@' in opt_query: + # process NSID format + qk = 'nsid' + elif 'staticflickr.com' in opt_query: + # process URL to photo id + opt_query = Path(opt_query).name.split('_')[0] + qk = 'photo_id' + else: + # process as photo id + qk = 'photo_id' + + log.debug(f'Searching Flickr data using "{qk}"') + + for dk in dataset_keys: + # read dataset metadata + fp_filepaths = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv') + log.debug(f'loading: {fp_filepaths}') + df = pd.read_csv(fp_filepaths, dtype={'photo_id': str}) + photo_ids = list(df[qk]) + + # search for match + if opt_query in photo_ids: + log.info(f'Found "{qk} = {opt_query}" id in "{dk}"') + df_match = df[df[qk] == opt_query] + records = df_match.to_dict('records') + for record in records: + record['dataset'] = dk + matches.append(record) + + # Write file + log.debug(f'Found {len(matches)} matches') + pd.DataFrame.from_dict(matches).to_csv(opt_fp_out, index=False)
\ No newline at end of file |
