summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--megapixels/app/settings/app_cfg.py1
-rw-r--r--megapixels/cli_search.py36
-rw-r--r--megapixels/commands/datasets/download_images.py8
-rw-r--r--megapixels/commands/search/flickr.py71
4 files changed, 113 insertions, 3 deletions
diff --git a/megapixels/app/settings/app_cfg.py b/megapixels/app/settings/app_cfg.py
index 5ce0a678..0041c295 100644
--- a/megapixels/app/settings/app_cfg.py
+++ b/megapixels/app/settings/app_cfg.py
@@ -96,6 +96,7 @@ DIR_COMMANDS_MISC = 'commands/misc'
DIR_COMMANDS_SITE = 'commands/site'
DIR_COMMANDS_DEMO = 'commands/demo'
DIR_COMMANDS_MSC = 'commands/msc'
+DIR_COMMANDS_SEARCH = 'commands/search'
# -----------------------------------------------------------------------------
# Filesystem settings
diff --git a/megapixels/cli_search.py b/megapixels/cli_search.py
new file mode 100644
index 00000000..9597e70b
--- /dev/null
+++ b/megapixels/cli_search.py
@@ -0,0 +1,36 @@
+# --------------------------------------------------------
+# MSC Project
+# --------------------------------------------------------
+
+import click
+
+from app.settings import app_cfg as cfg
+from app.utils import logger_utils
+from app.models.click_factory import ClickSimple
+
+# click cli factory
+cc = ClickSimple.create(cfg.DIR_COMMANDS_SEARCH)
+
+# --------------------------------------------------------
+# CLI
+# --------------------------------------------------------
+@click.group(cls=cc, chain=False)
+@click.option('-v', '--verbose', 'verbosity', count=True, default=4,
+ show_default=True,
+ help='Verbosity: -v DEBUG, -vv INFO, -vvv WARN, -vvvv ERROR, -vvvvv CRITICAL')
+@click.pass_context
+def cli(ctx, **kwargs):
+ """\033[1m\033[94mMegaPixels: Search Scripts\033[0m
+ """
+ ctx.opts = {}
+ # init logger
+ logger_utils.Logger.create(verbosity=kwargs['verbosity'])
+
+
+
+# --------------------------------------------------------
+# Entrypoint
+# --------------------------------------------------------
+if __name__ == '__main__':
+ cli()
+
diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py
index 45ca8f6e..f2fdfb8f 100644
--- a/megapixels/commands/datasets/download_images.py
+++ b/megapixels/commands/datasets/download_images.py
@@ -10,8 +10,10 @@ import click
help='Number of threads')
@click.option('--wayback', 'opt_wayback', is_flag=True, default=False,
help='Check Wayback archive for URL and download cached image')
+@click.option('--url', 'opt_key_url', default='url', help='Field name for URL', show_default=True)
+@click.option('--filepath', 'opt_key_filepath', default='filepath', help='Field name for filepath', show_default=True)
@click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback):
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_key_filepath, opt_key_url, opt_wayback):
"""Threaded image downloader"""
"""
@@ -69,11 +71,11 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback):
pool_items = []
log.debug(f'Initializing multithreaded pool...')
for x in tqdm(records):
- fp_dst = join(opt_fp_out, x['filepath'])
+ fp_dst = join(opt_fp_out, x[opt_key_filepath])
fp_dst_is_file = Path(fp_dst).is_file()
fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
if not fp_dst_is_file and not fp_dst_is_err:
- pool_items.append({'url':x['url'], 'filepath': fp_dst, 'opt_wayback': opt_wayback})
+ pool_items.append({'url':x[opt_key_url], 'filepath': fp_dst, 'opt_wayback': opt_wayback})
num_items = len(pool_items)
log.info(f'Going to download {num_items:,} files')
diff --git a/megapixels/commands/search/flickr.py b/megapixels/commands/search/flickr.py
new file mode 100644
index 00000000..ef3515bf
--- /dev/null
+++ b/megapixels/commands/search/flickr.py
@@ -0,0 +1,71 @@
+"""
+# Examples:
+-q https://farm2.staticflickr.com/1252/1366994858_d4a2b377cc_o.jpg
+-q 48876008@N05
+-q 1366994858
+"""
+
+import click
+
+# datasets
+dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face']
+
+@click.command()
+@click.option('-q', '--query', 'opt_query', type=str, required=True,
+ help='Photo URL, photo id, or NSID')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Output file CSV')
+@click.pass_context
+def cli(ctx, opt_query, opt_fp_out):
+ """Locate image by Flickr identifier"""
+
+ import sys
+ from os.path import join
+ from glob import glob
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ from tqdm import tqdm
+
+ from app.utils.logger_utils import Logger
+
+ log = Logger.getLogger()
+ log.debug(f'Search query: "{opt_query}"')
+
+ fp_dataset_base = '/data_store/datasets/people/'
+
+ matches = []
+
+ if '@' in opt_query:
+ # process NSID format
+ qk = 'nsid'
+ elif 'staticflickr.com' in opt_query:
+ # process URL to photo id
+ opt_query = Path(opt_query).name.split('_')[0]
+ qk = 'photo_id'
+ else:
+ # process as photo id
+ qk = 'photo_id'
+
+ log.debug(f'Searching Flickr data using "{qk}"')
+
+ for dk in dataset_keys:
+ # read dataset metadata
+ fp_filepaths = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv')
+ log.debug(f'loading: {fp_filepaths}')
+ df = pd.read_csv(fp_filepaths, dtype={'photo_id': str})
+ photo_ids = list(df[qk])
+
+ # search for match
+ if opt_query in photo_ids:
+ log.info(f'Found "{qk} = {opt_query}" id in "{dk}"')
+ df_match = df[df[qk] == opt_query]
+ records = df_match.to_dict('records')
+ for record in records:
+ record['dataset'] = dk
+ matches.append(record)
+
+ # Write file
+ log.debug(f'Found {len(matches)} matches')
+ pd.DataFrame.from_dict(matches).to_csv(opt_fp_out, index=False) \ No newline at end of file