summaryrefslogtreecommitdiff
path: root/megapixels/commands/search
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-09-08 21:32:10 +0200
committeradamhrv <adam@ahprojects.com>2019-09-08 21:32:10 +0200
commitaed339c29abce0fad683736622c20427da8ad3a6 (patch)
treed1999418ddf276f36473c828561e02279a809e9b /megapixels/commands/search
parent48d181774888206434d8096770022532bd09af87 (diff)
add flickr search
Diffstat (limited to 'megapixels/commands/search')
-rw-r--r--megapixels/commands/search/flickr.py71
-rw-r--r--megapixels/commands/search/flickr_nsid.py133
-rw-r--r--megapixels/commands/search/flickr_url.py47
3 files changed, 251 insertions, 0 deletions
diff --git a/megapixels/commands/search/flickr.py b/megapixels/commands/search/flickr.py
new file mode 100644
index 00000000..ef3515bf
--- /dev/null
+++ b/megapixels/commands/search/flickr.py
@@ -0,0 +1,71 @@
+"""
+# Examples:
+-q https://farm2.staticflickr.com/1252/1366994858_d4a2b377cc_o.jpg
+-q 48876008@N05
+-q 1366994858
+"""
+
+import click
+
+# datasets
+dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face']
+
+@click.command()
+@click.option('-q', '--query', 'opt_query', type=str, required=True,
+ help='Photo URL, photo id, or NSID')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Output file CSV')
+@click.pass_context
+def cli(ctx, opt_query, opt_fp_out):
+ """Locate image by Flickr identifier"""
+
+ import sys
+ from os.path import join
+ from glob import glob
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ from tqdm import tqdm
+
+ from app.utils.logger_utils import Logger
+
+ log = Logger.getLogger()
+ log.debug(f'Search query: "{opt_query}"')
+
+ fp_dataset_base = '/data_store/datasets/people/'
+
+ matches = []
+
+ if '@' in opt_query:
+ # process NSID format
+ qk = 'nsid'
+ elif 'staticflickr.com' in opt_query:
+ # process URL to photo id
+ opt_query = Path(opt_query).name.split('_')[0]
+ qk = 'photo_id'
+ else:
+ # process as photo id
+ qk = 'photo_id'
+
+ log.debug(f'Searching Flickr data using "{qk}"')
+
+ for dk in dataset_keys:
+ # read dataset metadata
+ fp_filepaths = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv')
+ log.debug(f'loading: {fp_filepaths}')
+ df = pd.read_csv(fp_filepaths, dtype={'photo_id': str})
+ photo_ids = list(df[qk])
+
+ # search for match
+ if opt_query in photo_ids:
+ log.info(f'Found "{qk} = {opt_query}" id in "{dk}"')
+ df_match = df[df[qk] == opt_query]
+ records = df_match.to_dict('records')
+ for record in records:
+ record['dataset'] = dk
+ matches.append(record)
+
+ # Write file
+ log.debug(f'Found {len(matches)} matches')
+ pd.DataFrame.from_dict(matches).to_csv(opt_fp_out, index=False) \ No newline at end of file
diff --git a/megapixels/commands/search/flickr_nsid.py b/megapixels/commands/search/flickr_nsid.py
new file mode 100644
index 00000000..d9997090
--- /dev/null
+++ b/megapixels/commands/search/flickr_nsid.py
@@ -0,0 +1,133 @@
+from os.path import join
+
+import click
+
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+# datasets
+dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face']
+
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Input file for embassies')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Output file')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+ help='Slice list of files')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+ help='Force overwrite')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force):
+ """Cross reference"""
+ """
+ # data input example
+ first_name last_name nsid path_alias url username skip
+ Query_01 98022916@N00 range_of_light https://www.flickr.com/photos/range_of_light/ range_of_light
+ """
+
+ import sys
+ from os.path import join
+ from glob import glob
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ from tqdm import tqdm
+
+ log = Logger.getLogger()
+ log.info('Cross reference embassy list')
+
+
+ fp_counts = {}
+ fp_filepaths = {}
+ fp_dataset_base = '/data_store/datasets/people/'
+
+ for dk in dataset_keys:
+ fp_counts[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_counts.csv')
+ fp_filepaths[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv')
+
+ df_queries = pd.read_csv(opt_fp_in)
+ df_queries.fillna('', inplace=True)
+ len_orig = len(df_queries)
+ df_queries = df_queries[df_queries.skip != True]
+ log.debug(f'Skipping {len_orig - len(df_queries)} embassies')
+
+ query_nsids = list(df_queries['nsid'])
+
+ match_items = []
+ images = []
+
+ for dataset_key, fp_dataset in tqdm(fp_counts.items()):
+ df_counts = pd.read_csv(fp_dataset)
+ log.debug(f'loading: {fp_filepaths[dataset_key]}')
+ df_filepaths = pd.read_csv(fp_filepaths[dataset_key])
+ nsids = list(df_counts['nsid'])
+ for nsid in nsids:
+ if nsid in query_nsids:
+ # add to matches, and count
+ count = df_counts[df_counts['nsid'] == nsid]['count'].values[0]
+ first_name = df_queries[df_queries['nsid'] == nsid]['first_name'].values[0]
+ last_name = df_queries[df_queries['nsid'] == nsid]['last_name'].values[0]
+ path_alias = df_queries[df_queries['nsid'] == nsid]['path_alias'].values[0]
+ page_url = f'https://flickr.com/photos/{path_alias}'
+ name = f'{first_name} {last_name}'
+ meta = df_queries[df_queries['nsid'] == nsid].iloc[0]
+
+ match_obj = {
+ 'count': count,
+ 'path_alias': path_alias,
+ 'name': name,
+ 'dataset_key': dataset_key,
+ 'nsid': nsid,
+ 'page_url': page_url,
+ 'username': meta.username
+ }
+ match_items.append(match_obj)
+
+ # add photo ids or url
+ df_nsids = df_filepaths[df_filepaths['nsid'] == nsid]
+ nsid_records = df_nsids.to_dict('records')
+ for nsid_record in nsid_records:
+ photo_id = nsid_record.get('photo_id')
+ im_obj = {
+ 'nsid': nsid,
+ 'url': nsid_record.get('url'),
+ 'photo_id': photo_id,
+ 'dataset_key': dataset_key,
+ 'path_alias': path_alias,
+ 'name': name,
+ 'page_url': page_url,
+ 'username': meta.username,
+ 'filepath': f'{photo_id}.jpg'
+ }
+
+ images.append(im_obj)
+
+ # Save embassy matches
+ df_matches = pd.DataFrame.from_dict(match_items)
+ df_matches.to_csv(opt_fp_out, index=False)
+ total = df_matches['count'].sum()
+
+ # Save image matches
+ df_images = pd.DataFrame.from_dict(images)
+ fp_out = opt_fp_out.replace('.csv', '_images.csv')
+ df_images.to_csv(fp_out, index=False)
+ total = len(images)
+ log.debug(f'Found {total:,} embassy images')
+
+ # save summary count per dataset
+ groups_datasets = df_matches.groupby('dataset_key')
+ summary_counts = []
+ for group_dataset, df_dataset in groups_datasets:
+ log.debug(f'{group_dataset}')
+ summary_counts.append({'dataset': group_dataset, 'images': df_dataset['count'].sum()})
+ df_dataset_counts = pd.DataFrame.from_dict(summary_counts)
+ fp_out = opt_fp_out.replace('.csv', '_counts_summary_dataset.csv')
+ df_dataset_counts.to_csv(fp_out, index=False)
+
+
+ log.debug(f'wrote {fp_out}')
+ log.debug(f'Found {len(images):,} embassy images')
diff --git a/megapixels/commands/search/flickr_url.py b/megapixels/commands/search/flickr_url.py
new file mode 100644
index 00000000..bc205604
--- /dev/null
+++ b/megapixels/commands/search/flickr_url.py
@@ -0,0 +1,47 @@
+"""
+# data input example
+--url https://farm2.staticflickr.com/1252/1366994858_d4a2b377cc_o.jpg
+"""
+
+import click
+
+# datasets
+dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face']
+#dataset_keys = ['pipa']
+
+@click.command()
+@click.option('-u', '--url', 'opt_url', required=True,
+ help='Photo URL')
+@click.option('-o', '--output', 'opt_fp_out',
+ help='Output file CSV')
+@click.pass_context
+def cli(ctx, opt_url, opt_fp_out):
+ """Locate image by URL"""
+
+ import sys
+ from os.path import join
+ from glob import glob
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ from tqdm import tqdm
+
+ from app.utils.logger_utils import Logger
+
+ log = Logger.getLogger()
+
+ fp_dataset_base = '/data_store/datasets/people/'
+
+ matches = []
+
+ for dk in dataset_keys:
+
+ fp_filepaths = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv')
+ log.debug(f'loading: {fp_filepaths}')
+ df = pd.read_csv(fp_filepaths)
+ urls = list(df['url'])
+ if opt_url in urls:
+ log.info(f'Found image in {dk}')
+ matches.append(dk)
+