summaryrefslogtreecommitdiff
path: root/megapixels
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels')
-rw-r--r--megapixels/commands/search/flickr_nsid.py133
-rw-r--r--megapixels/commands/search/flickr_url.py47
2 files changed, 0 insertions, 180 deletions
diff --git a/megapixels/commands/search/flickr_nsid.py b/megapixels/commands/search/flickr_nsid.py
deleted file mode 100644
index d9997090..00000000
--- a/megapixels/commands/search/flickr_nsid.py
+++ /dev/null
@@ -1,133 +0,0 @@
-from os.path import join
-
-import click
-
-from app.utils.logger_utils import Logger
-
-log = Logger.getLogger()
-
-# datasets
-dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face']
-
-
-@click.command()
-@click.option('-i', '--input', 'opt_fp_in', required=True,
- help='Input file for embassies')
-@click.option('-o', '--output', 'opt_fp_out', required=True,
- help='Output file')
-@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
- help='Slice list of files')
-@click.option('-f', '--force', 'opt_force', is_flag=True,
- help='Force overwrite')
-@click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force):
- """Cross reference"""
- """
- # data input example
- first_name last_name nsid path_alias url username skip
- Query_01 98022916@N00 range_of_light https://www.flickr.com/photos/range_of_light/ range_of_light
- """
-
- import sys
- from os.path import join
- from glob import glob
- from pathlib import Path
- import time
-
- import pandas as pd
- from tqdm import tqdm
-
- log = Logger.getLogger()
- log.info('Cross reference embassy list')
-
-
- fp_counts = {}
- fp_filepaths = {}
- fp_dataset_base = '/data_store/datasets/people/'
-
- for dk in dataset_keys:
- fp_counts[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_counts.csv')
- fp_filepaths[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv')
-
- df_queries = pd.read_csv(opt_fp_in)
- df_queries.fillna('', inplace=True)
- len_orig = len(df_queries)
- df_queries = df_queries[df_queries.skip != True]
- log.debug(f'Skipping {len_orig - len(df_queries)} embassies')
-
- query_nsids = list(df_queries['nsid'])
-
- match_items = []
- images = []
-
- for dataset_key, fp_dataset in tqdm(fp_counts.items()):
- df_counts = pd.read_csv(fp_dataset)
- log.debug(f'loading: {fp_filepaths[dataset_key]}')
- df_filepaths = pd.read_csv(fp_filepaths[dataset_key])
- nsids = list(df_counts['nsid'])
- for nsid in nsids:
- if nsid in query_nsids:
- # add to matches, and count
- count = df_counts[df_counts['nsid'] == nsid]['count'].values[0]
- first_name = df_queries[df_queries['nsid'] == nsid]['first_name'].values[0]
- last_name = df_queries[df_queries['nsid'] == nsid]['last_name'].values[0]
- path_alias = df_queries[df_queries['nsid'] == nsid]['path_alias'].values[0]
- page_url = f'https://flickr.com/photos/{path_alias}'
- name = f'{first_name} {last_name}'
- meta = df_queries[df_queries['nsid'] == nsid].iloc[0]
-
- match_obj = {
- 'count': count,
- 'path_alias': path_alias,
- 'name': name,
- 'dataset_key': dataset_key,
- 'nsid': nsid,
- 'page_url': page_url,
- 'username': meta.username
- }
- match_items.append(match_obj)
-
- # add photo ids or url
- df_nsids = df_filepaths[df_filepaths['nsid'] == nsid]
- nsid_records = df_nsids.to_dict('records')
- for nsid_record in nsid_records:
- photo_id = nsid_record.get('photo_id')
- im_obj = {
- 'nsid': nsid,
- 'url': nsid_record.get('url'),
- 'photo_id': photo_id,
- 'dataset_key': dataset_key,
- 'path_alias': path_alias,
- 'name': name,
- 'page_url': page_url,
- 'username': meta.username,
- 'filepath': f'{photo_id}.jpg'
- }
-
- images.append(im_obj)
-
- # Save embassy matches
- df_matches = pd.DataFrame.from_dict(match_items)
- df_matches.to_csv(opt_fp_out, index=False)
- total = df_matches['count'].sum()
-
- # Save image matches
- df_images = pd.DataFrame.from_dict(images)
- fp_out = opt_fp_out.replace('.csv', '_images.csv')
- df_images.to_csv(fp_out, index=False)
- total = len(images)
- log.debug(f'Found {total:,} embassy images')
-
- # save summary count per dataset
- groups_datasets = df_matches.groupby('dataset_key')
- summary_counts = []
- for group_dataset, df_dataset in groups_datasets:
- log.debug(f'{group_dataset}')
- summary_counts.append({'dataset': group_dataset, 'images': df_dataset['count'].sum()})
- df_dataset_counts = pd.DataFrame.from_dict(summary_counts)
- fp_out = opt_fp_out.replace('.csv', '_counts_summary_dataset.csv')
- df_dataset_counts.to_csv(fp_out, index=False)
-
-
- log.debug(f'wrote {fp_out}')
- log.debug(f'Found {len(images):,} embassy images')
diff --git a/megapixels/commands/search/flickr_url.py b/megapixels/commands/search/flickr_url.py
deleted file mode 100644
index bc205604..00000000
--- a/megapixels/commands/search/flickr_url.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""
-# data input example
---url https://farm2.staticflickr.com/1252/1366994858_d4a2b377cc_o.jpg
-"""
-
-import click
-
-# datasets
-dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face']
-#dataset_keys = ['pipa']
-
-@click.command()
-@click.option('-u', '--url', 'opt_url', required=True,
- help='Photo URL')
-@click.option('-o', '--output', 'opt_fp_out',
- help='Output file CSV')
-@click.pass_context
-def cli(ctx, opt_url, opt_fp_out):
- """Locate image by URL"""
-
- import sys
- from os.path import join
- from glob import glob
- from pathlib import Path
- import time
-
- import pandas as pd
- from tqdm import tqdm
-
- from app.utils.logger_utils import Logger
-
- log = Logger.getLogger()
-
- fp_dataset_base = '/data_store/datasets/people/'
-
- matches = []
-
- for dk in dataset_keys:
-
- fp_filepaths = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv')
- log.debug(f'loading: {fp_filepaths}')
- df = pd.read_csv(fp_filepaths)
- urls = list(df['url'])
- if opt_url in urls:
- log.info(f'Found image in {dk}')
- matches.append(dk)
-