summaryrefslogtreecommitdiff
path: root/megapixels/commands/msc
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-06-03 03:33:06 +0200
committeradamhrv <adam@ahprojects.com>2019-06-03 03:33:06 +0200
commit1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch)
tree86c37309ff5bcb62716638562489ddb747c16159 /megapixels/commands/msc
parente5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff)
add msc working utils
Diffstat (limited to 'megapixels/commands/msc')
-rw-r--r--megapixels/commands/msc/count.py123
-rw-r--r--megapixels/commands/msc/cross_reference.py78
-rw-r--r--megapixels/commands/msc/summarize.py5
3 files changed, 126 insertions, 80 deletions
diff --git a/megapixels/commands/msc/count.py b/megapixels/commands/msc/count.py
new file mode 100644
index 00000000..3c242bc6
--- /dev/null
+++ b/megapixels/commands/msc/count.py
@@ -0,0 +1,123 @@
+from os.path import join
+
+import click
+
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+# datasets
+dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face']
+
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Input file for embassies')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Output file')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+ help='Slice list of files')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+ help='Force overwrite')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force):
+ """Cross reference"""
+
+ import sys
+ from os.path import join
+ from glob import glob
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ from tqdm import tqdm
+
+ log = Logger.getLogger()
+ log.info('Cross reference embassy list')
+
+
+ fp_counts = {}
+ fp_filepaths = {}
+ fp_dataset_base = '/data_store/datasets/people/'
+
+ for dk in dataset_keys:
+ fp_counts[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_counts.csv')
+ fp_filepaths[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv')
+
+ df_embassies = pd.read_csv(opt_fp_in)
+ df_embassies.fillna('', inplace=True)
+ embassy_nsids = list(df_embassies['nsid'])
+
+ match_items = []
+ embassy_images = []
+ malta_images = []
+
+ for dataset_key, fp_dataset in tqdm(fp_counts.items()):
+ df_counts = pd.read_csv(fp_dataset)
+ log.debug(f'loading: {fp_filepaths[dataset_key]}')
+ df_filepaths = pd.read_csv(fp_filepaths[dataset_key])
+ nsids = list(df_counts['nsid'])
+ for nsid in nsids:
+ if nsid in embassy_nsids:
+ # add to matches, and count
+ count = df_counts[df_counts['nsid'] == nsid]['count'].values[0]
+ first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0]
+ last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0]
+ path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0]
+ page_url = f'https://flickr.com/photos/{path_alias}'
+ embassy_name = f'{first_name} {last_name}'
+ embassy_meta = df_embassies[df_embassies['nsid'] == nsid].iloc[0]
+
+ match_obj = {
+ 'count': count,
+ 'path_alias': path_alias,
+ 'name': embassy_name,
+ 'dataset_key': dataset_key,
+ 'nsid': nsid,
+ 'page_url': page_url,
+ 'embassy_type': embassy_meta.type,
+ 'username': embassy_meta.username
+ }
+ match_items.append(match_obj)
+
+ # add photo ids or url
+ df_nsids = df_filepaths[df_filepaths['nsid'] == nsid]
+ nsid_records = df_nsids.to_dict('records')
+ for nsid_record in nsid_records:
+ photo_id = nsid_record.get('photo_id')
+ im_obj = {
+ 'nsid': nsid,
+ 'url': nsid_record.get('url'),
+ 'photo_id': photo_id,
+ 'dataset_key': dataset_key,
+ 'path_alias': path_alias,
+ 'name': embassy_name,
+ 'page_url': page_url,
+ 'username': embassy_meta.username,
+ 'filepath': f'{photo_id}.jpg'
+ }
+
+ embassy_images.append(im_obj)
+ if nsid == '51226353@N03':
+ malta_images.append(im_obj)
+
+ # Save embassy matches
+ df = pd.DataFrame.from_dict(match_items)
+ df.to_csv(opt_fp_out, index=False)
+ total = df['count'].sum()
+
+ # Save image matches
+ df = pd.DataFrame.from_dict(embassy_images)
+ fp_out = opt_fp_out.replace('.csv', '_images.csv')
+ df.to_csv(fp_out, index=False)
+ total = len(embassy_images)
+ log.debug(f'wrote {fp_out}')
+ log.debug(f'Found {total:,} embassy images')
+
+ # Save malta images
+ df = pd.DataFrame.from_dict(malta_images)
+ fp_out = opt_fp_out.replace('.csv', '_images_malta.csv')
+ df.to_csv(fp_out, index=False)
+ total = len(malta)
+ log.debug(f'wrote {fp_out}')
+ log.debug(f'Found {total:,} malta embassy images') \ No newline at end of file
diff --git a/megapixels/commands/msc/cross_reference.py b/megapixels/commands/msc/cross_reference.py
deleted file mode 100644
index d4457945..00000000
--- a/megapixels/commands/msc/cross_reference.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from os.path import join
-
-import click
-
-from app.utils.logger_utils import Logger
-
-log = Logger.getLogger()
-
-# source file for Embassy NSIDs
-fp_in_embassies = '/data_store/datasets/msc/embassies/embassies_on_flickr_ext.csv'
-
-# list of datasets to cross reference
-dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there']
-fp_dataset_base = '/data_store/datasets/people/'
-fp_datasets = {}
-for dk in dataset_keys:
- fp_datasets[dk] = join(fp_dataset_base, dk, f'research/{dk}_flickr_meta.csv')
-
-
-# output file
-fp_out = '/data_store/datasets/msc/embassies/embassies_scores.csv'
-
-@click.command()
-@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in_embassies,
- help='Input file for embassies')
-@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out,
- help='Output file')
-@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
- help='Slice list of files')
-@click.option('-f', '--force', 'opt_force', is_flag=True,
- help='Force overwrite')
-@click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force):
- """Cross reference"""
-
- import sys
- from os.path import join
- from glob import glob
- from pathlib import Path
- import time
-
- import pandas as pd
- from tqdm import tqdm
-
- log = Logger.getLogger()
- log.info('Cross reference embassy list')
-
- df_embassies = pd.read_csv(opt_fp_in)
- df_embassies.fillna('', inplace=True)
- embassy_nsids = list(df_embassies['nsid'])
-
- match_items = []
- for dataset_key, fp_dataset in fp_datasets.items():
- df_dataset = pd.read_csv(fp_dataset)
- nsids = list(df_dataset['nsid'])
- for nsid in nsids:
- if nsid in embassy_nsids:
- # add to matches, and count
- count = df_dataset[df_dataset['nsid'] == nsid]['count'].values[0]
- first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0]
- last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0]
- path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0]
- log.debug(f'{first_name} {last_name}, {path_alias} count: {count}, in {dataset_key}')
- match_obj = {
- 'count': count,
- 'path_alias': path_alias,
- 'name': f'{first_name} {last_name}',
- 'dataset_key': dataset_key,
- 'nsid': nsid
- }
- match_items.append(match_obj)
-
- df = pd.DataFrame.from_dict(match_items)
- df.to_csv(opt_fp_out, index=False)
-
- total = df['count'].sum()
-
- log.debug(f'Found {total} embassy photos') \ No newline at end of file
diff --git a/megapixels/commands/msc/summarize.py b/megapixels/commands/msc/summarize.py
index d5d251db..045e3b69 100644
--- a/megapixels/commands/msc/summarize.py
+++ b/megapixels/commands/msc/summarize.py
@@ -29,7 +29,7 @@ def cli(ctx, opt_fp_in, opt_fp_out):
log = Logger.getLogger()
- dataset_names = ['helen', 'megaface', 'adience', 'pipa', 'lfpw', 'duke_mtmc', 'brainwash', 'msceleb', 'uccs']
+ dataset_names = ['helen', 'megaface', 'adience', 'pipa', 'lfpw', 'brainwash', 'msceleb', 'duke_mtmc', 'uccs']
df = pd.DataFrame()
fp_out = opt_fp_out.replace('.csv', '_citations.csv')
@@ -37,10 +37,11 @@ def cli(ctx, opt_fp_in, opt_fp_out):
fp_csv = join(opt_fp_in, f'{dataset_name}.csv')
_df = pd.read_csv(fp_csv)
_df = _df[_df.lat != 0]
+ _df.drop('id', axis=1, inplace=True)
print(dataset_name, len(_df))
df = df.append(_df, ignore_index=True)
- df.to_csv(opt_fp_out, index=False)
+ df.to_csv(fp_out, index=False)
# create country summary
fp_out = opt_fp_out.replace('.csv', '_countries.csv')