diff options
Diffstat (limited to 'megapixels/commands/msc/summarize.py')
| -rw-r--r-- | megapixels/commands/msc/summarize.py | 67 |
1 files changed, 67 insertions, 0 deletions
diff --git a/megapixels/commands/msc/summarize.py b/megapixels/commands/msc/summarize.py new file mode 100644 index 00000000..d5d251db --- /dev/null +++ b/megapixels/commands/msc/summarize.py @@ -0,0 +1,67 @@ +import click + +from app.settings import types +from app.models.dataset import Dataset +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True) +@click.option('-o', '--output', 'opt_fp_out', required=True) +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out): + """_template_""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + import time + from pprint import pprint + + import pandas as pd + from tqdm import tqdm + + from app.utils import file_utils + + log = Logger.getLogger() + + dataset_names = ['helen', 'megaface', 'adience', 'pipa', 'lfpw', 'duke_mtmc', 'brainwash', 'msceleb', 'uccs'] + + df = pd.DataFrame() + fp_out = opt_fp_out.replace('.csv', '_citations.csv') + for dataset_name in dataset_names: + fp_csv = join(opt_fp_in, f'{dataset_name}.csv') + _df = pd.read_csv(fp_csv) + _df = _df[_df.lat != 0] + print(dataset_name, len(_df)) + df = df.append(_df, ignore_index=True) + + df.to_csv(opt_fp_out, index=False) + + # create country summary + fp_out = opt_fp_out.replace('.csv', '_countries.csv') + country_groups = df.groupby('country') + summary = [] + for group_name, group in country_groups: + summary.append({'country': group_name, 'citations': len(group)}) + df_summary = pd.DataFrame.from_dict(summary) + df_summary.sort_values(by='citations', ascending=False, inplace=True) + df_summary.to_csv(fp_out, index=False) + pprint(df_summary) + + # summary sector + summary = [] + fp_out = opt_fp_out.replace('.csv', '_sector.csv') + groups = df.groupby('loc_type') + for group_name, group in groups: + summary.append({'type': group_name, 'citations': len(group)}) + df_types = pd.DataFrame.from_dict(summary) + df_types.sort_values(by='citations', ascending=False, inplace=True) + df_types.to_csv(fp_out, index=False) + pprint(df_types) + + |
