import click from app.settings import types from app.models.dataset import Dataset from app.utils import click_utils from app.settings import app_cfg as cfg from app.utils.logger_utils import Logger log = Logger.getLogger() @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True) @click.option('-o', '--output', 'opt_fp_out', required=True) @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out): """_template_""" import sys from glob import glob from os.path import join from pathlib import Path import time from pprint import pprint import pandas as pd from tqdm import tqdm from app.utils import file_utils log = Logger.getLogger() dataset_names = ['helen', 'megaface', 'adience', 'pipa', 'lfpw', 'brainwash', 'msceleb', 'duke_mtmc', 'uccs'] df = pd.DataFrame() fp_out = opt_fp_out.replace('.csv', '_citations.csv') for dataset_name in dataset_names: fp_csv = join(opt_fp_in, f'{dataset_name}.csv') _df = pd.read_csv(fp_csv) _df = _df[_df.lat != 0] _df.drop('id', axis=1, inplace=True) print(dataset_name, len(_df)) df = df.append(_df, ignore_index=True) df.to_csv(fp_out, index=False) # Print outputs by year results_year = [] year_groups = df.groupby('year') for year, group in year_groups: if int(year) > 0: results_year.append({'year': year, 'count': len(group)}) fp_out = opt_fp_out.replace('.csv', '_years.csv') df_years = pd.DataFrame.from_dict(results_year) df_years.to_csv(fp_out, index=False) pprint(df_years) # create country summary fp_out = opt_fp_out.replace('.csv', '_countries.csv') country_groups = df.groupby('country') summary = [] for group_name, group in country_groups: summary.append({'country': group_name, 'citations': len(group)}) df_summary = pd.DataFrame.from_dict(summary) df_summary.sort_values(by='citations', ascending=False, inplace=True) df_summary.to_csv(fp_out, index=False) pprint(df_summary) # summary sector summary = [] fp_out = opt_fp_out.replace('.csv', '_sector.csv') groups = df.groupby('loc_type') for group_name, group in groups: summary.append({'type': group_name, 'citations': len(group)}) df_types = pd.DataFrame.from_dict(summary) df_types.sort_values(by='citations', ascending=False, inplace=True) df_types.to_csv(fp_out, index=False) pprint(df_types)