diff options
Diffstat (limited to 'megapixels/commands/datasets/citations_to_csv.py')
| -rw-r--r-- | megapixels/commands/datasets/citations_to_csv.py | 53 |
1 files changed, 32 insertions, 21 deletions
diff --git a/megapixels/commands/datasets/citations_to_csv.py b/megapixels/commands/datasets/citations_to_csv.py index d96748e5..e54d0dac 100644 --- a/megapixels/commands/datasets/citations_to_csv.py +++ b/megapixels/commands/datasets/citations_to_csv.py @@ -8,11 +8,11 @@ log = Logger.getLogger() @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, - help='Input license data CSV') -@click.option('-o', '--output', 'opt_fp_out', + help='Input citation data file or folder') +@click.option('-o', '--output', 'opt_dir_out', help='Output directory') @click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out): +def cli(ctx, opt_fp_in, opt_dir_out): """Convert JSON to CSV""" import sys @@ -30,27 +30,38 @@ def cli(ctx, opt_fp_in, opt_fp_out): log.info('Convert JSON to CSV') # load - with open(opt_fp_in, 'r') as fp: - json_data = json.load(fp) + if Path(opt_fp_in).is_dir(): + fps_in = glob(join(opt_fp_in, '*.json')) + else: + fps_in = [opt_fp_in] - # parse - papers = [] - dataset_key = json_data['paper']['key'] - dataset_name = json_data['paper']['name'] - papers_main = get_orig_paper(json_data) - papers += papers_main - papers_citations = get_citations(dataset_key, dataset_name, json_data) - papers += papers_citations - papers = [p.to_dict() for p in papers] + log.info(f'{fps_in}') + + for fp_in in fps_in: + with open(fp_in, 'r') as fp: + json_data = json.load(fp) - # save - if not opt_fp_out: - fp_out = opt_fp_in.replace('.json','.csv') - log.info(fp_out) + # parse + papers = [] + dataset_key = json_data['paper']['key'] + dataset_name = json_data['paper']['name'] + papers_main = get_orig_paper(json_data) + papers += papers_main + papers_citations = get_citations(dataset_key, dataset_name, json_data) + papers += papers_citations + papers = [p.to_dict() for p in papers] + + # save + if not opt_dir_out: + # save to same directory replacing ext + fp_out = fp_in.replace('.json','.csv') + else: + fp_out = join(opt_dir_out, Path(fp_in).name) - df_papers = pd.DataFrame.from_dict(papers) - df_papers.index.name = 'index' - df_papers.to_csv(fp_out) + df_papers = pd.DataFrame.from_dict(papers) + df_papers.index.name = 'id' + df_papers.to_csv(fp_out) + log.info(f'Wrote {len(df_papers):,} lines to {fp_out}') |
