summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-02-14 14:45:18 +0100
committeradamhrv <adam@ahprojects.com>2019-02-14 14:45:18 +0100
commit3a3a89f2c58eceee07b2cfcfb1700a61b34619e5 (patch)
tree436347b8466422a1019209f9f04937ea1ce0e4eb /megapixels/commands/datasets
parent41247c08ea359d0a72a247992d2019ae2120536c (diff)
updates
Diffstat (limited to 'megapixels/commands/datasets')
-rw-r--r--megapixels/commands/datasets/citations_to_csv.py53
1 files changed, 32 insertions, 21 deletions
diff --git a/megapixels/commands/datasets/citations_to_csv.py b/megapixels/commands/datasets/citations_to_csv.py
index d96748e5..e54d0dac 100644
--- a/megapixels/commands/datasets/citations_to_csv.py
+++ b/megapixels/commands/datasets/citations_to_csv.py
@@ -8,11 +8,11 @@ log = Logger.getLogger()
@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True,
- help='Input license data CSV')
-@click.option('-o', '--output', 'opt_fp_out',
+ help='Input citation data file or folder')
+@click.option('-o', '--output', 'opt_dir_out',
help='Output directory')
@click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out):
+def cli(ctx, opt_fp_in, opt_dir_out):
"""Convert JSON to CSV"""
import sys
@@ -30,27 +30,38 @@ def cli(ctx, opt_fp_in, opt_fp_out):
log.info('Convert JSON to CSV')
# load
- with open(opt_fp_in, 'r') as fp:
- json_data = json.load(fp)
+ if Path(opt_fp_in).is_dir():
+ fps_in = glob(join(opt_fp_in, '*.json'))
+ else:
+ fps_in = [opt_fp_in]
- # parse
- papers = []
- dataset_key = json_data['paper']['key']
- dataset_name = json_data['paper']['name']
- papers_main = get_orig_paper(json_data)
- papers += papers_main
- papers_citations = get_citations(dataset_key, dataset_name, json_data)
- papers += papers_citations
- papers = [p.to_dict() for p in papers]
+ log.info(f'{fps_in}')
+
+ for fp_in in fps_in:
+ with open(fp_in, 'r') as fp:
+ json_data = json.load(fp)
- # save
- if not opt_fp_out:
- fp_out = opt_fp_in.replace('.json','.csv')
- log.info(fp_out)
+ # parse
+ papers = []
+ dataset_key = json_data['paper']['key']
+ dataset_name = json_data['paper']['name']
+ papers_main = get_orig_paper(json_data)
+ papers += papers_main
+ papers_citations = get_citations(dataset_key, dataset_name, json_data)
+ papers += papers_citations
+ papers = [p.to_dict() for p in papers]
+
+ # save
+ if not opt_dir_out:
+ # save to same directory replacing ext
+ fp_out = fp_in.replace('.json','.csv')
+ else:
+ fp_out = join(opt_dir_out, Path(fp_in).name)
- df_papers = pd.DataFrame.from_dict(papers)
- df_papers.index.name = 'index'
- df_papers.to_csv(fp_out)
+ df_papers = pd.DataFrame.from_dict(papers)
+ df_papers.index.name = 'id'
+ df_papers.to_csv(fp_out)
+ log.info(f'Wrote {len(df_papers):,} lines to {fp_out}')