import click from app.utils import click_utils from app.utils.logger_utils import Logger from app.models.citations import Paper log = Logger.getLogger() @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, help='Input license data CSV') @click.option('-o', '--output', 'opt_fp_out', help='Output directory') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out): """Convert JSON to CSV""" import sys from glob import glob from os.path import join from pathlib import Path import json import pandas as pd from tqdm import tqdm from app.utils import file_utils, im_utils log = Logger.getLogger() log.info('Convert JSON to CSV') # load with open(opt_fp_in, 'r') as fp: json_data = json.load(fp) # parse papers = [] dataset_key = json_data['paper']['key'] dataset_name = json_data['paper']['name'] papers_main = get_orig_paper(json_data) papers += papers_main papers_citations = get_citations(dataset_key, dataset_name, json_data) papers += papers_citations papers = [p.to_dict() for p in papers] # save if not opt_fp_out: fp_out = opt_fp_in.replace('.json','.csv') log.info(fp_out) df_papers = pd.DataFrame.from_dict(papers) df_papers.index.name = 'index' df_papers.to_csv(fp_out) # ---------------------------------------------------------------------------- # Helpers # ---------------------------------------------------------------------------- def get_citations(dataset_key, dataset_name, json_data): papers = [] d_type = 'citation' for p in json_data['citations']: year = 0 if p.get('year', 0) == '' else p.get('year', 0) addresses = p.get('addresses', '') if addresses: for a in addresses: paper = Paper(dataset_key, dataset_name, p['id'], p['title'], d_type, year, p['pdf'], a['address'], a['type'], a['lat'], a['lng']) papers.append(paper) else: paper = Paper(p['key'], p['name'], d['id'], p['title'], 'main', year, p['pdf']) papers.append(paper) return papers def get_orig_paper(json_data): papers = [] d = json_data p = d['paper'] d_type = 'main' year = 0 if p.get('year', 0) == '' else p.get('year', 0) addresses = p.get('address','') if addresses: for a in addresses: paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year, p['pdf'], a['address'], a['type'], a['lat'], a['lng']) papers.append(paper) else: paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year, p['pdf']) papers.append(paper) return papers