import click from app.utils import click_utils from app.utils.logger_utils import Logger from app.models.citations import Paper log = Logger.getLogger() fp_in_default = '../site/datasets/verified/' fp_out_default = '../site/datasets/verified/' @click.command() @click.option('-i', '--input', 'opt_fp_in', default=fp_in_default, help='Input citation data file or folder') @click.option('-o', '--output', 'opt_dir_out', default=fp_out_default, help='Output directory') @click.pass_context def cli(ctx, opt_fp_in, opt_dir_out): """Convert JSON to CSV""" import sys from glob import glob from os.path import join from pathlib import Path import json import pandas as pd from tqdm import tqdm from app.utils import file_utils, im_utils log = Logger.getLogger() log.info('Convert JSON to CSV') # load if Path(opt_fp_in).is_dir(): fps_in = glob(join(opt_fp_in, '*.json')) else: fps_in = [opt_fp_in] log.info(f'Converting {len(fps_in)} JSON files to CSV') for fp_in in fps_in: log.info(f'Processing: {Path(fp_in).name}') with open(fp_in, 'r') as fp: json_data = json.load(fp) # parse papers = [] dataset_key = json_data['paper']['key'] dataset_name = json_data['paper']['name'] try: papers_main = get_orig_paper(json_data) papers += papers_main papers_citations = get_citations(dataset_key, dataset_name, json_data) papers += papers_citations papers = [p.to_dict() for p in papers] except Exception as e: log.error(f'{e} on {Path(fp_in).name}') continue # save if not opt_dir_out: # save to same directory replacing ext fp_out = fp_in.replace('.json','.csv') else: fp_out = join(opt_dir_out, f'{Path(fp_in).stem}.csv') df_papers = pd.DataFrame.from_dict(papers) df_papers.index.name = 'id' df_papers.to_csv(fp_out) log.info(f'Wrote {len(df_papers):,} lines to {fp_out}') # ---------------------------------------------------------------------------- # Helpers # ---------------------------------------------------------------------------- def get_citations(dataset_key, dataset_name, json_data): papers = [] d_type = 'citation' for p in json_data['citations']: year = 0 if p.get('year', 0) == '' else p.get('year', 0) addresses = p.get('addresses', '') if addresses: for a in addresses: pdf_url = '' if not p.get('pdf') else p.get('pdf')[0] paper = Paper(dataset_key, dataset_name, p['id'], p['title'], d_type, year, pdf_url, a['name'], a['type'], a['lat'], a['lng'], a['country']) papers.append(paper) else: pdf_url = '' if not p.get('pdf') else p.get('pdf')[0] paper = Paper(p['key'], p['name'], d['id'], p['title'], 'main', year, pdf_url) papers.append(paper) return papers def get_orig_paper(json_data): papers = [] d = json_data p = d['paper'] d_type = 'main' year = 0 if p.get('year', 0) == '' else p.get('year', 0) addresses = p.get('address','') if addresses: for a in addresses: if type(a) == str or a is None: continue pdf_url = '' if not p.get('pdf') else p.get('pdf')[0] paper = Paper(p.get('key'), p.get('name'), p.get('paper_id'), p.get('title'), d_type, year, pdf_url, a.get('name'), a.get('type'), a.get('lat'), a.get('lng'), a.get('country')) papers.append(paper) else: pdf_url = '' if not p.get('pdf') else p.get('pdf')[0] paper = Paper(p.get('key'), p.get('name'), p.get('paper_id'), p.get('title'), d_type, year, pdf_url) papers.append(paper) return papers