From 14727041f2b54dea9a37ff6e2dfef161b6243556 Mon Sep 17 00:00:00 2001 From: adamhrv Date: Sun, 10 Feb 2019 20:39:03 +0100 Subject: add csv converter for citations --- megapixels/commands/datasets/citations_to_csv.py | 92 ++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 megapixels/commands/datasets/citations_to_csv.py (limited to 'megapixels/commands/datasets/citations_to_csv.py') diff --git a/megapixels/commands/datasets/citations_to_csv.py b/megapixels/commands/datasets/citations_to_csv.py new file mode 100644 index 00000000..431ee4cd --- /dev/null +++ b/megapixels/commands/datasets/citations_to_csv.py @@ -0,0 +1,92 @@ +import click + +from app.utils import click_utils +from app.utils.logger_utils import Logger +from app.models.citations import Paper + +log = Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input license data CSV') +@click.option('-o', '--output', 'opt_fp_out', + help='Output directory') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out): + """Convert JSON to CSV""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + + import json + import pandas as pd + from tqdm import tqdm + + from app.utils import file_utils, im_utils + + log = Logger.getLogger() + log.info('Convert JSON to CSV') + + # load + with open(opt_fp_in, 'r') as fp: + json_data = json.load(fp) + + # parse + papers = [] + dataset_key = json_data['paper']['key'] + dataset_name = json_data['paper']['name'] + papers_main = get_orig_paper(json_data) + papers += papers_main + papers_citations = get_citations(dataset_key, dataset_name, json_data) + papers += papers_citations + papers = [p.to_dict() for p in papers] + + # save + if not opt_fp_out: + fp_out = opt_fp_in.replace('.json','.csv') + log.info(fp_out) + + df_papers = pd.DataFrame.from_dict(papers) + df_papers.index.name = 'index' + df_papers.to_csv(fp_out) + + + +# ---------------------------------------------------------------------------- +# Helpers +# ---------------------------------------------------------------------------- +def get_citations(dataset_key, dataset_name, json_data): + papers = [] + d_type = 'citation' + for p in json_data['citations']: + year = 0 if p.get('year', 0) == '' else p.get('year', 0) + addresses = p.get('addresses', '') + if addresses: + for a in addresses: + paper = Paper(dataset_key, dataset_name, p['id'], p['title'], d_type, + year, p['pdf'], + a['address'], a['type'], a['lat'], a['lng']) + else: + paper = Paper(p['key'], p['name'], d['id'], p['title'], 'main', year, p['pdf']) + + papers.append(paper) + return papers + +def get_orig_paper(json_data): + papers = [] + d = json_data + p = d['paper'] + d_type = 'main' + year = 0 if p.get('year', 0) == '' else p.get('year', 0) + addresses = p.get('address','') + if addresses: + for a in addresses: + paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year, + p['pdf'], + a['address'], a['type'], a['lat'], a['lng']) + else: + paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year, p['pdf']) + papers.append(paper) + return papers \ No newline at end of file -- cgit v1.2.3-70-g09d2 From bb4224262409c0f148fb26aede0ff8f7ebf1a034 Mon Sep 17 00:00:00 2001 From: adamhrv Date: Sun, 10 Feb 2019 20:42:52 +0100 Subject: fix multi addr --- megapixels/commands/datasets/citations_to_csv.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'megapixels/commands/datasets/citations_to_csv.py') diff --git a/megapixels/commands/datasets/citations_to_csv.py b/megapixels/commands/datasets/citations_to_csv.py index 431ee4cd..d96748e5 100644 --- a/megapixels/commands/datasets/citations_to_csv.py +++ b/megapixels/commands/datasets/citations_to_csv.py @@ -68,10 +68,10 @@ def get_citations(dataset_key, dataset_name, json_data): paper = Paper(dataset_key, dataset_name, p['id'], p['title'], d_type, year, p['pdf'], a['address'], a['type'], a['lat'], a['lng']) + papers.append(paper) else: paper = Paper(p['key'], p['name'], d['id'], p['title'], 'main', year, p['pdf']) - - papers.append(paper) + papers.append(paper) return papers def get_orig_paper(json_data): @@ -86,7 +86,8 @@ def get_orig_paper(json_data): paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year, p['pdf'], a['address'], a['type'], a['lat'], a['lng']) + papers.append(paper) else: paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year, p['pdf']) - papers.append(paper) + papers.append(paper) return papers \ No newline at end of file -- cgit v1.2.3-70-g09d2