diff options
| author | adamhrv <adam@ahprojects.com> | 2019-02-10 20:39:03 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-02-10 20:39:03 +0100 |
| commit | 14727041f2b54dea9a37ff6e2dfef161b6243556 (patch) | |
| tree | 62ba1aef6eb80900e67c5cc344300eefbf720ced /megapixels/commands/datasets/citations_to_csv.py | |
| parent | d213702d4baf7a8c776ef71383346c0d6402106a (diff) | |
add csv converter for citations
Diffstat (limited to 'megapixels/commands/datasets/citations_to_csv.py')
| -rw-r--r-- | megapixels/commands/datasets/citations_to_csv.py | 92 |
1 files changed, 92 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/citations_to_csv.py b/megapixels/commands/datasets/citations_to_csv.py new file mode 100644 index 00000000..431ee4cd --- /dev/null +++ b/megapixels/commands/datasets/citations_to_csv.py @@ -0,0 +1,92 @@ +import click + +from app.utils import click_utils +from app.utils.logger_utils import Logger +from app.models.citations import Paper + +log = Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input license data CSV') +@click.option('-o', '--output', 'opt_fp_out', + help='Output directory') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out): + """Convert JSON to CSV""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + + import json + import pandas as pd + from tqdm import tqdm + + from app.utils import file_utils, im_utils + + log = Logger.getLogger() + log.info('Convert JSON to CSV') + + # load + with open(opt_fp_in, 'r') as fp: + json_data = json.load(fp) + + # parse + papers = [] + dataset_key = json_data['paper']['key'] + dataset_name = json_data['paper']['name'] + papers_main = get_orig_paper(json_data) + papers += papers_main + papers_citations = get_citations(dataset_key, dataset_name, json_data) + papers += papers_citations + papers = [p.to_dict() for p in papers] + + # save + if not opt_fp_out: + fp_out = opt_fp_in.replace('.json','.csv') + log.info(fp_out) + + df_papers = pd.DataFrame.from_dict(papers) + df_papers.index.name = 'index' + df_papers.to_csv(fp_out) + + + +# ---------------------------------------------------------------------------- +# Helpers +# ---------------------------------------------------------------------------- +def get_citations(dataset_key, dataset_name, json_data): + papers = [] + d_type = 'citation' + for p in json_data['citations']: + year = 0 if p.get('year', 0) == '' else p.get('year', 0) + addresses = p.get('addresses', '') + if addresses: + for a in addresses: + paper = Paper(dataset_key, dataset_name, p['id'], p['title'], d_type, + year, p['pdf'], + a['address'], a['type'], a['lat'], a['lng']) + else: + paper = Paper(p['key'], p['name'], d['id'], p['title'], 'main', year, p['pdf']) + + papers.append(paper) + return papers + +def get_orig_paper(json_data): + papers = [] + d = json_data + p = d['paper'] + d_type = 'main' + year = 0 if p.get('year', 0) == '' else p.get('year', 0) + addresses = p.get('address','') + if addresses: + for a in addresses: + paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year, + p['pdf'], + a['address'], a['type'], a['lat'], a['lng']) + else: + paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year, p['pdf']) + papers.append(paper) + return papers
\ No newline at end of file |
