summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/citations_to_csv.py
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-02-10 20:39:03 +0100
committeradamhrv <adam@ahprojects.com>2019-02-10 20:39:03 +0100
commit14727041f2b54dea9a37ff6e2dfef161b6243556 (patch)
tree62ba1aef6eb80900e67c5cc344300eefbf720ced /megapixels/commands/datasets/citations_to_csv.py
parentd213702d4baf7a8c776ef71383346c0d6402106a (diff)
add csv converter for citations
Diffstat (limited to 'megapixels/commands/datasets/citations_to_csv.py')
-rw-r--r--megapixels/commands/datasets/citations_to_csv.py92
1 files changed, 92 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/citations_to_csv.py b/megapixels/commands/datasets/citations_to_csv.py
new file mode 100644
index 00000000..431ee4cd
--- /dev/null
+++ b/megapixels/commands/datasets/citations_to_csv.py
@@ -0,0 +1,92 @@
+import click
+
+from app.utils import click_utils
+from app.utils.logger_utils import Logger
+from app.models.citations import Paper
+
+log = Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Input license data CSV')
+@click.option('-o', '--output', 'opt_fp_out',
+ help='Output directory')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out):
+ """Convert JSON to CSV"""
+
+ import sys
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+
+ import json
+ import pandas as pd
+ from tqdm import tqdm
+
+ from app.utils import file_utils, im_utils
+
+ log = Logger.getLogger()
+ log.info('Convert JSON to CSV')
+
+ # load
+ with open(opt_fp_in, 'r') as fp:
+ json_data = json.load(fp)
+
+ # parse
+ papers = []
+ dataset_key = json_data['paper']['key']
+ dataset_name = json_data['paper']['name']
+ papers_main = get_orig_paper(json_data)
+ papers += papers_main
+ papers_citations = get_citations(dataset_key, dataset_name, json_data)
+ papers += papers_citations
+ papers = [p.to_dict() for p in papers]
+
+ # save
+ if not opt_fp_out:
+ fp_out = opt_fp_in.replace('.json','.csv')
+ log.info(fp_out)
+
+ df_papers = pd.DataFrame.from_dict(papers)
+ df_papers.index.name = 'index'
+ df_papers.to_csv(fp_out)
+
+
+
+# ----------------------------------------------------------------------------
+# Helpers
+# ----------------------------------------------------------------------------
+def get_citations(dataset_key, dataset_name, json_data):
+ papers = []
+ d_type = 'citation'
+ for p in json_data['citations']:
+ year = 0 if p.get('year', 0) == '' else p.get('year', 0)
+ addresses = p.get('addresses', '')
+ if addresses:
+ for a in addresses:
+ paper = Paper(dataset_key, dataset_name, p['id'], p['title'], d_type,
+ year, p['pdf'],
+ a['address'], a['type'], a['lat'], a['lng'])
+ else:
+ paper = Paper(p['key'], p['name'], d['id'], p['title'], 'main', year, p['pdf'])
+
+ papers.append(paper)
+ return papers
+
+def get_orig_paper(json_data):
+ papers = []
+ d = json_data
+ p = d['paper']
+ d_type = 'main'
+ year = 0 if p.get('year', 0) == '' else p.get('year', 0)
+ addresses = p.get('address','')
+ if addresses:
+ for a in addresses:
+ paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year,
+ p['pdf'],
+ a['address'], a['type'], a['lat'], a['lng'])
+ else:
+ paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year, p['pdf'])
+ papers.append(paper)
+ return papers \ No newline at end of file