1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
|
import click
from app.utils import click_utils
from app.utils.logger_utils import Logger
from app.models.citations import Paper
log = Logger.getLogger()
@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True,
help='Input citation data file or folder')
@click.option('-o', '--output', 'opt_dir_out',
help='Output directory')
@click.pass_context
def cli(ctx, opt_fp_in, opt_dir_out):
"""Convert JSON to CSV"""
import sys
from glob import glob
from os.path import join
from pathlib import Path
import json
import pandas as pd
from tqdm import tqdm
from app.utils import file_utils, im_utils
log = Logger.getLogger()
log.info('Convert JSON to CSV')
# load
if Path(opt_fp_in).is_dir():
fps_in = glob(join(opt_fp_in, '*.json'))
else:
fps_in = [opt_fp_in]
log.info(f'{fps_in}')
for fp_in in fps_in:
with open(fp_in, 'r') as fp:
json_data = json.load(fp)
# parse
papers = []
dataset_key = json_data['paper']['key']
dataset_name = json_data['paper']['name']
papers_main = get_orig_paper(json_data)
papers += papers_main
papers_citations = get_citations(dataset_key, dataset_name, json_data)
papers += papers_citations
#papers = [p.to_dict() for p in papers]
for p in papers:
try:
p.to_dict()
except:
print(p)
# save
if not opt_dir_out:
# save to same directory replacing ext
fp_out = fp_in.replace('.json','.csv')
else:
fp_out = join(opt_dir_out, Path(fp_in).name)
df_papers = pd.DataFrame.from_dict(papers)
df_papers.index.name = 'id'
df_papers.to_csv(fp_out)
log.info(f'Wrote {len(df_papers):,} lines to {fp_out}')
# ----------------------------------------------------------------------------
# Helpers
# ----------------------------------------------------------------------------
def get_citations(dataset_key, dataset_name, json_data):
papers = []
d_type = 'citation'
for p in json_data['citations']:
year = 0 if p.get('year', 0) == '' else p.get('year', 0)
addresses = p.get('addresses', '')
if addresses:
for a in addresses:
paper = Paper(dataset_key, dataset_name, p['id'], p['title'], d_type,
year, p['pdf'],
a['name'], a['type'], a['lat'], a['lng'], a['country'])
papers.append(paper)
else:
paper = Paper(p['key'], p['name'], d['id'], p['title'], 'main', year, p['pdf'])
papers.append(paper)
return papers
def get_orig_paper(json_data):
papers = []
d = json_data
p = d['paper']
d_type = 'main'
year = 0 if p.get('year', 0) == '' else p.get('year', 0)
addresses = p.get('address','')
if addresses:
for a in addresses:
if type(a) == str or a is None:
continue
paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year,
p['pdf'],
a['name'], a['type'], a['lat'], a['lng'], a['country'])
papers.append(paper)
else:
paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year, p['pdf'])
papers.append(paper)
return papers
|