diff options
Diffstat (limited to 's2-citation-report.py')
| -rw-r--r-- | s2-citation-report.py | 136 |
1 files changed, 136 insertions, 0 deletions
diff --git a/s2-citation-report.py b/s2-citation-report.py new file mode 100644 index 00000000..54d267eb --- /dev/null +++ b/s2-citation-report.py @@ -0,0 +1,136 @@ +import os +import gzip +import glob +import json +import math +import operator +import click +from util import * + +@click.command() +def s2_citation_report(): + addresses = load_addresses() + for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True): + process_paper(fn, addresses) + +def process_paper(fn, addresses): + res = {} + address_count = 0 + geocode_count = 0 + geocoded_citations = [] + unknown_citations = [] + display_geocoded_citations = [] + with open(fn, 'r') as f: + data = json.load(f) + print('>> {}'.format(data['paperId'])) + paper = load_paper(data['paperId']) + if paper.data is None: + print("Paper missing! {}".format(data['paperId'])) + return + res['paperId'] = paper.paper_id + res['title'] = paper.title + res['journal'] = paper.journal + res['authors'] = paper.authors + res['citations'] = [] + for cite in data['citations']: + citationId = cite['paperId'] + citation = load_paper(citationId) + if citation.data is None: + print("Citation missing! {}".format(cite['paperId'])) + continue + institutions = load_institutions(citationId) + geocoded_institutions = [] + institution = '' + address = None + for inst in sorted(institutions, key=operator.itemgetter(1)): + # print(inst[1]) + address_count += 1 + institution = inst[1] + if institution in addresses: + address = addresses[institution] + geocode_count += 1 + geocoded_institutions.append(institution) + else: + for part in institution.split(', '): + if part in addresses: + address = addresses[part] + geocode_count += 1 + geocoded_institutions.append(institution) + res['citations'].append({ + 'title': citation.title, + 'journal': citation.journal, + 'authors': citation.authors, + 'institutions': [inst[1] for inst in institutions], + 'geocoded': geocoded_institutions, + }) + if len(geocoded_institutions): + geocoded_citations.append([ + citation.title, + institution, + address, + ]) + display_geocoded_citations.append([ + citation.title, + institution, + ', '.join(address), + ]) + else: + unknown_citations.append([ + citation.title, + institution, + ]) + + total_citations = len(geocoded_citations) + len(unknown_citations) + os.makedirs('reports/papers/', exist_ok=True) + with open('reports/papers/{}.html'.format(paper.paper_id), 'w') as f: + f.write("<!doctype html>") + f.write("<html>") + f.write("<head>") + f.write('<meta charset="utf-8">') + f.write("<title>{}</title>".format(paper.title)) + f.write("<link rel='stylesheet' href='../reports.css'>") + f.write('<link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css" integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA==" crossorigin=""/>') + f.write("</head>") + f.write("<body>") + f.write("<div id='mapid'></div>") + f.write("<h2>{}</h2>".format(paper.title)) + f.write('<ul>') + if paper.journal: + f.write('<li>{}</li>'.format(paper.journal)) + f.write('<li>{}</li>'.format(paper.year)) + f.write('<li>{} / {} citations ({} %)</li>'.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100))) + f.write('</ul>') + f.write('<h3>{}</h3>'.format('Geocoded Citations')) + write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0))) + f.write('<h3>{}</h3>'.format('Other Citations')) + write_table(f, keys=None, rows=sorted(unknown_citations, key=operator.itemgetter(0))) + f.write("</body>") + f.write('<script src="https://unpkg.com/leaflet@1.3.4/dist/leaflet.js" integrity="sha512-nMMmRyTVoLYqjP9hrbed9S+FzjZHW5gY1TWCHA5ckwXZBadntCNs8kEqAWdrb9O7rxbCaA4lKTIWjDXZxflOcA==" crossorigin=""></script>') + f.write('<script type="text/json">') + json.dump(geocoded_citations, f) + f.write('</script>') + f.write('<script src="../map.js"></script>') + f.write("</html>") + return res + +def load_addresses(): + data = read_csv('reports/all_institutions.csv', keys=None) + lookup = {} + for row in data: + name = row[0] + lookup[name] = row + return lookup + +def load_institutions(paperId): + if os.path.exists(os.path.join(data_path('pdf', paperId), 'institutions.json')): + return read_json(os.path.join(data_path('pdf', paperId), 'institutions.json'))['institutions'] + elif os.path.exists(os.path.join(data_path('doi', paperId), 'institutions.json')): + return read_json(os.path.join(data_path('doi', paperId), 'institutions.json'))['institutions'] + else: + return [] + +def data_path(key, paper_id): + return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id) + +if __name__ == '__main__': + s2_citation_report() |
