summaryrefslogtreecommitdiff
path: root/s2-citation-report.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-25 22:19:15 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-25 22:19:15 +0100
commitee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch)
tree41372528e78d4328bc2a47bbbabac7e809c58894 /s2-citation-report.py
parent255b8178af1e25a71fd23703d30c0d1f74911f47 (diff)
moving stuff
Diffstat (limited to 's2-citation-report.py')
-rw-r--r--s2-citation-report.py282
1 files changed, 0 insertions, 282 deletions
diff --git a/s2-citation-report.py b/s2-citation-report.py
deleted file mode 100644
index 0d1712b6..00000000
--- a/s2-citation-report.py
+++ /dev/null
@@ -1,282 +0,0 @@
-import os
-import re
-import glob
-import json
-import math
-import operator
-import click
-from util import *
-
-@click.command()
-def s2_citation_report():
- addresses = AddressBook()
- megapixels = load_megapixels_queries()
- successful_geocodes = {}
- papers = []
- for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True):
- paper_data = process_paper(fn, addresses, megapixels, successful_geocodes)
- papers.append(paper_data)
- write_papers_report('reports/report_index.html', 'All Papers', papers, 'title')
- write_papers_report('reports/report_coverage.html', 'Coverage', papers, 'citations_geocoded', reverse=True)
-
- paper_count = 0
- geocode_count = 0
- for key, value in successful_geocodes.items():
- if value:
- geocode_count += 1
- paper_count += 1
- print("citations: {}".format(paper_count))
- print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count)))
-
-def write_papers_report(fn, title, papers, key, reverse=False):
- sorted_papers = []
- for paper in sorted(papers, key=lambda x: x[key], reverse=reverse):
- sorted_papers.append([
- paper['paperId'],
- paper['key'],
- LinkLine(paper['report_link'], paper['title']),
- LinkLine(paper['pdf_link'], '[pdf]'),
- paper['journal'],
- paper['address_type'],
- paper['address'],
- paper['lat'],
- paper['lng'],
- str(percent(paper['citations_geocoded'], paper['citation_count'])) + '%',
- paper['citation_count'],
- paper['citations_geocoded'],
- paper['citations_unknown'],
- paper['citations_empty'],
- paper['citations_pdf'],
- paper['citations_doi'],
- ])
- sorted_paper_keys = [
- 'Paper ID',
- 'Megapixels Key',
- 'Report Link',
- 'PDF Link',
- 'Journal',
- 'Type',
- 'Address',
- 'Lat',
- 'Lng',
- 'Coverage',
- 'Total Citations',
- 'Geocoded Citations',
- 'Unknown Citations',
- 'Empty Citations',
- 'With PDF',
- 'With DOI',
- ]
- write_report(fn, title=title, keys=sorted_paper_keys, rows=sorted_papers)
-
-def process_paper(fn, addresses, megapixels, success):
- res = {
- 'paperId': '',
- 'key': '',
- 'title': '',
- 'journal': '',
- 'address': '',
- 'address_type': '',
- 'lat': '',
- 'lng': '',
- 'pdf_link': '',
- 'report_link': '',
- 'citation_count': 0,
- 'citations_geocoded': 0,
- 'citations_unknown': 0,
- 'citations_empty': 0,
- 'citations_pdf': 0,
- 'citations_doi': 0,
- }
-
- geocoded_citations = []
- unknown_citations = []
- display_geocoded_citations = []
- empty_citations = []
- pdf_count = 0
- doi_count = 0
- address_count = 0
-
- with open(fn, 'r') as f:
- data = json.load(f)
- print('>> {}'.format(data['paperId']))
- paper = load_paper(data['paperId'])
- if paper.data is None:
- print("Paper missing! {}".format(data['paperId']))
- return
-
- res['paperId'] = paper.paper_id
- res['title'] = paper.title
- res['journal'] = paper.journal
- res['report_link'] = 'papers/{}.html'.format(paper.paper_id)
- res['pdf_link'] = paper.pdf_link
- # res['authors'] = ', '.join(paper.authors)
- # res['citations'] = []
-
- if res['title'] in megapixels:
- res['key'] = megapixels[res['title']]['Database Name']
-
- paper_institutions = load_institutions(paper.paper_id)
- paper_address = None
- for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
- # print(inst[1])
- institution = inst[1]
- if paper_address is None:
- paper_address = addresses.find(institution)
-
- if paper_address:
- print(paper_address)
- res['address'] = paper_address[0]
- res['lat'] = paper_address[3]
- res['lng'] = paper_address[4]
- res['address_type'] = paper_address[5]
-
- for cite in data['citations']:
- citationId = cite['paperId']
- citation = load_paper(citationId)
- has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt'))
- has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi'))
- if has_pdf:
- pdf_count += 1
- if has_doi:
- doi_count += 1
- if citation.data is None:
- print("Citation missing! {}".format(cite['paperId']))
- continue
- institutions = load_institutions(citationId)
- geocoded_institutions = []
- unknown_institutions = []
- institution = ''
- address = None
- for inst in sorted(institutions, key=operator.itemgetter(1)):
- # print(inst[1])
- address_count += 1
- institution = inst[1]
- next_address = addresses.find(institution)
- if next_address:
- address = next_address
- geocoded_institutions.append(institution)
- else:
- unknown_institutions.append(institution)
- if not address:
- if has_pdf:
- headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation)
- heading_string = '\n'.join(headings[0:20])
- found_addresses = []
- if len(headings):
- for heading in headings:
- l = heading.lower().strip()
- if l:
- next_address = addresses.find(l)
- if next_address:
- address = next_address
- geocoded_institutions.append(heading)
- else:
- unknown_institutions.append(heading)
- else:
- empty_citations.append([
- citationId,
- citation.title,
- ])
-
- # res['citations'].append({
- # 'title': citation.title,
- # 'journal': citation.journal,
- # 'authors': citation.authors,
- # 'institutions': [inst[1] for inst in institutions],
- # 'geocoded': geocoded_institutions,
- # })
- if address:
- success[citationId] = True
- geocoded_citations.append([
- citation.title,
- institution,
- ] + address)
- display_geocoded_citations.append([
- citationId,
- LinkLine(citation.pdf_link, '[pdf]'),
- citation.title,
- ] + address[0:5])
- else:
- success[citationId] = False
- unknown_citations.append([
- citationId,
- LinkLine(citation.pdf_link, '[pdf]'),
- citation.title,
- '<br>'.join(unknown_institutions),
- ])
- res['citation_count'] = len(data['citations'])
- res['citations_geocoded'] = len(geocoded_citations)
- res['citations_unknown'] = len(unknown_citations)
- res['citations_empty'] = len(empty_citations)
- res['citations_pdf'] = pdf_count
- res['citations_doi'] = doi_count
-
- total_citations = len(geocoded_citations) + len(unknown_citations)
- os.makedirs('reports/papers/', exist_ok=True)
- with open('reports/papers/{}.html'.format(paper.paper_id), 'w') as f:
- f.write("<!doctype html>")
- f.write("<html>")
- f.write("<head>")
- f.write('<meta charset="utf-8">')
- f.write("<title>{}</title>".format(paper.title))
- f.write("<link rel='stylesheet' href='../reports.css'>")
- f.write('<link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css" integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA==" crossorigin=""/>')
- f.write("</head>")
- f.write("<body>")
- f.write("<div id='mapid'></div>")
- f.write("<h2>{}</h2>".format(paper.title))
- f.write('<ul>')
- if paper.journal:
- f.write('<li>Journal: {}</li>'.format(paper.journal))
- if paper_address:
- f.write('<li>Research institution: {}</li>'.format(paper_address[0]))
- f.write('<li>Address: {}</li>'.format(paper_address[2]))
- f.write('<li>Lat/Lng: {}, {}</li>'.format(paper_address[3], paper_address[4]))
- f.write('<li>Year: {}</li>'.format(paper.year))
- f.write('<li>Coverage: {} / {} citations were located ({} %).</li>'.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100)))
- f.write('</ul>')
- f.write('<h3>{}</h3>'.format('Geocoded Citations'))
- write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0)))
- f.write('<h3>{}</h3>'.format('Other Citations'))
- write_table(f, keys=None, rows=sorted(unknown_citations, key=operator.itemgetter(0)))
- f.write("</body>")
- f.write('<script src="../snap.svg-min.js"></script>')
- f.write('<script src="https://unpkg.com/leaflet@1.3.4/dist/leaflet.js" integrity="sha512-nMMmRyTVoLYqjP9hrbed9S+FzjZHW5gY1TWCHA5ckwXZBadntCNs8kEqAWdrb9O7rxbCaA4lKTIWjDXZxflOcA==" crossorigin=""></script>')
- f.write('<script src="../leaflet.arc.js"></script>')
- f.write('<script src="../leaflet.bezier.js"></script>')
- f.write('<script type="text/json" id="address">')
- json.dump(paper_address, f)
- f.write('</script>')
- f.write('<script type="text/json" id="citations">')
- json.dump(geocoded_citations, f)
- f.write('</script>')
- f.write('<script src="../map.js"></script>')
- f.write("</html>")
- return res
-
-def load_megapixels_queries():
- keys, rows = read_csv('datasets/citations-2018310.csv')
- lookup = {}
- for row in rows:
- rec = {}
- for index, key in enumerate(keys):
- rec[key] = row[index]
- lookup[rec['Title'].strip()] = rec
- return lookup
-
-def load_institutions(paperId):
- if os.path.exists(file_path('pdf', paperId, 'institutions.json')):
- return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions']
- elif os.path.exists(file_path('doi', paperId, 'institutions.json')):
- return read_json(file_path('doi', paperId, 'institutions.json'))['institutions']
- else:
- return []
-
-def data_path(key, paper_id):
- return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id)
-def file_path(key, paper_id, fn):
- return os.path.join(data_path(key, paper_id), fn)
-
-if __name__ == '__main__':
- s2_citation_report()