summaryrefslogtreecommitdiff
path: root/scraper/s2-citation-report.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/s2-citation-report.py')
-rw-r--r--scraper/s2-citation-report.py282
1 files changed, 282 insertions, 0 deletions
diff --git a/scraper/s2-citation-report.py b/scraper/s2-citation-report.py
new file mode 100644
index 00000000..0d1712b6
--- /dev/null
+++ b/scraper/s2-citation-report.py
@@ -0,0 +1,282 @@
+import os
+import re
+import glob
+import json
+import math
+import operator
+import click
+from util import *
+
+@click.command()
+def s2_citation_report():
+ addresses = AddressBook()
+ megapixels = load_megapixels_queries()
+ successful_geocodes = {}
+ papers = []
+ for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True):
+ paper_data = process_paper(fn, addresses, megapixels, successful_geocodes)
+ papers.append(paper_data)
+ write_papers_report('reports/report_index.html', 'All Papers', papers, 'title')
+ write_papers_report('reports/report_coverage.html', 'Coverage', papers, 'citations_geocoded', reverse=True)
+
+ paper_count = 0
+ geocode_count = 0
+ for key, value in successful_geocodes.items():
+ if value:
+ geocode_count += 1
+ paper_count += 1
+ print("citations: {}".format(paper_count))
+ print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count)))
+
+def write_papers_report(fn, title, papers, key, reverse=False):
+ sorted_papers = []
+ for paper in sorted(papers, key=lambda x: x[key], reverse=reverse):
+ sorted_papers.append([
+ paper['paperId'],
+ paper['key'],
+ LinkLine(paper['report_link'], paper['title']),
+ LinkLine(paper['pdf_link'], '[pdf]'),
+ paper['journal'],
+ paper['address_type'],
+ paper['address'],
+ paper['lat'],
+ paper['lng'],
+ str(percent(paper['citations_geocoded'], paper['citation_count'])) + '%',
+ paper['citation_count'],
+ paper['citations_geocoded'],
+ paper['citations_unknown'],
+ paper['citations_empty'],
+ paper['citations_pdf'],
+ paper['citations_doi'],
+ ])
+ sorted_paper_keys = [
+ 'Paper ID',
+ 'Megapixels Key',
+ 'Report Link',
+ 'PDF Link',
+ 'Journal',
+ 'Type',
+ 'Address',
+ 'Lat',
+ 'Lng',
+ 'Coverage',
+ 'Total Citations',
+ 'Geocoded Citations',
+ 'Unknown Citations',
+ 'Empty Citations',
+ 'With PDF',
+ 'With DOI',
+ ]
+ write_report(fn, title=title, keys=sorted_paper_keys, rows=sorted_papers)
+
+def process_paper(fn, addresses, megapixels, success):
+ res = {
+ 'paperId': '',
+ 'key': '',
+ 'title': '',
+ 'journal': '',
+ 'address': '',
+ 'address_type': '',
+ 'lat': '',
+ 'lng': '',
+ 'pdf_link': '',
+ 'report_link': '',
+ 'citation_count': 0,
+ 'citations_geocoded': 0,
+ 'citations_unknown': 0,
+ 'citations_empty': 0,
+ 'citations_pdf': 0,
+ 'citations_doi': 0,
+ }
+
+ geocoded_citations = []
+ unknown_citations = []
+ display_geocoded_citations = []
+ empty_citations = []
+ pdf_count = 0
+ doi_count = 0
+ address_count = 0
+
+ with open(fn, 'r') as f:
+ data = json.load(f)
+ print('>> {}'.format(data['paperId']))
+ paper = load_paper(data['paperId'])
+ if paper.data is None:
+ print("Paper missing! {}".format(data['paperId']))
+ return
+
+ res['paperId'] = paper.paper_id
+ res['title'] = paper.title
+ res['journal'] = paper.journal
+ res['report_link'] = 'papers/{}.html'.format(paper.paper_id)
+ res['pdf_link'] = paper.pdf_link
+ # res['authors'] = ', '.join(paper.authors)
+ # res['citations'] = []
+
+ if res['title'] in megapixels:
+ res['key'] = megapixels[res['title']]['Database Name']
+
+ paper_institutions = load_institutions(paper.paper_id)
+ paper_address = None
+ for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
+ # print(inst[1])
+ institution = inst[1]
+ if paper_address is None:
+ paper_address = addresses.find(institution)
+
+ if paper_address:
+ print(paper_address)
+ res['address'] = paper_address[0]
+ res['lat'] = paper_address[3]
+ res['lng'] = paper_address[4]
+ res['address_type'] = paper_address[5]
+
+ for cite in data['citations']:
+ citationId = cite['paperId']
+ citation = load_paper(citationId)
+ has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt'))
+ has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi'))
+ if has_pdf:
+ pdf_count += 1
+ if has_doi:
+ doi_count += 1
+ if citation.data is None:
+ print("Citation missing! {}".format(cite['paperId']))
+ continue
+ institutions = load_institutions(citationId)
+ geocoded_institutions = []
+ unknown_institutions = []
+ institution = ''
+ address = None
+ for inst in sorted(institutions, key=operator.itemgetter(1)):
+ # print(inst[1])
+ address_count += 1
+ institution = inst[1]
+ next_address = addresses.find(institution)
+ if next_address:
+ address = next_address
+ geocoded_institutions.append(institution)
+ else:
+ unknown_institutions.append(institution)
+ if not address:
+ if has_pdf:
+ headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation)
+ heading_string = '\n'.join(headings[0:20])
+ found_addresses = []
+ if len(headings):
+ for heading in headings:
+ l = heading.lower().strip()
+ if l:
+ next_address = addresses.find(l)
+ if next_address:
+ address = next_address
+ geocoded_institutions.append(heading)
+ else:
+ unknown_institutions.append(heading)
+ else:
+ empty_citations.append([
+ citationId,
+ citation.title,
+ ])
+
+ # res['citations'].append({
+ # 'title': citation.title,
+ # 'journal': citation.journal,
+ # 'authors': citation.authors,
+ # 'institutions': [inst[1] for inst in institutions],
+ # 'geocoded': geocoded_institutions,
+ # })
+ if address:
+ success[citationId] = True
+ geocoded_citations.append([
+ citation.title,
+ institution,
+ ] + address)
+ display_geocoded_citations.append([
+ citationId,
+ LinkLine(citation.pdf_link, '[pdf]'),
+ citation.title,
+ ] + address[0:5])
+ else:
+ success[citationId] = False
+ unknown_citations.append([
+ citationId,
+ LinkLine(citation.pdf_link, '[pdf]'),
+ citation.title,
+ '<br>'.join(unknown_institutions),
+ ])
+ res['citation_count'] = len(data['citations'])
+ res['citations_geocoded'] = len(geocoded_citations)
+ res['citations_unknown'] = len(unknown_citations)
+ res['citations_empty'] = len(empty_citations)
+ res['citations_pdf'] = pdf_count
+ res['citations_doi'] = doi_count
+
+ total_citations = len(geocoded_citations) + len(unknown_citations)
+ os.makedirs('reports/papers/', exist_ok=True)
+ with open('reports/papers/{}.html'.format(paper.paper_id), 'w') as f:
+ f.write("<!doctype html>")
+ f.write("<html>")
+ f.write("<head>")
+ f.write('<meta charset="utf-8">')
+ f.write("<title>{}</title>".format(paper.title))
+ f.write("<link rel='stylesheet' href='../reports.css'>")
+ f.write('<link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css" integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA==" crossorigin=""/>')
+ f.write("</head>")
+ f.write("<body>")
+ f.write("<div id='mapid'></div>")
+ f.write("<h2>{}</h2>".format(paper.title))
+ f.write('<ul>')
+ if paper.journal:
+ f.write('<li>Journal: {}</li>'.format(paper.journal))
+ if paper_address:
+ f.write('<li>Research institution: {}</li>'.format(paper_address[0]))
+ f.write('<li>Address: {}</li>'.format(paper_address[2]))
+ f.write('<li>Lat/Lng: {}, {}</li>'.format(paper_address[3], paper_address[4]))
+ f.write('<li>Year: {}</li>'.format(paper.year))
+ f.write('<li>Coverage: {} / {} citations were located ({} %).</li>'.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100)))
+ f.write('</ul>')
+ f.write('<h3>{}</h3>'.format('Geocoded Citations'))
+ write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0)))
+ f.write('<h3>{}</h3>'.format('Other Citations'))
+ write_table(f, keys=None, rows=sorted(unknown_citations, key=operator.itemgetter(0)))
+ f.write("</body>")
+ f.write('<script src="../snap.svg-min.js"></script>')
+ f.write('<script src="https://unpkg.com/leaflet@1.3.4/dist/leaflet.js" integrity="sha512-nMMmRyTVoLYqjP9hrbed9S+FzjZHW5gY1TWCHA5ckwXZBadntCNs8kEqAWdrb9O7rxbCaA4lKTIWjDXZxflOcA==" crossorigin=""></script>')
+ f.write('<script src="../leaflet.arc.js"></script>')
+ f.write('<script src="../leaflet.bezier.js"></script>')
+ f.write('<script type="text/json" id="address">')
+ json.dump(paper_address, f)
+ f.write('</script>')
+ f.write('<script type="text/json" id="citations">')
+ json.dump(geocoded_citations, f)
+ f.write('</script>')
+ f.write('<script src="../map.js"></script>')
+ f.write("</html>")
+ return res
+
+def load_megapixels_queries():
+ keys, rows = read_csv('datasets/citations-2018310.csv')
+ lookup = {}
+ for row in rows:
+ rec = {}
+ for index, key in enumerate(keys):
+ rec[key] = row[index]
+ lookup[rec['Title'].strip()] = rec
+ return lookup
+
+def load_institutions(paperId):
+ if os.path.exists(file_path('pdf', paperId, 'institutions.json')):
+ return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions']
+ elif os.path.exists(file_path('doi', paperId, 'institutions.json')):
+ return read_json(file_path('doi', paperId, 'institutions.json'))['institutions']
+ else:
+ return []
+
+def data_path(key, paper_id):
+ return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id)
+def file_path(key, paper_id, fn):
+ return os.path.join(data_path(key, paper_id), fn)
+
+if __name__ == '__main__':
+ s2_citation_report()