diff options
Diffstat (limited to 'scraper/s2-citation-report.py')
| -rw-r--r-- | scraper/s2-citation-report.py | 282 |
1 files changed, 282 insertions, 0 deletions
diff --git a/scraper/s2-citation-report.py b/scraper/s2-citation-report.py new file mode 100644 index 00000000..0d1712b6 --- /dev/null +++ b/scraper/s2-citation-report.py @@ -0,0 +1,282 @@ +import os +import re +import glob +import json +import math +import operator +import click +from util import * + +@click.command() +def s2_citation_report(): + addresses = AddressBook() + megapixels = load_megapixels_queries() + successful_geocodes = {} + papers = [] + for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True): + paper_data = process_paper(fn, addresses, megapixels, successful_geocodes) + papers.append(paper_data) + write_papers_report('reports/report_index.html', 'All Papers', papers, 'title') + write_papers_report('reports/report_coverage.html', 'Coverage', papers, 'citations_geocoded', reverse=True) + + paper_count = 0 + geocode_count = 0 + for key, value in successful_geocodes.items(): + if value: + geocode_count += 1 + paper_count += 1 + print("citations: {}".format(paper_count)) + print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count))) + +def write_papers_report(fn, title, papers, key, reverse=False): + sorted_papers = [] + for paper in sorted(papers, key=lambda x: x[key], reverse=reverse): + sorted_papers.append([ + paper['paperId'], + paper['key'], + LinkLine(paper['report_link'], paper['title']), + LinkLine(paper['pdf_link'], '[pdf]'), + paper['journal'], + paper['address_type'], + paper['address'], + paper['lat'], + paper['lng'], + str(percent(paper['citations_geocoded'], paper['citation_count'])) + '%', + paper['citation_count'], + paper['citations_geocoded'], + paper['citations_unknown'], + paper['citations_empty'], + paper['citations_pdf'], + paper['citations_doi'], + ]) + sorted_paper_keys = [ + 'Paper ID', + 'Megapixels Key', + 'Report Link', + 'PDF Link', + 'Journal', + 'Type', + 'Address', + 'Lat', + 'Lng', + 'Coverage', + 'Total Citations', + 'Geocoded Citations', + 'Unknown Citations', + 'Empty Citations', + 'With PDF', + 'With DOI', + ] + write_report(fn, title=title, keys=sorted_paper_keys, rows=sorted_papers) + +def process_paper(fn, addresses, megapixels, success): + res = { + 'paperId': '', + 'key': '', + 'title': '', + 'journal': '', + 'address': '', + 'address_type': '', + 'lat': '', + 'lng': '', + 'pdf_link': '', + 'report_link': '', + 'citation_count': 0, + 'citations_geocoded': 0, + 'citations_unknown': 0, + 'citations_empty': 0, + 'citations_pdf': 0, + 'citations_doi': 0, + } + + geocoded_citations = [] + unknown_citations = [] + display_geocoded_citations = [] + empty_citations = [] + pdf_count = 0 + doi_count = 0 + address_count = 0 + + with open(fn, 'r') as f: + data = json.load(f) + print('>> {}'.format(data['paperId'])) + paper = load_paper(data['paperId']) + if paper.data is None: + print("Paper missing! {}".format(data['paperId'])) + return + + res['paperId'] = paper.paper_id + res['title'] = paper.title + res['journal'] = paper.journal + res['report_link'] = 'papers/{}.html'.format(paper.paper_id) + res['pdf_link'] = paper.pdf_link + # res['authors'] = ', '.join(paper.authors) + # res['citations'] = [] + + if res['title'] in megapixels: + res['key'] = megapixels[res['title']]['Database Name'] + + paper_institutions = load_institutions(paper.paper_id) + paper_address = None + for inst in sorted(paper_institutions, key=operator.itemgetter(1)): + # print(inst[1]) + institution = inst[1] + if paper_address is None: + paper_address = addresses.find(institution) + + if paper_address: + print(paper_address) + res['address'] = paper_address[0] + res['lat'] = paper_address[3] + res['lng'] = paper_address[4] + res['address_type'] = paper_address[5] + + for cite in data['citations']: + citationId = cite['paperId'] + citation = load_paper(citationId) + has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt')) + has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi')) + if has_pdf: + pdf_count += 1 + if has_doi: + doi_count += 1 + if citation.data is None: + print("Citation missing! {}".format(cite['paperId'])) + continue + institutions = load_institutions(citationId) + geocoded_institutions = [] + unknown_institutions = [] + institution = '' + address = None + for inst in sorted(institutions, key=operator.itemgetter(1)): + # print(inst[1]) + address_count += 1 + institution = inst[1] + next_address = addresses.find(institution) + if next_address: + address = next_address + geocoded_institutions.append(institution) + else: + unknown_institutions.append(institution) + if not address: + if has_pdf: + headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation) + heading_string = '\n'.join(headings[0:20]) + found_addresses = [] + if len(headings): + for heading in headings: + l = heading.lower().strip() + if l: + next_address = addresses.find(l) + if next_address: + address = next_address + geocoded_institutions.append(heading) + else: + unknown_institutions.append(heading) + else: + empty_citations.append([ + citationId, + citation.title, + ]) + + # res['citations'].append({ + # 'title': citation.title, + # 'journal': citation.journal, + # 'authors': citation.authors, + # 'institutions': [inst[1] for inst in institutions], + # 'geocoded': geocoded_institutions, + # }) + if address: + success[citationId] = True + geocoded_citations.append([ + citation.title, + institution, + ] + address) + display_geocoded_citations.append([ + citationId, + LinkLine(citation.pdf_link, '[pdf]'), + citation.title, + ] + address[0:5]) + else: + success[citationId] = False + unknown_citations.append([ + citationId, + LinkLine(citation.pdf_link, '[pdf]'), + citation.title, + '<br>'.join(unknown_institutions), + ]) + res['citation_count'] = len(data['citations']) + res['citations_geocoded'] = len(geocoded_citations) + res['citations_unknown'] = len(unknown_citations) + res['citations_empty'] = len(empty_citations) + res['citations_pdf'] = pdf_count + res['citations_doi'] = doi_count + + total_citations = len(geocoded_citations) + len(unknown_citations) + os.makedirs('reports/papers/', exist_ok=True) + with open('reports/papers/{}.html'.format(paper.paper_id), 'w') as f: + f.write("<!doctype html>") + f.write("<html>") + f.write("<head>") + f.write('<meta charset="utf-8">') + f.write("<title>{}</title>".format(paper.title)) + f.write("<link rel='stylesheet' href='../reports.css'>") + f.write('<link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css" integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA==" crossorigin=""/>') + f.write("</head>") + f.write("<body>") + f.write("<div id='mapid'></div>") + f.write("<h2>{}</h2>".format(paper.title)) + f.write('<ul>') + if paper.journal: + f.write('<li>Journal: {}</li>'.format(paper.journal)) + if paper_address: + f.write('<li>Research institution: {}</li>'.format(paper_address[0])) + f.write('<li>Address: {}</li>'.format(paper_address[2])) + f.write('<li>Lat/Lng: {}, {}</li>'.format(paper_address[3], paper_address[4])) + f.write('<li>Year: {}</li>'.format(paper.year)) + f.write('<li>Coverage: {} / {} citations were located ({} %).</li>'.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100))) + f.write('</ul>') + f.write('<h3>{}</h3>'.format('Geocoded Citations')) + write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0))) + f.write('<h3>{}</h3>'.format('Other Citations')) + write_table(f, keys=None, rows=sorted(unknown_citations, key=operator.itemgetter(0))) + f.write("</body>") + f.write('<script src="../snap.svg-min.js"></script>') + f.write('<script src="https://unpkg.com/leaflet@1.3.4/dist/leaflet.js" integrity="sha512-nMMmRyTVoLYqjP9hrbed9S+FzjZHW5gY1TWCHA5ckwXZBadntCNs8kEqAWdrb9O7rxbCaA4lKTIWjDXZxflOcA==" crossorigin=""></script>') + f.write('<script src="../leaflet.arc.js"></script>') + f.write('<script src="../leaflet.bezier.js"></script>') + f.write('<script type="text/json" id="address">') + json.dump(paper_address, f) + f.write('</script>') + f.write('<script type="text/json" id="citations">') + json.dump(geocoded_citations, f) + f.write('</script>') + f.write('<script src="../map.js"></script>') + f.write("</html>") + return res + +def load_megapixels_queries(): + keys, rows = read_csv('datasets/citations-2018310.csv') + lookup = {} + for row in rows: + rec = {} + for index, key in enumerate(keys): + rec[key] = row[index] + lookup[rec['Title'].strip()] = rec + return lookup + +def load_institutions(paperId): + if os.path.exists(file_path('pdf', paperId, 'institutions.json')): + return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions'] + elif os.path.exists(file_path('doi', paperId, 'institutions.json')): + return read_json(file_path('doi', paperId, 'institutions.json'))['institutions'] + else: + return [] + +def data_path(key, paper_id): + return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id) +def file_path(key, paper_id, fn): + return os.path.join(data_path(key, paper_id), fn) + +if __name__ == '__main__': + s2_citation_report() |
