diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:19:15 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:19:15 +0100 |
| commit | ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch) | |
| tree | 41372528e78d4328bc2a47bbbabac7e809c58894 /s2-citation-report.py | |
| parent | 255b8178af1e25a71fd23703d30c0d1f74911f47 (diff) | |
moving stuff
Diffstat (limited to 's2-citation-report.py')
| -rw-r--r-- | s2-citation-report.py | 282 |
1 files changed, 0 insertions, 282 deletions
diff --git a/s2-citation-report.py b/s2-citation-report.py deleted file mode 100644 index 0d1712b6..00000000 --- a/s2-citation-report.py +++ /dev/null @@ -1,282 +0,0 @@ -import os -import re -import glob -import json -import math -import operator -import click -from util import * - -@click.command() -def s2_citation_report(): - addresses = AddressBook() - megapixels = load_megapixels_queries() - successful_geocodes = {} - papers = [] - for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True): - paper_data = process_paper(fn, addresses, megapixels, successful_geocodes) - papers.append(paper_data) - write_papers_report('reports/report_index.html', 'All Papers', papers, 'title') - write_papers_report('reports/report_coverage.html', 'Coverage', papers, 'citations_geocoded', reverse=True) - - paper_count = 0 - geocode_count = 0 - for key, value in successful_geocodes.items(): - if value: - geocode_count += 1 - paper_count += 1 - print("citations: {}".format(paper_count)) - print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count))) - -def write_papers_report(fn, title, papers, key, reverse=False): - sorted_papers = [] - for paper in sorted(papers, key=lambda x: x[key], reverse=reverse): - sorted_papers.append([ - paper['paperId'], - paper['key'], - LinkLine(paper['report_link'], paper['title']), - LinkLine(paper['pdf_link'], '[pdf]'), - paper['journal'], - paper['address_type'], - paper['address'], - paper['lat'], - paper['lng'], - str(percent(paper['citations_geocoded'], paper['citation_count'])) + '%', - paper['citation_count'], - paper['citations_geocoded'], - paper['citations_unknown'], - paper['citations_empty'], - paper['citations_pdf'], - paper['citations_doi'], - ]) - sorted_paper_keys = [ - 'Paper ID', - 'Megapixels Key', - 'Report Link', - 'PDF Link', - 'Journal', - 'Type', - 'Address', - 'Lat', - 'Lng', - 'Coverage', - 'Total Citations', - 'Geocoded Citations', - 'Unknown Citations', - 'Empty Citations', - 'With PDF', - 'With DOI', - ] - write_report(fn, title=title, keys=sorted_paper_keys, rows=sorted_papers) - -def process_paper(fn, addresses, megapixels, success): - res = { - 'paperId': '', - 'key': '', - 'title': '', - 'journal': '', - 'address': '', - 'address_type': '', - 'lat': '', - 'lng': '', - 'pdf_link': '', - 'report_link': '', - 'citation_count': 0, - 'citations_geocoded': 0, - 'citations_unknown': 0, - 'citations_empty': 0, - 'citations_pdf': 0, - 'citations_doi': 0, - } - - geocoded_citations = [] - unknown_citations = [] - display_geocoded_citations = [] - empty_citations = [] - pdf_count = 0 - doi_count = 0 - address_count = 0 - - with open(fn, 'r') as f: - data = json.load(f) - print('>> {}'.format(data['paperId'])) - paper = load_paper(data['paperId']) - if paper.data is None: - print("Paper missing! {}".format(data['paperId'])) - return - - res['paperId'] = paper.paper_id - res['title'] = paper.title - res['journal'] = paper.journal - res['report_link'] = 'papers/{}.html'.format(paper.paper_id) - res['pdf_link'] = paper.pdf_link - # res['authors'] = ', '.join(paper.authors) - # res['citations'] = [] - - if res['title'] in megapixels: - res['key'] = megapixels[res['title']]['Database Name'] - - paper_institutions = load_institutions(paper.paper_id) - paper_address = None - for inst in sorted(paper_institutions, key=operator.itemgetter(1)): - # print(inst[1]) - institution = inst[1] - if paper_address is None: - paper_address = addresses.find(institution) - - if paper_address: - print(paper_address) - res['address'] = paper_address[0] - res['lat'] = paper_address[3] - res['lng'] = paper_address[4] - res['address_type'] = paper_address[5] - - for cite in data['citations']: - citationId = cite['paperId'] - citation = load_paper(citationId) - has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt')) - has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi')) - if has_pdf: - pdf_count += 1 - if has_doi: - doi_count += 1 - if citation.data is None: - print("Citation missing! {}".format(cite['paperId'])) - continue - institutions = load_institutions(citationId) - geocoded_institutions = [] - unknown_institutions = [] - institution = '' - address = None - for inst in sorted(institutions, key=operator.itemgetter(1)): - # print(inst[1]) - address_count += 1 - institution = inst[1] - next_address = addresses.find(institution) - if next_address: - address = next_address - geocoded_institutions.append(institution) - else: - unknown_institutions.append(institution) - if not address: - if has_pdf: - headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation) - heading_string = '\n'.join(headings[0:20]) - found_addresses = [] - if len(headings): - for heading in headings: - l = heading.lower().strip() - if l: - next_address = addresses.find(l) - if next_address: - address = next_address - geocoded_institutions.append(heading) - else: - unknown_institutions.append(heading) - else: - empty_citations.append([ - citationId, - citation.title, - ]) - - # res['citations'].append({ - # 'title': citation.title, - # 'journal': citation.journal, - # 'authors': citation.authors, - # 'institutions': [inst[1] for inst in institutions], - # 'geocoded': geocoded_institutions, - # }) - if address: - success[citationId] = True - geocoded_citations.append([ - citation.title, - institution, - ] + address) - display_geocoded_citations.append([ - citationId, - LinkLine(citation.pdf_link, '[pdf]'), - citation.title, - ] + address[0:5]) - else: - success[citationId] = False - unknown_citations.append([ - citationId, - LinkLine(citation.pdf_link, '[pdf]'), - citation.title, - '<br>'.join(unknown_institutions), - ]) - res['citation_count'] = len(data['citations']) - res['citations_geocoded'] = len(geocoded_citations) - res['citations_unknown'] = len(unknown_citations) - res['citations_empty'] = len(empty_citations) - res['citations_pdf'] = pdf_count - res['citations_doi'] = doi_count - - total_citations = len(geocoded_citations) + len(unknown_citations) - os.makedirs('reports/papers/', exist_ok=True) - with open('reports/papers/{}.html'.format(paper.paper_id), 'w') as f: - f.write("<!doctype html>") - f.write("<html>") - f.write("<head>") - f.write('<meta charset="utf-8">') - f.write("<title>{}</title>".format(paper.title)) - f.write("<link rel='stylesheet' href='../reports.css'>") - f.write('<link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css" integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA==" crossorigin=""/>') - f.write("</head>") - f.write("<body>") - f.write("<div id='mapid'></div>") - f.write("<h2>{}</h2>".format(paper.title)) - f.write('<ul>') - if paper.journal: - f.write('<li>Journal: {}</li>'.format(paper.journal)) - if paper_address: - f.write('<li>Research institution: {}</li>'.format(paper_address[0])) - f.write('<li>Address: {}</li>'.format(paper_address[2])) - f.write('<li>Lat/Lng: {}, {}</li>'.format(paper_address[3], paper_address[4])) - f.write('<li>Year: {}</li>'.format(paper.year)) - f.write('<li>Coverage: {} / {} citations were located ({} %).</li>'.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100))) - f.write('</ul>') - f.write('<h3>{}</h3>'.format('Geocoded Citations')) - write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0))) - f.write('<h3>{}</h3>'.format('Other Citations')) - write_table(f, keys=None, rows=sorted(unknown_citations, key=operator.itemgetter(0))) - f.write("</body>") - f.write('<script src="../snap.svg-min.js"></script>') - f.write('<script src="https://unpkg.com/leaflet@1.3.4/dist/leaflet.js" integrity="sha512-nMMmRyTVoLYqjP9hrbed9S+FzjZHW5gY1TWCHA5ckwXZBadntCNs8kEqAWdrb9O7rxbCaA4lKTIWjDXZxflOcA==" crossorigin=""></script>') - f.write('<script src="../leaflet.arc.js"></script>') - f.write('<script src="../leaflet.bezier.js"></script>') - f.write('<script type="text/json" id="address">') - json.dump(paper_address, f) - f.write('</script>') - f.write('<script type="text/json" id="citations">') - json.dump(geocoded_citations, f) - f.write('</script>') - f.write('<script src="../map.js"></script>') - f.write("</html>") - return res - -def load_megapixels_queries(): - keys, rows = read_csv('datasets/citations-2018310.csv') - lookup = {} - for row in rows: - rec = {} - for index, key in enumerate(keys): - rec[key] = row[index] - lookup[rec['Title'].strip()] = rec - return lookup - -def load_institutions(paperId): - if os.path.exists(file_path('pdf', paperId, 'institutions.json')): - return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions'] - elif os.path.exists(file_path('doi', paperId, 'institutions.json')): - return read_json(file_path('doi', paperId, 'institutions.json'))['institutions'] - else: - return [] - -def data_path(key, paper_id): - return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id) -def file_path(key, paper_id, fn): - return os.path.join(data_path(key, paper_id), fn) - -if __name__ == '__main__': - s2_citation_report() |
