diff options
Diffstat (limited to 'scraper/s2-citation-report.py')
| -rw-r--r-- | scraper/s2-citation-report.py | 378 |
1 files changed, 378 insertions, 0 deletions
diff --git a/scraper/s2-citation-report.py b/scraper/s2-citation-report.py new file mode 100644 index 00000000..580312ff --- /dev/null +++ b/scraper/s2-citation-report.py @@ -0,0 +1,378 @@ +import os +import re +import glob +import simplejson as json +import math +import operator +import click +import subprocess +from util import * + +DIR_PUBLIC_CITATIONS = "../site/datasets/citations" + +@click.command() +def s2_citation_report(): + addresses = AddressBook() + megapixels = load_megapixels_queries() + successful_geocodes = {} + papers = [] + for row in megapixels: + paper_data = process_paper(row, addresses, successful_geocodes) + if paper_data is not None: + papers.append(paper_data) + write_papers_report('reports/report_index.html', 'All Papers', papers, 'title') + write_papers_report('reports/report_coverage.html', 'Coverage', papers, 'citations_geocoded', reverse=True) + + paper_count = 0 + geocode_count = 0 + for key, value in successful_geocodes.items(): + if value: + geocode_count += 1 + paper_count += 1 + print("citations: {}".format(paper_count)) + print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count))) + + write_master_report('{}/{}'.format(DIR_PUBLIC_CITATIONS, "datasets.csv"), papers) + + sts = subprocess.call([ + "s3cmd", "put", "-P", "--recursive", + DIR_PUBLIC_CITATIONS + '/', + "s3://megapixels/v1/citations/", + ]) + +def write_master_report(fn, papers): + # first make a lookup of the keys that have papers + paper_key_lookup = {} + for paper in papers: + if paper['key'] not in paper_key_lookup: + paper_key_lookup[paper['key']] = paper + + # then fetch the statistics csv which has things like "year" + fields, rows = fetch_google_sheet('statistics') + master_papers = [] + statistics = {} + + def clean(n): + if type(n) is int: + return n + if type(n) is str and n: + s = str(n).replace(',','').replace('.','').replace('?','').strip() + try: + return int(s) + except e: + return s + if n: + return n + return None + + for row in rows: + key = row[0] + if key not in paper_key_lookup: + continue + paper = paper_key_lookup[key] + stats = {} + for index, field in enumerate(fields): + stats[field] = row[index] + report_fn = '../site/content/datasets/{}/index.md'.format(key) + has_report = os.path.exists(report_fn) + statistics[key] = stats + search_result = read_json('./datasets/s2/entries/{}.json'.format(paper['paperId'])) + + image_count = stats['images'] + if type(image_count) is str: + if len(image_count): + image_count = clean(image_count) + else: + image_count = None, + master_papers.append([ + stats['key'], + stats['name'], + '/datasets/{}/'.format(key) if has_report else '', + image_count, + clean(stats['faces_unique']) or None, + stats['year_published'], + clean(paper['citation_count']) or 0, + clean(search_result['citationStats']['numKeyCitations']) or 0, + # origin + ]) + master_paper_keys = [ + 'key', + 'title', + 'link', + 'images', + 'people', + 'year', + 'citations', + 'influenced', + # 'origin' + ] + write_csv(fn, keys=master_paper_keys, rows=master_papers) + +def write_papers_report(fn, title, papers, key, reverse=False): + sorted_papers = [] + for paper in sorted(papers, key=lambda x: x[key], reverse=reverse): + sorted_papers.append([ + paper['paperId'], + paper['key'], + paper['name'], + LinkLine(paper['report_link'], paper['title']), + LinkLine(paper['pdf_link'], '[pdf]'), + paper['journal'], + paper['address_type'], + paper['address'], + paper['lat'], + paper['lng'], + str(percent(paper['citations_geocoded'], paper['citation_count'])) + '%', + paper['citation_count'], + paper['citations_geocoded'], + paper['citations_unknown'], + paper['citations_empty'], + paper['citations_pdf'], + paper['citations_doi'], + ]) + sorted_paper_keys = [ + 'Paper ID', + 'Megapixels Key', + 'Megapixels Name', + 'Report Link', + 'PDF Link', + 'Journal', + 'Type', + 'Address', + 'Lat', + 'Lng', + 'Coverage', + 'Total Citations', + 'Geocoded Citations', + 'Unknown Citations', + 'Empty Citations', + 'With PDF', + 'With DOI', + ] + write_report(fn, title=title, keys=sorted_paper_keys, rows=sorted_papers) + +def process_paper(row, addresses, success): + res = { + 'paperId': '', + 'key': '', + 'title': '', + 'journal': '', + 'address': '', + 'address_type': '', + 'lat': '', + 'lng': '', + 'pdf_link': '', + 'report_link': '', + 'citation_count': 0, + 'citations_geocoded': 0, + 'citations_unknown': 0, + 'citations_empty': 0, + 'citations_pdf': 0, + 'citations_doi': 0, + } + + geocoded_citations = [] + unknown_citations = [] + display_geocoded_citations = [] + empty_citations = [] + pdf_count = 0 + doi_count = 0 + address_count = 0 + + fn = file_path('papers', row['paper_id'], 'paper.json') + + with open(fn, 'r') as f: + data = json.load(f) + print('>> {} {}'.format(data['paperId'], row['key'])) + paper = load_paper(data['paperId']) + if paper is None: + print("Paper missing! {}".format(data['paperId'])) + return + + res['key'] = row['key'] + res['name'] = row['name'] + res['paperId'] = paper.paper_id + res['title'] = paper.title + res['journal'] = paper.journal + res['report_link'] = 'papers/{}.html'.format(paper.paper_id) + res['pdf_link'] = paper.pdf_link + # res['authors'] = ', '.join(paper.authors) + # res['citations'] = [] + + paper_institutions = load_institutions(paper.paper_id) + paper_address = None + for inst in sorted(paper_institutions, key=operator.itemgetter(1)): + # print(inst[1]) + institution = inst[1] + if paper_address is None: + paper_address = addresses.find(institution) + + if paper_address: + # print(paper_address) + res['address'] = paper_address[0] + res['lat'] = paper_address[3] + res['lng'] = paper_address[4] + res['address_type'] = paper_address[5] + + for cite in data['citations']: + citationId = cite['paperId'] + citation = load_paper(citationId) + has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt')) + has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi')) + if has_pdf: + pdf_count += 1 + if has_doi: + doi_count += 1 + if citation is None or citation.data is None: + print("Citation missing! {}".format(cite['paperId'])) + continue + institutions = load_institutions(citationId) + geocoded_institutions = [] + unknown_institutions = [] + institution = '' + address = None + for inst in sorted(institutions, key=operator.itemgetter(1)): + # print(inst[1]) + address_count += 1 + institution = inst[1] + next_address = addresses.find(institution) + if next_address: + address = next_address + geocoded_institutions.append(institution) + else: + unknown_institutions.append(institution) + if not address: + if has_pdf: + headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation) + heading_string = '\n'.join(headings[0:20]) + found_addresses = [] + if len(headings): + for heading in headings: + l = heading.lower().strip() + if l: + next_address = addresses.find(l) + if next_address: + address = next_address + geocoded_institutions.append(heading) + else: + unknown_institutions.append(heading) + else: + empty_citations.append([ + citationId, + citation.title, + ]) + + # res['citations'].append({ + # 'title': citation.title, + # 'journal': citation.journal, + # 'authors': citation.authors, + # 'institutions': [inst[1] for inst in institutions], + # 'geocoded': geocoded_institutions, + # }) + if address: + success[citationId] = True + geocoded_citations.append([ + citation.title, + institution, + ] + address + [ + citation.year, + ]) + display_geocoded_citations.append([ + citationId, + LinkLine(citation.pdf_link, '[pdf]'), + citation.title, + ] + address[0:5]) + else: + success[citationId] = False + unknown_citations.append([ + citationId, + LinkLine(citation.pdf_link, '[pdf]'), + citation.title, + '<br>'.join(unknown_institutions), + ]) + res['citation_count'] = len(data['citations']) + res['citations_geocoded'] = len(geocoded_citations) + res['citations_unknown'] = len(unknown_citations) + res['citations_empty'] = len(empty_citations) + res['citations_pdf'] = pdf_count + res['citations_doi'] = doi_count + + total_citations = len(geocoded_citations) + len(unknown_citations) + os.makedirs('reports/papers/', exist_ok=True) + with open('reports/papers/{}.html'.format(paper.paper_id), 'w') as f: + f.write("<!doctype html>") + f.write("<html>") + f.write("<head>") + f.write('<meta charset="utf-8">') + f.write("<title>{}</title>".format(paper.title)) + f.write("<link rel='stylesheet' href='../reports.css'>") + f.write('<link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css" integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA==" crossorigin=""/>') + f.write("</head>") + f.write("<body>") + f.write("<div id='mapid'></div>") + f.write("<h2>{}</h2>".format(paper.title)) + f.write('<ul>') + if paper.journal: + f.write('<li>Journal: {}</li>'.format(paper.journal)) + if paper_address: + f.write('<li>Research institution: {}</li>'.format(paper_address[0])) + f.write('<li>Address: {}</li>'.format(paper_address[2])) + f.write('<li>Lat/Lng: {}, {}</li>'.format(paper_address[3], paper_address[4])) + f.write('<li>Year: {}</li>'.format(paper.year)) + if total_citations == 0: + f.write('<li>Coverage: No citations found!</li>') + else: + f.write('<li>Coverage: {} / {} citations were located ({} %).</li>'.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100))) + f.write('</ul>') + f.write('<h3>{}</h3>'.format('Geocoded Citations')) + write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0))) + f.write('<h3>{}</h3>'.format('Other Citations')) + write_table(f, keys=None, rows=sorted(unknown_citations, key=operator.itemgetter(0))) + f.write("</body>") + f.write('<script src="../snap.svg-min.js"></script>') + f.write('<script src="https://unpkg.com/leaflet@1.3.4/dist/leaflet.js" integrity="sha512-nMMmRyTVoLYqjP9hrbed9S+FzjZHW5gY1TWCHA5ckwXZBadntCNs8kEqAWdrb9O7rxbCaA4lKTIWjDXZxflOcA==" crossorigin=""></script>') + f.write('<script src="../leaflet.arc.js"></script>') + f.write('<script src="../leaflet.bezier.js"></script>') + f.write('<script type="text/json" id="address">') + json.dump(paper_address, f) + f.write('</script>') + f.write('<script type="text/json" id="citations">') + json.dump(geocoded_citations, f) + f.write('</script>') + f.write('<script src="../map.js"></script>') + f.write("</html>") + # template = env.get_template('paper.html') + with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f: + json.dump({ + 'id': paper.paper_id, + 'paper': res, + 'address': paper_address, + 'citations': geocoded_citations, + }, f) + return res + +def load_megapixels_queries(): + keys, rows = read_csv('datasets/citation_lookup.csv') + recs = [] + for row in rows: + rec = {} + for index, key in enumerate(keys): + rec[key] = row[index] + recs.append(rec) + return recs + +def load_institutions(paperId): + if os.path.exists(file_path('pdf', paperId, 'institutions.json')): + return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions'] + elif os.path.exists(file_path('doi', paperId, 'institutions.json')): + return read_json(file_path('doi', paperId, 'institutions.json'))['institutions'] + else: + return [] + +def data_path(key, paper_id): + return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id) +def file_path(key, paper_id, fn): + return os.path.join(data_path(key, paper_id), fn) + +if __name__ == '__main__': + s2_citation_report() |
