import os import re import glob import simplejson as json import math import operator import click import subprocess from util import * DIR_PUBLIC_CITATIONS = "../site/datasets/citations" paper_location_lookup = fetch_google_lookup('paper_locations', item_key='paper_id') @click.command() def s2_citation_report(): addresses = AddressBook() megapixels = load_megapixels_queries() successful_geocodes = {} papers = [] for row in megapixels: paper_data = process_paper(row, addresses, successful_geocodes) if paper_data is not None: papers.append(paper_data) write_papers_report('reports/report_index.html', 'All Papers', papers, 'title') write_papers_report('reports/report_coverage.html', 'Coverage', papers, 'citations_geocoded', reverse=True) paper_count = 0 geocode_count = 0 for key, value in successful_geocodes.items(): if value: geocode_count += 1 paper_count += 1 print("citations: {}".format(paper_count)) print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count))) write_master_report('{}/{}'.format(DIR_PUBLIC_CITATIONS, "datasets.csv"), papers) sts = subprocess.call([ "s3cmd", "put", "-P", "--recursive", DIR_PUBLIC_CITATIONS + '/', "s3://megapixels/v1/citations/", ]) def write_master_report(fn, papers): # first make a lookup of the keys that have papers paper_key_lookup = {} for paper in papers: if paper['key'] not in paper_key_lookup: paper_key_lookup[paper['key']] = paper # then fetch the statistics csv which has things like "year" fields, rows = fetch_google_sheet('statistics') master_papers = [] statistics = {} def clean(n): if type(n) is int: return n if type(n) is str and n: s = str(n).replace(',','').replace('.','').replace('?','').strip() try: return int(s) except e: return s if n: return n return None for row in rows: key = row[0] if key not in paper_key_lookup: continue paper = paper_key_lookup[key] stats = {} for index, field in enumerate(fields): stats[field] = row[index] report_fn = '../site/content/datasets/{}/index.md'.format(key) has_report = os.path.exists(report_fn) statistics[key] = stats # search_result = read_json('./datasets/s2/entries/{}.json'.format(paper['paperId'])) image_count = stats['images'] if type(image_count) is str: if len(image_count): image_count = clean(image_count) else: image_count = None, master_papers.append([ stats['key'], stats['name'], '/datasets/{}/'.format(key) if has_report else '', image_count, clean(stats['faces_unique']) or None, stats['year_published'], clean(paper['citation_count']) or 0, # clean(search_result['citationStats']['numKeyCitations']) or 0, # origin ]) master_paper_keys = [ 'key', 'title', 'link', 'images', 'people', 'year', 'citations', # 'influenced', # 'origin' ] write_csv(fn, keys=master_paper_keys, rows=master_papers) def write_papers_report(fn, title, papers, key, reverse=False): sorted_papers = [] for paper in sorted(papers, key=lambda x: x[key], reverse=reverse): sorted_papers.append([ paper['paperId'], paper['key'], paper['name'], LinkLine(paper['report_link'], paper['title']), LinkLine(paper['pdf_link'], '[pdf]'), paper['journal'], paper['address_type'], paper['address'], paper['lat'], paper['lng'], str(percent(paper['citations_geocoded'], paper['citation_count'])) + '%', paper['citation_count'], paper['citations_geocoded'], paper['citations_unknown'], paper['citations_empty'], paper['citations_pdf'], paper['citations_doi'], ]) sorted_paper_keys = [ 'Paper ID', 'Megapixels Key', 'Megapixels Name', 'Report Link', 'PDF Link', 'Journal', 'Type', 'Address', 'Lat', 'Lng', 'Coverage', 'Total Citations', 'Geocoded Citations', 'Unknown Citations', 'Empty Citations', 'With PDF', 'With DOI', ] write_report(fn, title=title, keys=sorted_paper_keys, rows=sorted_papers) def process_paper(row, addresses, success): res = { 'paperId': '', 'key': '', 'title': '', 'journal': '', 'address': '', 'address_type': '', 'lat': '', 'lng': '', 'pdf_link': '', 'report_link': '', 'citation_count': 0, 'citations_geocoded': 0, 'citations_unknown': 0, 'citations_empty': 0, 'citations_pdf': 0, 'citations_doi': 0, } geocoded_citations = [] unknown_citations = [] display_geocoded_citations = [] empty_citations = [] pdf_count = 0 doi_count = 0 address_count = 0 fn = file_path('papers', row['paper_id'], 'paper.json') with open(fn, 'r') as f: data = json.load(f) print('>> {} {}'.format(data['paperId'], row['key'])) paper = load_paper(data['paperId']) if paper is None: print("Paper missing! {}".format(data['paperId'])) return res['key'] = row['key'] res['name'] = row['name'] res['paperId'] = paper.paper_id res['title'] = paper.title res['journal'] = paper.journal res['report_link'] = 'papers/{}.html'.format(paper.paper_id) res['pdf_link'] = paper.pdf_link # res['authors'] = ', '.join(paper.authors) # res['citations'] = [] paper_institutions = load_institutions(paper.paper_id, paper_location_lookup) paper_address = None for inst in sorted(paper_institutions, key=operator.itemgetter(1)): # print(inst[1]) institution = inst[1] if paper_address is None: paper_address = addresses.find(institution) if paper_address: # print(paper_address) res['address'] = paper_address[0] res['lat'] = paper_address[3] res['lng'] = paper_address[4] res['address_type'] = paper_address[5] for cite in data['citations']: citationId = cite['paperId'] citation = load_paper(citationId) has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt')) has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi')) if has_pdf: pdf_count += 1 if has_doi: doi_count += 1 if citation is None or citation.data is None: print("Citation missing! {}".format(cite['paperId'])) continue institutions = load_institutions(citationId, paper_location_lookup) geocoded_institutions = [] unknown_institutions = [] institution = '' address = None for inst in sorted(institutions, key=operator.itemgetter(1)): # print(inst[1]) address_count += 1 institution = inst[1] next_address = addresses.find(institution) if next_address: address = next_address geocoded_institutions.append(institution) else: unknown_institutions.append(institution) if not address: if has_pdf: headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation) heading_string = '\n'.join(headings[0:20]) found_addresses = [] if len(headings): for heading in headings: l = heading.lower().strip() if l: next_address = addresses.find(l) if next_address: address = next_address geocoded_institutions.append(heading) else: unknown_institutions.append(heading) else: empty_citations.append([ citationId, citation.title, ]) # res['citations'].append({ # 'title': citation.title, # 'journal': citation.journal, # 'authors': citation.authors, # 'institutions': [inst[1] for inst in institutions], # 'geocoded': geocoded_institutions, # }) if address: success[citationId] = True geocoded_citations.append([ citation.title, institution, ] + address + [ citation.year, ]) display_geocoded_citations.append([ citationId, LinkLine(citation.pdf_link, '[pdf]'), citation.title, ] + address[0:5]) else: success[citationId] = False unknown_citations.append([ citationId, LinkLine(citation.pdf_link, '[pdf]'), citation.title, '
'.join(unknown_institutions), ]) res['citation_count'] = len(data['citations']) res['citations_geocoded'] = len(geocoded_citations) res['citations_unknown'] = len(unknown_citations) res['citations_empty'] = len(empty_citations) res['citations_pdf'] = pdf_count res['citations_doi'] = doi_count total_citations = len(geocoded_citations) + len(unknown_citations) os.makedirs('reports/papers/', exist_ok=True) with open('reports/papers/{}.html'.format(paper.paper_id), 'w') as f: f.write("") f.write("") f.write("") f.write('') f.write("{}".format(paper.title)) f.write("") f.write('') f.write("") f.write("") f.write("
") f.write("

{}

".format(paper.title)) f.write('') f.write('

{}

'.format('Geocoded Citations')) write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0))) f.write('

{}

'.format('Other Citations')) write_table(f, keys=None, rows=sorted(unknown_citations, key=operator.itemgetter(0))) f.write("") f.write('') f.write('') f.write('') f.write('') f.write('') f.write('') f.write('') f.write("") # template = env.get_template('paper.html') with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': paper.paper_id, 'paper': res, 'address': paper_address, 'citations': geocoded_citations, }, f) return res def load_megapixels_queries(): keys, rows = fetch_google_sheet('citation_lookup') recs = [] for row in rows: rec = {} for index, key in enumerate(keys): rec[key] = row[index] recs.append(rec) return recs #def load_institutions(paperId): # if os.path.exists(file_path('pdf', paperId, 'institutions.json')): # return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions'] # elif os.path.exists(file_path('doi', paperId, 'institutions.json')): # return read_json(file_path('doi', paperId, 'institutions.json'))['institutions'] # else: # return [] def data_path(key, paper_id): return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id) def file_path(key, paper_id, fn): return os.path.join(data_path(key, paper_id), fn) if __name__ == '__main__': s2_citation_report()