From e8ce7876c5869522f982073d70c3ee7be179e1f9 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Sat, 10 Nov 2018 15:59:24 +0100 Subject: citation coverage reports --- s2-citation-report.py | 167 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 136 insertions(+), 31 deletions(-) (limited to 's2-citation-report.py') diff --git a/s2-citation-report.py b/s2-citation-report.py index 58b7ed8f..26e148fe 100644 --- a/s2-citation-report.py +++ b/s2-citation-report.py @@ -10,16 +10,83 @@ from util import * @click.command() def s2_citation_report(): addresses = AddressBook() + megapixels = load_megapixels_queries() + papers = [] for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True): - process_paper(fn, addresses) + paper_data = process_paper(fn, addresses, megapixels) + papers.append(paper_data) + write_papers_report('reports/report_index.html', 'All Papers', papers, 'paperId') + write_papers_report('reports/report_coverage.html', 'Coverage', papers, 'citations_geocoded', reverse=True) + +def write_papers_report(fn, title, papers, key, reverse=False): + sorted_papers = [] + for paper in sorted(papers, key=lambda x: x[key], reverse=reverse): + sorted_papers.append([ + paper['paperId'], + paper['key'], + LinkLine(paper['report_link'], paper['title']), + LinkLine(paper['pdf_link'], '[pdf]'), + paper['journal'], + paper['address_type'], + paper['address'], + paper['lat'], + paper['lng'], + str(percent(paper['citation_count'], paper['citations_geocoded'])) + '%', + paper['citation_count'], + paper['citations_geocoded'], + paper['citations_unknown'], + paper['citations_empty'], + paper['citations_pdf'], + paper['citations_doi'], + ]) + sorted_paper_keys = [ + 'Paper ID', + 'Megapixels Key', + 'Report Link', + 'PDF Link', + 'Journal', + 'Type', + 'Address', + 'Lat', + 'Lng', + 'Coverage', + 'Total Citations', + 'Geocoded Citations', + 'Unknown Citations', + 'Empty Citations', + 'With PDF', + 'With DOI', + ] + write_report(fn, title=title, keys=sorted_paper_keys, rows=sorted_papers) + +def process_paper(fn, addresses, megapixels): + res = { + 'paperId': '', + 'key': '', + 'title': '', + 'journal': '', + 'address': '', + 'address_type': '', + 'lat': '', + 'lng': '', + 'pdf_link': '', + 'report_link': '', + 'citation_count': 0, + 'citations_geocoded': 0, + 'citations_unknown': 0, + 'citations_empty': 0, + 'citations_pdf': 0, + 'citations_doi': 0, + } -def process_paper(fn, addresses): - res = {} - address_count = 0 - geocode_count = 0 geocoded_citations = [] unknown_citations = [] display_geocoded_citations = [] + empty_citations = [] + pdf_count = 0 + doi_count = 0 + address_count = 0 + with open(fn, 'r') as f: data = json.load(f) print('>> {}'.format(data['paperId'])) @@ -27,14 +94,42 @@ def process_paper(fn, addresses): if paper.data is None: print("Paper missing! {}".format(data['paperId'])) return + res['paperId'] = paper.paper_id res['title'] = paper.title res['journal'] = paper.journal - res['authors'] = paper.authors - res['citations'] = [] + res['report_link'] = 'papers/{}.html'.format(paper.paper_id) + res['pdf_link'] = paper.pdf_link + # res['authors'] = ', '.join(paper.authors) + # res['citations'] = [] + + if res['title'] in megapixels: + res['key'] = megapixels[res['title']]['Database Name'] + + paper_institutions = load_institutions(paper.paper_id) + paper_address = None + for inst in sorted(paper_institutions, key=operator.itemgetter(1)): + # print(inst[1]) + institution = inst[1] + if paper_address is None: + paper_address = addresses.find(institution) + + if paper_address: + print(paper_address) + res['address'] = paper_address[0] + res['lat'] = paper_address[3] + res['lng'] = paper_address[4] + res['address_type'] = paper_address[5] + for cite in data['citations']: citationId = cite['paperId'] citation = load_paper(citationId) + has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt')) + has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi')) + if has_pdf: + pdf_count += 1 + if has_doi: + doi_count += 1 if citation.data is None: print("Citation missing! {}".format(cite['paperId'])) continue @@ -50,12 +145,11 @@ def process_paper(fn, addresses): next_address = addresses.find(institution) if next_address: address = next_address - geocode_count += 1 geocoded_institutions.append(institution) else: unknown_institutions.append(institution) if not address: - if os.path.exists(file_path('pdf', citationId, 'paper.txt')): + if has_pdf: headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation) heading_string = '\n'.join(headings[0:20]) found_addresses = [] @@ -66,18 +160,22 @@ def process_paper(fn, addresses): next_address = addresses.find(l) if next_address: address = next_address - geocode_count += 1 geocoded_institutions.append(heading) else: unknown_institutions.append(heading) + else: + empty_citations.append([ + citationId, + citation.title, + ]) - res['citations'].append({ - 'title': citation.title, - 'journal': citation.journal, - 'authors': citation.authors, - 'institutions': [inst[1] for inst in institutions], - 'geocoded': geocoded_institutions, - }) + # res['citations'].append({ + # 'title': citation.title, + # 'journal': citation.journal, + # 'authors': citation.authors, + # 'institutions': [inst[1] for inst in institutions], + # 'geocoded': geocoded_institutions, + # }) if address: geocoded_citations.append([ citation.title, @@ -88,20 +186,16 @@ def process_paper(fn, addresses): ] + address) else: unknown_citations.append([ + # citationId, citation.title, '
'.join(unknown_institutions), ]) - - paper_institutions = load_institutions(paper.paper_id) - paper_address = None - for inst in sorted(paper_institutions, key=operator.itemgetter(1)): - # print(inst[1]) - address_count += 1 - institution = inst[1] - paper_address = addresses.find(institution) - - if paper_address: - print(paper_address) + res['citation_count'] = len(data['citations']) + res['citations_geocoded'] = len(geocoded_citations) + res['citations_unknown'] = len(unknown_citations) + res['citations_empty'] = len(empty_citations) + res['citations_pdf'] = pdf_count + res['citations_doi'] = doi_count total_citations = len(geocoded_citations) + len(unknown_citations) os.makedirs('reports/papers/', exist_ok=True) @@ -122,9 +216,10 @@ def process_paper(fn, addresses): f.write('
  • Journal: {}
  • '.format(paper.journal)) if paper_address: f.write('
  • Research institution: {}
  • '.format(paper_address[0])) - f.write('
  • Address: {}
  • '.format(paper_address[3])) - f.write('
  • {}
  • '.format(paper.year)) - f.write('
  • {} / {} citations were located ({} %).
  • '.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100))) + f.write('
  • Address: {}
  • '.format(paper_address[2])) + f.write('
  • Lat/Lng: {}, {}
  • '.format(paper_address[3], paper_address[4])) + f.write('
  • Year: {}
  • '.format(paper.year)) + f.write('
  • Coverage: {} / {} citations were located ({} %).
  • '.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100))) f.write('') f.write('

    {}

    '.format('Geocoded Citations')) write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0))) @@ -145,6 +240,16 @@ def process_paper(fn, addresses): f.write("") return res +def load_megapixels_queries(): + keys, rows = read_csv('datasets/citations-2018310.csv') + lookup = {} + for row in rows: + rec = {} + for index, key in enumerate(keys): + rec[key] = row[index] + lookup[rec['Title'].strip()] = rec + return lookup + def load_institutions(paperId): if os.path.exists(file_path('pdf', paperId, 'institutions.json')): return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions'] -- cgit v1.2.3-70-g09d2