summaryrefslogtreecommitdiff
path: root/s2-citation-report.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-10 15:59:24 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-10 15:59:24 +0100
commite8ce7876c5869522f982073d70c3ee7be179e1f9 (patch)
tree367d30870781187f4f78eb074cb7cb0b632aa0c7 /s2-citation-report.py
parentc412e5f0f8b71d137e4f18f8a8c7361e15c8f500 (diff)
citation coverage reports
Diffstat (limited to 's2-citation-report.py')
-rw-r--r--s2-citation-report.py167
1 files changed, 136 insertions, 31 deletions
diff --git a/s2-citation-report.py b/s2-citation-report.py
index 58b7ed8f..26e148fe 100644
--- a/s2-citation-report.py
+++ b/s2-citation-report.py
@@ -10,16 +10,83 @@ from util import *
@click.command()
def s2_citation_report():
addresses = AddressBook()
+ megapixels = load_megapixels_queries()
+ papers = []
for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True):
- process_paper(fn, addresses)
+ paper_data = process_paper(fn, addresses, megapixels)
+ papers.append(paper_data)
+ write_papers_report('reports/report_index.html', 'All Papers', papers, 'paperId')
+ write_papers_report('reports/report_coverage.html', 'Coverage', papers, 'citations_geocoded', reverse=True)
+
+def write_papers_report(fn, title, papers, key, reverse=False):
+ sorted_papers = []
+ for paper in sorted(papers, key=lambda x: x[key], reverse=reverse):
+ sorted_papers.append([
+ paper['paperId'],
+ paper['key'],
+ LinkLine(paper['report_link'], paper['title']),
+ LinkLine(paper['pdf_link'], '[pdf]'),
+ paper['journal'],
+ paper['address_type'],
+ paper['address'],
+ paper['lat'],
+ paper['lng'],
+ str(percent(paper['citation_count'], paper['citations_geocoded'])) + '%',
+ paper['citation_count'],
+ paper['citations_geocoded'],
+ paper['citations_unknown'],
+ paper['citations_empty'],
+ paper['citations_pdf'],
+ paper['citations_doi'],
+ ])
+ sorted_paper_keys = [
+ 'Paper ID',
+ 'Megapixels Key',
+ 'Report Link',
+ 'PDF Link',
+ 'Journal',
+ 'Type',
+ 'Address',
+ 'Lat',
+ 'Lng',
+ 'Coverage',
+ 'Total Citations',
+ 'Geocoded Citations',
+ 'Unknown Citations',
+ 'Empty Citations',
+ 'With PDF',
+ 'With DOI',
+ ]
+ write_report(fn, title=title, keys=sorted_paper_keys, rows=sorted_papers)
+
+def process_paper(fn, addresses, megapixels):
+ res = {
+ 'paperId': '',
+ 'key': '',
+ 'title': '',
+ 'journal': '',
+ 'address': '',
+ 'address_type': '',
+ 'lat': '',
+ 'lng': '',
+ 'pdf_link': '',
+ 'report_link': '',
+ 'citation_count': 0,
+ 'citations_geocoded': 0,
+ 'citations_unknown': 0,
+ 'citations_empty': 0,
+ 'citations_pdf': 0,
+ 'citations_doi': 0,
+ }
-def process_paper(fn, addresses):
- res = {}
- address_count = 0
- geocode_count = 0
geocoded_citations = []
unknown_citations = []
display_geocoded_citations = []
+ empty_citations = []
+ pdf_count = 0
+ doi_count = 0
+ address_count = 0
+
with open(fn, 'r') as f:
data = json.load(f)
print('>> {}'.format(data['paperId']))
@@ -27,14 +94,42 @@ def process_paper(fn, addresses):
if paper.data is None:
print("Paper missing! {}".format(data['paperId']))
return
+
res['paperId'] = paper.paper_id
res['title'] = paper.title
res['journal'] = paper.journal
- res['authors'] = paper.authors
- res['citations'] = []
+ res['report_link'] = 'papers/{}.html'.format(paper.paper_id)
+ res['pdf_link'] = paper.pdf_link
+ # res['authors'] = ', '.join(paper.authors)
+ # res['citations'] = []
+
+ if res['title'] in megapixels:
+ res['key'] = megapixels[res['title']]['Database Name']
+
+ paper_institutions = load_institutions(paper.paper_id)
+ paper_address = None
+ for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
+ # print(inst[1])
+ institution = inst[1]
+ if paper_address is None:
+ paper_address = addresses.find(institution)
+
+ if paper_address:
+ print(paper_address)
+ res['address'] = paper_address[0]
+ res['lat'] = paper_address[3]
+ res['lng'] = paper_address[4]
+ res['address_type'] = paper_address[5]
+
for cite in data['citations']:
citationId = cite['paperId']
citation = load_paper(citationId)
+ has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt'))
+ has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi'))
+ if has_pdf:
+ pdf_count += 1
+ if has_doi:
+ doi_count += 1
if citation.data is None:
print("Citation missing! {}".format(cite['paperId']))
continue
@@ -50,12 +145,11 @@ def process_paper(fn, addresses):
next_address = addresses.find(institution)
if next_address:
address = next_address
- geocode_count += 1
geocoded_institutions.append(institution)
else:
unknown_institutions.append(institution)
if not address:
- if os.path.exists(file_path('pdf', citationId, 'paper.txt')):
+ if has_pdf:
headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation)
heading_string = '\n'.join(headings[0:20])
found_addresses = []
@@ -66,18 +160,22 @@ def process_paper(fn, addresses):
next_address = addresses.find(l)
if next_address:
address = next_address
- geocode_count += 1
geocoded_institutions.append(heading)
else:
unknown_institutions.append(heading)
+ else:
+ empty_citations.append([
+ citationId,
+ citation.title,
+ ])
- res['citations'].append({
- 'title': citation.title,
- 'journal': citation.journal,
- 'authors': citation.authors,
- 'institutions': [inst[1] for inst in institutions],
- 'geocoded': geocoded_institutions,
- })
+ # res['citations'].append({
+ # 'title': citation.title,
+ # 'journal': citation.journal,
+ # 'authors': citation.authors,
+ # 'institutions': [inst[1] for inst in institutions],
+ # 'geocoded': geocoded_institutions,
+ # })
if address:
geocoded_citations.append([
citation.title,
@@ -88,20 +186,16 @@ def process_paper(fn, addresses):
] + address)
else:
unknown_citations.append([
+ # citationId,
citation.title,
'<br>'.join(unknown_institutions),
])
-
- paper_institutions = load_institutions(paper.paper_id)
- paper_address = None
- for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
- # print(inst[1])
- address_count += 1
- institution = inst[1]
- paper_address = addresses.find(institution)
-
- if paper_address:
- print(paper_address)
+ res['citation_count'] = len(data['citations'])
+ res['citations_geocoded'] = len(geocoded_citations)
+ res['citations_unknown'] = len(unknown_citations)
+ res['citations_empty'] = len(empty_citations)
+ res['citations_pdf'] = pdf_count
+ res['citations_doi'] = doi_count
total_citations = len(geocoded_citations) + len(unknown_citations)
os.makedirs('reports/papers/', exist_ok=True)
@@ -122,9 +216,10 @@ def process_paper(fn, addresses):
f.write('<li>Journal: {}</li>'.format(paper.journal))
if paper_address:
f.write('<li>Research institution: {}</li>'.format(paper_address[0]))
- f.write('<li>Address: {}</li>'.format(paper_address[3]))
- f.write('<li>{}</li>'.format(paper.year))
- f.write('<li>{} / {} citations were located ({} %).</li>'.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100)))
+ f.write('<li>Address: {}</li>'.format(paper_address[2]))
+ f.write('<li>Lat/Lng: {}, {}</li>'.format(paper_address[3], paper_address[4]))
+ f.write('<li>Year: {}</li>'.format(paper.year))
+ f.write('<li>Coverage: {} / {} citations were located ({} %).</li>'.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100)))
f.write('</ul>')
f.write('<h3>{}</h3>'.format('Geocoded Citations'))
write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0)))
@@ -145,6 +240,16 @@ def process_paper(fn, addresses):
f.write("</html>")
return res
+def load_megapixels_queries():
+ keys, rows = read_csv('datasets/citations-2018310.csv')
+ lookup = {}
+ for row in rows:
+ rec = {}
+ for index, key in enumerate(keys):
+ rec[key] = row[index]
+ lookup[rec['Title'].strip()] = rec
+ return lookup
+
def load_institutions(paperId):
if os.path.exists(file_path('pdf', paperId, 'institutions.json')):
return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions']