summaryrefslogtreecommitdiff
path: root/s2-citation-report.py
diff options
context:
space:
mode:
Diffstat (limited to 's2-citation-report.py')
-rw-r--r--s2-citation-report.py67
1 files changed, 36 insertions, 31 deletions
diff --git a/s2-citation-report.py b/s2-citation-report.py
index 19b018f8..58b7ed8f 100644
--- a/s2-citation-report.py
+++ b/s2-citation-report.py
@@ -1,5 +1,5 @@
import os
-import gzip
+import re
import glob
import json
import math
@@ -9,7 +9,7 @@ from util import *
@click.command()
def s2_citation_report():
- addresses = load_addresses()
+ addresses = AddressBook()
for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True):
process_paper(fn, addresses)
@@ -40,22 +40,37 @@ def process_paper(fn, addresses):
continue
institutions = load_institutions(citationId)
geocoded_institutions = []
+ unknown_institutions = []
institution = ''
address = None
for inst in sorted(institutions, key=operator.itemgetter(1)):
# print(inst[1])
address_count += 1
institution = inst[1]
- if institution in addresses:
- address = addresses[institution]
+ next_address = addresses.find(institution)
+ if next_address:
+ address = next_address
geocode_count += 1
geocoded_institutions.append(institution)
else:
- for part in institution.split(', '):
- if part in addresses:
- address = addresses[part]
- geocode_count += 1
- geocoded_institutions.append(institution)
+ unknown_institutions.append(institution)
+ if not address:
+ if os.path.exists(file_path('pdf', citationId, 'paper.txt')):
+ headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation)
+ heading_string = '\n'.join(headings[0:20])
+ found_addresses = []
+ if len(headings):
+ for heading in headings:
+ l = heading.lower().strip()
+ if l:
+ next_address = addresses.find(l)
+ if next_address:
+ address = next_address
+ geocode_count += 1
+ geocoded_institutions.append(heading)
+ else:
+ unknown_institutions.append(heading)
+
res['citations'].append({
'title': citation.title,
'journal': citation.journal,
@@ -63,31 +78,27 @@ def process_paper(fn, addresses):
'institutions': [inst[1] for inst in institutions],
'geocoded': geocoded_institutions,
})
- if len(geocoded_institutions):
+ if address:
geocoded_citations.append([
citation.title,
institution,
- address,
- ])
+ ] + address)
display_geocoded_citations.append([
citation.title,
- institution,
- ', '.join(address),
- ])
+ ] + address)
else:
unknown_citations.append([
citation.title,
- institution,
+ '<br>'.join(unknown_institutions),
])
paper_institutions = load_institutions(paper.paper_id)
paper_address = None
- for inst in sorted(institutions, key=operator.itemgetter(1)):
+ for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
# print(inst[1])
address_count += 1
institution = inst[1]
- if institution in addresses:
- paper_address = addresses[institution]
+ paper_address = addresses.find(institution)
if paper_address:
print(paper_address)
@@ -134,24 +145,18 @@ def process_paper(fn, addresses):
f.write("</html>")
return res
-def load_addresses():
- data = read_csv('reports/all_institutions.csv', keys=None)
- lookup = {}
- for row in data:
- name = row[0]
- lookup[name] = row
- return lookup
-
def load_institutions(paperId):
- if os.path.exists(os.path.join(data_path('pdf', paperId), 'institutions.json')):
- return read_json(os.path.join(data_path('pdf', paperId), 'institutions.json'))['institutions']
- elif os.path.exists(os.path.join(data_path('doi', paperId), 'institutions.json')):
- return read_json(os.path.join(data_path('doi', paperId), 'institutions.json'))['institutions']
+ if os.path.exists(file_path('pdf', paperId, 'institutions.json')):
+ return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions']
+ elif os.path.exists(file_path('doi', paperId, 'institutions.json')):
+ return read_json(file_path('doi', paperId, 'institutions.json'))['institutions']
else:
return []
def data_path(key, paper_id):
return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id)
+def file_path(key, paper_id, fn):
+ return os.path.join(data_path(key, paper_id), fn)
if __name__ == '__main__':
s2_citation_report()