diff options
Diffstat (limited to 's2-citation-report.py')
| -rw-r--r-- | s2-citation-report.py | 67 |
1 files changed, 36 insertions, 31 deletions
diff --git a/s2-citation-report.py b/s2-citation-report.py index 19b018f8..58b7ed8f 100644 --- a/s2-citation-report.py +++ b/s2-citation-report.py @@ -1,5 +1,5 @@ import os -import gzip +import re import glob import json import math @@ -9,7 +9,7 @@ from util import * @click.command() def s2_citation_report(): - addresses = load_addresses() + addresses = AddressBook() for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True): process_paper(fn, addresses) @@ -40,22 +40,37 @@ def process_paper(fn, addresses): continue institutions = load_institutions(citationId) geocoded_institutions = [] + unknown_institutions = [] institution = '' address = None for inst in sorted(institutions, key=operator.itemgetter(1)): # print(inst[1]) address_count += 1 institution = inst[1] - if institution in addresses: - address = addresses[institution] + next_address = addresses.find(institution) + if next_address: + address = next_address geocode_count += 1 geocoded_institutions.append(institution) else: - for part in institution.split(', '): - if part in addresses: - address = addresses[part] - geocode_count += 1 - geocoded_institutions.append(institution) + unknown_institutions.append(institution) + if not address: + if os.path.exists(file_path('pdf', citationId, 'paper.txt')): + headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation) + heading_string = '\n'.join(headings[0:20]) + found_addresses = [] + if len(headings): + for heading in headings: + l = heading.lower().strip() + if l: + next_address = addresses.find(l) + if next_address: + address = next_address + geocode_count += 1 + geocoded_institutions.append(heading) + else: + unknown_institutions.append(heading) + res['citations'].append({ 'title': citation.title, 'journal': citation.journal, @@ -63,31 +78,27 @@ def process_paper(fn, addresses): 'institutions': [inst[1] for inst in institutions], 'geocoded': geocoded_institutions, }) - if len(geocoded_institutions): + if address: geocoded_citations.append([ citation.title, institution, - address, - ]) + ] + address) display_geocoded_citations.append([ citation.title, - institution, - ', '.join(address), - ]) + ] + address) else: unknown_citations.append([ citation.title, - institution, + '<br>'.join(unknown_institutions), ]) paper_institutions = load_institutions(paper.paper_id) paper_address = None - for inst in sorted(institutions, key=operator.itemgetter(1)): + for inst in sorted(paper_institutions, key=operator.itemgetter(1)): # print(inst[1]) address_count += 1 institution = inst[1] - if institution in addresses: - paper_address = addresses[institution] + paper_address = addresses.find(institution) if paper_address: print(paper_address) @@ -134,24 +145,18 @@ def process_paper(fn, addresses): f.write("</html>") return res -def load_addresses(): - data = read_csv('reports/all_institutions.csv', keys=None) - lookup = {} - for row in data: - name = row[0] - lookup[name] = row - return lookup - def load_institutions(paperId): - if os.path.exists(os.path.join(data_path('pdf', paperId), 'institutions.json')): - return read_json(os.path.join(data_path('pdf', paperId), 'institutions.json'))['institutions'] - elif os.path.exists(os.path.join(data_path('doi', paperId), 'institutions.json')): - return read_json(os.path.join(data_path('doi', paperId), 'institutions.json'))['institutions'] + if os.path.exists(file_path('pdf', paperId, 'institutions.json')): + return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions'] + elif os.path.exists(file_path('doi', paperId, 'institutions.json')): + return read_json(file_path('doi', paperId, 'institutions.json'))['institutions'] else: return [] def data_path(key, paper_id): return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id) +def file_path(key, paper_id, fn): + return os.path.join(data_path(key, paper_id), fn) if __name__ == '__main__': s2_citation_report() |
