import os import sys import csv import subprocess import time import random import re import operator import click from s2 import SemanticScholarAPI from util import * s2 = SemanticScholarAPI() @click.command() def fetch_papers(): addresses = AddressBook() lookup_keys, lines = read_csv('./datasets/citation_lookup.csv') report_keys = [ "key", "name", "our title", 'found title', '', '', 'address', 's2 id' ] all_rows = [] no_location_rows = [] nonmatching_rows = [] for line in lines: key, name, title, paper_id = line paper = fetch_paper(s2, paper_id) db_paper = load_paper(paper_id) pdf_link = db_paper.pdf_link if db_paper else "" paper_institutions = load_institutions(paper_id) paper_address = None for inst in sorted(paper_institutions, key=operator.itemgetter(1)): # print(inst[1]) institution = inst[1] if paper_address is None: paper_address = addresses.findObject(institution) if paper_address is None: paper_address = "" else: paper_address = paper_address['address'] s2_link = "https://www.semanticscholar.org/search?q={}&sort=relevance".format(title.strip().lower()) row = [ key, name, title, paper['title'], LinkLine(pdf_link, '[pdf]'), LinkLine(s2_link, '[s2]'), paper_address, paper['paperId'], ] all_rows.append(row) if title.strip().lower() != paper['title'].strip().lower(): nonmatching_rows.append(row) if paper_address == '': no_location_rows.append(row) write_report('./reports/paper_title_report.html', 'Paper Title Sanity Check', report_keys, all_rows) write_report('./reports/paper_title_report_nonmatching.html', 'Paper Titles that do not match', report_keys, nonmatching_rows) write_report('./reports/paper_title_report_no_location.html', 'Papers with no location', report_keys, no_location_rows) def load_institutions(paperId): if os.path.exists(file_path('pdf', paperId, 'institutions.json')): return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions'] elif os.path.exists(file_path('doi', paperId, 'institutions.json')): return read_json(file_path('doi', paperId, 'institutions.json'))['institutions'] else: return [] def data_path(key, paper_id): return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id) def file_path(key, paper_id, fn): return os.path.join(data_path(key, paper_id), fn) if __name__ == '__main__': fetch_papers()