1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
import os
import sys
import csv
import subprocess
import time
import random
import re
import operator
import click
from s2 import SemanticScholarAPI
from util import *
from urllib.parse import unquote
import importlib
raw_papers_api = importlib.import_module('s2-raw-papers')
s2 = SemanticScholarAPI()
@click.command()
@click.option('--freshen/--no-freshen', '-f', help='Force it to query the paper API again')
def fetch_papers(freshen):
addresses = AddressBook()
lookup_keys, lines = fetch_google_sheet('citation_lookup')
report_keys = [
"key", "name", "our title", 'found title', '', '', 'address', 's2 id'
]
all_rows = []
no_location_rows = []
nonmatching_rows = []
for line in lines:
# key, name, title, paper_id, is_unknown, notes = line
key = line[0]
name = line[1]
title = line[2]
paper_id = line[3]
if paper_id == '':
continue
paper = fetch_paper(s2, paper_id, freshen)
if paper is None:
continue
if freshen:
raw_papers_api.fetch_raw_paper(paper_id, freshen)
db_paper = load_paper(paper_id)
pdf_link = db_paper.pdf_link if db_paper else ""
paper_institutions = load_institutions(paper_id)
paper_address = None
for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
# print(inst[1])
institution = inst[1]
if paper_address is None:
paper_address = addresses.findObject(institution)
if paper_address is None:
paper_address = ""
elif 'name' in paper_address:
paper_address = paper_address['name']
else:
print('paper_address is funky:')
print(paper_address)
s2_link = "https://www.semanticscholar.org/search?q={}&sort=relevance".format(title.strip().lower())
row = [
key,
name,
title,
paper['title'],
LinkLine(pdf_link, '[pdf]'),
LinkLine(s2_link, '[s2]'),
paper_address,
paper['paperId'],
]
all_rows.append(row)
if title.strip().lower() != paper['title'].strip().lower():
nonmatching_rows.append(row)
if paper_address == '':
no_location_rows.append(row)
write_report('./reports/paper_title_report.html', 'Paper Title Sanity Check', report_keys, all_rows)
write_report('./reports/paper_title_report_nonmatching.html', 'Paper Titles that do not match', report_keys, nonmatching_rows)
write_report('./reports/paper_title_report_no_location.html', 'Papers with no location', report_keys, no_location_rows)
def load_institutions(paperId):
if os.path.exists(file_path('pdf', paperId, 'institutions.json')):
return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions']
elif os.path.exists(file_path('doi', paperId, 'institutions.json')):
return read_json(file_path('doi', paperId, 'institutions.json'))['institutions']
else:
return []
def data_path(key, paper_id):
return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id)
def file_path(key, paper_id, fn):
return os.path.join(data_path(key, paper_id), fn)
if __name__ == '__main__':
fetch_papers()
|