scraper/s2-papers.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

import os
import sys
import csv
import subprocess
import time
import random
import re
import operator
import click
from s2 import SemanticScholarAPI
from util import *
from urllib.parse import unquote
import importlib
raw_papers_api = importlib.import_module('s2-raw-papers')

s2 = SemanticScholarAPI()

@click.command()
@click.option('--freshen/--no-freshen', '-f', help='Force it to query the paper API again')
def fetch_papers(freshen):
  addresses = AddressBook()
  lookup_keys, lines = fetch_google_sheet('citation_lookup')
  report_keys = [
    "key", "name", "our title", 'found title', '', '', 'address', 's2 id'
  ]
  all_rows = []
  no_location_rows = []
  nonmatching_rows = []
  for line in lines:
    # key, name, title, paper_id, is_unknown, notes = line
    key = line[0]
    name = line[1]
    title = line[2]
    paper_id = line[3]
    if paper_id == '':
      continue
    paper = fetch_paper(s2, paper_id, freshen)
    if paper is None:
      continue
    if freshen:
      raw_papers_api.fetch_raw_paper(paper_id, freshen)
    db_paper = load_paper(paper_id)
    pdf_link = db_paper.pdf_link if db_paper else ""

    paper_institutions = load_institutions(paper_id)
    paper_address = None
    for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
      # print(inst[1])
      institution = inst[1]
      if paper_address is None:
        paper_address = addresses.findObject(institution)

    if paper_address is None:
      paper_address = ""
    elif 'name' in paper_address:
      paper_address = paper_address['name']
    else:
      print('paper_address is funky:')
      print(paper_address)
    if freshen:
      time.sleep(2)

    s2_link = "https://www.semanticscholar.org/search?q={}&sort=relevance".format(title.strip().lower())
    row = [
      key,
      name,
      title,
      paper['title'],
      LinkLine(pdf_link, '[pdf]'),
      LinkLine(s2_link, '[s2]'),
      paper_address,
      paper['paperId'],
    ]
    all_rows.append(row)
    if title.strip().lower() != paper['title'].strip().lower():
      nonmatching_rows.append(row)
    if paper_address == '':
      no_location_rows.append(row)
  write_report('./reports/paper_title_report.html', 'Paper Title Sanity Check', report_keys, all_rows)
  write_report('./reports/paper_title_report_nonmatching.html', 'Paper Titles that do not match', report_keys, nonmatching_rows)
  write_report('./reports/paper_title_report_no_location.html', 'Papers with no location', report_keys, no_location_rows)

def load_institutions(paperId):
  if os.path.exists(file_path('pdf', paperId, 'institutions.json')):
    return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions']
  elif os.path.exists(file_path('doi', paperId, 'institutions.json')):
    return read_json(file_path('doi', paperId, 'institutions.json'))['institutions']
  else:
    return []

def data_path(key, paper_id):
  return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id)
def file_path(key, paper_id, fn):
  return os.path.join(data_path(key, paper_id), fn)

if __name__ == '__main__':
  fetch_papers()