reports of which paper titles matched

author: Jules Laplace <julescarbon@gmail.com> 2019-02-08 23:19:04 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2019-02-08 23:19:04 +0100
commit: 8e26cbff5171fb204082e1b6778d17f786c1eb16 (patch)
tree: f8420a6268d1c624572091881f0b02cf17d0b695 /scraper/s2-papers.py
parent: 6059ce2eb68a931a4cbb12049c202c3299e4966b (diff)
1 files changed, 60 insertions, 28 deletions
diff --git a/scraper/s2-papers.py b/scraper/s2-papers.py
index bf77a734..86e2d614 100644
--- a/scraper/s2-papers.py
+++ b/scraper/s2-papers.py
@@ -5,42 +5,74 @@ import subprocess
 import time
 import random
 import re
-import simplejson as json
+import operator
 import click
 from s2 import SemanticScholarAPI
 from util import *
 
-'''
-s2 search API format:
-results
-matchedAuthors
-matchedPresentations
-query
-querySuggestions
-results
-stats
-totalPages
-totalResults
-'''
-
 s2 = SemanticScholarAPI()
 
 @click.command()
-@click.option('--index', '-n', default=0, help='Index of CSV (query,)')
-@click.option('--depth', '-d', default=1, help='Depth to recurse (not implemented).')
-def fetch_papers(index, depth):
-  keys, lines = read_citation_list(index)
+def fetch_papers():
+  addresses = AddressBook()
+  lookup_keys, lines = read_csv('./datasets/citation_lookup.csv')
+  report_keys = [
+    "key", "name", "our title", 'found title', '', '', 'address', 's2 id'
+  ]
+  all_rows = []
+  no_location_rows = []
+  nonmatching_rows = []
   for line in lines:
-    label = line[0]
-    title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[1])
-    entry_fn = './datasets/s2/entries/{}.json'.format(title)
-    if not os.path.exists(entry_fn):
-      print('not found: {}'.format(entry_fn))
-      continue
-    result = read_json(entry_fn)
-    paper_id = result['id']
-    paper = fetch_paper(paper_id)
-    # get all of the paper's citations
+    key, name, title, paper_id = line
+    paper = fetch_paper(s2, paper_id)
+    db_paper = load_paper(paper_id)
+    pdf_link = db_paper.pdf_link if db_paper else ""
+
+    paper_institutions = load_institutions(paper_id)
+    paper_address = None
+    for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
+      # print(inst[1])
+      institution = inst[1]
+      if paper_address is None:
+        paper_address = addresses.findObject(institution)
+
+    if paper_address is None:
+      paper_address = ""
+    else:
+      paper_address = paper_address['address']
+
+    s2_link = "https://www.semanticscholar.org/search?q={}&sort=relevance".format(title.strip().lower())
+    row = [
+      key,
+      name,
+      title,
+      paper['title'],
+      LinkLine(pdf_link, '[pdf]'),
+      LinkLine(s2_link, '[s2]'),
+      paper_address,
+      paper['paperId'],
+    ]
+    all_rows.append(row)
+    if title.strip().lower() != paper['title'].strip().lower():
+      nonmatching_rows.append(row)
+    if paper_address == '':
+      no_location_rows.append(row)
+  write_report('./reports/paper_title_report.html', 'Paper Title Sanity Check', report_keys, all_rows)
+  write_report('./reports/paper_title_report_nonmatching.html', 'Paper Titles that do not match', report_keys, nonmatching_rows)
+  write_report('./reports/paper_title_report_no_location.html', 'Papers with no location', report_keys, no_location_rows)
+
+def load_institutions(paperId):
+  if os.path.exists(file_path('pdf', paperId, 'institutions.json')):
+    return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions']
+  elif os.path.exists(file_path('doi', paperId, 'institutions.json')):
+    return read_json(file_path('doi', paperId, 'institutions.json'))['institutions']
+  else:
+    return []
+
+def data_path(key, paper_id):
+  return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id)
+def file_path(key, paper_id, fn):
+  return os.path.join(data_path(key, paper_id), fn)
 
 if __name__ == '__main__':
   fetch_papers()
author	Jules Laplace <julescarbon@gmail.com>	2019-02-08 23:19:04 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2019-02-08 23:19:04 +0100
commit	8e26cbff5171fb204082e1b6778d17f786c1eb16 (patch)
tree	f8420a6268d1c624572091881f0b02cf17d0b695 /scraper/s2-papers.py
parent	6059ce2eb68a931a4cbb12049c202c3299e4966b (diff)