summaryrefslogtreecommitdiff
path: root/scraper/s2-papers.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2019-02-08 23:19:04 +0100
committerJules Laplace <julescarbon@gmail.com>2019-02-08 23:19:04 +0100
commit8e26cbff5171fb204082e1b6778d17f786c1eb16 (patch)
treef8420a6268d1c624572091881f0b02cf17d0b695 /scraper/s2-papers.py
parent6059ce2eb68a931a4cbb12049c202c3299e4966b (diff)
reports of which paper titles matched
Diffstat (limited to 'scraper/s2-papers.py')
-rw-r--r--scraper/s2-papers.py88
1 files changed, 60 insertions, 28 deletions
diff --git a/scraper/s2-papers.py b/scraper/s2-papers.py
index bf77a734..86e2d614 100644
--- a/scraper/s2-papers.py
+++ b/scraper/s2-papers.py
@@ -5,42 +5,74 @@ import subprocess
import time
import random
import re
-import simplejson as json
+import operator
import click
from s2 import SemanticScholarAPI
from util import *
-'''
-s2 search API format:
-results
-matchedAuthors
-matchedPresentations
-query
-querySuggestions
-results
-stats
-totalPages
-totalResults
-'''
-
s2 = SemanticScholarAPI()
@click.command()
-@click.option('--index', '-n', default=0, help='Index of CSV (query,)')
-@click.option('--depth', '-d', default=1, help='Depth to recurse (not implemented).')
-def fetch_papers(index, depth):
- keys, lines = read_citation_list(index)
+def fetch_papers():
+ addresses = AddressBook()
+ lookup_keys, lines = read_csv('./datasets/citation_lookup.csv')
+ report_keys = [
+ "key", "name", "our title", 'found title', '', '', 'address', 's2 id'
+ ]
+ all_rows = []
+ no_location_rows = []
+ nonmatching_rows = []
for line in lines:
- label = line[0]
- title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[1])
- entry_fn = './datasets/s2/entries/{}.json'.format(title)
- if not os.path.exists(entry_fn):
- print('not found: {}'.format(entry_fn))
- continue
- result = read_json(entry_fn)
- paper_id = result['id']
- paper = fetch_paper(paper_id)
- # get all of the paper's citations
+ key, name, title, paper_id = line
+ paper = fetch_paper(s2, paper_id)
+ db_paper = load_paper(paper_id)
+ pdf_link = db_paper.pdf_link if db_paper else ""
+
+ paper_institutions = load_institutions(paper_id)
+ paper_address = None
+ for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
+ # print(inst[1])
+ institution = inst[1]
+ if paper_address is None:
+ paper_address = addresses.findObject(institution)
+
+ if paper_address is None:
+ paper_address = ""
+ else:
+ paper_address = paper_address['address']
+
+ s2_link = "https://www.semanticscholar.org/search?q={}&sort=relevance".format(title.strip().lower())
+ row = [
+ key,
+ name,
+ title,
+ paper['title'],
+ LinkLine(pdf_link, '[pdf]'),
+ LinkLine(s2_link, '[s2]'),
+ paper_address,
+ paper['paperId'],
+ ]
+ all_rows.append(row)
+ if title.strip().lower() != paper['title'].strip().lower():
+ nonmatching_rows.append(row)
+ if paper_address == '':
+ no_location_rows.append(row)
+ write_report('./reports/paper_title_report.html', 'Paper Title Sanity Check', report_keys, all_rows)
+ write_report('./reports/paper_title_report_nonmatching.html', 'Paper Titles that do not match', report_keys, nonmatching_rows)
+ write_report('./reports/paper_title_report_no_location.html', 'Papers with no location', report_keys, no_location_rows)
+
+def load_institutions(paperId):
+ if os.path.exists(file_path('pdf', paperId, 'institutions.json')):
+ return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions']
+ elif os.path.exists(file_path('doi', paperId, 'institutions.json')):
+ return read_json(file_path('doi', paperId, 'institutions.json'))['institutions']
+ else:
+ return []
+
+def data_path(key, paper_id):
+ return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id)
+def file_path(key, paper_id, fn):
+ return os.path.join(data_path(key, paper_id), fn)
if __name__ == '__main__':
fetch_papers()