diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2019-02-08 23:19:04 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2019-02-08 23:19:04 +0100 |
| commit | 8e26cbff5171fb204082e1b6778d17f786c1eb16 (patch) | |
| tree | f8420a6268d1c624572091881f0b02cf17d0b695 /scraper/s2-papers.py | |
| parent | 6059ce2eb68a931a4cbb12049c202c3299e4966b (diff) | |
reports of which paper titles matched
Diffstat (limited to 'scraper/s2-papers.py')
| -rw-r--r-- | scraper/s2-papers.py | 88 |
1 files changed, 60 insertions, 28 deletions
diff --git a/scraper/s2-papers.py b/scraper/s2-papers.py index bf77a734..86e2d614 100644 --- a/scraper/s2-papers.py +++ b/scraper/s2-papers.py @@ -5,42 +5,74 @@ import subprocess import time import random import re -import simplejson as json +import operator import click from s2 import SemanticScholarAPI from util import * -''' -s2 search API format: -results -matchedAuthors -matchedPresentations -query -querySuggestions -results -stats -totalPages -totalResults -''' - s2 = SemanticScholarAPI() @click.command() -@click.option('--index', '-n', default=0, help='Index of CSV (query,)') -@click.option('--depth', '-d', default=1, help='Depth to recurse (not implemented).') -def fetch_papers(index, depth): - keys, lines = read_citation_list(index) +def fetch_papers(): + addresses = AddressBook() + lookup_keys, lines = read_csv('./datasets/citation_lookup.csv') + report_keys = [ + "key", "name", "our title", 'found title', '', '', 'address', 's2 id' + ] + all_rows = [] + no_location_rows = [] + nonmatching_rows = [] for line in lines: - label = line[0] - title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[1]) - entry_fn = './datasets/s2/entries/{}.json'.format(title) - if not os.path.exists(entry_fn): - print('not found: {}'.format(entry_fn)) - continue - result = read_json(entry_fn) - paper_id = result['id'] - paper = fetch_paper(paper_id) - # get all of the paper's citations + key, name, title, paper_id = line + paper = fetch_paper(s2, paper_id) + db_paper = load_paper(paper_id) + pdf_link = db_paper.pdf_link if db_paper else "" + + paper_institutions = load_institutions(paper_id) + paper_address = None + for inst in sorted(paper_institutions, key=operator.itemgetter(1)): + # print(inst[1]) + institution = inst[1] + if paper_address is None: + paper_address = addresses.findObject(institution) + + if paper_address is None: + paper_address = "" + else: + paper_address = paper_address['address'] + + s2_link = "https://www.semanticscholar.org/search?q={}&sort=relevance".format(title.strip().lower()) + row = [ + key, + name, + title, + paper['title'], + LinkLine(pdf_link, '[pdf]'), + LinkLine(s2_link, '[s2]'), + paper_address, + paper['paperId'], + ] + all_rows.append(row) + if title.strip().lower() != paper['title'].strip().lower(): + nonmatching_rows.append(row) + if paper_address == '': + no_location_rows.append(row) + write_report('./reports/paper_title_report.html', 'Paper Title Sanity Check', report_keys, all_rows) + write_report('./reports/paper_title_report_nonmatching.html', 'Paper Titles that do not match', report_keys, nonmatching_rows) + write_report('./reports/paper_title_report_no_location.html', 'Papers with no location', report_keys, no_location_rows) + +def load_institutions(paperId): + if os.path.exists(file_path('pdf', paperId, 'institutions.json')): + return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions'] + elif os.path.exists(file_path('doi', paperId, 'institutions.json')): + return read_json(file_path('doi', paperId, 'institutions.json'))['institutions'] + else: + return [] + +def data_path(key, paper_id): + return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id) +def file_path(key, paper_id, fn): + return os.path.join(data_path(key, paper_id), fn) if __name__ == '__main__': fetch_papers() |
