From 8f0d59a5f44c71aeb4eecf60cb323d2fe0306a3e Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Fri, 8 Mar 2019 21:02:58 +0100 Subject: possibly freshen raw papers --- scraper/s2-papers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'scraper/s2-papers.py') diff --git a/scraper/s2-papers.py b/scraper/s2-papers.py index 782dc198..40489e75 100644 --- a/scraper/s2-papers.py +++ b/scraper/s2-papers.py @@ -13,7 +13,8 @@ from util import * s2 = SemanticScholarAPI() @click.command() -def fetch_papers(): +@click.option('--freshen/--no-freshen', '-f', help='Force it to query the paper API again') +def fetch_papers(freshen): addresses = AddressBook() lookup_keys, lines = fetch_google_sheet('citation_lookup') report_keys = [ @@ -30,7 +31,7 @@ def fetch_papers(): paper_id = line[3] if paper_id == '': continue - paper = fetch_paper(s2, paper_id) + paper = fetch_paper(s2, paper_id, freshen) if paper is None: continue db_paper = load_paper(paper_id) -- cgit v1.2.3-70-g09d2 From faa59f192c00dadd84165fe14a90307c908e6ab2 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Fri, 8 Mar 2019 21:05:03 +0100 Subject: possibly freshen raw papers --- scraper/s2-papers.py | 4 ++++ scraper/s2-raw-papers.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'scraper/s2-papers.py') diff --git a/scraper/s2-papers.py b/scraper/s2-papers.py index 40489e75..92bf8ebf 100644 --- a/scraper/s2-papers.py +++ b/scraper/s2-papers.py @@ -9,6 +9,8 @@ import operator import click from s2 import SemanticScholarAPI from util import * +from urllib.parse import unquote +raw_papers_api = import_module('s2-raw-papers') s2 = SemanticScholarAPI() @@ -34,6 +36,8 @@ def fetch_papers(freshen): paper = fetch_paper(s2, paper_id, freshen) if paper is None: continue + if freshen: + raw_papers_api.fetch_raw_paper(paper_id, freshen) db_paper = load_paper(paper_id) pdf_link = db_paper.pdf_link if db_paper else "" diff --git a/scraper/s2-raw-papers.py b/scraper/s2-raw-papers.py index 612c8099..8881cda0 100644 --- a/scraper/s2-raw-papers.py +++ b/scraper/s2-raw-papers.py @@ -18,10 +18,10 @@ def fetch_raw_papers(fn): lines = read_csv(fn, keys=False) parallelize(fetch_raw_paper, lines) -def fetch_raw_paper(paper_id): +def fetch_raw_paper(paper_id, freshen=False): os.makedirs(make_raw_paper_path(paper_id), exist_ok=True) paper_fn = make_raw_paper_fn(paper_id) - if os.path.exists(paper_fn): + if os.path.exists(paper_fn) and not freshen: paper = read_json(paper_fn) else: paper = s2.raw_paper(paper_id) -- cgit v1.2.3-70-g09d2 From 8396d4b3af07dc84c4243c3ee131750759ee5324 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Fri, 8 Mar 2019 21:07:12 +0100 Subject: possibly freshen raw papers --- scraper/s2-doi-report.py | 2 +- scraper/s2-papers.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'scraper/s2-papers.py') diff --git a/scraper/s2-doi-report.py b/scraper/s2-doi-report.py index 1d7bf44a..ea708de2 100644 --- a/scraper/s2-doi-report.py +++ b/scraper/s2-doi-report.py @@ -7,8 +7,8 @@ import click import operator from util import * from bs4 import BeautifulSoup -from importlib import import_module from urllib.parse import unquote +from importlib import import_module doi = import_module('s2-fetch-doi') DOI_DIR = 'datasets/s2/doi' diff --git a/scraper/s2-papers.py b/scraper/s2-papers.py index 92bf8ebf..ef7d3d81 100644 --- a/scraper/s2-papers.py +++ b/scraper/s2-papers.py @@ -10,7 +10,8 @@ import click from s2 import SemanticScholarAPI from util import * from urllib.parse import unquote -raw_papers_api = import_module('s2-raw-papers') +import importlib +raw_papers_api = importlib.import_module('s2-raw-papers') s2 = SemanticScholarAPI() -- cgit v1.2.3-70-g09d2