summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
Diffstat (limited to 'scraper')
-rw-r--r--scraper/s2-doi-report.py2
-rw-r--r--scraper/s2-geocode-spreadsheet.py4
-rw-r--r--scraper/s2-papers.py10
-rw-r--r--scraper/s2-raw-papers.py6
-rw-r--r--scraper/util.py4
5 files changed, 18 insertions, 8 deletions
diff --git a/scraper/s2-doi-report.py b/scraper/s2-doi-report.py
index 1d7bf44a..ea708de2 100644
--- a/scraper/s2-doi-report.py
+++ b/scraper/s2-doi-report.py
@@ -7,8 +7,8 @@ import click
import operator
from util import *
from bs4 import BeautifulSoup
-from importlib import import_module
from urllib.parse import unquote
+from importlib import import_module
doi = import_module('s2-fetch-doi')
DOI_DIR = 'datasets/s2/doi'
diff --git a/scraper/s2-geocode-spreadsheet.py b/scraper/s2-geocode-spreadsheet.py
index 32d7c669..c36625a6 100644
--- a/scraper/s2-geocode-spreadsheet.py
+++ b/scraper/s2-geocode-spreadsheet.py
@@ -110,6 +110,8 @@ def update_country_from_address(address, i, countries, worksheet):
country = None
if possible_country in countries:
country = countries[possible_country]
+ elif "CHINA" in address:
+ country = "China"
elif "China" in address:
country = "China"
elif "Hong Kong" in address:
@@ -118,6 +120,8 @@ def update_country_from_address(address, i, countries, worksheet):
country = "Singapore"
elif "Taiwan" in address:
country = "Taiwan"
+ elif "Saudi Arabia" in address:
+ country = "Saudi Arabia"
elif "Russia" in address:
country = "Russia"
elif "Ukraine" in address:
diff --git a/scraper/s2-papers.py b/scraper/s2-papers.py
index 782dc198..ef7d3d81 100644
--- a/scraper/s2-papers.py
+++ b/scraper/s2-papers.py
@@ -9,11 +9,15 @@ import operator
import click
from s2 import SemanticScholarAPI
from util import *
+from urllib.parse import unquote
+import importlib
+raw_papers_api = importlib.import_module('s2-raw-papers')
s2 = SemanticScholarAPI()
@click.command()
-def fetch_papers():
+@click.option('--freshen/--no-freshen', '-f', help='Force it to query the paper API again')
+def fetch_papers(freshen):
addresses = AddressBook()
lookup_keys, lines = fetch_google_sheet('citation_lookup')
report_keys = [
@@ -30,9 +34,11 @@ def fetch_papers():
paper_id = line[3]
if paper_id == '':
continue
- paper = fetch_paper(s2, paper_id)
+ paper = fetch_paper(s2, paper_id, freshen)
if paper is None:
continue
+ if freshen:
+ raw_papers_api.fetch_raw_paper(paper_id, freshen)
db_paper = load_paper(paper_id)
pdf_link = db_paper.pdf_link if db_paper else ""
diff --git a/scraper/s2-raw-papers.py b/scraper/s2-raw-papers.py
index 612c8099..67ff0b65 100644
--- a/scraper/s2-raw-papers.py
+++ b/scraper/s2-raw-papers.py
@@ -18,10 +18,10 @@ def fetch_raw_papers(fn):
lines = read_csv(fn, keys=False)
parallelize(fetch_raw_paper, lines)
-def fetch_raw_paper(paper_id):
+def fetch_raw_paper(paper_id, freshen=False):
os.makedirs(make_raw_paper_path(paper_id), exist_ok=True)
paper_fn = make_raw_paper_fn(paper_id)
- if os.path.exists(paper_fn):
+ if os.path.exists(paper_fn) and not freshen:
paper = read_json(paper_fn)
else:
paper = s2.raw_paper(paper_id)
@@ -33,7 +33,7 @@ def fetch_raw_paper(paper_id):
if 'responseType' in paper and paper['responseType'] == 'CANONICAL':
write_json(paper_fn, paper)
- paper = s2.raw_paper(data['canonicalId'])
+ paper = s2.raw_paper(paper['canonicalId'])
paper_fn = make_raw_paper_fn(paper_id)
return paper
diff --git a/scraper/util.py b/scraper/util.py
index fdbc0534..830dbe8b 100644
--- a/scraper/util.py
+++ b/scraper/util.py
@@ -386,10 +386,10 @@ def parallelize(func, rows):
with Pool(processes=processCount) as pool:
pool.starmap(func, rows, chunksize)
-def fetch_paper(s2, paper_id):
+def fetch_paper(s2, paper_id, freshen=False):
os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True)
paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id)
- if os.path.exists(paper_fn):
+ if os.path.exists(paper_fn) and not freshen:
return read_json(paper_fn)
print(paper_id)
paper = s2.paper(paper_id)