diff options
Diffstat (limited to 'scraper')
| -rw-r--r-- | scraper/s2-doi-report.py | 2 | ||||
| -rw-r--r-- | scraper/s2-geocode-spreadsheet.py | 4 | ||||
| -rw-r--r-- | scraper/s2-papers.py | 10 | ||||
| -rw-r--r-- | scraper/s2-raw-papers.py | 6 | ||||
| -rw-r--r-- | scraper/util.py | 4 |
5 files changed, 18 insertions, 8 deletions
diff --git a/scraper/s2-doi-report.py b/scraper/s2-doi-report.py index 1d7bf44a..ea708de2 100644 --- a/scraper/s2-doi-report.py +++ b/scraper/s2-doi-report.py @@ -7,8 +7,8 @@ import click import operator from util import * from bs4 import BeautifulSoup -from importlib import import_module from urllib.parse import unquote +from importlib import import_module doi = import_module('s2-fetch-doi') DOI_DIR = 'datasets/s2/doi' diff --git a/scraper/s2-geocode-spreadsheet.py b/scraper/s2-geocode-spreadsheet.py index 32d7c669..c36625a6 100644 --- a/scraper/s2-geocode-spreadsheet.py +++ b/scraper/s2-geocode-spreadsheet.py @@ -110,6 +110,8 @@ def update_country_from_address(address, i, countries, worksheet): country = None if possible_country in countries: country = countries[possible_country] + elif "CHINA" in address: + country = "China" elif "China" in address: country = "China" elif "Hong Kong" in address: @@ -118,6 +120,8 @@ def update_country_from_address(address, i, countries, worksheet): country = "Singapore" elif "Taiwan" in address: country = "Taiwan" + elif "Saudi Arabia" in address: + country = "Saudi Arabia" elif "Russia" in address: country = "Russia" elif "Ukraine" in address: diff --git a/scraper/s2-papers.py b/scraper/s2-papers.py index 782dc198..ef7d3d81 100644 --- a/scraper/s2-papers.py +++ b/scraper/s2-papers.py @@ -9,11 +9,15 @@ import operator import click from s2 import SemanticScholarAPI from util import * +from urllib.parse import unquote +import importlib +raw_papers_api = importlib.import_module('s2-raw-papers') s2 = SemanticScholarAPI() @click.command() -def fetch_papers(): +@click.option('--freshen/--no-freshen', '-f', help='Force it to query the paper API again') +def fetch_papers(freshen): addresses = AddressBook() lookup_keys, lines = fetch_google_sheet('citation_lookup') report_keys = [ @@ -30,9 +34,11 @@ def fetch_papers(): paper_id = line[3] if paper_id == '': continue - paper = fetch_paper(s2, paper_id) + paper = fetch_paper(s2, paper_id, freshen) if paper is None: continue + if freshen: + raw_papers_api.fetch_raw_paper(paper_id, freshen) db_paper = load_paper(paper_id) pdf_link = db_paper.pdf_link if db_paper else "" diff --git a/scraper/s2-raw-papers.py b/scraper/s2-raw-papers.py index 612c8099..67ff0b65 100644 --- a/scraper/s2-raw-papers.py +++ b/scraper/s2-raw-papers.py @@ -18,10 +18,10 @@ def fetch_raw_papers(fn): lines = read_csv(fn, keys=False) parallelize(fetch_raw_paper, lines) -def fetch_raw_paper(paper_id): +def fetch_raw_paper(paper_id, freshen=False): os.makedirs(make_raw_paper_path(paper_id), exist_ok=True) paper_fn = make_raw_paper_fn(paper_id) - if os.path.exists(paper_fn): + if os.path.exists(paper_fn) and not freshen: paper = read_json(paper_fn) else: paper = s2.raw_paper(paper_id) @@ -33,7 +33,7 @@ def fetch_raw_paper(paper_id): if 'responseType' in paper and paper['responseType'] == 'CANONICAL': write_json(paper_fn, paper) - paper = s2.raw_paper(data['canonicalId']) + paper = s2.raw_paper(paper['canonicalId']) paper_fn = make_raw_paper_fn(paper_id) return paper diff --git a/scraper/util.py b/scraper/util.py index fdbc0534..830dbe8b 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -386,10 +386,10 @@ def parallelize(func, rows): with Pool(processes=processCount) as pool: pool.starmap(func, rows, chunksize) -def fetch_paper(s2, paper_id): +def fetch_paper(s2, paper_id, freshen=False): os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True) paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id) - if os.path.exists(paper_fn): + if os.path.exists(paper_fn) and not freshen: return read_json(paper_fn) print(paper_id) paper = s2.paper(paper_id) |
