diff options
| -rw-r--r-- | scraper/s2-search-deep.py | 84 | ||||
| -rw-r--r-- | scraper/s2.py | 8 |
2 files changed, 88 insertions, 4 deletions
diff --git a/scraper/s2-search-deep.py b/scraper/s2-search-deep.py new file mode 100644 index 00000000..47c2f021 --- /dev/null +++ b/scraper/s2-search-deep.py @@ -0,0 +1,84 @@ +import os +import sys +import csv +import subprocess +import time +import random +import re +import simplejson as json +import click +from s2 import SemanticScholarAPI +from util import * +from importlib import import_module +raw_paper_module = import_module('s2-raw-papers') + +''' +s2 search API format: +results +matchedAuthors +matchedPresentations +query +querySuggestions +results +stats +totalPages +totalResults +''' + +s2 = SemanticScholarAPI() + +def fetch_query(query, since=None): + clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', query) + yearFilter = {'min': since, 'max': 2020 } if since else None + results_path = './datasets/s2/search/{}'.format(clean_title) + os.makedirs(results_path, exist_ok=True) + page = 1 + total = 0 + paper_ids = {} + + while True: + dump_fn = '{}/{}.json'.format(results_path, page) + if not refresh and os.path.exists(dump_fn): + results = read_json(dump_fn) + else: + results = s2.search(q=clean_title, page=page, pageSize=10, yearFilter=yearFilter) + write_json(dump_fn, results) + + total += len(results['results']) + if len(results['results']) == 0: + break + + print("+ {} page {}".format(query, page)) + + for result in results['results']: + paper_id = result['id'] + if paper_id not in paper_ids: + paper_ids[paper_id] = True + if total >= results['totalResults']: + break + return paper_ids + +@click.command() +@click.option('--refresh/--no-refresh', '-f', help='Force it to query the paper API again') +def search_deep(index, refresh): + s2_queries = fetch_google_sheet_objects("s2_queries") + os.makedirs('./datasets/s2/search_papers', exist_ok=True) + for row in s2_queries: + since = row['since'] + queries = [] + row_paper_ids = {} + for i in range(1, 6): + query_key = 'query' + i + query = row[query_key] + if query: + paper_ids = fetch_query(query, since) + for paper_id in paper_ids: + row_paper_ids[paper_id] = True + + row_fn = './datasets/s2/search_papers/{}'.format(row['key']) + write_csv(row_fn, keys=False, rows=row_paper_ids.keys()) + + # parallelize(raw_paper_module.fetch_raw_paper, paper_ids.keys()) + +if __name__ == '__main__': + fetch_entries() diff --git a/scraper/s2.py b/scraper/s2.py index 01c0b4d5..26334de8 100644 --- a/scraper/s2.py +++ b/scraper/s2.py @@ -177,19 +177,19 @@ class SemanticScholarAPI(object): return None if resp.status_code != 200 else resp.json() # Paper(**resp.json()) @staticmethod - def search(q): + def search(q, page=1, pageSize=10, yearFilter=None): resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json={ 'authors': [], 'coAuthors': [], 'facets': {}, - 'page': 1, - 'pageSize': 10, + 'page': page, + 'pageSize': pageSize, 'publicationTypes': [], 'queryString': q, 'requireViewablePdf': False, 'sort': "relevance", 'venues': [], - 'yearFilter': None, + 'yearFilter': yearFilter, }, headers=SemanticScholarAPI.headers) # print(resp.status_code) return None if resp.status_code != 200 else resp.json() |
