summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
Diffstat (limited to 'scraper')
-rw-r--r--scraper/s2-search-deep.py84
-rw-r--r--scraper/s2.py8
2 files changed, 88 insertions, 4 deletions
diff --git a/scraper/s2-search-deep.py b/scraper/s2-search-deep.py
new file mode 100644
index 00000000..47c2f021
--- /dev/null
+++ b/scraper/s2-search-deep.py
@@ -0,0 +1,84 @@
+import os
+import sys
+import csv
+import subprocess
+import time
+import random
+import re
+import simplejson as json
+import click
+from s2 import SemanticScholarAPI
+from util import *
+from importlib import import_module
+raw_paper_module = import_module('s2-raw-papers')
+
+'''
+s2 search API format:
+results
+matchedAuthors
+matchedPresentations
+query
+querySuggestions
+results
+stats
+totalPages
+totalResults
+'''
+
+s2 = SemanticScholarAPI()
+
+def fetch_query(query, since=None):
+ clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', query)
+ yearFilter = {'min': since, 'max': 2020 } if since else None
+ results_path = './datasets/s2/search/{}'.format(clean_title)
+ os.makedirs(results_path, exist_ok=True)
+ page = 1
+ total = 0
+ paper_ids = {}
+
+ while True:
+ dump_fn = '{}/{}.json'.format(results_path, page)
+ if not refresh and os.path.exists(dump_fn):
+ results = read_json(dump_fn)
+ else:
+ results = s2.search(q=clean_title, page=page, pageSize=10, yearFilter=yearFilter)
+ write_json(dump_fn, results)
+
+ total += len(results['results'])
+ if len(results['results']) == 0:
+ break
+
+ print("+ {} page {}".format(query, page))
+
+ for result in results['results']:
+ paper_id = result['id']
+ if paper_id not in paper_ids:
+ paper_ids[paper_id] = True
+ if total >= results['totalResults']:
+ break
+ return paper_ids
+
+@click.command()
+@click.option('--refresh/--no-refresh', '-f', help='Force it to query the paper API again')
+def search_deep(index, refresh):
+ s2_queries = fetch_google_sheet_objects("s2_queries")
+ os.makedirs('./datasets/s2/search_papers', exist_ok=True)
+ for row in s2_queries:
+ since = row['since']
+ queries = []
+ row_paper_ids = {}
+ for i in range(1, 6):
+ query_key = 'query' + i
+ query = row[query_key]
+ if query:
+ paper_ids = fetch_query(query, since)
+ for paper_id in paper_ids:
+ row_paper_ids[paper_id] = True
+
+ row_fn = './datasets/s2/search_papers/{}'.format(row['key'])
+ write_csv(row_fn, keys=False, rows=row_paper_ids.keys())
+
+ # parallelize(raw_paper_module.fetch_raw_paper, paper_ids.keys())
+
+if __name__ == '__main__':
+ fetch_entries()
diff --git a/scraper/s2.py b/scraper/s2.py
index 01c0b4d5..26334de8 100644
--- a/scraper/s2.py
+++ b/scraper/s2.py
@@ -177,19 +177,19 @@ class SemanticScholarAPI(object):
return None if resp.status_code != 200 else resp.json() # Paper(**resp.json())
@staticmethod
- def search(q):
+ def search(q, page=1, pageSize=10, yearFilter=None):
resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json={
'authors': [],
'coAuthors': [],
'facets': {},
- 'page': 1,
- 'pageSize': 10,
+ 'page': page,
+ 'pageSize': pageSize,
'publicationTypes': [],
'queryString': q,
'requireViewablePdf': False,
'sort': "relevance",
'venues': [],
- 'yearFilter': None,
+ 'yearFilter': yearFilter,
}, headers=SemanticScholarAPI.headers)
# print(resp.status_code)
return None if resp.status_code != 200 else resp.json()