summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scraper/s2-search-deep.py36
-rw-r--r--scraper/s2.py6
2 files changed, 27 insertions, 15 deletions
diff --git a/scraper/s2-search-deep.py b/scraper/s2-search-deep.py
index 47c2f021..9846f2a3 100644
--- a/scraper/s2-search-deep.py
+++ b/scraper/s2-search-deep.py
@@ -27,7 +27,7 @@ totalResults
s2 = SemanticScholarAPI()
-def fetch_query(query, since=None):
+def fetch_query(query, since=None, refresh=False):
clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', query)
yearFilter = {'min': since, 'max': 2020 } if since else None
results_path = './datasets/s2/search/{}'.format(clean_title)
@@ -38,47 +38,57 @@ def fetch_query(query, since=None):
while True:
dump_fn = '{}/{}.json'.format(results_path, page)
+ #print(dump_fn)
if not refresh and os.path.exists(dump_fn):
results = read_json(dump_fn)
else:
- results = s2.search(q=clean_title, page=page, pageSize=10, yearFilter=yearFilter)
+ results = s2.search(q=clean_title.replace(' ', '+'), page=page, pageSize=10, yearFilter=yearFilter)
write_json(dump_fn, results)
+ time.sleep(5)
- total += len(results['results'])
- if len(results['results']) == 0:
+ #print(results)
+ if not results or len(results['results']) == 0:
break
+ total += len(results['results'])
+
print("+ {} page {}".format(query, page))
for result in results['results']:
paper_id = result['id']
if paper_id not in paper_ids:
paper_ids[paper_id] = True
- if total >= results['totalResults']:
+ page += 1
+ if total >= results['totalResults'] - 9:
break
return paper_ids
@click.command()
@click.option('--refresh/--no-refresh', '-f', help='Force it to query the paper API again')
-def search_deep(index, refresh):
+def search_deep(refresh):
s2_queries = fetch_google_sheet_objects("s2_queries")
os.makedirs('./datasets/s2/search_papers', exist_ok=True)
for row in s2_queries:
since = row['since']
+ if not since:
+ continue
+ since = int(since)
queries = []
row_paper_ids = {}
for i in range(1, 6):
- query_key = 'query' + i
+ query_key = 'query{}'.format(i)
query = row[query_key]
if query:
- paper_ids = fetch_query(query, since)
+ paper_ids = fetch_query(query, since, refresh)
for paper_id in paper_ids:
row_paper_ids[paper_id] = True
- row_fn = './datasets/s2/search_papers/{}'.format(row['key'])
- write_csv(row_fn, keys=False, rows=row_paper_ids.keys())
-
- # parallelize(raw_paper_module.fetch_raw_paper, paper_ids.keys())
+ paper_ids = list(row_paper_ids.keys())
+ if len(paper_ids):
+ print("Writing {} paper ids".format(len(paper_ids)))
+ row_fn = './datasets/s2/search_papers/{}.json'.format(row['key'])
+ write_json(row_fn, paper_ids)
+ parallelize(raw_paper_module.fetch_raw_paper, [(id,) for id in paper_ids])
if __name__ == '__main__':
- fetch_entries()
+ search_deep()
diff --git a/scraper/s2.py b/scraper/s2.py
index 26334de8..36dbc906 100644
--- a/scraper/s2.py
+++ b/scraper/s2.py
@@ -178,10 +178,12 @@ class SemanticScholarAPI(object):
@staticmethod
def search(q, page=1, pageSize=10, yearFilter=None):
+ #print(q)
+ #print(yearFilter)
resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json={
'authors': [],
'coAuthors': [],
- 'facets': {},
+ #'facets': {},
'page': page,
'pageSize': pageSize,
'publicationTypes': [],
@@ -191,5 +193,5 @@ class SemanticScholarAPI(object):
'venues': [],
'yearFilter': yearFilter,
}, headers=SemanticScholarAPI.headers)
- # print(resp.status_code)
+ #print(resp.status_code)
return None if resp.status_code != 200 else resp.json()