search-deep.py

author: Jules Laplace <julescarbon@gmail.com> 2019-05-03 17:48:22 +0200
committer: Jules Laplace <julescarbon@gmail.com> 2019-05-03 17:48:22 +0200
commit: a53909352266a2258ddfa287508f979da59a9d1d (patch)
tree: 1ba841e8deabecc2201f24e53317de012653a875 /scraper/s2-search-deep.py
parent: c8d79d1b0f1b71706db2bbf0be2b60ef56904004 (diff)
1 files changed, 84 insertions, 0 deletions
diff --git a/scraper/s2-search-deep.py b/scraper/s2-search-deep.py
new file mode 100644
index 00000000..47c2f021
--- /dev/null
+++ b/scraper/s2-search-deep.py
@@ -0,0 +1,84 @@
+import os
+import sys
+import csv
+import subprocess
+import time
+import random
+import re
+import simplejson as json
+import click
+from s2 import SemanticScholarAPI
+from util import *
+from importlib import import_module
+raw_paper_module = import_module('s2-raw-papers')
+
+'''
+s2 search API format:
+results
+matchedAuthors
+matchedPresentations
+query
+querySuggestions
+results
+stats
+totalPages
+totalResults
+'''
+
+s2 = SemanticScholarAPI()
+
+def fetch_query(query, since=None):
+  clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', query)
+  yearFilter = {'min': since, 'max': 2020 } if since else None
+  results_path = './datasets/s2/search/{}'.format(clean_title)
+  os.makedirs(results_path, exist_ok=True)
+  page = 1
+  total = 0
+  paper_ids = {}
+
+  while True:
+    dump_fn = '{}/{}.json'.format(results_path, page)
+    if not refresh and os.path.exists(dump_fn):
+      results = read_json(dump_fn)
+    else:
+      results = s2.search(q=clean_title, page=page, pageSize=10, yearFilter=yearFilter)
+      write_json(dump_fn, results)
+
+    total += len(results['results'])
+    if len(results['results']) == 0:
+      break
+
+    print("+ {} page {}".format(query, page))
+
+    for result in results['results']:
+      paper_id = result['id']
+      if paper_id not in paper_ids:
+        paper_ids[paper_id] = True
+    if total >= results['totalResults']:
+      break
+  return paper_ids
+
+@click.command()
+@click.option('--refresh/--no-refresh', '-f', help='Force it to query the paper API again')
+def search_deep(index, refresh):
+  s2_queries = fetch_google_sheet_objects("s2_queries")
+  os.makedirs('./datasets/s2/search_papers', exist_ok=True)
+  for row in s2_queries:
+    since = row['since']
+    queries = []
+    row_paper_ids = {}
+    for i in range(1, 6):
+      query_key = 'query' + i
+      query = row[query_key]
+      if query:
+        paper_ids = fetch_query(query, since)
+        for paper_id in paper_ids:
+          row_paper_ids[paper_id] = True
+
+    row_fn = './datasets/s2/search_papers/{}'.format(row['key'])
+    write_csv(row_fn, keys=False, rows=row_paper_ids.keys())
+
+  # parallelize(raw_paper_module.fetch_raw_paper, paper_ids.keys())
+
+if __name__ == '__main__':
+  fetch_entries()
author	Jules Laplace <julescarbon@gmail.com>	2019-05-03 17:48:22 +0200
committer	Jules Laplace <julescarbon@gmail.com>	2019-05-03 17:48:22 +0200
commit	a53909352266a2258ddfa287508f979da59a9d1d (patch)
tree	1ba841e8deabecc2201f24e53317de012653a875 /scraper/s2-search-deep.py
parent	c8d79d1b0f1b71706db2bbf0be2b60ef56904004 (diff)