summaryrefslogtreecommitdiff
path: root/scraper/s2-search.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/s2-search.py')
-rw-r--r--scraper/s2-search.py27
1 files changed, 23 insertions, 4 deletions
diff --git a/scraper/s2-search.py b/scraper/s2-search.py
index 169a8d19..db5731d5 100644
--- a/scraper/s2-search.py
+++ b/scraper/s2-search.py
@@ -7,7 +7,7 @@ import random
import re
import simplejson as json
import click
-from s2 import SemanticScholarAPI, fetch_paper
+from s2 import SemanticScholarAPI
from util import *
'''
@@ -32,8 +32,9 @@ def fetch_entries(index):
for line in lines:
key = line[0]
name = line[1]
- title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[2])
- if len(title) < 2:
+ title = line[2].trim()
+ clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[2])
+ if len(clean_title) < 2:
continue
dump_fn = './datasets/s2/dumps/{}.json'.format(key)
entry_fn = './datasets/s2/entries/{}.json'.format(key)
@@ -41,7 +42,7 @@ def fetch_entries(index):
if os.path.exists(entry_fn):
result = read_json(entry_fn)
else:
- results = s2.search(title)
+ results = s2.search(clean_title)
write_json(dump_fn, results)
if len(results['results']) == 0:
print("- {}".format(title))
@@ -55,5 +56,23 @@ def fetch_entries(index):
citation_lookup.append([key, name, title, paper_id])
write_csv("datasets/citation_lookup.csv", keys=['key', 'name', 'title', 'paper_id'], rows=citation_lookup)
+def fetch_paper(s2, paper_id):
+ os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True)
+ paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id)
+ if os.path.exists(paper_fn):
+ return read_json(paper_fn)
+ print(paper_id)
+ paper = s2.paper(paper_id)
+ if paper is None:
+ print("Got none paper??")
+ # time.sleep(random.randint(1, 2))
+ paper = s2.paper(paper_id)
+ if paper is None:
+ print("Paper not found")
+ return None
+ write_json(paper_fn, paper)
+ # time.sleep(random.randint(1, 2))
+ return paper
+
if __name__ == '__main__':
fetch_entries()