summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-12-07 17:17:29 +0100
committerJules Laplace <julescarbon@gmail.com>2018-12-07 17:17:29 +0100
commit927fd8825101749cc8fcdc14f05ffd50d14ed652 (patch)
treea4f19a0a4284e15c2ccb9698c20a86fc3832430f /scraper
parentf9d7d6d4b63fc688d5e7fb249e51a97d8d806e7d (diff)
cleaning up
Diffstat (limited to 'scraper')
-rw-r--r--scraper/README.md6
-rw-r--r--scraper/s2-search.py27
-rw-r--r--scraper/s2.py20
3 files changed, 24 insertions, 29 deletions
diff --git a/scraper/README.md b/scraper/README.md
index a17f1efe..782fa30a 100644
--- a/scraper/README.md
+++ b/scraper/README.md
@@ -38,11 +38,7 @@ http://labs.semanticscholar.org/corpus/
### s2-search.py
-Loads titles from citations file and queries the S2 search API to get paper IDs.
-
-### s2-papers.py
-
-Uses the paper IDs from the search entries to query the S2 papers API to get first-degree citations, authors, etc.
+Loads titles from citations file and queries the S2 search API to get paper IDs, then uses the paper IDs from the search entries to query the S2 papers API to get first-degree citations, authors, etc.
### s2-dump-ids.py
diff --git a/scraper/s2-search.py b/scraper/s2-search.py
index 169a8d19..db5731d5 100644
--- a/scraper/s2-search.py
+++ b/scraper/s2-search.py
@@ -7,7 +7,7 @@ import random
import re
import simplejson as json
import click
-from s2 import SemanticScholarAPI, fetch_paper
+from s2 import SemanticScholarAPI
from util import *
'''
@@ -32,8 +32,9 @@ def fetch_entries(index):
for line in lines:
key = line[0]
name = line[1]
- title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[2])
- if len(title) < 2:
+ title = line[2].trim()
+ clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[2])
+ if len(clean_title) < 2:
continue
dump_fn = './datasets/s2/dumps/{}.json'.format(key)
entry_fn = './datasets/s2/entries/{}.json'.format(key)
@@ -41,7 +42,7 @@ def fetch_entries(index):
if os.path.exists(entry_fn):
result = read_json(entry_fn)
else:
- results = s2.search(title)
+ results = s2.search(clean_title)
write_json(dump_fn, results)
if len(results['results']) == 0:
print("- {}".format(title))
@@ -55,5 +56,23 @@ def fetch_entries(index):
citation_lookup.append([key, name, title, paper_id])
write_csv("datasets/citation_lookup.csv", keys=['key', 'name', 'title', 'paper_id'], rows=citation_lookup)
+def fetch_paper(s2, paper_id):
+ os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True)
+ paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id)
+ if os.path.exists(paper_fn):
+ return read_json(paper_fn)
+ print(paper_id)
+ paper = s2.paper(paper_id)
+ if paper is None:
+ print("Got none paper??")
+ # time.sleep(random.randint(1, 2))
+ paper = s2.paper(paper_id)
+ if paper is None:
+ print("Paper not found")
+ return None
+ write_json(paper_fn, paper)
+ # time.sleep(random.randint(1, 2))
+ return paper
+
if __name__ == '__main__':
fetch_entries()
diff --git a/scraper/s2.py b/scraper/s2.py
index 4fdd5f28..b1b9742c 100644
--- a/scraper/s2.py
+++ b/scraper/s2.py
@@ -1,7 +1,5 @@
import os
import requests
-import time
-import random
from util import *
class AuthorStub(object):
@@ -195,21 +193,3 @@ class SemanticScholarAPI(object):
}, headers=SemanticScholarAPI.headers)
# print(resp.status_code)
return None if resp.status_code != 200 else resp.json()
-
-def fetch_paper(s2, paper_id):
- os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True)
- paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id)
- if os.path.exists(paper_fn):
- return read_json(paper_fn)
- print(paper_id)
- paper = s2.paper(paper_id)
- if paper is None:
- print("Got none paper??")
- # time.sleep(random.randint(1, 2))
- paper = s2.paper(paper_id)
- if paper is None:
- print("Paper not found")
- return None
- write_json(paper_fn, paper)
- # time.sleep(random.randint(1, 2))
- return paper