cleaning up

author: Jules Laplace <julescarbon@gmail.com> 2018-12-07 17:17:29 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2018-12-07 17:17:29 +0100
commit: 927fd8825101749cc8fcdc14f05ffd50d14ed652 (patch)
tree: a4f19a0a4284e15c2ccb9698c20a86fc3832430f /scraper
parent: f9d7d6d4b63fc688d5e7fb249e51a97d8d806e7d (diff)
3 files changed, 24 insertions, 29 deletions
diff --git a/scraper/README.md b/scraper/README.md
index a17f1efe..782fa30a 100644
--- a/scraper/README.md
+++ b/scraper/README.md
@@ -38,11 +38,7 @@ http://labs.semanticscholar.org/corpus/
 
 ### s2-search.py
 
-Loads titles from citations file and queries the S2 search API to get paper IDs.
-
-### s2-papers.py
-
-Uses the paper IDs from the search entries to query the S2 papers API to get first-degree citations, authors, etc.
+Loads titles from citations file and queries the S2 search API to get paper IDs, then uses the paper IDs from the search entries to query the S2 papers API to get first-degree citations, authors, etc.
 
 ### s2-dump-ids.py
 
diff --git a/scraper/s2-search.py b/scraper/s2-search.py
index 169a8d19..db5731d5 100644
--- a/scraper/s2-search.py
+++ b/scraper/s2-search.py
@@ -7,7 +7,7 @@ import random
 import re
 import simplejson as json
 import click
-from s2 import SemanticScholarAPI, fetch_paper
+from s2 import SemanticScholarAPI
 from util import *
 
 '''
@@ -32,8 +32,9 @@ def fetch_entries(index):
   for line in lines:
     key = line[0]
     name = line[1]
-    title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[2])
-    if len(title) < 2:
+    title = line[2].trim()
+    clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[2])
+    if len(clean_title) < 2:
       continue
     dump_fn = './datasets/s2/dumps/{}.json'.format(key)
     entry_fn = './datasets/s2/entries/{}.json'.format(key)
@@ -41,7 +42,7 @@ def fetch_entries(index):
     if os.path.exists(entry_fn):
       result = read_json(entry_fn)
     else:
-      results = s2.search(title)
+      results = s2.search(clean_title)
       write_json(dump_fn, results)
       if len(results['results']) == 0:
         print("- {}".format(title))
@@ -55,5 +56,23 @@ def fetch_entries(index):
       citation_lookup.append([key, name, title, paper_id])
   write_csv("datasets/citation_lookup.csv", keys=['key', 'name', 'title', 'paper_id'], rows=citation_lookup)
 
+def fetch_paper(s2, paper_id):
+  os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True)
+  paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id)
+  if os.path.exists(paper_fn):
+    return read_json(paper_fn)
+  print(paper_id)
+  paper = s2.paper(paper_id)
+  if paper is None:
+    print("Got none paper??")
+    # time.sleep(random.randint(1, 2))
+    paper = s2.paper(paper_id)
+    if paper is None:
+      print("Paper not found")
+      return None  
+  write_json(paper_fn, paper)
+  # time.sleep(random.randint(1, 2))
+  return paper
+
 if __name__ == '__main__':
   fetch_entries()
diff --git a/scraper/s2.py b/scraper/s2.py
index 4fdd5f28..b1b9742c 100644
--- a/scraper/s2.py
+++ b/scraper/s2.py
@@ -1,7 +1,5 @@
 import os
 import requests
-import time
-import random
 from util import *
 
 class AuthorStub(object):
@@ -195,21 +193,3 @@ class SemanticScholarAPI(object):
         }, headers=SemanticScholarAPI.headers)
         # print(resp.status_code)
         return None if resp.status_code != 200 else resp.json()
-
-def fetch_paper(s2, paper_id):
-  os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True)
-  paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id)
-  if os.path.exists(paper_fn):
-    return read_json(paper_fn)
-  print(paper_id)
-  paper = s2.paper(paper_id)
-  if paper is None:
-    print("Got none paper??")
-    # time.sleep(random.randint(1, 2))
-    paper = s2.paper(paper_id)
-    if paper is None:
-      print("Paper not found")
-      return None  
-  write_json(paper_fn, paper)
-  # time.sleep(random.randint(1, 2))
-  return paper
author	Jules Laplace <julescarbon@gmail.com>	2018-12-07 17:17:29 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2018-12-07 17:17:29 +0100
commit	927fd8825101749cc8fcdc14f05ffd50d14ed652 (patch)
tree	a4f19a0a4284e15c2ccb9698c20a86fc3832430f /scraper
parent	f9d7d6d4b63fc688d5e7fb249e51a97d8d806e7d (diff)