update s2_search

author: Jules Laplace <julescarbon@gmail.com> 2018-12-07 15:25:06 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2018-12-07 15:25:13 +0100
commit: 602527f66a438ffe9340299a242900057b175926 (patch)
tree: fd07a5ab955b2d8ad1b7a9d65c6fae9edd73c3d1
parent: 2d950c3fa3b8107f941a80f88127ab45e371d128 (diff)
2 files changed, 20 insertions, 9 deletions
diff --git a/scraper/README.md b/scraper/README.md
index 964a3ee3..a17f1efe 100644
--- a/scraper/README.md
+++ b/scraper/README.md
@@ -3,7 +3,7 @@
 ## installation
 
 ```
-conda create -n megapixels python=3.7
+conda create -n megapixels python=3.6
 pip install urllib3
 pip install requests
 pip install simplejson
diff --git a/scraper/s2-search.py b/scraper/s2-search.py
index 44bab08b..ddecf2f9 100644
--- a/scraper/s2-search.py
+++ b/scraper/s2-search.py
@@ -27,20 +27,31 @@ totalResults
 @click.option('--index', '-n', default=0, help='Index of CSV (query,)')
 def fetch_entries(index):
   keys, lines = read_citation_list(index)
+  citation_lookup = []
   s2 = SemanticScholarAPI()
   for line in lines:
-    label = line[0]
-    title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[1])
-    entry_fn = './datasets/s2/entries/{}.json'.format(title)
-    if not os.path.exists(entry_fn):
+    key = line[0]
+    name = line[1]
+    title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[2])
+    dump_fn = './datasets/s2/dumps/{}.json'.format(key)
+    entry_fn = './datasets/s2/entries/{}.json'.format(key)
+    result = None
+    if os.path.exists(entry_fn):
+      result = read_json(entry_fn)
+    else:
       results = s2.search(title)
       write_json(dump_fn, results)
       if len(results['results']) == 0:
-        print("No results for {}".format(title))
+        print("- {}".format(title))
       else:
-        print(title)
-        write_json(entry_fn, results['results'][0])
-      time.sleep(random.randint(10, 20))
+        print("+ {}".format(title))
+        result = results['results'][0]
+        write_json(entry_fn, result)
+    if result:
+      paper_id = result['id']
+      paper = fetch_paper(paper_id)
+      citation_lookup.append([key, name, title, paper_id])
+  write_csv("datasets/citation_lookup.csv", keys=['key', 'name', 'title', 'paper_id'], rows=citation_lookup)
 
 if __name__ == '__main__':
   fetch_entries()
author	Jules Laplace <julescarbon@gmail.com>	2018-12-07 15:25:06 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2018-12-07 15:25:13 +0100
commit	602527f66a438ffe9340299a242900057b175926 (patch)
tree	fd07a5ab955b2d8ad1b7a9d65c6fae9edd73c3d1
parent	2d950c3fa3b8107f941a80f88127ab45e371d128 (diff)