diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-12-07 17:17:29 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-12-07 17:17:29 +0100 |
| commit | 927fd8825101749cc8fcdc14f05ffd50d14ed652 (patch) | |
| tree | a4f19a0a4284e15c2ccb9698c20a86fc3832430f /scraper | |
| parent | f9d7d6d4b63fc688d5e7fb249e51a97d8d806e7d (diff) | |
cleaning up
Diffstat (limited to 'scraper')
| -rw-r--r-- | scraper/README.md | 6 | ||||
| -rw-r--r-- | scraper/s2-search.py | 27 | ||||
| -rw-r--r-- | scraper/s2.py | 20 |
3 files changed, 24 insertions, 29 deletions
diff --git a/scraper/README.md b/scraper/README.md index a17f1efe..782fa30a 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -38,11 +38,7 @@ http://labs.semanticscholar.org/corpus/ ### s2-search.py -Loads titles from citations file and queries the S2 search API to get paper IDs. - -### s2-papers.py - -Uses the paper IDs from the search entries to query the S2 papers API to get first-degree citations, authors, etc. +Loads titles from citations file and queries the S2 search API to get paper IDs, then uses the paper IDs from the search entries to query the S2 papers API to get first-degree citations, authors, etc. ### s2-dump-ids.py diff --git a/scraper/s2-search.py b/scraper/s2-search.py index 169a8d19..db5731d5 100644 --- a/scraper/s2-search.py +++ b/scraper/s2-search.py @@ -7,7 +7,7 @@ import random import re import simplejson as json import click -from s2 import SemanticScholarAPI, fetch_paper +from s2 import SemanticScholarAPI from util import * ''' @@ -32,8 +32,9 @@ def fetch_entries(index): for line in lines: key = line[0] name = line[1] - title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[2]) - if len(title) < 2: + title = line[2].trim() + clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[2]) + if len(clean_title) < 2: continue dump_fn = './datasets/s2/dumps/{}.json'.format(key) entry_fn = './datasets/s2/entries/{}.json'.format(key) @@ -41,7 +42,7 @@ def fetch_entries(index): if os.path.exists(entry_fn): result = read_json(entry_fn) else: - results = s2.search(title) + results = s2.search(clean_title) write_json(dump_fn, results) if len(results['results']) == 0: print("- {}".format(title)) @@ -55,5 +56,23 @@ def fetch_entries(index): citation_lookup.append([key, name, title, paper_id]) write_csv("datasets/citation_lookup.csv", keys=['key', 'name', 'title', 'paper_id'], rows=citation_lookup) +def fetch_paper(s2, paper_id): + os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True) + paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id) + if os.path.exists(paper_fn): + return read_json(paper_fn) + print(paper_id) + paper = s2.paper(paper_id) + if paper is None: + print("Got none paper??") + # time.sleep(random.randint(1, 2)) + paper = s2.paper(paper_id) + if paper is None: + print("Paper not found") + return None + write_json(paper_fn, paper) + # time.sleep(random.randint(1, 2)) + return paper + if __name__ == '__main__': fetch_entries() diff --git a/scraper/s2.py b/scraper/s2.py index 4fdd5f28..b1b9742c 100644 --- a/scraper/s2.py +++ b/scraper/s2.py @@ -1,7 +1,5 @@ import os import requests -import time -import random from util import * class AuthorStub(object): @@ -195,21 +193,3 @@ class SemanticScholarAPI(object): }, headers=SemanticScholarAPI.headers) # print(resp.status_code) return None if resp.status_code != 200 else resp.json() - -def fetch_paper(s2, paper_id): - os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True) - paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id) - if os.path.exists(paper_fn): - return read_json(paper_fn) - print(paper_id) - paper = s2.paper(paper_id) - if paper is None: - print("Got none paper??") - # time.sleep(random.randint(1, 2)) - paper = s2.paper(paper_id) - if paper is None: - print("Paper not found") - return None - write_json(paper_fn, paper) - # time.sleep(random.randint(1, 2)) - return paper |
