diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-12-07 15:25:06 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-12-07 15:25:13 +0100 |
| commit | 602527f66a438ffe9340299a242900057b175926 (patch) | |
| tree | fd07a5ab955b2d8ad1b7a9d65c6fae9edd73c3d1 | |
| parent | 2d950c3fa3b8107f941a80f88127ab45e371d128 (diff) | |
update s2_search
| -rw-r--r-- | scraper/README.md | 2 | ||||
| -rw-r--r-- | scraper/s2-search.py | 27 |
2 files changed, 20 insertions, 9 deletions
diff --git a/scraper/README.md b/scraper/README.md index 964a3ee3..a17f1efe 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -3,7 +3,7 @@ ## installation ``` -conda create -n megapixels python=3.7 +conda create -n megapixels python=3.6 pip install urllib3 pip install requests pip install simplejson diff --git a/scraper/s2-search.py b/scraper/s2-search.py index 44bab08b..ddecf2f9 100644 --- a/scraper/s2-search.py +++ b/scraper/s2-search.py @@ -27,20 +27,31 @@ totalResults @click.option('--index', '-n', default=0, help='Index of CSV (query,)') def fetch_entries(index): keys, lines = read_citation_list(index) + citation_lookup = [] s2 = SemanticScholarAPI() for line in lines: - label = line[0] - title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[1]) - entry_fn = './datasets/s2/entries/{}.json'.format(title) - if not os.path.exists(entry_fn): + key = line[0] + name = line[1] + title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[2]) + dump_fn = './datasets/s2/dumps/{}.json'.format(key) + entry_fn = './datasets/s2/entries/{}.json'.format(key) + result = None + if os.path.exists(entry_fn): + result = read_json(entry_fn) + else: results = s2.search(title) write_json(dump_fn, results) if len(results['results']) == 0: - print("No results for {}".format(title)) + print("- {}".format(title)) else: - print(title) - write_json(entry_fn, results['results'][0]) - time.sleep(random.randint(10, 20)) + print("+ {}".format(title)) + result = results['results'][0] + write_json(entry_fn, result) + if result: + paper_id = result['id'] + paper = fetch_paper(paper_id) + citation_lookup.append([key, name, title, paper_id]) + write_csv("datasets/citation_lookup.csv", keys=['key', 'name', 'title', 'paper_id'], rows=citation_lookup) if __name__ == '__main__': fetch_entries() |
