diff options
| -rw-r--r-- | scraper/README.md | 2 | ||||
| -rw-r--r-- | scraper/s2-search.py | 27 |
2 files changed, 20 insertions, 9 deletions
diff --git a/scraper/README.md b/scraper/README.md index 964a3ee3..a17f1efe 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -3,7 +3,7 @@ ## installation ``` -conda create -n megapixels python=3.7 +conda create -n megapixels python=3.6 pip install urllib3 pip install requests pip install simplejson diff --git a/scraper/s2-search.py b/scraper/s2-search.py index 44bab08b..ddecf2f9 100644 --- a/scraper/s2-search.py +++ b/scraper/s2-search.py @@ -27,20 +27,31 @@ totalResults @click.option('--index', '-n', default=0, help='Index of CSV (query,)') def fetch_entries(index): keys, lines = read_citation_list(index) + citation_lookup = [] s2 = SemanticScholarAPI() for line in lines: - label = line[0] - title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[1]) - entry_fn = './datasets/s2/entries/{}.json'.format(title) - if not os.path.exists(entry_fn): + key = line[0] + name = line[1] + title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[2]) + dump_fn = './datasets/s2/dumps/{}.json'.format(key) + entry_fn = './datasets/s2/entries/{}.json'.format(key) + result = None + if os.path.exists(entry_fn): + result = read_json(entry_fn) + else: results = s2.search(title) write_json(dump_fn, results) if len(results['results']) == 0: - print("No results for {}".format(title)) + print("- {}".format(title)) else: - print(title) - write_json(entry_fn, results['results'][0]) - time.sleep(random.randint(10, 20)) + print("+ {}".format(title)) + result = results['results'][0] + write_json(entry_fn, result) + if result: + paper_id = result['id'] + paper = fetch_paper(paper_id) + citation_lookup.append([key, name, title, paper_id]) + write_csv("datasets/citation_lookup.csv", keys=['key', 'name', 'title', 'paper_id'], rows=citation_lookup) if __name__ == '__main__': fetch_entries() |
