summaryrefslogtreecommitdiff
path: root/scraper/s2-search.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-12-07 15:25:06 +0100
committerJules Laplace <julescarbon@gmail.com>2018-12-07 15:25:13 +0100
commit602527f66a438ffe9340299a242900057b175926 (patch)
treefd07a5ab955b2d8ad1b7a9d65c6fae9edd73c3d1 /scraper/s2-search.py
parent2d950c3fa3b8107f941a80f88127ab45e371d128 (diff)
update s2_search
Diffstat (limited to 'scraper/s2-search.py')
-rw-r--r--scraper/s2-search.py27
1 files changed, 19 insertions, 8 deletions
diff --git a/scraper/s2-search.py b/scraper/s2-search.py
index 44bab08b..ddecf2f9 100644
--- a/scraper/s2-search.py
+++ b/scraper/s2-search.py
@@ -27,20 +27,31 @@ totalResults
@click.option('--index', '-n', default=0, help='Index of CSV (query,)')
def fetch_entries(index):
keys, lines = read_citation_list(index)
+ citation_lookup = []
s2 = SemanticScholarAPI()
for line in lines:
- label = line[0]
- title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[1])
- entry_fn = './datasets/s2/entries/{}.json'.format(title)
- if not os.path.exists(entry_fn):
+ key = line[0]
+ name = line[1]
+ title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[2])
+ dump_fn = './datasets/s2/dumps/{}.json'.format(key)
+ entry_fn = './datasets/s2/entries/{}.json'.format(key)
+ result = None
+ if os.path.exists(entry_fn):
+ result = read_json(entry_fn)
+ else:
results = s2.search(title)
write_json(dump_fn, results)
if len(results['results']) == 0:
- print("No results for {}".format(title))
+ print("- {}".format(title))
else:
- print(title)
- write_json(entry_fn, results['results'][0])
- time.sleep(random.randint(10, 20))
+ print("+ {}".format(title))
+ result = results['results'][0]
+ write_json(entry_fn, result)
+ if result:
+ paper_id = result['id']
+ paper = fetch_paper(paper_id)
+ citation_lookup.append([key, name, title, paper_id])
+ write_csv("datasets/citation_lookup.csv", keys=['key', 'name', 'title', 'paper_id'], rows=citation_lookup)
if __name__ == '__main__':
fetch_entries()