diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-06 16:52:21 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-06 16:52:21 +0100 |
| commit | ffa32666f9a7a4a80c5725f67a8191c9c79ed01f (patch) | |
| tree | a74ae03842efe210501ebc066340e1b268804d83 | |
| parent | 0c0deb93edfaf50dc82b8a9118a4c00bdae1f043 (diff) | |
doi fetch script
| -rw-r--r-- | s2-fetch-doi.py | 16 | ||||
| -rw-r--r-- | s2.py | 2 |
2 files changed, 12 insertions, 6 deletions
diff --git a/s2-fetch-doi.py b/s2-fetch-doi.py index a021fd2b..cd1d7999 100644 --- a/s2-fetch-doi.py +++ b/s2-fetch-doi.py @@ -17,9 +17,10 @@ s2 = SemanticScholarAPI() @click.option('--fn', '-i', default='db_paper_doi.csv', help='Filename of CSV (id, url,)') def fetch_doi_list(fn): lines = read_csv(fn, keys=False) + domains = [] for line in lines: paper_id, url = line - fetch_doi(paper_id, url) + domain = fetch_doi(paper_id, url) print("{} papers processed".format(len(lines))) def fetch_doi(paper_id, url): @@ -30,14 +31,19 @@ def fetch_doi(paper_id, url): if os.path.exists(doi_fn) or os.path.exists(txt_fn): # return read_json(doi_fn) return - size, final_url = s2.fetch_file(url, doi_fn) + size, final_url = s2.fetch_doi(url, doi_fn) if size is None: print("{} empty?".format(paper_id)) - time.sleep(random.randint(5, 10)) + time.sleep(random.randint(2, 5)) return None print("{} {} kb".format(paper_id, int(size / 1024))) - time.sleep(random.randint(5, 10)) - return + domain = urlparse(final_url).netloc + write_json(url_fn, { + 'paper_id': paper_id, + 'domain': domain + }) + time.sleep(random.randint(2, 5)) + return domain # return paper def make_doi_path(paper_id): @@ -151,7 +151,7 @@ class SemanticScholarAPI(object): if chunk: size += len(chunk) f.write(chunk) - return size, response.url + return size, resp.url @staticmethod def paper(paper_id, **kwargs): |
