diff options
| -rw-r--r-- | s2-fetch-doi.py | 16 | ||||
| -rw-r--r-- | s2.py | 2 |
2 files changed, 12 insertions, 6 deletions
diff --git a/s2-fetch-doi.py b/s2-fetch-doi.py index a021fd2b..cd1d7999 100644 --- a/s2-fetch-doi.py +++ b/s2-fetch-doi.py @@ -17,9 +17,10 @@ s2 = SemanticScholarAPI() @click.option('--fn', '-i', default='db_paper_doi.csv', help='Filename of CSV (id, url,)') def fetch_doi_list(fn): lines = read_csv(fn, keys=False) + domains = [] for line in lines: paper_id, url = line - fetch_doi(paper_id, url) + domain = fetch_doi(paper_id, url) print("{} papers processed".format(len(lines))) def fetch_doi(paper_id, url): @@ -30,14 +31,19 @@ def fetch_doi(paper_id, url): if os.path.exists(doi_fn) or os.path.exists(txt_fn): # return read_json(doi_fn) return - size, final_url = s2.fetch_file(url, doi_fn) + size, final_url = s2.fetch_doi(url, doi_fn) if size is None: print("{} empty?".format(paper_id)) - time.sleep(random.randint(5, 10)) + time.sleep(random.randint(2, 5)) return None print("{} {} kb".format(paper_id, int(size / 1024))) - time.sleep(random.randint(5, 10)) - return + domain = urlparse(final_url).netloc + write_json(url_fn, { + 'paper_id': paper_id, + 'domain': domain + }) + time.sleep(random.randint(2, 5)) + return domain # return paper def make_doi_path(paper_id): @@ -151,7 +151,7 @@ class SemanticScholarAPI(object): if chunk: size += len(chunk) f.write(chunk) - return size, response.url + return size, resp.url @staticmethod def paper(paper_id, **kwargs): |
