summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-06 16:52:21 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-06 16:52:21 +0100
commitffa32666f9a7a4a80c5725f67a8191c9c79ed01f (patch)
treea74ae03842efe210501ebc066340e1b268804d83
parent0c0deb93edfaf50dc82b8a9118a4c00bdae1f043 (diff)
doi fetch script
-rw-r--r--s2-fetch-doi.py16
-rw-r--r--s2.py2
2 files changed, 12 insertions, 6 deletions
diff --git a/s2-fetch-doi.py b/s2-fetch-doi.py
index a021fd2b..cd1d7999 100644
--- a/s2-fetch-doi.py
+++ b/s2-fetch-doi.py
@@ -17,9 +17,10 @@ s2 = SemanticScholarAPI()
@click.option('--fn', '-i', default='db_paper_doi.csv', help='Filename of CSV (id, url,)')
def fetch_doi_list(fn):
lines = read_csv(fn, keys=False)
+ domains = []
for line in lines:
paper_id, url = line
- fetch_doi(paper_id, url)
+ domain = fetch_doi(paper_id, url)
print("{} papers processed".format(len(lines)))
def fetch_doi(paper_id, url):
@@ -30,14 +31,19 @@ def fetch_doi(paper_id, url):
if os.path.exists(doi_fn) or os.path.exists(txt_fn):
# return read_json(doi_fn)
return
- size, final_url = s2.fetch_file(url, doi_fn)
+ size, final_url = s2.fetch_doi(url, doi_fn)
if size is None:
print("{} empty?".format(paper_id))
- time.sleep(random.randint(5, 10))
+ time.sleep(random.randint(2, 5))
return None
print("{} {} kb".format(paper_id, int(size / 1024)))
- time.sleep(random.randint(5, 10))
- return
+ domain = urlparse(final_url).netloc
+ write_json(url_fn, {
+ 'paper_id': paper_id,
+ 'domain': domain
+ })
+ time.sleep(random.randint(2, 5))
+ return domain
# return paper
def make_doi_path(paper_id):
diff --git a/s2.py b/s2.py
index 8add3205..d1ee05d1 100644
--- a/s2.py
+++ b/s2.py
@@ -151,7 +151,7 @@ class SemanticScholarAPI(object):
if chunk:
size += len(chunk)
f.write(chunk)
- return size, response.url
+ return size, resp.url
@staticmethod
def paper(paper_id, **kwargs):