import os import sys import csv import subprocess import time import random import re import json import click from urllib.parse import urlparse from s2 import SemanticScholarAPI from util import * s2 = SemanticScholarAPI() @click.command() @click.option('--fn', '-i', default='db_paper_doi.csv', help='Filename of CSV (id, url,)') def fetch_doi_list(fn): lines = read_csv(fn, keys=False) domains = [] for line in lines: paper_id, url = line if url: domain = fetch_doi(paper_id, url) print(domain) print("{} papers processed".format(len(lines))) def fetch_doi(paper_id, url, replace=False): os.makedirs(make_doi_path(paper_id), exist_ok=True) doi_fn = make_doi_fn(paper_id) url_fn = make_url_fn(paper_id) txt_fn = make_txt_fn(paper_id) if replace and os.path.exists(doi_fn): os.rename(doi_fn, old_doi_fn(paper_id)) os.rename(url_fn, old_url_fn(paper_id)) if os.path.exists(doi_fn) or os.path.exists(txt_fn): # return read_json(doi_fn) return None, None size, final_url = s2.fetch_doi(url, doi_fn) if size is None: print("{} empty?".format(paper_id)) time.sleep(random.randint(2, 5)) return None, None print("{} {} kb".format(paper_id, int(size / 1024))) domain = urlparse(final_url).netloc write_json(url_fn, { 'paper_id': paper_id, 'domain': domain }) time.sleep(random.randint(2, 5)) return domain # return paper def make_doi_path(paper_id): return './datasets/s2/doi/{}/{}'.format(paper_id[0:2], paper_id) def make_doi_fn(paper_id): return './datasets/s2/doi/{}/{}/paper.doi'.format(paper_id[0:2], paper_id) def make_url_fn(paper_id): return './datasets/s2/doi/{}/{}/paper.url'.format(paper_id[0:2], paper_id) def make_txt_fn(paper_id): return './datasets/s2/pdf/{}/{}/paper.txt'.format(paper_id[0:2], paper_id) def old_doi_fn(paper_id): return './datasets/s2/doi/{}/{}/paper.doi2'.format(paper_id[0:2], paper_id) def old_url_fn(paper_id): return './datasets/s2/doi/{}/{}/paper.url2'.format(paper_id[0:2], paper_id) if __name__ == '__main__': fetch_doi_list()