import os import sys import csv import subprocess import time import random import re import simplejson as json import click from urllib.parse import urlparse from s2 import SemanticScholarAPI from util import * s2 = SemanticScholarAPI() @click.command() @click.option('--fn', '-i', default='db_paper_doi.csv', help='Filename of CSV (id, url,)') def fetch_doi_list(fn): lines = read_csv(fn, keys=False) domains = [] for line in lines: paper_id, url = line if url: domain = fetch_doi(paper_id, url) print(domain) print("{} papers processed".format(len(lines))) def fetch_doi(paper_id, url, replace=False): os.makedirs(make_doi_path(paper_id), exist_ok=True) doi_fn = make_doi_fn(paper_id) url_fn = make_url_fn(paper_id) txt_fn = make_txt_fn(paper_id) if replace and os.path.exists(doi_fn): os.rename(doi_fn, old_doi_fn(paper_id)) os.rename(url_fn, old_url_fn(paper_id)) if os.path.exists(doi_fn) or os.path.exists(txt_fn): # return read_json(doi_fn) return None, None size, final_url = s2.fetch_doi(url, doi_fn) if size is None: print("{} empty?".format(paper_id)) time.sleep(random.randint(2, 5)) return None, None print("{} {} kb".format(paper_id, int(size / 1024))) domain = urlparse(final_url).netloc write_json(url_fn, { 'paper_id': paper_id, 'domain': domain }) time.sleep(random.randint(2, 5)) return domain # return paper def make_doi_path(paper_id): return './datasets/s2/doi/{}/{}'.format(paper_id[0:2], paper_id) def make_doi_fn(paper_id): return './datasets/s2/doi/{}/{}/paper.doi'.format(paper_id[0:2], paper_id) def make_url_fn(paper_id): return './datasets/s2/doi/{}/{}/paper.url'.format(paper_id[0:2], paper_id) def make_txt_fn(paper_id): return './datasets/s2/pdf/{}/{}/paper.txt'.format(paper_id[0:2], paper_id) def old_doi_fn(paper_id): return './datasets/s2/doi/{}/{}/paper.doi2'.format(paper_id[0:2], paper_id) def old_url_fn(paper_id): return './datasets/s2/doi/{}/{}/paper.url2'.format(paper_id[0:2], paper_id) if __name__ == '__main__': fetch_doi_list()