import os import sys import csv import subprocess import time import random import re import simplejson as json import click from multiprocessing import Pool from s2 import SemanticScholarAPI from util import * s2 = SemanticScholarAPI() @click.command() @click.option('--fn', '-i', default='db_paper_pdf.csv', help='Filename of CSV (id, url,)') def fetch_pdfs(fn): lines = read_csv(fn, keys=False) parallelize(fetch_pdf, lines) print("{} papers processed".format(len(lines))) def fetch_pdf(paper_id, url): os.makedirs(make_pdf_path(paper_id), exist_ok=True) pdf_fn = make_pdf_fn(paper_id) txt_fn = make_txt_fn(paper_id) empty_fn = make_empty_fn(paper_id) if os.path.exists(pdf_fn) or os.path.exists(txt_fn) or os.path.exists(empty_fn): return size = s2.fetch_file(url, pdf_fn) if size is None: print("{} empty?".format(paper_id)) write_json(empty_fn, { 'paper_id': paper_id, 'url': url }) return print("{} {} kb {}".format(paper_id, int(size / 1024), url)) def make_pdf_path(paper_id): return './datasets/s2/pdf/{}/{}'.format(paper_id[0:2], paper_id) def make_pdf_fn(paper_id): return './datasets/s2/pdf/{}/{}/paper.pdf'.format(paper_id[0:2], paper_id) def make_txt_fn(paper_id): return './datasets/s2/pdf/{}/{}/paper.txt'.format(paper_id[0:2], paper_id) def make_empty_fn(paper_id): return './datasets/s2/pdf/{}/{}/pdf.empty'.format(paper_id[0:2], paper_id) if __name__ == '__main__': fetch_pdfs()