import os import sys import csv import subprocess import time import random import re import simplejson as json import click import glob from multiprocessing import Pool from s2 import SemanticScholarAPI from util import * s2 = SemanticScholarAPI() @click.command() def extract_full_pdf_txt(): fns = [] for fn in glob.iglob('datasets/s2/pdf/*/*/*.pdf'): out_path = os.path.dirname(fn).replace('pdf', 'txt') out_fn = os.path.join(out_path, 'paper.txt') if not os.path.exists(out_fn): fns.append((fn, out_path, out_fn)) parallelize(extract_txt, fns) def extract_txt(in_fn, out_path, out_fn): subprocess.call([ "pdf2txt.py", in_fn, '-o', out_fn, '-O', out_path, ]) if __name__ == '__main__': extract_full_pdf_txt()