From 768757fe47d55b62c1d3ef87c982332e0292393e Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Tue, 19 Feb 2019 18:11:51 +0100 Subject: extract full pdf txt --- scraper/s2-extract-full-pdf-txt.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 scraper/s2-extract-full-pdf-txt.py (limited to 'scraper/s2-extract-full-pdf-txt.py') diff --git a/scraper/s2-extract-full-pdf-txt.py b/scraper/s2-extract-full-pdf-txt.py new file mode 100644 index 00000000..c748b6a1 --- /dev/null +++ b/scraper/s2-extract-full-pdf-txt.py @@ -0,0 +1,36 @@ +import os +import sys +import csv +import subprocess +import time +import random +import re +import simplejson as json +import click +import glob +from multiprocessing import Pool +from s2 import SemanticScholarAPI +from util import * + +s2 = SemanticScholarAPI() + +@click.command() +def extract_full_pdf_txt(): + fns = [] + for fn in glob.iglob('datasets/s2/pdf/*/*/*.pdf'): + path = os.path.dirname(fn).replace('pdf', 'txt') + out_fn = os.path.join(path, 'paper.txt') + if not os.path.exists(out_fn): + fns.append((fn, out_path, out_fn)) + parallelize(extract_txt, fns) + +def extract_txt(in_fn, out_path, out_fn): + subprocess.call([ + "pdf2txt.py", + in_fn, + '-o', out_fn, + '-O', out_path, + ]) + +if __name__ == '__main__': + extract_full_pdf_txt() -- cgit v1.2.3-70-g09d2