From 0b8469cf54a1a895ab549a1703d034cbd2030598 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Sun, 10 Feb 2019 17:24:16 +0100 Subject: s2-extract-pdf-txt.py --- scraper/s2-extract-pdf-txt.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 scraper/s2-extract-pdf-txt.py diff --git a/scraper/s2-extract-pdf-txt.py b/scraper/s2-extract-pdf-txt.py new file mode 100644 index 00000000..ab38202a --- /dev/null +++ b/scraper/s2-extract-pdf-txt.py @@ -0,0 +1,37 @@ +import os +import sys +import csv +import subprocess +import time +import random +import re +import simplejson as json +import click +import glob +from multiprocessing import Pool +from s2 import SemanticScholarAPI +from util import * + +s2 = SemanticScholarAPI() + +@click.command() +def extract_pdf_txt(fn): + fns = [] + for fn in glob.iglob('datasets/s2/pdf/*/*/*.pdf'): + path = os.path.pathname(fn) + out_fn = os.path.join(path, 'paper.txt') + if not os.path.exists(): + fns.append((fn, out_fn)) + parallelize(extract_txt, paths) + +def extract_txt(in_fn, out_fn): + subprocess.call([ + "pdf2txt.py", + '-p', '1', in_fn, + '-o', out_fn, + ]) + + -p 1 fn > $OUTPUT + +if __name__ == '__main__': + extract_pdf_txt() -- cgit v1.2.3-70-g09d2