diff options
Diffstat (limited to 'scraper/s2-extract-full-pdf-txt.py')
| -rw-r--r-- | scraper/s2-extract-full-pdf-txt.py | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/scraper/s2-extract-full-pdf-txt.py b/scraper/s2-extract-full-pdf-txt.py new file mode 100644 index 00000000..c748b6a1 --- /dev/null +++ b/scraper/s2-extract-full-pdf-txt.py @@ -0,0 +1,36 @@ +import os +import sys +import csv +import subprocess +import time +import random +import re +import simplejson as json +import click +import glob +from multiprocessing import Pool +from s2 import SemanticScholarAPI +from util import * + +s2 = SemanticScholarAPI() + +@click.command() +def extract_full_pdf_txt(): + fns = [] + for fn in glob.iglob('datasets/s2/pdf/*/*/*.pdf'): + path = os.path.dirname(fn).replace('pdf', 'txt') + out_fn = os.path.join(path, 'paper.txt') + if not os.path.exists(out_fn): + fns.append((fn, out_path, out_fn)) + parallelize(extract_txt, fns) + +def extract_txt(in_fn, out_path, out_fn): + subprocess.call([ + "pdf2txt.py", + in_fn, + '-o', out_fn, + '-O', out_path, + ]) + +if __name__ == '__main__': + extract_full_pdf_txt() |
