diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2019-02-10 17:24:16 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2019-02-10 17:24:16 +0100 |
| commit | 0b8469cf54a1a895ab549a1703d034cbd2030598 (patch) | |
| tree | 314d9cc05adbe22d114df05657e78cd4d80cfae2 /scraper | |
| parent | 7fe9d36385a2350c9c4fd5dec105e34d85fe637a (diff) | |
s2-extract-pdf-txt.py
Diffstat (limited to 'scraper')
| -rw-r--r-- | scraper/s2-extract-pdf-txt.py | 37 |
1 files changed, 37 insertions, 0 deletions
diff --git a/scraper/s2-extract-pdf-txt.py b/scraper/s2-extract-pdf-txt.py new file mode 100644 index 00000000..ab38202a --- /dev/null +++ b/scraper/s2-extract-pdf-txt.py @@ -0,0 +1,37 @@ +import os +import sys +import csv +import subprocess +import time +import random +import re +import simplejson as json +import click +import glob +from multiprocessing import Pool +from s2 import SemanticScholarAPI +from util import * + +s2 = SemanticScholarAPI() + +@click.command() +def extract_pdf_txt(fn): + fns = [] + for fn in glob.iglob('datasets/s2/pdf/*/*/*.pdf'): + path = os.path.pathname(fn) + out_fn = os.path.join(path, 'paper.txt') + if not os.path.exists(): + fns.append((fn, out_fn)) + parallelize(extract_txt, paths) + +def extract_txt(in_fn, out_fn): + subprocess.call([ + "pdf2txt.py", + '-p', '1', in_fn, + '-o', out_fn, + ]) + + -p 1 fn > $OUTPUT + +if __name__ == '__main__': + extract_pdf_txt() |
