diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2019-02-19 18:11:51 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2019-02-19 18:11:51 +0100 |
| commit | 768757fe47d55b62c1d3ef87c982332e0292393e (patch) | |
| tree | a2a8db79388da496508dba9b8bf05e61b120f68b /scraper/s2-extract-full-pdf-txt.py | |
| parent | d87d52d4877977825dda1f617875625dd46fa8b9 (diff) | |
extract full pdf txt
Diffstat (limited to 'scraper/s2-extract-full-pdf-txt.py')
| -rw-r--r-- | scraper/s2-extract-full-pdf-txt.py | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/scraper/s2-extract-full-pdf-txt.py b/scraper/s2-extract-full-pdf-txt.py new file mode 100644 index 00000000..c748b6a1 --- /dev/null +++ b/scraper/s2-extract-full-pdf-txt.py @@ -0,0 +1,36 @@ +import os +import sys +import csv +import subprocess +import time +import random +import re +import simplejson as json +import click +import glob +from multiprocessing import Pool +from s2 import SemanticScholarAPI +from util import * + +s2 = SemanticScholarAPI() + +@click.command() +def extract_full_pdf_txt(): + fns = [] + for fn in glob.iglob('datasets/s2/pdf/*/*/*.pdf'): + path = os.path.dirname(fn).replace('pdf', 'txt') + out_fn = os.path.join(path, 'paper.txt') + if not os.path.exists(out_fn): + fns.append((fn, out_path, out_fn)) + parallelize(extract_txt, fns) + +def extract_txt(in_fn, out_path, out_fn): + subprocess.call([ + "pdf2txt.py", + in_fn, + '-o', out_fn, + '-O', out_path, + ]) + +if __name__ == '__main__': + extract_full_pdf_txt() |
