summaryrefslogtreecommitdiff
path: root/scraper/s2-extract-pdf-txt.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2019-02-10 17:24:16 +0100
committerJules Laplace <julescarbon@gmail.com>2019-02-10 17:24:16 +0100
commit0b8469cf54a1a895ab549a1703d034cbd2030598 (patch)
tree314d9cc05adbe22d114df05657e78cd4d80cfae2 /scraper/s2-extract-pdf-txt.py
parent7fe9d36385a2350c9c4fd5dec105e34d85fe637a (diff)
s2-extract-pdf-txt.py
Diffstat (limited to 'scraper/s2-extract-pdf-txt.py')
-rw-r--r--scraper/s2-extract-pdf-txt.py37
1 files changed, 37 insertions, 0 deletions
diff --git a/scraper/s2-extract-pdf-txt.py b/scraper/s2-extract-pdf-txt.py
new file mode 100644
index 00000000..ab38202a
--- /dev/null
+++ b/scraper/s2-extract-pdf-txt.py
@@ -0,0 +1,37 @@
+import os
+import sys
+import csv
+import subprocess
+import time
+import random
+import re
+import simplejson as json
+import click
+import glob
+from multiprocessing import Pool
+from s2 import SemanticScholarAPI
+from util import *
+
+s2 = SemanticScholarAPI()
+
+@click.command()
+def extract_pdf_txt(fn):
+ fns = []
+ for fn in glob.iglob('datasets/s2/pdf/*/*/*.pdf'):
+ path = os.path.pathname(fn)
+ out_fn = os.path.join(path, 'paper.txt')
+ if not os.path.exists():
+ fns.append((fn, out_fn))
+ parallelize(extract_txt, paths)
+
+def extract_txt(in_fn, out_fn):
+ subprocess.call([
+ "pdf2txt.py",
+ '-p', '1', in_fn,
+ '-o', out_fn,
+ ])
+
+ -p 1 fn > $OUTPUT
+
+if __name__ == '__main__':
+ extract_pdf_txt()