summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scraper/s2-extract-pdf-txt.py37
1 files changed, 37 insertions, 0 deletions
diff --git a/scraper/s2-extract-pdf-txt.py b/scraper/s2-extract-pdf-txt.py
new file mode 100644
index 00000000..ab38202a
--- /dev/null
+++ b/scraper/s2-extract-pdf-txt.py
@@ -0,0 +1,37 @@
+import os
+import sys
+import csv
+import subprocess
+import time
+import random
+import re
+import simplejson as json
+import click
+import glob
+from multiprocessing import Pool
+from s2 import SemanticScholarAPI
+from util import *
+
+s2 = SemanticScholarAPI()
+
+@click.command()
+def extract_pdf_txt(fn):
+ fns = []
+ for fn in glob.iglob('datasets/s2/pdf/*/*/*.pdf'):
+ path = os.path.pathname(fn)
+ out_fn = os.path.join(path, 'paper.txt')
+ if not os.path.exists():
+ fns.append((fn, out_fn))
+ parallelize(extract_txt, paths)
+
+def extract_txt(in_fn, out_fn):
+ subprocess.call([
+ "pdf2txt.py",
+ '-p', '1', in_fn,
+ '-o', out_fn,
+ ])
+
+ -p 1 fn > $OUTPUT
+
+if __name__ == '__main__':
+ extract_pdf_txt()