extract full pdf txt

author: Jules Laplace <julescarbon@gmail.com> 2019-02-19 18:11:51 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2019-02-19 18:11:51 +0100
commit: 768757fe47d55b62c1d3ef87c982332e0292393e (patch)
tree: a2a8db79388da496508dba9b8bf05e61b120f68b /scraper/s2-extract-full-pdf-txt.py
parent: d87d52d4877977825dda1f617875625dd46fa8b9 (diff)
1 files changed, 36 insertions, 0 deletions
diff --git a/scraper/s2-extract-full-pdf-txt.py b/scraper/s2-extract-full-pdf-txt.py
new file mode 100644
index 00000000..c748b6a1
--- /dev/null
+++ b/scraper/s2-extract-full-pdf-txt.py
@@ -0,0 +1,36 @@
+import os
+import sys
+import csv
+import subprocess
+import time
+import random
+import re
+import simplejson as json
+import click
+import glob
+from multiprocessing import Pool
+from s2 import SemanticScholarAPI
+from util import *
+
+s2 = SemanticScholarAPI()
+
+@click.command()
+def extract_full_pdf_txt():
+  fns = []
+  for fn in glob.iglob('datasets/s2/pdf/*/*/*.pdf'):
+    path = os.path.dirname(fn).replace('pdf', 'txt')
+    out_fn = os.path.join(path, 'paper.txt')
+    if not os.path.exists(out_fn):
+      fns.append((fn, out_path, out_fn))
+  parallelize(extract_txt, fns)
+
+def extract_txt(in_fn, out_path, out_fn):
+  subprocess.call([
+    "pdf2txt.py",
+    in_fn,
+    '-o', out_fn,
+    '-O', out_path,
+  ])
+
+if __name__ == '__main__':
+  extract_full_pdf_txt()
author	Jules Laplace <julescarbon@gmail.com>	2019-02-19 18:11:51 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2019-02-19 18:11:51 +0100
commit	768757fe47d55b62c1d3ef87c982332e0292393e (patch)
tree	a2a8db79388da496508dba9b8bf05e61b120f68b /scraper/s2-extract-full-pdf-txt.py
parent	d87d52d4877977825dda1f617875625dd46fa8b9 (diff)