summaryrefslogtreecommitdiff
path: root/scraper/s2-extract-full-pdf-txt.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2019-02-19 18:11:51 +0100
committerJules Laplace <julescarbon@gmail.com>2019-02-19 18:11:51 +0100
commit768757fe47d55b62c1d3ef87c982332e0292393e (patch)
treea2a8db79388da496508dba9b8bf05e61b120f68b /scraper/s2-extract-full-pdf-txt.py
parentd87d52d4877977825dda1f617875625dd46fa8b9 (diff)
extract full pdf txt
Diffstat (limited to 'scraper/s2-extract-full-pdf-txt.py')
-rw-r--r--scraper/s2-extract-full-pdf-txt.py36
1 files changed, 36 insertions, 0 deletions
diff --git a/scraper/s2-extract-full-pdf-txt.py b/scraper/s2-extract-full-pdf-txt.py
new file mode 100644
index 00000000..c748b6a1
--- /dev/null
+++ b/scraper/s2-extract-full-pdf-txt.py
@@ -0,0 +1,36 @@
+import os
+import sys
+import csv
+import subprocess
+import time
+import random
+import re
+import simplejson as json
+import click
+import glob
+from multiprocessing import Pool
+from s2 import SemanticScholarAPI
+from util import *
+
+s2 = SemanticScholarAPI()
+
+@click.command()
+def extract_full_pdf_txt():
+ fns = []
+ for fn in glob.iglob('datasets/s2/pdf/*/*/*.pdf'):
+ path = os.path.dirname(fn).replace('pdf', 'txt')
+ out_fn = os.path.join(path, 'paper.txt')
+ if not os.path.exists(out_fn):
+ fns.append((fn, out_path, out_fn))
+ parallelize(extract_txt, fns)
+
+def extract_txt(in_fn, out_path, out_fn):
+ subprocess.call([
+ "pdf2txt.py",
+ in_fn,
+ '-o', out_fn,
+ '-O', out_path,
+ ])
+
+if __name__ == '__main__':
+ extract_full_pdf_txt()