summaryrefslogtreecommitdiff
path: root/scraper/pdf_dump_all.sh
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-02-09 14:02:22 +0100
committeradamhrv <adam@ahprojects.com>2019-02-09 14:02:22 +0100
commit31305e9d9de9b7624cb9b2dfb462a3e68c120798 (patch)
tree51b37499dc21bb600639d9a3e6050a5d01e2dbd1 /scraper/pdf_dump_all.sh
parentedc5e1542071fdc1a18a2bb1af2c2b5bed8be02a (diff)
parent865be13c0d7e22db4f23f1d4dddc381e7392fe55 (diff)
Merge branch 'master' of github.com:adamhrv/megapixels_dev
Diffstat (limited to 'scraper/pdf_dump_all.sh')
-rw-r--r--scraper/pdf_dump_all.sh20
1 files changed, 20 insertions, 0 deletions
diff --git a/scraper/pdf_dump_all.sh b/scraper/pdf_dump_all.sh
new file mode 100644
index 00000000..a17c8d44
--- /dev/null
+++ b/scraper/pdf_dump_all.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+for i in datasets/s2/pdf/*/*/*.pdf
+ do
+ OUTPUT="${i%.*}.txt"
+ OUTPUT="${OUTPUT/pdf/txt}"
+ IMDIR=`dirname ${OUTPUT}`
+ if [[ ! -e $OUTPUT ]]
+ then
+ pdf2txt.py -o "${OUTPUT}" -O "${IMDIR}" "${i}"
+ if [ -s $OUTPUT ]
+ then
+ echo "found $OUTPUT"
+ else
+ echo "rm empty $OUTPUT"
+ rm -f $OUTPUT
+ fi
+ fi
+ done
+