diff options
| author | adamhrv <adam@ahprojects.com> | 2019-02-09 14:02:22 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-02-09 14:02:22 +0100 |
| commit | 31305e9d9de9b7624cb9b2dfb462a3e68c120798 (patch) | |
| tree | 51b37499dc21bb600639d9a3e6050a5d01e2dbd1 /scraper/pdf_dump_all.sh | |
| parent | edc5e1542071fdc1a18a2bb1af2c2b5bed8be02a (diff) | |
| parent | 865be13c0d7e22db4f23f1d4dddc381e7392fe55 (diff) | |
Merge branch 'master' of github.com:adamhrv/megapixels_dev
Diffstat (limited to 'scraper/pdf_dump_all.sh')
| -rw-r--r-- | scraper/pdf_dump_all.sh | 20 |
1 files changed, 20 insertions, 0 deletions
diff --git a/scraper/pdf_dump_all.sh b/scraper/pdf_dump_all.sh new file mode 100644 index 00000000..a17c8d44 --- /dev/null +++ b/scraper/pdf_dump_all.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +for i in datasets/s2/pdf/*/*/*.pdf + do + OUTPUT="${i%.*}.txt" + OUTPUT="${OUTPUT/pdf/txt}" + IMDIR=`dirname ${OUTPUT}` + if [[ ! -e $OUTPUT ]] + then + pdf2txt.py -o "${OUTPUT}" -O "${IMDIR}" "${i}" + if [ -s $OUTPUT ] + then + echo "found $OUTPUT" + else + echo "rm empty $OUTPUT" + rm -f $OUTPUT + fi + fi + done + |
