diff options
| author | jules@lens <julescarbon@gmail.com> | 2019-02-08 23:23:45 +0100 |
|---|---|---|
| committer | jules@lens <julescarbon@gmail.com> | 2019-02-08 23:23:45 +0100 |
| commit | 410b7c88aaaccb2ceaf778015cb5a696b561e03c (patch) | |
| tree | 1b1ee808ecf6378dc9859acac2fc656cf05fa12f | |
| parent | 14369a4f4a9c411ab4a8e4759e3ef4d5e1ad21cb (diff) | |
pdf_dump_all.sh
| -rw-r--r-- | scraper/pdf_dump_all.sh | 20 |
1 files changed, 20 insertions, 0 deletions
diff --git a/scraper/pdf_dump_all.sh b/scraper/pdf_dump_all.sh new file mode 100644 index 00000000..a17c8d44 --- /dev/null +++ b/scraper/pdf_dump_all.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +for i in datasets/s2/pdf/*/*/*.pdf + do + OUTPUT="${i%.*}.txt" + OUTPUT="${OUTPUT/pdf/txt}" + IMDIR=`dirname ${OUTPUT}` + if [[ ! -e $OUTPUT ]] + then + pdf2txt.py -o "${OUTPUT}" -O "${IMDIR}" "${i}" + if [ -s $OUTPUT ] + then + echo "found $OUTPUT" + else + echo "rm empty $OUTPUT" + rm -f $OUTPUT + fi + fi + done + |
