summaryrefslogtreecommitdiff
path: root/scraper/pdf_dump_all.sh
blob: a17c8d4429370c988ff6c9581034e55408439e62 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#!/bin/bash

for i in datasets/s2/pdf/*/*/*.pdf
  do
    OUTPUT="${i%.*}.txt"
    OUTPUT="${OUTPUT/pdf/txt}"
    IMDIR=`dirname ${OUTPUT}`
    if [[ ! -e $OUTPUT ]]
    then
      pdf2txt.py -o "${OUTPUT}" -O "${IMDIR}" "${i}"
      if [ -s $OUTPUT ]
      then
        echo "found $OUTPUT"
      else
        echo "rm empty $OUTPUT"
        rm -f $OUTPUT
      fi
    fi
  done