diff options
Diffstat (limited to 'datasets/download-from-youtube.sh')
| -rwxr-xr-x | datasets/download-from-youtube.sh | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/datasets/download-from-youtube.sh b/datasets/download-from-youtube.sh new file mode 100755 index 0000000..2bcec33 --- /dev/null +++ b/datasets/download-from-youtube.sh @@ -0,0 +1,30 @@ +#!/bin/sh + +if [ "$#" -ne 3 ]; then + echo "Usage: $0 <youtube url> <chunk size in seconds> <dataset path>" + exit +fi + +url=$1 +chunk_size=$2 +dataset_path=$3 + +downloaded=".temp" +rm -f $downloaded +format=$(youtube-dl -F $url | grep audio | sed -r 's|([0-9]+).*|\1|g' | tail -n 1) +youtube-dl $url -f $format -o $downloaded + +converted=".temp2.wav" +rm -f $converted +ffmpeg -i $downloaded -ac 1 -ab 16k -ar 16000 $converted +rm -f $downloaded + +mkdir $dataset_path +length=$(ffprobe -i $converted -show_entries format=duration -v quiet -of csv="p=0") +end=$(echo "$length / $chunk_size - 1" | bc) +echo "splitting..." +for i in $(seq 0 $end); do + ffmpeg -hide_banner -loglevel error -ss $(($i * $chunk_size)) -t $chunk_size -i $converted "$dataset_path/$i.wav" +done +echo "done" +rm -f $converted |
