summaryrefslogtreecommitdiff
path: root/datasets
diff options
context:
space:
mode:
Diffstat (limited to 'datasets')
-rwxr-xr-xdatasets/download-from-youtube.sh30
1 files changed, 30 insertions, 0 deletions
diff --git a/datasets/download-from-youtube.sh b/datasets/download-from-youtube.sh
new file mode 100755
index 0000000..2bcec33
--- /dev/null
+++ b/datasets/download-from-youtube.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+if [ "$#" -ne 3 ]; then
+ echo "Usage: $0 <youtube url> <chunk size in seconds> <dataset path>"
+ exit
+fi
+
+url=$1
+chunk_size=$2
+dataset_path=$3
+
+downloaded=".temp"
+rm -f $downloaded
+format=$(youtube-dl -F $url | grep audio | sed -r 's|([0-9]+).*|\1|g' | tail -n 1)
+youtube-dl $url -f $format -o $downloaded
+
+converted=".temp2.wav"
+rm -f $converted
+ffmpeg -i $downloaded -ac 1 -ab 16k -ar 16000 $converted
+rm -f $downloaded
+
+mkdir $dataset_path
+length=$(ffprobe -i $converted -show_entries format=duration -v quiet -of csv="p=0")
+end=$(echo "$length / $chunk_size - 1" | bc)
+echo "splitting..."
+for i in $(seq 0 $end); do
+ ffmpeg -hide_banner -loglevel error -ss $(($i * $chunk_size)) -t $chunk_size -i $converted "$dataset_path/$i.wav"
+done
+echo "done"
+rm -f $converted