diff options
Diffstat (limited to 'bin/incoming/dl.pl')
| -rwxr-xr-x | bin/incoming/dl.pl | 238 |
1 files changed, 238 insertions, 0 deletions
diff --git a/bin/incoming/dl.pl b/bin/incoming/dl.pl new file mode 100755 index 0000000..d0a2cff --- /dev/null +++ b/bin/incoming/dl.pl @@ -0,0 +1,238 @@ +#!/usr/bin/perl + +use strict; +my $DEBUG = 1; + +my $DIRNAME = '.'; +my $PREFIX = ''; +my $SOURCE_URL = ''; +my $GREP_BY = ''; +my $URL_FILE = 'index.html'; +my $BASE_HREF = ''; +my $WGET = "/usr/bin/wget"; +my $WGET_SINGLE = $WGET; +my $WGET_WEBPAGE = $WGET; +my $BIN_MV = "/bin/mv"; +my $TYPE_A = "html|txt|pdf"; +my $TYPE_I = "gif|jpe?g|png|tiff?"; +my $TYPE_B = $TYPE_I."|mp3|mov|avi|wav|aiff?|tiff?|zip|bz2|compress|m4a|m4b"; +my $VALID_TYPES = "$TYPE_A|$TYPE_B"; +my %SEEN = (); +my %FILES = (); + +foreach my $ARG (@ARGV) + { + $ARG = strip($ARG); + if ($ARG =~ /^http/) + { + $SOURCE_URL = $ARG; + } + elsif ($ARG =~ /\|/ || $ARG =~ /($VALID_TYPES)/) + { + $GREP_BY = $ARG; + $GREP_BY =~ s/^\(//; + $GREP_BY =~ s/\)$//; + } + elsif ($ARG =~ /\// || $ARG eq ".") + { + $DIRNAME = $ARG; + } + else + { + $PREFIX = $ARG; + $PREFIX =~ s/[^a-zA-Z0-9\.\/]//g; + } + } + +$BASE_HREF = get_basehref($SOURCE_URL); +$URL_FILE = get_local_filename($SOURCE_URL); +$GREP_BY ||= $VALID_TYPES; + +if ($DEBUG == 1) + { + print STDERR <<ARGZ; +DIRNAME = $DIRNAME +PREFIX = $PREFIX +SOURCE_URL = $SOURCE_URL +BASE_HREF = $BASE_HREF +GREP_BY = $GREP_BY +URL_FILE = $URL_FILE +ARGZ + } + +if (length($DIRNAME) == 0) + { + print STDERR "usage: ./dl.pl DIRNAME http://... '$VALID_TYPES'\n"; + exit; + } + +my ($WGET_SINGLE, $WGET_WEBPAGE) = make_wget_commands(); + +wget_single($SOURCE_URL); +if (! -e $URL_FILE) + { + print STDERR "\n**** $URL_FILE DOES NOT EXIST\n"; + exit; + } + +open URLZ, $URL_FILE or die $!; +foreach my $line (<URLZ>) + { + $line = strip($line); + process_line($line); + } +close URLZ; + +sub process_line + { + my ($line) = @_; + my $catch = "href"; + unless ( $line =~ /<a.*href/i && $line =~ /($GREP_BY)/i) + { + if ( $line =~ /<img.*src/i && $GREP_BY =~ /($TYPE_I)/i ) + { $catch = "src"; } + else + { return; } + } +# <p align=Center> <a href="newsamples/outtake6_20thcentury.wav">outtake6_20thcentury</a></td> + $line =~ /$catch=('|")?/i; + my $quote = $1; + my ($pre, $post) = split(/$catch=$quote/i, $line, 2); + my ($url, $rest) = split(/$quote/, $post, 2); +# if ($DEBUG == 1) +# { print STDERR $post."\n\n"; } + $url =~ s/(\"|\').*$//; + if ($url =~ /($GREP_BY)/) + { + wget_single($url); + foil_redirect($url); + } + process_line($rest); + } + +sub wget_single + { + my ($url) = @_; + return if $url =~ /\#/; + return if length($url) < 1; + return if $SEEN{$url}; + $SEEN{$url} = 1; + + my $file = get_local_filename($url); + + $url = $BASE_HREF.$url unless ($url =~ /^http:\/\//); + my $cmd = $WGET_SINGLE." '$url'"; + + system($cmd); + sleep 1; + if ($PREFIX) + { + my $prefixed = $file; + $prefixed =~ s/\//\/$PREFIX\-/; + system('/bin/mv', $file, $prefixed); + } + } + +sub foil_redirect + { + my ($_url) = @_; + my $_file = get_local_filename($_url); + if ( ! -e $_file ) + { + print STDERR "weird: no $_file\n\n"; + return; + } + return unless ($_file =~ /($TYPE_B)/); + open FILE, "$_file" or die $!; + my $scanning = 0; + foreach my $line (<FILE>) + { + if ($line =~ /<html>/) + { $scanning = 1; } + next unless $scanning; + next unless $line =~ /(src=)/; + print STDERR "$line"; + $line =~ /src=('|")?/; + my $quote = $1; + my ($pre, $post) = split(/src=$quote/, $line, 2); + my ($url, $rest) = split(/$quote/, $post, 2); + close FILE; + my ($_newfile) = get_filename($url); + if ($_file =~ $_newfile) + { system($BIN_MV, $_file, "$_file.temp"); } + wget_single($url); + if (-e $_newfile) + { system($BIN_MV, $_newfile, get_local_filename($_newfile)); } + last; + } + print STDERR "^^^ SUSPICIOUS\n" if $scanning; + close FILE; + } +sub get_basehref + { + my ($url) = @_; + $url =~ s/\/([^\/]*)$/\//; + return $url; + } +sub get_type + { + my ($_url) = @_; + $_url =~ s/\/([^\/]*).(\w+)$/\//; + my $_type = $2; + return $_type; + } +sub get_local_filename + { + my ($_url) = @_; + my $_file; + if ($_url =~ /\//) + { $_file = get_filename($_url); } + else + { $_file = $_url; } + $_file = "$DIRNAME/$_file"; + $_file =~ s/\/+/\//g; +print STDERR " -> $_url => $_file\n" if $DEBUG == 1; + return $_file; + } +sub get_filename + { + my ($_url) = @_; + my $_file = ''; + $_url =~ s/\#*$//; + if ($_url =~ /\/$/) + { + $_file = "index.html"; + } + else + { + if ($_url =~ /\?$/) + { print STDERR "possible cgi: $_url\n"; } + $_file = $_url; + my $_basehref = get_basehref($_url); + $_file =~ s/$_basehref//; + print STDERR "u: $_url\nb: $_basehref\nf: $_file\n"; + } + return $_file; + } + +sub make_wget_commands + { + my $ua = ("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.0.3705)"); + my $dp = $DIRNAME || "."; + +# -E = --html-extension +# -H = --span-hosts +# -k = --convert-links +# -K = --backup-converted +# -p = --page-requisite + + my $SINGLE = "$WGET -erobots=off --user-agent='$ua' --directory-prefix=$dp"; + my $WEBPAGE = "$WGET -erobots=off -d -o wgetlog " . + "--user-agent='$ua' -E -H -K -k -p --no-directories " . + "--directory-prefix=$dp"; + return ($SINGLE, $WEBPAGE); + } + +sub strip + { my ($q) = @_; $q =~ s/^\s+//; $q =~ s/\s+$//; return $q; } + |
