summaryrefslogtreecommitdiff
path: root/bin/incoming/dl.pl
diff options
context:
space:
mode:
authorJules Laplace <carbon@melanarchy.org>2013-08-02 17:16:34 -0500
committerJules Laplace <carbon@melanarchy.org>2013-08-02 17:16:34 -0500
commitff9fe30cf16474407580daddc990686cf868aa15 (patch)
treebf274d7fa78a4c067585ea7751015f1a8b691e09 /bin/incoming/dl.pl
parentc36b26f23c2ce37b509ad90c92c0ba7a840d575f (diff)
incoming/ scripts
Diffstat (limited to 'bin/incoming/dl.pl')
-rwxr-xr-xbin/incoming/dl.pl238
1 files changed, 238 insertions, 0 deletions
diff --git a/bin/incoming/dl.pl b/bin/incoming/dl.pl
new file mode 100755
index 0000000..d0a2cff
--- /dev/null
+++ b/bin/incoming/dl.pl
@@ -0,0 +1,238 @@
+#!/usr/bin/perl
+
+use strict;
+my $DEBUG = 1;
+
+my $DIRNAME = '.';
+my $PREFIX = '';
+my $SOURCE_URL = '';
+my $GREP_BY = '';
+my $URL_FILE = 'index.html';
+my $BASE_HREF = '';
+my $WGET = "/usr/bin/wget";
+my $WGET_SINGLE = $WGET;
+my $WGET_WEBPAGE = $WGET;
+my $BIN_MV = "/bin/mv";
+my $TYPE_A = "html|txt|pdf";
+my $TYPE_I = "gif|jpe?g|png|tiff?";
+my $TYPE_B = $TYPE_I."|mp3|mov|avi|wav|aiff?|tiff?|zip|bz2|compress|m4a|m4b";
+my $VALID_TYPES = "$TYPE_A|$TYPE_B";
+my %SEEN = ();
+my %FILES = ();
+
+foreach my $ARG (@ARGV)
+ {
+ $ARG = strip($ARG);
+ if ($ARG =~ /^http/)
+ {
+ $SOURCE_URL = $ARG;
+ }
+ elsif ($ARG =~ /\|/ || $ARG =~ /($VALID_TYPES)/)
+ {
+ $GREP_BY = $ARG;
+ $GREP_BY =~ s/^\(//;
+ $GREP_BY =~ s/\)$//;
+ }
+ elsif ($ARG =~ /\// || $ARG eq ".")
+ {
+ $DIRNAME = $ARG;
+ }
+ else
+ {
+ $PREFIX = $ARG;
+ $PREFIX =~ s/[^a-zA-Z0-9\.\/]//g;
+ }
+ }
+
+$BASE_HREF = get_basehref($SOURCE_URL);
+$URL_FILE = get_local_filename($SOURCE_URL);
+$GREP_BY ||= $VALID_TYPES;
+
+if ($DEBUG == 1)
+ {
+ print STDERR <<ARGZ;
+DIRNAME = $DIRNAME
+PREFIX = $PREFIX
+SOURCE_URL = $SOURCE_URL
+BASE_HREF = $BASE_HREF
+GREP_BY = $GREP_BY
+URL_FILE = $URL_FILE
+ARGZ
+ }
+
+if (length($DIRNAME) == 0)
+ {
+ print STDERR "usage: ./dl.pl DIRNAME http://... '$VALID_TYPES'\n";
+ exit;
+ }
+
+my ($WGET_SINGLE, $WGET_WEBPAGE) = make_wget_commands();
+
+wget_single($SOURCE_URL);
+if (! -e $URL_FILE)
+ {
+ print STDERR "\n**** $URL_FILE DOES NOT EXIST\n";
+ exit;
+ }
+
+open URLZ, $URL_FILE or die $!;
+foreach my $line (<URLZ>)
+ {
+ $line = strip($line);
+ process_line($line);
+ }
+close URLZ;
+
+sub process_line
+ {
+ my ($line) = @_;
+ my $catch = "href";
+ unless ( $line =~ /<a.*href/i && $line =~ /($GREP_BY)/i)
+ {
+ if ( $line =~ /<img.*src/i && $GREP_BY =~ /($TYPE_I)/i )
+ { $catch = "src"; }
+ else
+ { return; }
+ }
+# <p align=Center> <a href="newsamples/outtake6_20thcentury.wav">outtake6_20thcentury</a></td>
+ $line =~ /$catch=('|")?/i;
+ my $quote = $1;
+ my ($pre, $post) = split(/$catch=$quote/i, $line, 2);
+ my ($url, $rest) = split(/$quote/, $post, 2);
+# if ($DEBUG == 1)
+# { print STDERR $post."\n\n"; }
+ $url =~ s/(\"|\').*$//;
+ if ($url =~ /($GREP_BY)/)
+ {
+ wget_single($url);
+ foil_redirect($url);
+ }
+ process_line($rest);
+ }
+
+sub wget_single
+ {
+ my ($url) = @_;
+ return if $url =~ /\#/;
+ return if length($url) < 1;
+ return if $SEEN{$url};
+ $SEEN{$url} = 1;
+
+ my $file = get_local_filename($url);
+
+ $url = $BASE_HREF.$url unless ($url =~ /^http:\/\//);
+ my $cmd = $WGET_SINGLE." '$url'";
+
+ system($cmd);
+ sleep 1;
+ if ($PREFIX)
+ {
+ my $prefixed = $file;
+ $prefixed =~ s/\//\/$PREFIX\-/;
+ system('/bin/mv', $file, $prefixed);
+ }
+ }
+
+sub foil_redirect
+ {
+ my ($_url) = @_;
+ my $_file = get_local_filename($_url);
+ if ( ! -e $_file )
+ {
+ print STDERR "weird: no $_file\n\n";
+ return;
+ }
+ return unless ($_file =~ /($TYPE_B)/);
+ open FILE, "$_file" or die $!;
+ my $scanning = 0;
+ foreach my $line (<FILE>)
+ {
+ if ($line =~ /<html>/)
+ { $scanning = 1; }
+ next unless $scanning;
+ next unless $line =~ /(src=)/;
+ print STDERR "$line";
+ $line =~ /src=('|")?/;
+ my $quote = $1;
+ my ($pre, $post) = split(/src=$quote/, $line, 2);
+ my ($url, $rest) = split(/$quote/, $post, 2);
+ close FILE;
+ my ($_newfile) = get_filename($url);
+ if ($_file =~ $_newfile)
+ { system($BIN_MV, $_file, "$_file.temp"); }
+ wget_single($url);
+ if (-e $_newfile)
+ { system($BIN_MV, $_newfile, get_local_filename($_newfile)); }
+ last;
+ }
+ print STDERR "^^^ SUSPICIOUS\n" if $scanning;
+ close FILE;
+ }
+sub get_basehref
+ {
+ my ($url) = @_;
+ $url =~ s/\/([^\/]*)$/\//;
+ return $url;
+ }
+sub get_type
+ {
+ my ($_url) = @_;
+ $_url =~ s/\/([^\/]*).(\w+)$/\//;
+ my $_type = $2;
+ return $_type;
+ }
+sub get_local_filename
+ {
+ my ($_url) = @_;
+ my $_file;
+ if ($_url =~ /\//)
+ { $_file = get_filename($_url); }
+ else
+ { $_file = $_url; }
+ $_file = "$DIRNAME/$_file";
+ $_file =~ s/\/+/\//g;
+print STDERR " -> $_url => $_file\n" if $DEBUG == 1;
+ return $_file;
+ }
+sub get_filename
+ {
+ my ($_url) = @_;
+ my $_file = '';
+ $_url =~ s/\#*$//;
+ if ($_url =~ /\/$/)
+ {
+ $_file = "index.html";
+ }
+ else
+ {
+ if ($_url =~ /\?$/)
+ { print STDERR "possible cgi: $_url\n"; }
+ $_file = $_url;
+ my $_basehref = get_basehref($_url);
+ $_file =~ s/$_basehref//;
+ print STDERR "u: $_url\nb: $_basehref\nf: $_file\n";
+ }
+ return $_file;
+ }
+
+sub make_wget_commands
+ {
+ my $ua = ("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.0.3705)");
+ my $dp = $DIRNAME || ".";
+
+# -E = --html-extension
+# -H = --span-hosts
+# -k = --convert-links
+# -K = --backup-converted
+# -p = --page-requisite
+
+ my $SINGLE = "$WGET -erobots=off --user-agent='$ua' --directory-prefix=$dp";
+ my $WEBPAGE = "$WGET -erobots=off -d -o wgetlog " .
+ "--user-agent='$ua' -E -H -K -k -p --no-directories " .
+ "--directory-prefix=$dp";
+ return ($SINGLE, $WEBPAGE);
+ }
+
+sub strip
+ { my ($q) = @_; $q =~ s/^\s+//; $q =~ s/\s+$//; return $q; }
+