#!/usr/bin/perl use strict; my $DEBUG = 1; my $DIRNAME = '.'; my $PREFIX = ''; my $SOURCE_URL = ''; my $GREP_BY = ''; my $URL_FILE = 'index.html'; my $BASE_HREF = ''; my $WGET = "/usr/bin/wget"; my $WGET_SINGLE = $WGET; my $WGET_WEBPAGE = $WGET; my $BIN_MV = "/bin/mv"; my $TYPE_A = "html|txt|pdf"; my $TYPE_I = "gif|jpe?g|png|tiff?"; my $TYPE_B = $TYPE_I."|mp3|mov|avi|wav|aiff?|tiff?|zip|bz2|compress|m4a|m4b"; my $VALID_TYPES = "$TYPE_A|$TYPE_B"; my %SEEN = (); my %FILES = (); foreach my $ARG (@ARGV) { $ARG = strip($ARG); if ($ARG =~ /^http/) { $SOURCE_URL = $ARG; } elsif ($ARG =~ /\|/ || $ARG =~ /($VALID_TYPES)/) { $GREP_BY = $ARG; $GREP_BY =~ s/^\(//; $GREP_BY =~ s/\)$//; } elsif ($ARG =~ /\// || $ARG eq ".") { $DIRNAME = $ARG; } else { $PREFIX = $ARG; $PREFIX =~ s/[^a-zA-Z0-9\.\/]//g; } } $BASE_HREF = get_basehref($SOURCE_URL); $URL_FILE = get_local_filename($SOURCE_URL); $GREP_BY ||= $VALID_TYPES; if ($DEBUG == 1) { print STDERR <) { $line = strip($line); process_line($line); } close URLZ; sub process_line { my ($line) = @_; my $catch = "href"; unless ( $line =~ / outtake6_20thcentury $line =~ /$catch=('|")?/i; my $quote = $1; my ($pre, $post) = split(/$catch=$quote/i, $line, 2); my ($url, $rest) = split(/$quote/, $post, 2); # if ($DEBUG == 1) # { print STDERR $post."\n\n"; } $url =~ s/(\"|\').*$//; if ($url =~ /($GREP_BY)/) { wget_single($url); foil_redirect($url); } process_line($rest); } sub wget_single { my ($url) = @_; return if $url =~ /\#/; return if length($url) < 1; return if $SEEN{$url}; $SEEN{$url} = 1; my $file = get_local_filename($url); $url = $BASE_HREF.$url unless ($url =~ /^http:\/\//); my $cmd = $WGET_SINGLE." '$url'"; system($cmd); sleep 1; if ($PREFIX) { my $prefixed = $file; $prefixed =~ s/\//\/$PREFIX\-/; system('/bin/mv', $file, $prefixed); } } sub foil_redirect { my ($_url) = @_; my $_file = get_local_filename($_url); if ( ! -e $_file ) { print STDERR "weird: no $_file\n\n"; return; } return unless ($_file =~ /($TYPE_B)/); open FILE, "$_file" or die $!; my $scanning = 0; foreach my $line () { if ($line =~ //) { $scanning = 1; } next unless $scanning; next unless $line =~ /(src=)/; print STDERR "$line"; $line =~ /src=('|")?/; my $quote = $1; my ($pre, $post) = split(/src=$quote/, $line, 2); my ($url, $rest) = split(/$quote/, $post, 2); close FILE; my ($_newfile) = get_filename($url); if ($_file =~ $_newfile) { system($BIN_MV, $_file, "$_file.temp"); } wget_single($url); if (-e $_newfile) { system($BIN_MV, $_newfile, get_local_filename($_newfile)); } last; } print STDERR "^^^ SUSPICIOUS\n" if $scanning; close FILE; } sub get_basehref { my ($url) = @_; $url =~ s/\/([^\/]*)$/\//; return $url; } sub get_type { my ($_url) = @_; $_url =~ s/\/([^\/]*).(\w+)$/\//; my $_type = $2; return $_type; } sub get_local_filename { my ($_url) = @_; my $_file; if ($_url =~ /\//) { $_file = get_filename($_url); } else { $_file = $_url; } $_file = "$DIRNAME/$_file"; $_file =~ s/\/+/\//g; print STDERR " -> $_url => $_file\n" if $DEBUG == 1; return $_file; } sub get_filename { my ($_url) = @_; my $_file = ''; $_url =~ s/\#*$//; if ($_url =~ /\/$/) { $_file = "index.html"; } else { if ($_url =~ /\?$/) { print STDERR "possible cgi: $_url\n"; } $_file = $_url; my $_basehref = get_basehref($_url); $_file =~ s/$_basehref//; print STDERR "u: $_url\nb: $_basehref\nf: $_file\n"; } return $_file; } sub make_wget_commands { my $ua = ("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.0.3705)"); my $dp = $DIRNAME || "."; # -E = --html-extension # -H = --span-hosts # -k = --convert-links # -K = --backup-converted # -p = --page-requisite my $SINGLE = "$WGET -erobots=off --user-agent='$ua' --directory-prefix=$dp"; my $WEBPAGE = "$WGET -erobots=off -d -o wgetlog " . "--user-agent='$ua' -E -H -K -k -p --no-directories " . "--directory-prefix=$dp"; return ($SINGLE, $WEBPAGE); } sub strip { my ($q) = @_; $q =~ s/^\s+//; $q =~ s/\s+$//; return $q; }