########################################################################### # # WgetDownload.pm -- Download base module that handles calling Wget # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2006 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package WgetDownload; eval {require bytes}; # suppress the annoying "subroutine redefined" warning that various # plugins cause under perl 5.6 $SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)}; use BaseDownload; use strict; use Cwd; use util; use IPC::Open3; use IO::Select; use IO::Socket; sub BEGIN { @WgetDownload::ISA = ('BaseDownload'); } my $arguments = [ { 'name' => "proxy_on", 'desc' => "{WgetDownload.proxy_on}", 'type' => "flag", 'reqd' => "no", 'hiddengli' => "yes"}, { 'name' => "proxy_host", 'desc' => "{WgetDownload.proxy_host}", 'type' => "string", 'reqd' => "no", 'hiddengli' => "yes"}, { 'name' => "proxy_port", 'desc' => "{WgetDownload.proxy_port}", 'type' => "string", 'reqd' => "no", 'hiddengli' => "yes"}, { 'name' => "user_name", 'desc' => "{WgetDownload.user_name}", 'type' => "string", 'reqd' => "no", 'hiddengli' => "yes"}, { 'name' => "user_password", 'desc' => "{WgetDownload.user_password}", 'type' => "string", 'reqd' => "no", 'hiddengli' => "yes"}]; my $options = { 'name' => "WgetDownload", 'desc' => "{WgetDownload.desc}", 'abstract' => "yes", 'inherits' => "yes", 'args' => $arguments }; # Declaring file global variables related to the wget child process so that # the termination signal handler for SIGTERM can close the streams and tidy # up before ending the child process. my $childpid; my ($chld_out, $chld_in); my ($serverSocket, $read_set); # The port this script's server socket will be listening on, to handle # incoming signals from GLI to terminate wget. This is also file global, # since OAIDownload.pm will make several calls on wget using the same # instance of this script and we want to reuse whatever port GLI gave us. my $port; # When this script is called from the command line, this handler will be called # if this process is killed or abruptly ends due to receiving one of the # terminating signals that this handler is registered to deal with. sub abrupt_end_handler { my $termination_signal = shift (@_); if(defined $childpid) { close($chld_out); close($chld_in); print STDOUT "Received termination signal: $termination_signal\n"; # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work # See http://perldoc.perl.org/perlipc.html#Signals # Warning on using kill at http://perldoc.perl.org/perlfork.html kill("TERM", $childpid); # If the SIGTERM sent on Linux calls this handler, we want to make # sure any socket connection is closed. # Otherwise sockets are only used when this script is run from GLI # in which case the handlers don't really get called. if(defined $serverSocket) { $read_set->remove($serverSocket) if defined $read_set; close($serverSocket); } } exit(0); } # Registering a handler for when termination signals SIGINT and SIGTERM are received to stop # the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill # signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows). # Note that SIGKILL can't be handled as the handler won't get called for it. More information: # http://affy.blogspot.com/p5be/ch13.htm # http://perldoc.perl.org/perlipc.html#Signals $SIG{'INT'} = \&abrupt_end_handler; $SIG{'TERM'} = \&abrupt_end_handler; sub new { my ($class) = shift (@_); my ($getlist,$inputargs,$hashArgOptLists) = @_; push(@$getlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists); # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?) &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set return bless $self, $class; } sub checkWgetSetup { my ($self,$blnGliCall) = @_; #TODO: proxy detection?? if((!$blnGliCall) && $self->{'proxy_on'}) { &checkProxySetup($self); } &checkURL($self); } sub getWgetOptions { my ($self) = @_; my $strOptions = ""; if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'}) { if($self->{'url'} =~ m/^https\:/) { $strOptions .= " -e https_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} "; } else { $strOptions .= " -e http_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} "; } if ($self->{'user_name'} && $self->{'user_password'}) { $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}"; } } if ($self->{'proxy_on'}) { $strOptions .= " --proxy "; } if($self->{'no_check_certificate'} && $self->{'url'} =~ m/^https\:/) { $strOptions .= " --no-check-certificate "; } return $strOptions; } # Checking for proxy setup: proxy server, proxy port, proxy username and password. sub checkProxySetup { my ($self) = @_; ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?"); # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'} # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password. } # Returns true if the wget status needs to be monitored through sockets # (if a socket is used to communicate with the Java program on when to # terminate wget). True if we are running gli, or if the particular type # of WgetDownload is *not* OAIDownload (in that case, the original way of # terminating the perl script from Java terminated wget as well). sub dealingWithSockets() { my ($self) = @_; return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload"); # use ref($self) to find the classname of an object } sub useWget { #local $| = 1; # autoflush stdout buffer #print STDOUT "*** Start of subroutine useWget in $0\n"; my ($self, $cmdWget,$blnShow, $working_dir) = @_; my ($strReadIn,$strLine,$command); $strReadIn = "" unless defined $strReadIn; my $current_dir = cwd(); my $changed_dir = 0; if (defined $working_dir && -e $working_dir) { chdir "$working_dir"; $changed_dir = 1; } # When we are running this script through GLI, the SIGTERM signal handler # won't get called on Windows when wget is to be prematurely terminated. # Instead, when wget has to be terminated in the middle of execution, GLI will # connect to a serverSocket here to communicate when it's time to stop wget. if($self->dealingWithSockets()) { $port = ; # gets a port on localhost that's not yet in use chomp($port); $serverSocket = IO::Socket::INET->new( Proto => 'tcp', LocalPort => $port, Listen => 1, Reuse => 1); die "can't setup server" unless $serverSocket; #print STDOUT "[Serversocket $0 accepting clients at port $port]\n"; $read_set = new IO::Select(); # create handle set for reading $read_set->add($serverSocket); # add the main socket to the set } my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget"); $command = "\"$wget_file_path\" $cmdWget"; #print STDOUT "Command is: $command\n"; # Wget's output needs to be monitored to find out when it has naturally terminated. # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1. # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing # the childpid does not kill wget on Linux but the subshell that launched it instead. # Therefore, we use open3. Though the child process wget sends output only to its stdout, # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!) # However that may be, it works with open3. But to avoid the confusion of managing and # closing an extra unused handle, a single handle is used instead for both the child's # stderr and stdout. # Both open2 and open3 don't return on failure, but raise an exception. The handling # of the exception is described on p.568 of the Perl Cookbook eval { $childpid = open3($chld_in, $chld_out, $chld_out, $command); }; if ($@) { if($@ =~ m/^open3/) { die "open3 failed in $0: $!\n$@\n"; } die "Tried to launch open3 in $0, got unexpected exception: $@"; } my $loop = 1; while($loop) { if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout if($blnShow) { print STDERR "$strLine\n"; } $strReadIn .= $strLine; } else { # wget finished, terminate naturally #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n"; close($chld_in); close($chld_out); waitpid $childpid, 0; $loop = 0; $childpid = undef; if(defined $port) { $read_set->remove($serverSocket); close($serverSocket); } } # if we run this script from the command-line (as opposed to from GLI), # then we're not working with sockets and can therefore can skip the next bits next unless(defined $port); # http://www.perlfect.com/articles/select.shtml # "multiplex between several filehandles within a single thread of control, # thus creating the effect of parallelism in the handling of I/O." my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting # take all readable handles in turn foreach my $rh (@rh_set) { if($rh == $serverSocket) { my $client = $rh->accept(); #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines print $client "Talked to ServerSocket (port $port). Connection accepted\n"; # Read from the client (getting rid of the trailing newline) # Has the client sent the <> signal? my $signal = <$client>; chomp($signal); if($signal eq "<>") { print $client "Perl received STOP signal (on port $port): stopping wget\n"; $loop = 0; # out of outer while loop $self->{'forced_quit'} = 1; # subclasses need to know we're quitting # Sometimes the wget process takes some time to start up. If the STOP signal # was sent, don't try to terminate the process until we know it is running. # Otherwise wget may start running after we tried to kill it. Wait 5 seconds # for it to start up, checking for whether it is running in order to kill it. for(my $seconds = 1; $seconds <= 5; $seconds++) { if(kill(0, $childpid)) { # If kill(0, $childpid) returns true, then the process is running # and we need to kill it. close($chld_in); close($chld_out); kill("TERM", $childpid); $childpid = undef; # Stop monitoring the read_handle and close the serverSocket # (the Java end will close the client socket that Java opened) $read_set->remove($rh); #$read_set->remove($serverSocket); close($rh); #close($serverSocket); print $client "Perl terminated wget and is about to exit\n"; last; # out of inner for loop } else { # the process may just be starting up, wait sleep(1); } } last; # out of foreach loop } } } } if ($changed_dir) { chdir $current_dir; } return $strReadIn; } sub useWgetMonitored { #local $| = 1; # autoflush stdout buffer #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n"; my ($self, $cmdWget,$blnShow, $working_dir) = @_; my $current_dir = cwd(); my $changed_dir = 0; if (defined $working_dir && -e $working_dir) { chdir "$working_dir"; $changed_dir = 1; } # When we are running this script through GLI, the SIGTERM signal handler # won't get called on Windows when wget is to be prematurely terminated. # Instead, when wget has to be terminated in the middle of execution, GLI will # connect to a serverSocket here to communicate when it's time to stop wget. if($self->dealingWithSockets()) { $port = ; # gets a port on localhost that's not yet in use chomp($port); $serverSocket = IO::Socket::INET->new( Proto => 'tcp', LocalPort => $port, Listen => 1, Reuse => 1); die "can't setup server" unless $serverSocket; #print STDOUT "[Serversocket $0 accepting clients at port $port]\n"; $read_set = new IO::Select(); # create handle set for reading $read_set->add($serverSocket); # add the main socket to the set } my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget"); my $command = "\"$wget_file_path\" $cmdWget"; #print STDOUT "Command is: $command\n"; eval { # see p.568 of Perl Cookbook $childpid = open3($chld_in, $chld_out, $chld_out, $command); }; if ($@) { if($@ =~ m/^open3/) { die "open3 failed in $0: $!\n$@\n"; } die "Tried to launch open3 in $0, got unexpected exception: $@"; } my $full_text = ""; my $error_text = ""; my @follow_list = (); my $line; my $loop = 1; while($loop) { if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout if((defined $blnShow) && $blnShow) { print STDERR "$line"; } if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) { my $follow_url = $1; push(@follow_list,$follow_url); } if ($line =~ m/ERROR\s+\d+/) { $error_text .= $line; } $full_text .= $line; } else { # wget finished, terminate naturally #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n"; close($chld_in); close($chld_out); # Program terminates only when the following line is included # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary # it prevents the child from turning into a "zombie process". # While the wget process terminates without it, this perl script does not: # the DOS prompt is not returned without it. waitpid $childpid, 0; $loop = 0; $childpid = undef; if(defined $port) { $read_set->remove($serverSocket); close($serverSocket); } } # if we run this script from the command-line (as opposed to from GLI), # then we're not working with sockets and can therefore can skip the next bits next unless(defined $port); # http://www.perlfect.com/articles/select.shtml # "multiplex between several filehandles within a single thread of control, # thus creating the effect of parallelism in the handling of I/O." my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting # take all readable handles in turn foreach my $rh (@rh_set) { if($rh == $serverSocket) { my $client = $rh->accept(); #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines print $client "Talked to ServerSocket (port $port). Connection accepted\n"; # Read from the client (getting rid of trailing newline) # Has the client sent the <> signal? my $signal = <$client>; chomp($signal); if($signal eq "<>") { print $client "Perl received STOP signal (on port $port): stopping wget\n"; $loop = 0; # out of outer while loop $self->{'forced_quit'} = 1; # subclasses need to know we're quitting # Sometimes the wget process takes some time to start up. If the STOP signal # was sent, don't try to terminate the process until we know it is running. # Otherwise wget may start running after we tried to kill it. Wait 5 seconds # for it to start up, checking for whether it is running in order to kill it. for(my $seconds = 1; $seconds <= 5; $seconds++) { if(kill(0, $childpid)) { # If kill(0, $childpid) returns true, then the process is running # and we need to kill it. close($chld_in); close($chld_out); kill("TERM", $childpid); $childpid = undef; # Stop monitoring the read_handle and close the serverSocket # (the Java end will close the client socket that Java opened) $read_set->remove($rh); #$read_set->remove($serverSocket); close($rh); #close($serverSocket); print $client "Perl terminated wget and is about to exit\n"; last; # out of inner for loop } else { # the process may just be starting up, wait sleep(1); } } last; # out of foreach loop } } } } my $command_status = $?; if ($command_status != 0) { $error_text .= "Exit error: $command_status"; } if ($changed_dir) { chdir $current_dir; } my $final_follow = pop(@follow_list); # might be undefined, but that's OK return ($full_text,$error_text,$final_follow); } # TODO: Check if the URL is valid?? Not sure what should be in this function yet!! sub checkURL { my ($self) = @_; if ($self->{'url'} eq "") { &error("checkURL","no URL is specified!! Please specifies the URL for downloading."); } } sub error { my ($strFunctionName,$strError) = @_; { print "Error occoured in WgetDownload.pm\n". "In Function:".$strFunctionName."\n". "Error Message:".$strError."\n"; exit(-1); } } 1;