root/main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm @ 31975

Revision 31975, 34.5 KB (checked in by ak19, 3 years ago)

Another bugfix to downloading. Downloading over OAI wasn't working. Dr Bainbridge discovered that this was because OAIDownload.pm was still doublequoting filepaths and URLs too, whereas open3() launching the wget cmd can't handle quotes in its arguments. WgetDownload? used split to convert the cmd string into a cmd array. A clean solution was not passing WgetDownload::useWget() methods an array of cmd parameters (too involved and error prone to change all the calling code constructing the parameter cmd string), but to use the quotewords() method in place of split. This will preserve spaces in double quoted params in the cmd string, while splitting on spaces outside quoted strings. Then it also removes double quotes (and unescapes double backslashes). Tested on Mac: both OAI and Web downloading now work.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36no strict 'subs'; # make an exception so we can use variables as filehandles to pass STDERR/STDOUT to functions, needed for gsprintf()
37use Cwd;
38use util;
39use IPC::Open3;
40use IO::Select;
41use IO::Socket;
42use Text::ParseWords; # part of Core modules. Needed to use quotewords() subroutine
43
44#use IO::Select qw( );
45#use IPC::Open3 qw( open3 );
46use Socket     qw( AF_UNIX SOCK_STREAM PF_UNSPEC ); # http://perlmeme.org/howtos/perlfunc/qw_function.html
47
48
49sub BEGIN {
50    @WgetDownload::ISA = ('BaseDownload');
51}
52
53my $arguments =
54     [ { 'name' => "proxy_on",
55    'desc' => "{WgetDownload.proxy_on}",
56    'type' => "flag",
57    'reqd' => "no",
58    'hiddengli' => "yes"},
59      { 'name' => "http_proxy_host",
60    'desc' => "{WgetDownload.http_proxy_host}",
61    'type' => "string",
62    'reqd' => "no",
63    'hiddengli' => "yes"},
64      { 'name' => "http_proxy_port",
65    'desc' => "{WgetDownload.http_proxy_port}",
66    'type' => "string",
67    'reqd' => "no",
68    'hiddengli' => "yes"},
69      { 'name' => "https_proxy_host", 
70    'desc' => "{WgetDownload.https_proxy_host}",         
71    'type' => "string",
72    'reqd' => "no",
73    'hiddengli' => "yes"},
74      { 'name' => "https_proxy_port",
75    'desc' => "{WgetDownload.https_proxy_port}",         
76    'type' => "string",
77    'reqd' => "no",
78    'hiddengli' => "yes"},
79      { 'name' => "ftp_proxy_host", 
80    'desc' => "{WgetDownload.ftp_proxy_host}",         
81    'type' => "string",
82    'reqd' => "no",
83    'hiddengli' => "yes"},
84      { 'name' => "ftp_proxy_port",
85    'desc' => "{WgetDownload.ftp_proxy_port}",         
86    'type' => "string",
87    'reqd' => "no",
88    'hiddengli' => "yes"},
89      { 'name' => "user_name", 
90    'desc' => "{WgetDownload.user_name}",
91    'type' => "string",
92    'reqd' => "no",
93    'hiddengli' => "yes"},
94      { 'name' => "user_password",
95    'desc' => "{WgetDownload.user_password}",
96    'type' => "string",
97    'reqd' => "no",
98    'hiddengli' => "yes"},
99      { 'name' => "no_check_certificate",
100    'desc' => "{WgetDownload.no_check_certificate}",
101    'type' => "flag",
102    'reqd' => "no",
103    'hiddengli' => "yes"}
104     ];
105
106my $options = { 'name'     => "WgetDownload",
107        'desc'     => "{WgetDownload.desc}",
108        'abstract' => "yes",
109        'inherits' => "yes",
110        'args'     => $arguments };
111
112
113# Declaring file global variables related to the wget child process so that
114# the termination signal handler for SIGTERM can close the streams and tidy
115# up before ending the child process.
116my $childpid;
117my ($chld_out, $chld_in);
118my ($serverSocket, $read_set);
119
120my $TIMEOUT = 1; # seconds
121my $NUM_TRIES = 10;
122
123# The port this script's server socket will be listening on, to handle
124# incoming signals from GLI to terminate wget. This is also file global,
125# since OAIDownload.pm will make several calls on wget using the same
126# instance of this script and we want to reuse whatever port GLI gave us.
127my $port;
128
129# When this script is called from the command line, this handler will be called
130# if this process is killed or abruptly ends due to receiving one of the
131# terminating signals that this handler is registered to deal with.
132sub abrupt_end_handler {
133    my $termination_signal = shift (@_);
134
135    if(defined $childpid) {
136    close($chld_out);
137    close($chld_in);
138   
139    print STDOUT "Received termination signal: $termination_signal\n";
140
141    # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
142    # See http://perldoc.perl.org/perlipc.html#Signals
143    # Warning on using kill at http://perldoc.perl.org/perlfork.html
144    kill("TERM", $childpid); # prefix - to signal to kill process group
145
146    # If the SIGTERM sent on Linux calls this handler, we want to make
147    # sure any socket connection is closed.
148    # Otherwise sockets are only used when this script is run from GLI
149    # in which case the handlers don't really get called.
150    if(defined $serverSocket) {
151        $read_set->remove($serverSocket) if defined $read_set;
152        close($serverSocket);
153    }
154    }
155
156    exit(0);
157}
158
159# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
160# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
161# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
162# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
163# http://affy.blogspot.com/p5be/ch13.htm
164# http://perldoc.perl.org/perlipc.html#Signals
165$SIG{'INT'} = \&abrupt_end_handler;
166$SIG{'TERM'} = \&abrupt_end_handler;
167
168sub new {
169    my ($class) = shift (@_);
170    my ($getlist,$inputargs,$hashArgOptLists) = @_;
171    push(@$getlist, $class);
172
173    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
174    push(@{$hashArgOptLists->{"OptList"}},$options);
175
176    my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
177
178    # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
179    &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
180
181    return bless $self, $class;
182}
183
184sub checkWgetSetup
185{
186    my ($self,$blnGliCall) = @_;
187    #TODO: proxy detection??
188   
189    if((!$blnGliCall) && $self->{'proxy_on'})
190    {
191    &checkProxySetup($self);
192    }
193    &checkURL($self);
194}
195
196# Not using this. On Windows, we used to pass proxying settings as flags to wget. But, as that can be
197# seen with Task Manager, we now have the proxy settings set in the environment and are no longer passing it
198sub addProxySettingsAsWgetFlags
199{
200    my ($self) = @_;
201    my $strOptions = "";
202
203    if($self->{'http_proxy_host'} && $self->{'http_proxy_port'}) {
204    $strOptions .= " -e http_proxy=$self->{'http_proxy_host'}:$self->{'http_proxy_port'} ";
205    }
206    if($self->{'https_proxy_host'} && $self->{'https_proxy_port'}) {
207    $strOptions .= " -e https_proxy=$self->{'https_proxy_host'}:$self->{'https_proxy_port'} ";
208    }
209    if($self->{'ftp_proxy_host'} && $self->{'ftp_proxy_port'}) {
210    $strOptions .= " -e ftp_proxy=$self->{'ftp_proxy_host'}:$self->{'ftp_proxy_port'} ";
211    }
212   
213    # For wget, there is only one set pair of proxy-user and proxy-passwd, so wget seems to assume
214    # that all 3 proxy protocols (http|https|ftp) will use the same username and pwd combination?
215    # Note that this only matters when passing the proxying details as flags to wget, not when
216    # the proxies are setup as environment variables.
217    if ($self->{'user_name'} && $self->{'user_password'})
218    {
219    $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";     
220    # how is "--proxy-passwd" instead of "--proxy-password" even working????       
221    # see https://www.gnu.org/software/wget/manual/html_node/Proxies.html
222    # and https://www.gnu.org/software/wget/manual/wget.html
223    # Not touching this, in case the manual is simply wrong. Since our code works in
224    # practice (when we were still using wget proxy username/pwd flags for windows).
225    }   
226   
227    return $strOptions;
228}
229
230sub getWgetOptions
231{
232    my ($self) = @_;
233    my $strOptions = "";
234   
235    # If proxy settings are set up in the environment, wget is ready to use them. More secure.
236    # But if proxy settings are not set up in the environment, pass them as flags to wget
237    # This is less secure, as pwd etc visible in task manager, but it was the original way in
238    # which wget was run on windows.
239    # Truth in Perl: https://home.ubalt.edu/abento/452/perl/perltruth.html
240    # http://www.perlmonks.org/?node=what%20is%20true%20and%20false%20in%20Perl%3F
241   
242    if ($self->{'proxy_on'}) {
243    if(!$ENV{'http_proxy'} && !$ENV{'https_proxy'} && !$ENV{'ftp_proxy'}) {
244        $strOptions .= $self->addProxySettingsAsWgetFlags();
245    } # else wget will use proxy settings in environment, assume enough settings have been provided
246    # either way, we're using the proxy
247    $strOptions .= " --proxy ";
248    }
249   
250    if($self->{'no_check_certificate'}) { # URL may be http that gets redirected to https, so if no_check_certificate is on, turn it on even if URL is http
251   
252    $strOptions .= " --no-check-certificate ";
253    }
254   
255    return $strOptions;
256}
257
258# Checking for proxy setup: proxy server, proxy port, proxy username and password.
259sub checkProxySetup
260{
261    my ($self) = @_;
262    ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
263    # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
264    # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
265
266}
267
268# Returns true if the wget status needs to be monitored through sockets
269# (if a socket is used to communicate with the Java program on when to
270# terminate wget). True if we are running gli, or if the particular type
271# of WgetDownload is *not* OAIDownload (in that case, the original way of
272# terminating the perl script from Java would terminate wget as well).
273sub dealingWithSockets() {
274    my ($self) = @_;
275    return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
276                       # use ref($self) to find the classname of an object
277}
278
279# On Windows, we can only use IO::Select's can_read() with Sockets, not with the usual handles to a child process' iostreams
280# However, we can use Sockets as the handles to connect to a child process' streams, which then allows us to use can_read()
281# not just on Unix but Windows too. The 2 subroutines below to use Sockets to connect to a child process' iostreams come from
282# http://www.perlmonks.org/?node_id=869942
283# http://www.perlmonks.org/?node_id=811650
284# It was suggested that IPC::Run will take care of all this or circumvent the need for all this,
285# but IPC::Run has limitations on Windows, see http://search.cpan.org/~toddr/IPC-Run-0.96/lib/IPC/Run.pm#Win32_LIMITATIONS
286
287# Create a unidirectional pipe to an iostream of a process that is actually a socket
288sub _pipe {
289    socketpair($_[0], $_[1], AF_UNIX, SOCK_STREAM, PF_UNSPEC)
290        or return undef;
291    shutdown($_[0], 1);  # No more writing for reader. See http://www.perlmonks.org/?node=108244
292    shutdown($_[1], 0);  # No more reading for writer
293    return 1;
294}
295
296sub _open3 {
297    local (*TO_CHLD_R,     *TO_CHLD_W);
298    local (*FR_CHLD_R,     *FR_CHLD_W);
299    #local (*FR_CHLD_ERR_R, *FR_CHLD_ERR_W);
300
301    if ($^O =~ /Win32/) {
302        _pipe(*TO_CHLD_R,     *TO_CHLD_W    ) or die $^E;
303        _pipe(*FR_CHLD_R,     *FR_CHLD_W    ) or die $^E;
304        #_pipe(*FR_CHLD_ERR_R, *FR_CHLD_ERR_W) or die $^E;
305    } else {
306        pipe(*TO_CHLD_R,     *TO_CHLD_W    ) or die $!;
307        pipe(*FR_CHLD_R,     *FR_CHLD_W    ) or die $!;
308        #pipe(*FR_CHLD_ERR_R, *FR_CHLD_ERR_W) or die $!;
309    }
310
311    #my $pid = open3('>&TO_CHLD_R', '<&FR_CHLD_W', '<&FR_CHLD_ERR_W', @_);
312    my $pid = open3('>&TO_CHLD_R', '<&FR_CHLD_W', '<&FR_CHLD_W', @_); # use one handle, chldout, for both stdout and stderr of child proc,
313                                                                      # see http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
314   
315    #return ( $pid, *TO_CHLD_W, *FR_CHLD_R, *FR_CHLD_ERR_R );
316    return ( $pid, *TO_CHLD_W, *FR_CHLD_R);
317}
318
319# useWget and useWgetMonitored are very similar and, when updating, will probably need updating in tandem
320# useWget(Monitored) runs the wget command using open3 and then sits in a loop doing two things per iteration:
321# - processing a set buffer size of the wget (child) process' stdout/stderr streams, if anything has appeared there
322# - followed by checking the socket connection to Java GLI, to see if GLI is trying to cancel the wget process we're running.
323# Then the loop of these two things repeats.
324sub useWget
325{
326    #local $| = 1; # autoflush stdout buffer
327    #print STDOUT "*** Start of subroutine useWget in $0\n";
328
329    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
330
331    my ($strReadIn,$strLine,$command);
332    $strReadIn = "" unless defined $strReadIn;
333
334    my $current_dir = cwd();
335    my $changed_dir = 0;
336    if (defined $working_dir && -e $working_dir) {
337    chdir "$working_dir";
338    $changed_dir = 1;
339    }
340
341    # When we are running this script through GLI, the SIGTERM signal handler
342    # won't get called on Windows when wget is to be prematurely terminated.
343    # Instead, when wget has to be terminated in the middle of execution, GLI will
344    # connect to a serverSocket here to communicate when it's time to stop wget.
345    if($self->dealingWithSockets()) {
346
347    $port = <STDIN>; # gets a port on localhost that's not yet in use
348    chomp($port);
349   
350    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
351                           LocalPort => $port,
352                           Listen    => 1,
353                           Reuse     => 1);
354   
355    die "can't setup server" unless $serverSocket;
356    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
357
358    $read_set = new IO::Select();         # create handle set for reading
359    $read_set->add($serverSocket);        # add the main socket to the set
360    }
361
362    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
363   
364    # Shouldn't use double quotes around wget path after all? See final comment at
365    # http://www.perlmonks.org/?node_id=394709
366    # http://coldattic.info/shvedsky/pro/blogs/a-foo-walks-into-a-bar/posts/63
367    # Therefore, compose the command as an array rather than as a string, to preserve spaces in the filepath
368    # because single/double quotes using open3 seem to launch a subshell, see also final comment at
369    # http://www.perlmonks.org/?node_id=394709 and that ends up causing problems in terminating wget, as 2 processes
370    # got launched then which don't have parent-child pid relationship (so that terminating one doesn't terminate the other).
371
372    # remove leading and trailing spaces, https://stackoverflow.com/questions/4597937/perl-function-to-trim-string-leading-and-trailing-whitespace
373    $cmdWget =~ s/^\s+//;
374    $cmdWget =~ s/\s+$//;
375
376    # split on "words"
377    #my @commandargs = split(' ', $cmdWget);
378    # quotewords: to split on spaces except within quotes, then removes quotes and unescapes double backslash too
379      # https://stackoverflow.com/questions/19762412/regex-to-split-key-value-pairs-ignoring-space-in-double-quotes
380      # https://docstore.mik.ua/orelly/perl/perlnut/c08_389.htm
381    my @commandargs = quotewords('\s+', 0, $cmdWget);
382    unshift(@commandargs, $wget_file_path); # prepend the wget cmd
383    #print STDERR "Command is: ".join(",", @commandargs) . "\n"; # goes into ServerInfoDialog
384   
385    # Wget's output needs to be monitored to find out when it has naturally terminated.
386    # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
387    # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
388    # the childpid does not kill wget on Linux but the subshell that launched it instead.
389    # Therefore, we use open3. Though the child process wget sends output only to its stdout [is this meant to be "stderr"?],
390    # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
391    # However that may be, it works with open3. But to avoid the confusion of managing and
392    # closing an extra unused handle, a single handle is used instead for both the child's
393    # stderr and stdout.
394    # See http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
395    # for why this is the right thing to do.
396
397    # Both open2 and open3 don't return on failure, but raise an exception. The handling
398    # of the exception is described on p.568 of the Perl Cookbook
399    eval {
400    #$childpid = open3($chld_in, $chld_out, $chld_out, $command); # There should be no double quotes in command, like around filepaths to wget, else need to use array version of command as below
401    #$childpid = open3($chld_in, $chld_out, $chld_out, @commandargs);
402   
403    # instead of calling open3 directly, call wrapper _open3() subroutine that will use sockets to
404    # connect to the child process' iostreams, because we can then use IO::Select's can_read() even on Windows
405    ($childpid, $chld_in, $chld_out) = _open3(@commandargs);
406    };
407    if ($@) {
408    if($@ =~ m/^open3/) {
409        die "open3 failed in $0: $!\n$@\n";     
410    }
411    die "Tried to launch open3 in $0, got unexpected exception: $@";
412    }
413
414    # Switching to use IO::Select, which allows timeouts, instead of doing the potentially blocking
415    #     if defined(my $strLine=<$chld_out>)
416    # Google: perl open3 read timeout
417    # Google: perl open3 select() example
418    # https://stackoverflow.com/questions/10029406/why-does-ipcopen3-get-deadlocked
419    # https://codereview.stackexchange.com/questions/84496/the-right-way-to-use-ipcopen3-in-perl
420    # https://gist.github.com/shalk/6988937
421    # https://stackoverflow.com/questions/18373500/how-to-check-if-command-executed-with-ipcopen3-is-hung
422    # http://perldoc.perl.org/IO/Select.html
423    # http://perldoc.perl.org/IPC/Open3.html - explains the need for select()/IO::Select with open3
424    # http://www.perlmonks.org/?node_id=951554
425    # http://search.cpan.org/~dmuey/IPC-Open3-Utils-0.91/lib/IPC/Open3/Utils.pm
426    # https://stackoverflow.com/questions/3000907/wget-not-behaving-via-ipcopen3-vs-bash?rq=1
427
428    # create the select object and add our streamhandle(s)
429    my $sel = new IO::Select;
430    $sel->add($chld_out);
431
432    my $num_consecutive_timedouts = 0;
433    my $error = 0;
434    my $loop = 1;
435   
436    while($loop)
437    {
438    # assume we're going to timeout trying to read from child process
439    $num_consecutive_timedouts++;
440
441   
442    # block until data is available on the registered filehandles or until the timeout specified   
443    if(my @readyhandles = $sel->can_read($TIMEOUT)) {
444
445        $num_consecutive_timedouts = 0; # re-zero, as we didn't timeout reading from child process after all
446        # since we're in this if statement
447       
448        # now there's a list of registered filehandles we can read from to loop through reading from.
449        # though we've registered only one, chld_out
450        foreach my $fh (@readyhandles) {
451        my $strLine;
452        #sleep 3;
453       
454        # read up to 4096 bytes from this filehandle fh.
455        # if there is less than 4096 bytes, we'll only get
456        # those available bytes and won't block.  If there
457        # is more than 4096 bytes, we'll only read 4096 and
458        # wait for the next iteration through the loop to
459        # read the rest.
460        my $len = sysread($fh, $strLine, 4096);
461       
462        if($len) { # read something
463            if($blnShow) {
464            print STDERR "$strLine\n";
465            }
466            $strReadIn .= $strLine;
467        }
468        else { # error or EOF: (!defined $len || $len == 0)         
469           
470            if(!defined $len) { # could be an error reading
471              # On Windows, the socket ends up forcibly closed on the "other" side. It's just the way it's implemented
472              # on Windows when using sockets to our child process' iostreams. So $len not being defined is not an error in that case. Refer to
473              # https://stackoverflow.com/questions/16675950/perl-select-returning-undef-on-sysread-when-using-windows-ipcopen3-and-ios/16676271
474              if(!$!{ECONNRESET}) { # anything other ECONNRESET error means it's a real case of undefined $len being an error
475                print STDERR "WgetDownload: Error reading from child stream: $!\n";
476                # SHOULD THIS 'die "errmsg";' instead? - no, sockets may need closing
477                $error = 1;
478              } else { # $! contains message "An existing connection was forcibly closed by remote host" where "remote" is a reference to the sockets to our wget child process,
479                    # NOT to the remote web server we're downloading from. In such a case, the error code is ECONNRESET, and it's not an error, despite $len being undefined.
480                #print STDERR "WgetDownload: wget finished\n";
481              }
482            }
483            elsif ($len == 0) { # EOF
484            # Finished reading from this filehandle $fh because we read 0 bytes.
485            # wget finished, terminate naturally
486            #print STDERR "WgetDownload: wget finished\n"; #print STDOUT "\nPerl: open3 command, input streams closed. Wget terminated naturally.\n";
487            }
488
489            $loop = 0; # error or EOF, either way will need to clean up and break out of outer loop
490           
491            # last; # if we have more than one filehandle registered with IO::Select
492           
493            $sel->remove($fh); # if more than one filehandle registered, we should unregister all of them here on error         
494           
495        } # end else error or EOF
496       
497        } # end foreach on readyhandles
498    } # end if on can_read
499   
500    if($num_consecutive_timedouts >= $NUM_TRIES) {
501        $error = 1;
502        $loop = 0;                          # to break out of outer while loop
503
504        $num_consecutive_timedouts = 0;
505
506        &gsprintf::gsprintf(STDERR, "{WgetDownload.wget_timed_out_warning}\n", $NUM_TRIES);
507    }
508
509    if($loop == 0) { # error or EOF, either way, clean up
510        if($error) {
511        $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
512       
513        if(kill(0, $childpid)) {
514            # If kill(0, $childpid) returns true, then the process is running
515            # and we need to kill it.
516            close($chld_in);
517            close($chld_out);
518            kill('TERM', $childpid); # kill the process group by prefixing - to signal
519
520            # https://coderwall.com/p/q-ovnw/killing-all-child-processes-in-a-shell-script
521            # https://stackoverflow.com/questions/392022/best-way-to-kill-all-child-processes
522            #print STDERR "SENT SIGTERM TO CHILD PID: $childpid\n";         
523            #print STDERR "Perl terminated wget after timing out repeatedly and is about to exit\n";
524        }
525        }
526        else { # wget finished (no errors), terminate naturally
527        #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
528        close($chld_in);
529        close($chld_out);
530        waitpid $childpid, 0;       
531        }
532
533        # error or not
534        $childpid = undef;
535        # Stop monitoring the read_handle and close the serverSocket
536        # (the Java end will close the client socket that Java opened)
537        if(defined $port) {
538        $read_set->remove($serverSocket);
539        close($serverSocket);
540        }
541    }
542
543    # If we've already terminated, either naturally or on error, we can get out of the while loop
544    next if($loop == 0);
545   
546    # Otherwise check for whether Java GLI has attempted to connect to this perl script via socket
547   
548    # if we run this script from the command-line (as opposed to from GLI),
549    # then we're not working with sockets and can therefore skip the next bits
550    next unless(defined $port);
551   
552    # http://www.perlfect.com/articles/select.shtml
553    # "multiplex between several filehandles within a single thread of control,
554    # thus creating the effect of parallelism in the handling of I/O."
555    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
556
557    # take all readable handles in turn
558    foreach my $rh (@rh_set) {
559        if($rh == $serverSocket) {
560        my $client = $rh->accept();
561        #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
562        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
563       
564        # Read from the client (getting rid of the trailing newline)
565        # Has the client sent the <<STOP>> signal?
566        my $signal = <$client>;
567        chomp($signal);
568        if($signal eq "<<STOP>>") {
569            print $client "Perl received STOP signal (on port $port): stopping wget\n";
570            $loop = 0;                          # out of outer while loop
571            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
572           
573            # Sometimes the wget process takes some time to start up. If the STOP signal
574            # was sent, don't try to terminate the process until we know it is running.
575            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
576            # for it to start up, checking for whether it is running in order to kill it.
577            for(my $seconds = 1; $seconds <= 5; $seconds++) {
578            if(kill(0, $childpid)) {
579                # If kill(0, $childpid) returns true, then the process is running
580                # and we need to kill it.
581                close($chld_in);
582                close($chld_out);
583                kill("TERM", $childpid); # prefix - to signal to kill process group
584               
585                $childpid = undef;
586               
587                # Stop monitoring the read_handle and close the serverSocket
588                # (the Java end will close the client socket that Java opened)
589                $read_set->remove($rh);     #$read_set->remove($serverSocket);
590                close($rh);             #close($serverSocket);
591                print $client "Perl terminated wget and is about to exit\n";
592                last;                           # out of inner for loop
593            }
594            else { # the process may just be starting up, wait
595                sleep(1);
596            }
597            }
598            last;                               # out of foreach loop
599        }
600        }
601    }
602    }
603
604    if ($changed_dir) {
605    chdir $current_dir;
606    }
607   
608    return $strReadIn;
609}
610
611
612sub useWgetMonitored
613{
614    #local $| = 1; # autoflush stdout buffer
615    #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
616
617    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
618
619
620    my $current_dir = cwd();
621    my $changed_dir = 0;
622    if (defined $working_dir && -e $working_dir) {
623    chdir "$working_dir";
624    $changed_dir = 1;
625    }
626
627    # When we are running this script through GLI, the SIGTERM signal handler
628    # won't get called on Windows when wget is to be prematurely terminated.
629    # Instead, when wget has to be terminated in the middle of execution, GLI will
630    # connect to a serverSocket here to communicate when it's time to stop wget.
631    if($self->dealingWithSockets()) {
632
633    $port = <STDIN>; # gets a port on localhost that's not yet in use
634    chomp($port);
635   
636    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
637                           LocalPort => $port,
638                           Listen    => 1,
639                           Reuse     => 1);
640   
641    die "can't setup server" unless $serverSocket;
642    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
643
644    $read_set = new IO::Select();         # create handle set for reading
645    $read_set->add($serverSocket);        # add the main socket to the set
646    }
647
648    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
649    # compose the command as an array for open3, to preserve spaces in any filepath
650    # Do so by removing leading and trailing spaces, then splitting on "words" (preserving spaces in quoted words and removing quotes)
651    $cmdWget =~ s/^\s+//;
652    $cmdWget =~ s/\s+$//;
653    my @commandargs = quotewords('\s+', 0, $cmdWget);
654    unshift(@commandargs, $wget_file_path); # prepend wget cmd to the command array
655    #print STDOUT "Command is: ".join(",", @commandargs) . "\n";
656
657    eval {     # see p.568 of Perl Cookbook
658    #$childpid = open3($chld_in, $chld_out, $chld_out, @commandargs);
659    ($childpid, $chld_in, $chld_out) = _open3(@commandargs);
660    };
661    if ($@) {
662    if($@ =~ m/^open3/) {
663        die "open3 failed in $0: $!\n$@\n";     
664    }
665    die "Tried to launch open3 in $0, got unexpected exception: $@";
666    }
667
668    my $full_text = "";
669    my $error_text = "";
670    my @follow_list = ();
671    my $line;
672
673    # create the select object and add our streamhandle(s)
674    my $sel = new IO::Select;
675    $sel->add($chld_out);
676   
677    my $num_consecutive_timedouts = 0;
678    my $error = 0;
679    my $loop = 1;
680    while($loop)
681    {
682    # assume we're going to timeout trying to read from child process
683    $num_consecutive_timedouts++;
684
685    # block until data is available on the registered filehandles or until the timeout specified   
686    if(my @readyhandles = $sel->can_read($TIMEOUT)) {
687        $num_consecutive_timedouts = 0; # re-zero, as we didn't timeout reading from child process after all
688        # since we're in this if statement
689       
690        foreach my $fh (@readyhandles) {
691        my $len = sysread($fh, $line, 4096); # read up to 4k from current ready filehandle
692        if($len) { # read something
693       
694           
695            if((defined $blnShow) && $blnShow)
696            {
697            print STDERR "$line";
698            }
699           
700            if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
701            my $follow_url = $1;
702            push(@follow_list,$follow_url);
703            }
704           
705            if ($line =~ m/ERROR\s+\d+/) {
706            $error_text .= $line;
707            }
708           
709            $full_text .= $line;
710        } else { # error or EOF
711            if(!defined $len) { # error reading
712            #print STDERR "WgetDownload: Error reading from child stream: $!\n";
713            $error = 1;
714            }
715             if(!defined $len) {
716              if(!$!{ECONNRESET}) { # anything other ECONNRESET error means it's a real case of undefined $len being an error
717                #print STDERR "WgetDownload: Error reading from child stream: $!\n";               
718                $error = 1;
719              } else { # the error code is ECONNRESET, and it's not an error, despite $len being undefined.
720                       # Happens on Windows when using sockets to a child process' iostreams
721                #print STDERR "WgetDownload: wget finished\n";
722              }
723            }
724            elsif ($len == 0) { # EOF, finished with this filehandle because 0 bytes read
725            #print STDERR "WgetDownload: wget finished\n"; # wget terminated naturally
726            }
727
728            $loop = 0; # error or EOF, either way will need to clean up and break out of outer loop
729           
730            # last; # if we have more than one filehandle registered with IO::Select
731           
732            $sel->remove($fh); # if more than one filehandle registered, we should unregister all of them here on error         
733        } # end else error or EOF
734       
735        } # end foreach on readyhandles
736    }  # end if on can_read
737
738    if($num_consecutive_timedouts >= $NUM_TRIES) {
739        $error = 1;
740        $loop = 0;                          # to break out of outer while loop
741
742        $num_consecutive_timedouts = 0;
743
744        #&gsprintf::gsprintf(STDERR, "{WgetDownload.wget_timed_out_warning}\n", $NUM_TRIES);
745    }
746
747    if($loop == 0) { # error or EOF, either way, clean up
748       
749        if($error) {
750        $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
751       
752        if(kill(0, $childpid)) {
753            # If kill(0, $childpid) returns true, then the process is running
754            # and we need to kill it.
755            close($chld_in);
756            close($chld_out);
757            kill("TERM", $childpid); # prefix - to signal to kill process group
758           
759            #print STDERR "Perl terminated wget after timing out repeatedly and is about to exit\n";
760        }
761        }
762        else { # wget finished, terminate naturally
763        close($chld_in);
764        close($chld_out);
765        # Program terminates only when the following line is included
766        # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
767        # it prevents the child from turning into a "zombie process".
768        # While the wget process terminates without it, this perl script does not:
769        # the DOS prompt is not returned without it.
770        waitpid $childpid, 0;
771        }
772       
773        # error or not:
774        $childpid = undef;     
775        if(defined $port) {
776        $read_set->remove($serverSocket);
777        close($serverSocket);
778        }
779    }
780   
781    # If we've already terminated, either naturally or on error, we can get out of the while loop
782    next if($loop == 0);
783
784    # Otherwise check for whether Java GLI has attempted to connect to this perl script via socket
785   
786    # if we run this script from the command-line (as opposed to from GLI),
787    # then we're not working with sockets and can therefore skip the next bits
788    next unless(defined $port);
789
790    # http://www.perlfect.com/articles/select.shtml
791    # "multiplex between several filehandles within a single thread of control,
792    # thus creating the effect of parallelism in the handling of I/O."
793    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
794
795    # take all readable handles in turn
796    foreach my $rh (@rh_set) {
797        if($rh == $serverSocket) {
798        my $client = $rh->accept();
799        #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
800        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
801       
802        # Read from the client (getting rid of trailing newline)
803        # Has the client sent the <<STOP>> signal?
804        my $signal = <$client>;
805        chomp($signal);
806        if($signal eq "<<STOP>>") {
807            print $client "Perl received STOP signal (on port $port): stopping wget\n";
808            $loop = 0;                          # out of outer while loop
809            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
810           
811            # Sometimes the wget process takes some time to start up. If the STOP signal
812            # was sent, don't try to terminate the process until we know it is running.
813            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
814            # for it to start up, checking for whether it is running in order to kill it.
815            for(my $seconds = 1; $seconds <= 5; $seconds++) {
816            if(kill(0, $childpid)) {
817                # If kill(0, $childpid) returns true, then the process is running
818                # and we need to kill it.
819                close($chld_in);
820                close($chld_out);
821                kill("TERM", $childpid); # prefix - to signal to kill process group
822               
823                $childpid = undef;
824               
825                # Stop monitoring the read_handle and close the serverSocket
826                # (the Java end will close the client socket that Java opened)
827                $read_set->remove($rh);     #$read_set->remove($serverSocket);
828                close($rh);             #close($serverSocket);
829                print $client "Perl terminated wget and is about to exit\n";
830                last;                           # out of inner for loop
831            }
832            else { # the process may just be starting up, wait
833                sleep(1);
834            }
835            }
836            last;                               # out of foreach loop
837        }
838        }
839    }
840    }
841
842    my $command_status = $?;
843    if ($command_status != 0) {
844    $error_text .= "Exit error: $command_status";
845    }
846
847    if ($changed_dir) {
848    chdir $current_dir;
849    }
850   
851    my $final_follow = pop(@follow_list); # might be undefined, but that's OK
852   
853    return ($full_text,$error_text,$final_follow);
854}
855
856
857# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
858sub checkURL
859{
860    my ($self) = @_;
861    if ($self->{'url'} eq "")
862    {
863    &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
864    }
865}
866
867sub error
868{
869    my ($strFunctionName,$strError) = @_;
870    {
871    print "Error occoured in WgetDownload.pm\n".
872        "In Function:".$strFunctionName."\n".
873        "Error Message:".$strError."\n";
874    exit(-1);
875    }
876}
877
8781;
879
Note: See TracBrowser for help on using the browser.