root/main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm @ 32779

Revision 32779, 34.9 KB (checked in by ak19, 20 months ago)

1. Bugfix to wget downloading: call to quotewords() necessarily has 2nd parameter keep set to to false/0, in order to remove quotes from cmd args, but which also has the undesirable side-effect of removing single backslashes (while double backslashes get turned to single backslashes). This caused a bug in the download tutorial failing as the cache dir on windows, which contained backslashes going in, ended up containing no backslashes at all when wget was run. The correct place to fix it is before quotewords() gets called in WgetDownload?'s 2 UseWget?() methods. Before quotewords() is called, singlebackslashes are now protected as double backslashes, so that quotewords with 2nd param keep=0 now gets things back to normal by turning double backslashes to single again. 2. Minor change to MediaWikiDownload? to reuse variable.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36no strict 'subs'; # make an exception so we can use variables as filehandles to pass STDERR/STDOUT to functions, needed for gsprintf()
37use Cwd;
38use util;
39use IPC::Open3;
40use IO::Select;
41use IO::Socket;
42use Text::ParseWords; # part of Core modules. Needed to use quotewords() subroutine
43
44#use IO::Select qw( );
45#use IPC::Open3 qw( open3 );
46use Socket     qw( AF_UNIX SOCK_STREAM PF_UNSPEC ); # http://perlmeme.org/howtos/perlfunc/qw_function.html
47
48
49sub BEGIN {
50    @WgetDownload::ISA = ('BaseDownload');
51}
52
53my $arguments =
54     [ { 'name' => "proxy_on",
55    'desc' => "{WgetDownload.proxy_on}",
56    'type' => "flag",
57    'reqd' => "no",
58    'hiddengli' => "yes"},
59      { 'name' => "http_proxy_host",
60    'desc' => "{WgetDownload.http_proxy_host}",
61    'type' => "string",
62    'reqd' => "no",
63    'hiddengli' => "yes"},
64      { 'name' => "http_proxy_port",
65    'desc' => "{WgetDownload.http_proxy_port}",
66    'type' => "string",
67    'reqd' => "no",
68    'hiddengli' => "yes"},
69      { 'name' => "https_proxy_host", 
70    'desc' => "{WgetDownload.https_proxy_host}",         
71    'type' => "string",
72    'reqd' => "no",
73    'hiddengli' => "yes"},
74      { 'name' => "https_proxy_port",
75    'desc' => "{WgetDownload.https_proxy_port}",         
76    'type' => "string",
77    'reqd' => "no",
78    'hiddengli' => "yes"},
79      { 'name' => "ftp_proxy_host", 
80    'desc' => "{WgetDownload.ftp_proxy_host}",         
81    'type' => "string",
82    'reqd' => "no",
83    'hiddengli' => "yes"},
84      { 'name' => "ftp_proxy_port",
85    'desc' => "{WgetDownload.ftp_proxy_port}",         
86    'type' => "string",
87    'reqd' => "no",
88    'hiddengli' => "yes"},
89      { 'name' => "user_name", 
90    'desc' => "{WgetDownload.user_name}",
91    'type' => "string",
92    'reqd' => "no",
93    'hiddengli' => "yes"},
94      { 'name' => "user_password",
95    'desc' => "{WgetDownload.user_password}",
96    'type' => "string",
97    'reqd' => "no",
98    'hiddengli' => "yes"},
99      { 'name' => "no_check_certificate",
100    'desc' => "{WgetDownload.no_check_certificate}",
101    'type' => "flag",
102    'reqd' => "no",
103    'hiddengli' => "yes"}
104     ];
105
106my $options = { 'name'     => "WgetDownload",
107        'desc'     => "{WgetDownload.desc}",
108        'abstract' => "yes",
109        'inherits' => "yes",
110        'args'     => $arguments };
111
112
113# Declaring file global variables related to the wget child process so that
114# the termination signal handler for SIGTERM can close the streams and tidy
115# up before ending the child process.
116my $childpid;
117my ($chld_out, $chld_in);
118my ($serverSocket, $read_set);
119
120my $TIMEOUT = 1; # seconds
121my $NUM_TRIES = 10;
122
123# The port this script's server socket will be listening on, to handle
124# incoming signals from GLI to terminate wget. This is also file global,
125# since OAIDownload.pm will make several calls on wget using the same
126# instance of this script and we want to reuse whatever port GLI gave us.
127my $port;
128
129# When this script is called from the command line, this handler will be called
130# if this process is killed or abruptly ends due to receiving one of the
131# terminating signals that this handler is registered to deal with.
132sub abrupt_end_handler {
133    my $termination_signal = shift (@_);
134
135    if(defined $childpid) {
136    close($chld_out);
137    close($chld_in);
138   
139    print STDOUT "Received termination signal: $termination_signal\n";
140
141    # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
142    # See http://perldoc.perl.org/perlipc.html#Signals
143    # Warning on using kill at http://perldoc.perl.org/perlfork.html
144    kill("TERM", $childpid); # prefix - to signal to kill process group
145
146    # If the SIGTERM sent on Linux calls this handler, we want to make
147    # sure any socket connection is closed.
148    # Otherwise sockets are only used when this script is run from GLI
149    # in which case the handlers don't really get called.
150    if(defined $serverSocket) {
151        $read_set->remove($serverSocket) if defined $read_set;
152        close($serverSocket);
153    }
154    }
155
156    exit(0);
157}
158
159# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
160# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
161# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
162# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
163# http://affy.blogspot.com/p5be/ch13.htm
164# http://perldoc.perl.org/perlipc.html#Signals
165$SIG{'INT'} = \&abrupt_end_handler;
166$SIG{'TERM'} = \&abrupt_end_handler;
167
168sub new {
169    my ($class) = shift (@_);
170    my ($getlist,$inputargs,$hashArgOptLists) = @_;
171    push(@$getlist, $class);
172
173    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
174    push(@{$hashArgOptLists->{"OptList"}},$options);
175
176    my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
177
178    # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
179    &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
180
181    return bless $self, $class;
182}
183
184sub checkWgetSetup
185{
186    my ($self,$blnGliCall) = @_;
187    #TODO: proxy detection??
188   
189    if((!$blnGliCall) && $self->{'proxy_on'})
190    {
191    &checkProxySetup($self);
192    }
193    &checkURL($self);
194}
195
196# Not using this. On Windows, we used to pass proxying settings as flags to wget. But, as that can be
197# seen with Task Manager, we now have the proxy settings set in the environment and are no longer passing it
198sub addProxySettingsAsWgetFlags
199{
200    my ($self) = @_;
201    my $strOptions = "";
202
203    if($self->{'http_proxy_host'} && $self->{'http_proxy_port'}) {
204    $strOptions .= " -e http_proxy=$self->{'http_proxy_host'}:$self->{'http_proxy_port'} ";
205    }
206    if($self->{'https_proxy_host'} && $self->{'https_proxy_port'}) {
207    $strOptions .= " -e https_proxy=$self->{'https_proxy_host'}:$self->{'https_proxy_port'} ";
208    }
209    if($self->{'ftp_proxy_host'} && $self->{'ftp_proxy_port'}) {
210    $strOptions .= " -e ftp_proxy=$self->{'ftp_proxy_host'}:$self->{'ftp_proxy_port'} ";
211    }
212   
213    # For wget, there is only one set pair of proxy-user and proxy-passwd, so wget seems to assume
214    # that all 3 proxy protocols (http|https|ftp) will use the same username and pwd combination?
215    # Note that this only matters when passing the proxying details as flags to wget, not when
216    # the proxies are setup as environment variables.
217    if ($self->{'user_name'} && $self->{'user_password'})
218    {
219    $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";     
220    # how is "--proxy-passwd" instead of "--proxy-password" even working????       
221    # see https://www.gnu.org/software/wget/manual/html_node/Proxies.html
222    # and https://www.gnu.org/software/wget/manual/wget.html
223    # Not touching this, in case the manual is simply wrong. Since our code works in
224    # practice (when we were still using wget proxy username/pwd flags for windows).
225    }   
226   
227    return $strOptions;
228}
229
230sub getWgetOptions
231{
232    my ($self) = @_;
233    my $strOptions = "";
234   
235    # If proxy settings are set up in the environment, wget is ready to use them. More secure.
236    # But if proxy settings are not set up in the environment, pass them as flags to wget
237    # This is less secure, as pwd etc visible in task manager, but it was the original way in
238    # which wget was run on windows.
239    # Truth in Perl: https://home.ubalt.edu/abento/452/perl/perltruth.html
240    # http://www.perlmonks.org/?node=what%20is%20true%20and%20false%20in%20Perl%3F
241   
242    if ($self->{'proxy_on'}) {
243    if(!$ENV{'http_proxy'} && !$ENV{'https_proxy'} && !$ENV{'ftp_proxy'}) {
244        $strOptions .= $self->addProxySettingsAsWgetFlags();
245    } # else wget will use proxy settings in environment, assume enough settings have been provided
246    # either way, we're using the proxy
247    $strOptions .= " --proxy ";
248    }
249   
250    if($self->{'no_check_certificate'}) { # URL may be http that gets redirected to https, so if no_check_certificate is on, turn it on even if URL is http
251   
252    $strOptions .= " --no-check-certificate ";
253    }
254   
255    return $strOptions;
256}
257
258# Checking for proxy setup: proxy server, proxy port, proxy username and password.
259sub checkProxySetup
260{
261    my ($self) = @_;
262    ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
263    # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
264    # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
265
266}
267
268# Returns true if the wget status needs to be monitored through sockets
269# (if a socket is used to communicate with the Java program on when to
270# terminate wget). True if we are running gli, or if the particular type
271# of WgetDownload is *not* OAIDownload (in that case, the original way of
272# terminating the perl script from Java would terminate wget as well).
273sub dealingWithSockets() {
274    my ($self) = @_;
275    return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
276                       # use ref($self) to find the classname of an object
277}
278
279# On Windows, we can only use IO::Select's can_read() with Sockets, not with the usual handles to a child process' iostreams
280# However, we can use Sockets as the handles to connect to a child process' streams, which then allows us to use can_read()
281# not just on Unix but Windows too. The 2 subroutines below to use Sockets to connect to a child process' iostreams come from
282# http://www.perlmonks.org/?node_id=869942
283# http://www.perlmonks.org/?node_id=811650
284# It was suggested that IPC::Run will take care of all this or circumvent the need for all this,
285# but IPC::Run has limitations on Windows, see http://search.cpan.org/~toddr/IPC-Run-0.96/lib/IPC/Run.pm#Win32_LIMITATIONS
286
287# Create a unidirectional pipe to an iostream of a process that is actually a socket
288sub _pipe {
289    socketpair($_[0], $_[1], AF_UNIX, SOCK_STREAM, PF_UNSPEC)
290        or return undef;
291    shutdown($_[0], 1);  # No more writing for reader. See http://www.perlmonks.org/?node=108244
292    shutdown($_[1], 0);  # No more reading for writer
293    return 1;
294}
295
296sub _open3 {
297    local (*TO_CHLD_R,     *TO_CHLD_W);
298    local (*FR_CHLD_R,     *FR_CHLD_W);
299    #local (*FR_CHLD_ERR_R, *FR_CHLD_ERR_W);
300
301    if ($^O =~ /Win32/) {
302        _pipe(*TO_CHLD_R,     *TO_CHLD_W    ) or die $^E;
303        _pipe(*FR_CHLD_R,     *FR_CHLD_W    ) or die $^E;
304        #_pipe(*FR_CHLD_ERR_R, *FR_CHLD_ERR_W) or die $^E;
305    } else {
306        pipe(*TO_CHLD_R,     *TO_CHLD_W    ) or die $!;
307        pipe(*FR_CHLD_R,     *FR_CHLD_W    ) or die $!;
308        #pipe(*FR_CHLD_ERR_R, *FR_CHLD_ERR_W) or die $!;
309    }
310
311    #my $pid = open3('>&TO_CHLD_R', '<&FR_CHLD_W', '<&FR_CHLD_ERR_W', @_);
312    my $pid = open3('>&TO_CHLD_R', '<&FR_CHLD_W', '<&FR_CHLD_W', @_); # use one handle, chldout, for both stdout and stderr of child proc,
313                                                                      # see http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
314   
315    #return ( $pid, *TO_CHLD_W, *FR_CHLD_R, *FR_CHLD_ERR_R );
316    return ( $pid, *TO_CHLD_W, *FR_CHLD_R);
317}
318
319# useWget and useWgetMonitored are very similar and, when updating, will probably need updating in tandem
320# useWget(Monitored) runs the wget command using open3 and then sits in a loop doing two things per iteration:
321# - processing a set buffer size of the wget (child) process' stdout/stderr streams, if anything has appeared there
322# - followed by checking the socket connection to Java GLI, to see if GLI is trying to cancel the wget process we're running.
323# Then the loop of these two things repeats.
324sub useWget
325{
326    #local $| = 1; # autoflush stdout buffer
327    #print STDOUT "*** Start of subroutine useWget in $0\n";
328
329    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
330
331    my ($strReadIn,$strLine,$command);
332    $strReadIn = "" unless defined $strReadIn;
333
334    my $current_dir = cwd();
335    my $changed_dir = 0;
336    if (defined $working_dir && -e $working_dir) {
337    chdir "$working_dir";
338    $changed_dir = 1;
339    }
340
341    # When we are running this script through GLI, the SIGTERM signal handler
342    # won't get called on Windows when wget is to be prematurely terminated.
343    # Instead, when wget has to be terminated in the middle of execution, GLI will
344    # connect to a serverSocket here to communicate when it's time to stop wget.
345    if($self->dealingWithSockets()) {
346
347    $port = <STDIN>; # gets a port on localhost that's not yet in use
348    chomp($port);
349   
350    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
351                           LocalPort => $port,
352                           Listen    => 1,
353                           Reuse     => 1);
354   
355    die "can't setup server" unless $serverSocket;
356    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
357
358    $read_set = new IO::Select();         # create handle set for reading
359    $read_set->add($serverSocket);        # add the main socket to the set
360    }
361
362    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
363   
364    # Shouldn't use double quotes around wget path after all? See final comment at
365    # http://www.perlmonks.org/?node_id=394709
366    # http://coldattic.info/shvedsky/pro/blogs/a-foo-walks-into-a-bar/posts/63
367    # Therefore, compose the command as an array rather than as a string, to preserve spaces in the filepath
368    # because single/double quotes using open3 seem to launch a subshell, see also final comment at
369    # http://www.perlmonks.org/?node_id=394709 and that ends up causing problems in terminating wget, as 2 processes
370    # got launched then which don't have parent-child pid relationship (so that terminating one doesn't terminate the other).
371
372    # remove leading and trailing spaces, https://stackoverflow.com/questions/4597937/perl-function-to-trim-string-leading-and-trailing-whitespace
373    $cmdWget =~ s/^\s+//;
374    $cmdWget =~ s/\s+$//;
375
376    # replace backslashes with double backslashes, so that we preserve backslash after doing quotewords() step below with its necessary 2nd param keep=0
377    $cmdWget =~ s@\\@\\\\@g;
378   
379    # split on "words"
380    #my @commandargs = split(' ', $cmdWget);
381    # quotewords: to split on spaces except within quotes, then removes quotes and unescapes double backslash too
382      # https://stackoverflow.com/questions/19762412/regex-to-split-key-value-pairs-ignoring-space-in-double-quotes
383      # https://docstore.mik.ua/orelly/perl/perlnut/c08_389.htm
384    my @commandargs = quotewords('\s+', 0, $cmdWget);
385    unshift(@commandargs, $wget_file_path); # prepend the wget cmd
386    print STDOUT "Command is: ".join(",", @commandargs) . "\n"; # if STDERR, cmd will go into ServerInfoDialog
387   
388    # Wget's output needs to be monitored to find out when it has naturally terminated.
389    # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
390    # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
391    # the childpid does not kill wget on Linux but the subshell that launched it instead.
392    # Therefore, we use open3. Though the child process wget sends output only to its stdout [is this meant to be "stderr"?],
393    # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
394    # However that may be, it works with open3. But to avoid the confusion of managing and
395    # closing an extra unused handle, a single handle is used instead for both the child's
396    # stderr and stdout.
397    # See http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
398    # for why this is the right thing to do.
399
400    # Both open2 and open3 don't return on failure, but raise an exception. The handling
401    # of the exception is described on p.568 of the Perl Cookbook
402    eval {
403    #$childpid = open3($chld_in, $chld_out, $chld_out, $command); # There should be no double quotes in command, like around filepaths to wget, else need to use array version of command as below
404    #$childpid = open3($chld_in, $chld_out, $chld_out, @commandargs);
405   
406    # instead of calling open3 directly, call wrapper _open3() subroutine that will use sockets to
407    # connect to the child process' iostreams, because we can then use IO::Select's can_read() even on Windows
408    ($childpid, $chld_in, $chld_out) = _open3(@commandargs);
409    };
410    if ($@) {
411    if($@ =~ m/^open3/) {
412        die "open3 failed in $0: $!\n$@\n";     
413    }
414    die "Tried to launch open3 in $0, got unexpected exception: $@";
415    }
416
417    # Switching to use IO::Select, which allows timeouts, instead of doing the potentially blocking
418    #     if defined(my $strLine=<$chld_out>)
419    # Google: perl open3 read timeout
420    # Google: perl open3 select() example
421    # https://stackoverflow.com/questions/10029406/why-does-ipcopen3-get-deadlocked
422    # https://codereview.stackexchange.com/questions/84496/the-right-way-to-use-ipcopen3-in-perl
423    # https://gist.github.com/shalk/6988937
424    # https://stackoverflow.com/questions/18373500/how-to-check-if-command-executed-with-ipcopen3-is-hung
425    # http://perldoc.perl.org/IO/Select.html
426    # http://perldoc.perl.org/IPC/Open3.html - explains the need for select()/IO::Select with open3
427    # http://www.perlmonks.org/?node_id=951554
428    # http://search.cpan.org/~dmuey/IPC-Open3-Utils-0.91/lib/IPC/Open3/Utils.pm
429    # https://stackoverflow.com/questions/3000907/wget-not-behaving-via-ipcopen3-vs-bash?rq=1
430
431    # create the select object and add our streamhandle(s)
432    my $sel = new IO::Select;
433    $sel->add($chld_out);
434
435    my $num_consecutive_timedouts = 0;
436    my $error = 0;
437    my $loop = 1;
438   
439    while($loop)
440    {
441    # assume we're going to timeout trying to read from child process
442    $num_consecutive_timedouts++;
443
444   
445    # block until data is available on the registered filehandles or until the timeout specified   
446    if(my @readyhandles = $sel->can_read($TIMEOUT)) {
447
448        $num_consecutive_timedouts = 0; # re-zero, as we didn't timeout reading from child process after all
449        # since we're in this if statement
450       
451        # now there's a list of registered filehandles we can read from to loop through reading from.
452        # though we've registered only one, chld_out
453        foreach my $fh (@readyhandles) {
454        my $strLine;
455        #sleep 3;
456       
457        # read up to 4096 bytes from this filehandle fh.
458        # if there is less than 4096 bytes, we'll only get
459        # those available bytes and won't block.  If there
460        # is more than 4096 bytes, we'll only read 4096 and
461        # wait for the next iteration through the loop to
462        # read the rest.
463        my $len = sysread($fh, $strLine, 4096);
464       
465        if($len) { # read something
466            if($blnShow) {
467            print STDERR "$strLine\n";
468            }
469            $strReadIn .= $strLine;
470        }
471        else { # error or EOF: (!defined $len || $len == 0)         
472           
473            if(!defined $len) { # could be an error reading
474              # On Windows, the socket ends up forcibly closed on the "other" side. It's just the way it's implemented
475              # on Windows when using sockets to our child process' iostreams. So $len not being defined is not an error in that case. Refer to
476              # https://stackoverflow.com/questions/16675950/perl-select-returning-undef-on-sysread-when-using-windows-ipcopen3-and-ios/16676271
477              if(!$!{ECONNRESET}) { # anything other ECONNRESET error means it's a real case of undefined $len being an error
478                print STDERR "WgetDownload: Error reading from child stream: $!\n";
479                # SHOULD THIS 'die "errmsg";' instead? - no, sockets may need closing
480                $error = 1;
481              } else { # $! contains message "An existing connection was forcibly closed by remote host" where "remote" is a reference to the sockets to our wget child process,
482                    # NOT to the remote web server we're downloading from. In such a case, the error code is ECONNRESET, and it's not an error, despite $len being undefined.
483                #print STDERR "WgetDownload: wget finished\n";
484              }
485            }
486            elsif ($len == 0) { # EOF
487            # Finished reading from this filehandle $fh because we read 0 bytes.
488            # wget finished, terminate naturally
489            #print STDERR "WgetDownload: wget finished\n"; #print STDOUT "\nPerl: open3 command, input streams closed. Wget terminated naturally.\n";
490            }
491
492            $loop = 0; # error or EOF, either way will need to clean up and break out of outer loop
493           
494            # last; # if we have more than one filehandle registered with IO::Select
495           
496            $sel->remove($fh); # if more than one filehandle registered, we should unregister all of them here on error         
497           
498        } # end else error or EOF
499       
500        } # end foreach on readyhandles
501    } # end if on can_read
502   
503    if($num_consecutive_timedouts >= $NUM_TRIES) {
504        $error = 1;
505        $loop = 0;                          # to break out of outer while loop
506
507        $num_consecutive_timedouts = 0;
508
509        &gsprintf::gsprintf(STDERR, "{WgetDownload.wget_timed_out_warning}\n", $NUM_TRIES);
510    }
511
512    if($loop == 0) { # error or EOF, either way, clean up
513        if($error) {
514        $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
515       
516        if(kill(0, $childpid)) {
517            # If kill(0, $childpid) returns true, then the process is running
518            # and we need to kill it.
519            close($chld_in);
520            close($chld_out);
521            kill('TERM', $childpid); # kill the process group by prefixing - to signal
522
523            # https://coderwall.com/p/q-ovnw/killing-all-child-processes-in-a-shell-script
524            # https://stackoverflow.com/questions/392022/best-way-to-kill-all-child-processes
525            #print STDERR "SENT SIGTERM TO CHILD PID: $childpid\n";         
526            #print STDERR "Perl terminated wget after timing out repeatedly and is about to exit\n";
527        }
528        }
529        else { # wget finished (no errors), terminate naturally
530        #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
531        close($chld_in);
532        close($chld_out);
533        waitpid $childpid, 0;       
534        }
535
536        # error or not
537        $childpid = undef;
538        # Stop monitoring the read_handle and close the serverSocket
539        # (the Java end will close the client socket that Java opened)
540        if(defined $port) {
541        $read_set->remove($serverSocket);
542        close($serverSocket);
543        }
544    }
545
546    # If we've already terminated, either naturally or on error, we can get out of the while loop
547    next if($loop == 0);
548   
549    # Otherwise check for whether Java GLI has attempted to connect to this perl script via socket
550   
551    # if we run this script from the command-line (as opposed to from GLI),
552    # then we're not working with sockets and can therefore skip the next bits
553    next unless(defined $port);
554   
555    # http://www.perlfect.com/articles/select.shtml
556    # "multiplex between several filehandles within a single thread of control,
557    # thus creating the effect of parallelism in the handling of I/O."
558    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
559
560    # take all readable handles in turn
561    foreach my $rh (@rh_set) {
562        if($rh == $serverSocket) {
563        my $client = $rh->accept();
564        #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
565        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
566       
567        # Read from the client (getting rid of the trailing newline)
568        # Has the client sent the <<STOP>> signal?
569        my $signal = <$client>;
570        chomp($signal);
571        if($signal eq "<<STOP>>") {
572            print $client "Perl received STOP signal (on port $port): stopping wget\n";
573            $loop = 0;                          # out of outer while loop
574            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
575           
576            # Sometimes the wget process takes some time to start up. If the STOP signal
577            # was sent, don't try to terminate the process until we know it is running.
578            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
579            # for it to start up, checking for whether it is running in order to kill it.
580            for(my $seconds = 1; $seconds <= 5; $seconds++) {
581            if(kill(0, $childpid)) {
582                # If kill(0, $childpid) returns true, then the process is running
583                # and we need to kill it.
584                close($chld_in);
585                close($chld_out);
586                kill("TERM", $childpid); # prefix - to signal to kill process group
587               
588                $childpid = undef;
589               
590                # Stop monitoring the read_handle and close the serverSocket
591                # (the Java end will close the client socket that Java opened)
592                $read_set->remove($rh);     #$read_set->remove($serverSocket);
593                close($rh);             #close($serverSocket);
594                print $client "Perl terminated wget and is about to exit\n";
595                last;                           # out of inner for loop
596            }
597            else { # the process may just be starting up, wait
598                sleep(1);
599            }
600            }
601            last;                               # out of foreach loop
602        }
603        }
604    }
605    }
606
607    if ($changed_dir) {
608    chdir $current_dir;
609    }
610   
611    return $strReadIn;
612}
613
614
615sub useWgetMonitored
616{
617    #local $| = 1; # autoflush stdout buffer
618    #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
619
620    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
621
622
623    my $current_dir = cwd();
624    my $changed_dir = 0;
625    if (defined $working_dir && -e $working_dir) {
626    chdir "$working_dir";
627    $changed_dir = 1;
628    }
629
630    # When we are running this script through GLI, the SIGTERM signal handler
631    # won't get called on Windows when wget is to be prematurely terminated.
632    # Instead, when wget has to be terminated in the middle of execution, GLI will
633    # connect to a serverSocket here to communicate when it's time to stop wget.
634    if($self->dealingWithSockets()) {
635
636    $port = <STDIN>; # gets a port on localhost that's not yet in use
637    chomp($port);
638   
639    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
640                           LocalPort => $port,
641                           Listen    => 1,
642                           Reuse     => 1);
643   
644    die "can't setup server" unless $serverSocket;
645    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
646
647    $read_set = new IO::Select();         # create handle set for reading
648    $read_set->add($serverSocket);        # add the main socket to the set
649    }
650
651    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
652    # compose the command as an array for open3, to preserve spaces in any filepath
653    # Do so by removing leading and trailing spaces, then splitting on "words" (preserving spaces in quoted words and removing quotes)
654    $cmdWget =~ s/^\s+//;
655    $cmdWget =~ s/\s+$//;
656   
657    # replace backslashes with double backslashes, so that we preserve backslash after doing quotewords() step below with its necessary 2nd param keep=0
658    $cmdWget =~ s@\\@\\\\@g;
659   
660    my @commandargs = quotewords('\s+', 0, $cmdWget);
661    unshift(@commandargs, $wget_file_path); # prepend wget cmd to the command array
662    #print STDOUT "Command is: ".join(",", @commandargs) . "\n";
663
664    eval {     # see p.568 of Perl Cookbook
665    #$childpid = open3($chld_in, $chld_out, $chld_out, @commandargs);
666    ($childpid, $chld_in, $chld_out) = _open3(@commandargs);
667    };
668    if ($@) {
669    if($@ =~ m/^open3/) {
670        die "open3 failed in $0: $!\n$@\n";     
671    }
672    die "Tried to launch open3 in $0, got unexpected exception: $@";
673    }
674
675    my $full_text = "";
676    my $error_text = "";
677    my @follow_list = ();
678    my $line;
679
680    # create the select object and add our streamhandle(s)
681    my $sel = new IO::Select;
682    $sel->add($chld_out);
683   
684    my $num_consecutive_timedouts = 0;
685    my $error = 0;
686    my $loop = 1;
687    while($loop)
688    {
689    # assume we're going to timeout trying to read from child process
690    $num_consecutive_timedouts++;
691
692    # block until data is available on the registered filehandles or until the timeout specified   
693    if(my @readyhandles = $sel->can_read($TIMEOUT)) {
694        $num_consecutive_timedouts = 0; # re-zero, as we didn't timeout reading from child process after all
695        # since we're in this if statement
696       
697        foreach my $fh (@readyhandles) {
698        my $len = sysread($fh, $line, 4096); # read up to 4k from current ready filehandle
699        if($len) { # read something
700       
701           
702            if((defined $blnShow) && $blnShow)
703            {
704            print STDERR "$line";
705            }
706           
707            if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
708            my $follow_url = $1;
709            push(@follow_list,$follow_url);
710            }
711           
712            if ($line =~ m/ERROR\s+\d+/) {
713            $error_text .= $line;
714            }
715           
716            $full_text .= $line;
717        } else { # error or EOF
718            if(!defined $len) { # error reading
719            #print STDERR "WgetDownload: Error reading from child stream: $!\n";
720            $error = 1;
721            }
722             if(!defined $len) {
723              if(!$!{ECONNRESET}) { # anything other ECONNRESET error means it's a real case of undefined $len being an error
724                #print STDERR "WgetDownload: Error reading from child stream: $!\n";               
725                $error = 1;
726              } else { # the error code is ECONNRESET, and it's not an error, despite $len being undefined.
727                       # Happens on Windows when using sockets to a child process' iostreams
728                #print STDERR "WgetDownload: wget finished\n";
729              }
730            }
731            elsif ($len == 0) { # EOF, finished with this filehandle because 0 bytes read
732            #print STDERR "WgetDownload: wget finished\n"; # wget terminated naturally
733            }
734
735            $loop = 0; # error or EOF, either way will need to clean up and break out of outer loop
736           
737            # last; # if we have more than one filehandle registered with IO::Select
738           
739            $sel->remove($fh); # if more than one filehandle registered, we should unregister all of them here on error         
740        } # end else error or EOF
741       
742        } # end foreach on readyhandles
743    }  # end if on can_read
744
745    if($num_consecutive_timedouts >= $NUM_TRIES) {
746        $error = 1;
747        $loop = 0;                          # to break out of outer while loop
748
749        $num_consecutive_timedouts = 0;
750
751        #&gsprintf::gsprintf(STDERR, "{WgetDownload.wget_timed_out_warning}\n", $NUM_TRIES);
752    }
753
754    if($loop == 0) { # error or EOF, either way, clean up
755       
756        if($error) {
757        $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
758       
759        if(kill(0, $childpid)) {
760            # If kill(0, $childpid) returns true, then the process is running
761            # and we need to kill it.
762            close($chld_in);
763            close($chld_out);
764            kill("TERM", $childpid); # prefix - to signal to kill process group
765           
766            #print STDERR "Perl terminated wget after timing out repeatedly and is about to exit\n";
767        }
768        }
769        else { # wget finished, terminate naturally
770        close($chld_in);
771        close($chld_out);
772        # Program terminates only when the following line is included
773        # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
774        # it prevents the child from turning into a "zombie process".
775        # While the wget process terminates without it, this perl script does not:
776        # the DOS prompt is not returned without it.
777        waitpid $childpid, 0;
778        }
779       
780        # error or not:
781        $childpid = undef;     
782        if(defined $port) {
783        $read_set->remove($serverSocket);
784        close($serverSocket);
785        }
786    }
787   
788    # If we've already terminated, either naturally or on error, we can get out of the while loop
789    next if($loop == 0);
790
791    # Otherwise check for whether Java GLI has attempted to connect to this perl script via socket
792   
793    # if we run this script from the command-line (as opposed to from GLI),
794    # then we're not working with sockets and can therefore skip the next bits
795    next unless(defined $port);
796
797    # http://www.perlfect.com/articles/select.shtml
798    # "multiplex between several filehandles within a single thread of control,
799    # thus creating the effect of parallelism in the handling of I/O."
800    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
801
802    # take all readable handles in turn
803    foreach my $rh (@rh_set) {
804        if($rh == $serverSocket) {
805        my $client = $rh->accept();
806        #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
807        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
808       
809        # Read from the client (getting rid of trailing newline)
810        # Has the client sent the <<STOP>> signal?
811        my $signal = <$client>;
812        chomp($signal);
813        if($signal eq "<<STOP>>") {
814            print $client "Perl received STOP signal (on port $port): stopping wget\n";
815            $loop = 0;                          # out of outer while loop
816            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
817           
818            # Sometimes the wget process takes some time to start up. If the STOP signal
819            # was sent, don't try to terminate the process until we know it is running.
820            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
821            # for it to start up, checking for whether it is running in order to kill it.
822            for(my $seconds = 1; $seconds <= 5; $seconds++) {
823            if(kill(0, $childpid)) {
824                # If kill(0, $childpid) returns true, then the process is running
825                # and we need to kill it.
826                close($chld_in);
827                close($chld_out);
828                kill("TERM", $childpid); # prefix - to signal to kill process group
829               
830                $childpid = undef;
831               
832                # Stop monitoring the read_handle and close the serverSocket
833                # (the Java end will close the client socket that Java opened)
834                $read_set->remove($rh);     #$read_set->remove($serverSocket);
835                close($rh);             #close($serverSocket);
836                print $client "Perl terminated wget and is about to exit\n";
837                last;                           # out of inner for loop
838            }
839            else { # the process may just be starting up, wait
840                sleep(1);
841            }
842            }
843            last;                               # out of foreach loop
844        }
845        }
846    }
847    }
848
849    my $command_status = $?;
850    if ($command_status != 0) {
851    $error_text .= "Exit error: $command_status";
852    }
853
854    if ($changed_dir) {
855    chdir $current_dir;
856    }
857   
858    my $final_follow = pop(@follow_list); # might be undefined, but that's OK
859   
860    return ($full_text,$error_text,$final_follow);
861}
862
863
864# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
865sub checkURL
866{
867    my ($self) = @_;
868    if ($self->{'url'} eq "")
869    {
870    &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
871    }
872}
873
874sub error
875{
876    my ($strFunctionName,$strError) = @_;
877    {
878    print "Error occoured in WgetDownload.pm\n".
879        "In Function:".$strFunctionName."\n".
880        "Error Message:".$strError."\n";
881    exit(-1);
882    }
883}
884
8851;
886
Note: See TracBrowser for help on using the browser.