root/main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm @ 31920

Revision 31920, 29.1 KB (checked in by ak19, 2 years ago)

Untested on Windows as yet. 1. Major overhaul to WgetDownload?'s useWget() and useWgetMonitored() subroutines. Their use of open3 was wrong and would cause blocking if proxy set wrong or if https_proxy not set/set wrong and the url entered was http but resolves to https. The problem was more fundamental than the symptoms indicated the open3() calls were used wrong and resulted in blocking. The blocking could be indefinite. To generally avoid blocking, needed to use IO::select() to loop to check any child streams that are ready. To avoid possibly indefinite blocking, needed to use IO::select() with a timeout on the can_read() method. The need for all these and their use is indicated in the links added to the committed version of this module. 2. After the use of select() worked in principle, there was still the large problem that terminating unnaturally did not stop a second wget that had been launched. This unexpectedly had to do with doublequotes around wget's path that attempted to preserve any spaces in the path, but which behaved differently with open3(): any double quotes launched a subshell to run the command passed to open3(). And the wget cmd launched by the subshell cmd wasn't actually a child process, so it could not be terminated via the parentpid used as a processgrouppid when doing the kill TERM -processgroupid. The solution lay with the unexpected cause of the problem, which was the double quotes. Now the command passed to open3() is an array of parameters and no double quotes. The array is meant to preserve spaces in any filepaths. 3. Removed the 2 tries parameter passed to wget, since we now loop a certain number of times trying to read from the child process' streams each time this times out. If it times out n times, then we give up and assume that the URL could not be read.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use util;
38use IPC::Open3;
39use IO::Select;
40use IO::Socket;
41
42
43sub BEGIN {
44    @WgetDownload::ISA = ('BaseDownload');
45}
46
47my $arguments =
48     [ { 'name' => "proxy_on",
49    'desc' => "{WgetDownload.proxy_on}",
50    'type' => "flag",
51    'reqd' => "no",
52    'hiddengli' => "yes"},
53      { 'name' => "http_proxy_host",
54    'desc' => "{WgetDownload.http_proxy_host}",
55    'type' => "string",
56    'reqd' => "no",
57    'hiddengli' => "yes"},
58      { 'name' => "http_proxy_port",
59    'desc' => "{WgetDownload.http_proxy_port}",
60    'type' => "string",
61    'reqd' => "no",
62    'hiddengli' => "yes"},
63      { 'name' => "https_proxy_host", 
64    'desc' => "{WgetDownload.https_proxy_host}",         
65    'type' => "string",
66    'reqd' => "no",
67    'hiddengli' => "yes"},
68      { 'name' => "https_proxy_port",
69    'desc' => "{WgetDownload.https_proxy_port}",         
70    'type' => "string",
71    'reqd' => "no",
72    'hiddengli' => "yes"},
73      { 'name' => "ftp_proxy_host", 
74    'desc' => "{WgetDownload.ftp_proxy_host}",         
75    'type' => "string",
76    'reqd' => "no",
77    'hiddengli' => "yes"},
78      { 'name' => "ftp_proxy_port",
79    'desc' => "{WgetDownload.ftp_proxy_port}",         
80    'type' => "string",
81    'reqd' => "no",
82    'hiddengli' => "yes"},
83      { 'name' => "user_name", 
84    'desc' => "{WgetDownload.user_name}",
85    'type' => "string",
86    'reqd' => "no",
87    'hiddengli' => "yes"},
88      { 'name' => "user_password",
89    'desc' => "{WgetDownload.user_password}",
90    'type' => "string",
91    'reqd' => "no",
92    'hiddengli' => "yes"},
93      { 'name' => "no_check_certificate",
94    'desc' => "{WgetDownload.no_check_certificate}",
95    'type' => "flag",
96    'reqd' => "no",
97    'hiddengli' => "yes"}
98     ];
99
100my $options = { 'name'     => "WgetDownload",
101        'desc'     => "{WgetDownload.desc}",
102        'abstract' => "yes",
103        'inherits' => "yes",
104        'args'     => $arguments };
105
106
107# Declaring file global variables related to the wget child process so that
108# the termination signal handler for SIGTERM can close the streams and tidy
109# up before ending the child process.
110my $childpid;
111my ($chld_out, $chld_in);
112my ($serverSocket, $read_set);
113
114my $TIMEOUT = 1; # seconds
115my $NUM_TRIES = 10;
116
117# The port this script's server socket will be listening on, to handle
118# incoming signals from GLI to terminate wget. This is also file global,
119# since OAIDownload.pm will make several calls on wget using the same
120# instance of this script and we want to reuse whatever port GLI gave us.
121my $port;
122
123# When this script is called from the command line, this handler will be called
124# if this process is killed or abruptly ends due to receiving one of the
125# terminating signals that this handler is registered to deal with.
126sub abrupt_end_handler {
127    my $termination_signal = shift (@_);
128
129    if(defined $childpid) {
130    close($chld_out);
131    close($chld_in);
132   
133    print STDOUT "Received termination signal: $termination_signal\n";
134
135    # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
136    # See http://perldoc.perl.org/perlipc.html#Signals
137    # Warning on using kill at http://perldoc.perl.org/perlfork.html
138    kill("TERM", $childpid); # prefix - to signal to kill process group
139
140    # If the SIGTERM sent on Linux calls this handler, we want to make
141    # sure any socket connection is closed.
142    # Otherwise sockets are only used when this script is run from GLI
143    # in which case the handlers don't really get called.
144    if(defined $serverSocket) {
145        $read_set->remove($serverSocket) if defined $read_set;
146        close($serverSocket);
147    }
148    }
149
150    exit(0);
151}
152
153# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
154# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
155# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
156# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
157# http://affy.blogspot.com/p5be/ch13.htm
158# http://perldoc.perl.org/perlipc.html#Signals
159$SIG{'INT'} = \&abrupt_end_handler;
160$SIG{'TERM'} = \&abrupt_end_handler;
161
162sub new {
163    my ($class) = shift (@_);
164    my ($getlist,$inputargs,$hashArgOptLists) = @_;
165    push(@$getlist, $class);
166
167    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
168    push(@{$hashArgOptLists->{"OptList"}},$options);
169
170    my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
171
172    # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
173    &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
174
175    return bless $self, $class;
176}
177
178sub checkWgetSetup
179{
180    my ($self,$blnGliCall) = @_;
181    #TODO: proxy detection??
182   
183    if((!$blnGliCall) && $self->{'proxy_on'})
184    {
185    &checkProxySetup($self);
186    }
187    &checkURL($self);
188}
189
190# Not using this. On Windows, we used to pass proxying settings as flags to wget. But, as that can be
191# seen with Task Manager, we now have the proxy settings set in the environment and are no longer passing it
192sub addProxySettingsAsWgetFlags
193{
194    my ($self) = @_;
195    my $strOptions = "";
196
197    if($self->{'http_proxy_host'} && $self->{'http_proxy_port'}) {
198    $strOptions .= " -e http_proxy=$self->{'http_proxy_host'}:$self->{'http_proxy_port'} ";
199    }
200    if($self->{'https_proxy_host'} && $self->{'https_proxy_port'}) {
201    $strOptions .= " -e https_proxy=$self->{'https_proxy_host'}:$self->{'https_proxy_port'} ";
202    }
203    if($self->{'ftp_proxy_host'} && $self->{'ftp_proxy_port'}) {
204    $strOptions .= " -e ftp_proxy=$self->{'ftp_proxy_host'}:$self->{'ftp_proxy_port'} ";
205    }
206   
207    # For wget, there is only one set pair of proxy-user and proxy-passwd, so wget seems to assume
208    # that all 3 proxy protocols (http|https|ftp) will use the same username and pwd combination?
209    # Note that this only matters when passing the proxying details as flags to wget, not when
210    # the proxies are setup as environment variables.
211    if ($self->{'user_name'} && $self->{'user_password'})
212    {
213    $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";     
214    # how is "--proxy-passwd" instead of "--proxy-password" even working????       
215    # see https://www.gnu.org/software/wget/manual/html_node/Proxies.html
216    # and https://www.gnu.org/software/wget/manual/wget.html
217    # Not touching this, in case the manual is simply wrong. Since our code works in
218    # practice (when we were still using wget proxy username/pwd flags for windows).
219    }   
220   
221    return $strOptions;
222}
223
224sub getWgetOptions
225{
226    my ($self) = @_;
227    my $strOptions = "";
228   
229    # If proxy settings are set up in the environment, wget is ready to use them. More secure.
230    # But if proxy settings are not set up in the environment, pass them as flags to wget
231    # This is less secure, as pwd etc visible in task manager, but it was the original way in
232    # which wget was run on windows.
233    # Truth in Perl: https://home.ubalt.edu/abento/452/perl/perltruth.html
234    # http://www.perlmonks.org/?node=what%20is%20true%20and%20false%20in%20Perl%3F
235   
236    if ($self->{'proxy_on'}) {
237    if(!$ENV{'http_proxy'} && !$ENV{'https_proxy'} && !$ENV{'ftp_proxy'}) {
238        $strOptions .= $self->addProxySettingsAsWgetFlags();
239    } # else wget will use proxy settings in environment, assume enough settings have been provided
240    # either way, we're using the proxy
241    $strOptions .= " --proxy ";
242    }
243   
244    if($self->{'no_check_certificate'}) { # URL may be http that gets redirected to https, so if no_check_certificate is on, turn it on even if URL is http
245   
246    $strOptions .= " --no-check-certificate ";
247    }
248   
249    return $strOptions;
250}
251
252# Checking for proxy setup: proxy server, proxy port, proxy username and password.
253sub checkProxySetup
254{
255    my ($self) = @_;
256    ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
257    # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
258    # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
259
260}
261
262# Returns true if the wget status needs to be monitored through sockets
263# (if a socket is used to communicate with the Java program on when to
264# terminate wget). True if we are running gli, or if the particular type
265# of WgetDownload is *not* OAIDownload (in that case, the original way of
266# terminating the perl script from Java would terminate wget as well).
267sub dealingWithSockets() {
268    my ($self) = @_;
269    return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
270                       # use ref($self) to find the classname of an object
271}
272
273# Shouldn't use double quotes around wget path after all? See final comment at
274# http://www.perlmonks.org/?node_id=394709
275# http://coldattic.info/shvedsky/pro/blogs/a-foo-walks-into-a-bar/posts/63
276sub useWget
277{
278    #local $| = 1; # autoflush stdout buffer
279    #print STDOUT "*** Start of subroutine useWget in $0\n";
280
281    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
282
283    my ($strReadIn,$strLine,$command);
284    $strReadIn = "" unless defined $strReadIn;
285
286    my $current_dir = cwd();
287    my $changed_dir = 0;
288    if (defined $working_dir && -e $working_dir) {
289    chdir "$working_dir";
290    $changed_dir = 1;
291    }
292
293    # When we are running this script through GLI, the SIGTERM signal handler
294    # won't get called on Windows when wget is to be prematurely terminated.
295    # Instead, when wget has to be terminated in the middle of execution, GLI will
296    # connect to a serverSocket here to communicate when it's time to stop wget.
297    if($self->dealingWithSockets()) {
298
299    $port = <STDIN>; # gets a port on localhost that's not yet in use
300    chomp($port);
301   
302    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
303                           LocalPort => $port,
304                           Listen    => 1,
305                           Reuse     => 1);
306   
307    die "can't setup server" unless $serverSocket;
308    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
309
310    $read_set = new IO::Select();         # create handle set for reading
311    $read_set->add($serverSocket);        # add the main socket to the set
312    }
313
314    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
315    # compose the command as an array rather than as a string, to preserve spaces in the filepath
316    # because single/double quotes using open3 seem to launch a shell, see final comment at
317    # http://www.perlmonks.org/?node_id=394709 and that ends up causing problems in terminating wget
318    # as 2 processes then got launched which don't have parent-child pid relationship.
319    my @commandargs = split(' ', $cmdWget);
320    unshift(@commandargs, $wget_file_path);
321    $command = "$wget_file_path $cmdWget";
322#    print STDOUT "Command is: $command\n"; # displayed in GLI output
323#    print STDERR "Command is: $command\n"; # goes into ServerInfoDialog
324   
325    # Wget's output needs to be monitored to find out when it has naturally terminated.
326    # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
327    # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
328    # the childpid does not kill wget on Linux but the subshell that launched it instead.
329    # Therefore, we use open3. Though the child process wget sends output only to its stdout [is this meant to be "stderr"?],
330    # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
331    # However that may be, it works with open3. But to avoid the confusion of managing and
332    # closing an extra unused handle, a single handle is used instead for both the child's
333    # stderr and stdout.
334    # See http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
335    # for why this is the right thing to do.
336
337    # Both open2 and open3 don't return on failure, but raise an exception. The handling
338    # of the exception is described on p.568 of the Perl Cookbook
339    eval {
340    #$childpid = open3($chld_in, $chld_out, $chld_out, $command);
341    $childpid = open3($chld_in, $chld_out, $chld_out, @commandargs);   
342    };
343    if ($@) {
344    if($@ =~ m/^open3/) {
345        die "open3 failed in $0: $!\n$@\n";     
346    }
347    die "Tried to launch open3 in $0, got unexpected exception: $@";
348    }
349
350    # Switching to use IO::Select, which allows timeouts, instead of doing the potentially blocking
351    #     if defined(my $strLine=<$chld_out>)
352    # Google: perl open3 read timeout
353    # Google: perl open3 select() example
354    # https://stackoverflow.com/questions/10029406/why-does-ipcopen3-get-deadlocked
355    # https://codereview.stackexchange.com/questions/84496/the-right-way-to-use-ipcopen3-in-perl
356    # https://gist.github.com/shalk/6988937
357    # https://stackoverflow.com/questions/18373500/how-to-check-if-command-executed-with-ipcopen3-is-hung
358    # http://perldoc.perl.org/IO/Select.html
359    # http://perldoc.perl.org/IPC/Open3.html - explains the need for select()/IO::Select with open3
360    # http://www.perlmonks.org/?node_id=951554
361    # http://search.cpan.org/~dmuey/IPC-Open3-Utils-0.91/lib/IPC/Open3/Utils.pm
362    # https://stackoverflow.com/questions/3000907/wget-not-behaving-via-ipcopen3-vs-bash?rq=1
363
364    # create the select object and add our streamhandle(s)
365    my $sel = new IO::Select;
366    $sel->add($chld_out);
367
368    my $num_consecutive_timedouts = 0;
369    my $error = 0;
370    my $loop = 1;
371   
372    while($loop)
373    {
374    # assume we're going to timeout trying to read from child process
375    $num_consecutive_timedouts++;
376
377   
378    # block until data is available on the registered filehandles or until the timeout specified   
379    if(my @readyhandles = $sel->can_read($TIMEOUT)) {
380
381        $num_consecutive_timedouts = 0; # re-zero, as we didn't timeout reading from child process after all
382        # since we're in this if statement
383       
384        # now there's a list of registered filehandles we can read from to loop through reading from.
385        # though we've registered only one, chld_out
386        foreach my $fh (@readyhandles) {
387        my $strLine;
388        #sleep 3;
389       
390        # read up to 4096 bytes from this filehandle fh.
391        # if there is less than 4096 bytes, we'll only get
392        # those available bytes and won't block.  If there
393        # is more than 4096 bytes, we'll only read 4096 and
394        # wait for the next iteration through the loop to
395        # read the rest.
396        my $len = sysread($fh, $strLine, 4096);
397       
398        if($len) { # read something
399            if($blnShow) {
400            print STDERR "$strLine\n";
401            }
402            $strReadIn .= $strLine;
403        }
404        else { # error or EOF: (!defined $len || $len == 0)         
405           
406            if(!defined $len) { # error reading         
407            print STDERR "WgetDownload: Error reading from child stream: $!\n";
408            # SHOULD THIS 'die "errmsg";' instead? - no, sockets may need closing
409            $error = 1;
410            }
411            elsif ($len == 0) { # EOF           
412            # Finished reading from this filehand $fh because we read 0 bytes.
413            # wget finished, terminate naturally
414            print STDERR "WgetDownload: wget finished\n";
415            #print STDOUT "\nPerl: open3 command, input streams closed. Wget terminated naturally.\n";
416            }
417
418            $loop = 0; # error or EOF, either way will need to clean up and break out of outer loop
419           
420            # last; # if we have more than one filehandle registered with IO::Select
421           
422            $sel->remove($fh); # if more than one filehandle registered, we should unregister all of them here on error         
423           
424        } # end else error or EOF
425       
426        } # end foreach on readyhandles
427    } # end if on can_read
428   
429    if($num_consecutive_timedouts >= $NUM_TRIES) {
430        $error = 1;
431        $loop = 0;                          # to break out of outer while loop
432
433        $num_consecutive_timedouts = 0;
434
435        print STDERR "WARNING from WgetDownload: wget timed out $NUM_TRIES times waiting for a response\n";
436        print STDERR "\tThe URL may be inaccessible or the proxy configuration is wrong or incomplete.\n";
437        #print STDERR "\tConsider cancelling this download?\n";
438    }
439
440    if($loop == 0) { # error or EOF, either way, clean up
441        if($error) {
442        $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
443       
444        if(kill(0, $childpid)) {
445            # If kill(0, $childpid) returns true, then the process is running
446            # and we need to kill it.
447            close($chld_in);
448            close($chld_out);
449            kill('TERM', $childpid); # kill the process group by prefixing - to signal
450
451            # https://coderwall.com/p/q-ovnw/killing-all-child-processes-in-a-shell-script
452            # https://stackoverflow.com/questions/392022/best-way-to-kill-all-child-processes
453            print STDERR "SENT SIGTERM TO CHILD PID: $childpid\n";
454           
455            #die "Perl terminated wget after timing out repeatedly and is about to exit\n";
456        }
457        }
458        else { # wget finished (no errors), terminate naturally
459        #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
460        close($chld_in);
461        close($chld_out);
462        waitpid $childpid, 0;       
463        }
464
465        # error or not
466        $childpid = undef;
467        # Stop monitoring the read_handle and close the serverSocket
468        # (the Java end will close the client socket that Java opened)
469        if(defined $port) {
470        $read_set->remove($serverSocket);
471        close($serverSocket);
472        }
473    }
474
475    # If we've already terminated, we can get out of the loop
476    #next if($loop == 0);
477   
478    # if we run this script from the command-line (as opposed to from GLI),
479    # then we're not working with sockets and can therefore skip the next bits
480    next unless(defined $port);
481   
482    # http://www.perlfect.com/articles/select.shtml
483    # "multiplex between several filehandles within a single thread of control,
484    # thus creating the effect of parallelism in the handling of I/O."
485    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
486
487    # take all readable handles in turn
488    foreach my $rh (@rh_set) {
489        if($rh == $serverSocket) {
490        my $client = $rh->accept();
491        #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
492        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
493       
494        # Read from the client (getting rid of the trailing newline)
495        # Has the client sent the <<STOP>> signal?
496        my $signal = <$client>;
497        chomp($signal);
498        if($signal eq "<<STOP>>") {
499            print $client "Perl received STOP signal (on port $port): stopping wget\n";
500            $loop = 0;                          # out of outer while loop
501            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
502           
503            # Sometimes the wget process takes some time to start up. If the STOP signal
504            # was sent, don't try to terminate the process until we know it is running.
505            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
506            # for it to start up, checking for whether it is running in order to kill it.
507            for(my $seconds = 1; $seconds <= 5; $seconds++) {
508            if(kill(0, $childpid)) {
509                # If kill(0, $childpid) returns true, then the process is running
510                # and we need to kill it.
511                close($chld_in);
512                close($chld_out);
513                kill("TERM", $childpid); # prefix - to signal to kill process group
514               
515                $childpid = undef;
516               
517                # Stop monitoring the read_handle and close the serverSocket
518                # (the Java end will close the client socket that Java opened)
519                $read_set->remove($rh);     #$read_set->remove($serverSocket);
520                close($rh);             #close($serverSocket);
521                print $client "Perl terminated wget and is about to exit\n";
522                last;                           # out of inner for loop
523            }
524            else { # the process may just be starting up, wait
525                sleep(1);
526            }
527            }
528            last;                               # out of foreach loop
529        }
530        }
531    }
532    }
533
534    if ($changed_dir) {
535    chdir $current_dir;
536    }
537   
538    return $strReadIn;
539}
540
541
542sub useWgetMonitored
543{
544    #local $| = 1; # autoflush stdout buffer
545    #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
546
547    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
548
549
550    my $current_dir = cwd();
551    my $changed_dir = 0;
552    if (defined $working_dir && -e $working_dir) {
553    chdir "$working_dir";
554    $changed_dir = 1;
555    }
556
557    # When we are running this script through GLI, the SIGTERM signal handler
558    # won't get called on Windows when wget is to be prematurely terminated.
559    # Instead, when wget has to be terminated in the middle of execution, GLI will
560    # connect to a serverSocket here to communicate when it's time to stop wget.
561    if($self->dealingWithSockets()) {
562
563    $port = <STDIN>; # gets a port on localhost that's not yet in use
564    chomp($port);
565   
566    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
567                           LocalPort => $port,
568                           Listen    => 1,
569                           Reuse     => 1);
570   
571    die "can't setup server" unless $serverSocket;
572    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
573
574    $read_set = new IO::Select();         # create handle set for reading
575    $read_set->add($serverSocket);        # add the main socket to the set
576    }
577
578    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
579    # compose the command as an array for open3, to preserve spaces in any filepath
580    my @commandargs = split(' ', $cmdWget);
581    unshift(@commandargs, $wget_file_path);
582    my $command = "$wget_file_path $cmdWget";
583    #print STDOUT "Command is: $command\n";
584
585    eval {     # see p.568 of Perl Cookbook
586    $childpid = open3($chld_in, $chld_out, $chld_out, @commandargs);
587    };
588    if ($@) {
589    if($@ =~ m/^open3/) {
590        die "open3 failed in $0: $!\n$@\n";     
591    }
592    die "Tried to launch open3 in $0, got unexpected exception: $@";
593    }
594
595    my $full_text = "";
596    my $error_text = "";
597    my @follow_list = ();
598    my $line;
599
600    # create the select object and add our streamhandle(s)
601    my $sel = new IO::Select;
602    $sel->add($chld_out);
603   
604    my $num_consecutive_timedouts = 0;
605    my $error = 0;
606    my $loop = 1;
607    while($loop)
608    {
609    # assume we're going to timeout trying to read from child process
610    $num_consecutive_timedouts++;
611
612    # block until data is available on the registered filehandles or until the timeout specified   
613    if(my @readyhandles = $sel->can_read($TIMEOUT)) {
614        $num_consecutive_timedouts = 0; # re-zero, as we didn't timeout reading from child process after all
615        # since we're in this if statement
616       
617        foreach my $fh (@readyhandles) {
618        my $len = sysread($fh, $line, 4096); # read up to 4k from current ready filehandle
619        if($len) { # read something
620       
621           
622            if((defined $blnShow) && $blnShow)
623            {
624            print STDERR "$line";
625            }
626           
627            if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
628            my $follow_url = $1;
629            push(@follow_list,$follow_url);
630            }
631           
632            if ($line =~ m/ERROR\s+\d+/) {
633            $error_text .= $line;
634            }
635           
636            $full_text .= $line;
637        } else { # error or EOF
638            if(!defined $len) { # error reading
639            #print STDERR "WgetDownload: Error reading from child stream: $!\n";
640            $error = 1;
641            }
642            elsif ($len == 0) { # EOF, finished with this filehandle because 0 bytes read
643            #print STDERR "WgetDownload: wget finished\n"; # wget terminated naturally
644            }
645
646            $loop = 0; # error or EOF, either way will need to clean up and break out of outer loop
647           
648            # last; # if we have more than one filehandle registered with IO::Select
649           
650            $sel->remove($fh); # if more than one filehandle registered, we should unregister all of them here on error         
651        } # end else error or EOF
652       
653        } # end foreach on readyhandles
654    }  # end if on can_read
655
656    if($num_consecutive_timedouts >= $NUM_TRIES) {
657        $error = 1;
658        $loop = 0;                          # to break out of outer while loop
659
660        $num_consecutive_timedouts = 0;
661
662        #print STDERR "WARNING from WgetDownload: wget timed out $NUM_TRIES times waiting for a response\n";
663        #print STDERR "\tThe URL may be inaccessible or the proxy configuration is wrong or incomplete.\n";
664    }
665
666    if($loop == 0) { # error or EOF, either way, clean up
667       
668        if($error) {
669        $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
670       
671        if(kill(0, $childpid)) {
672            # If kill(0, $childpid) returns true, then the process is running
673            # and we need to kill it.
674            close($chld_in);
675            close($chld_out);
676            kill("TERM", $childpid); # prefix - to signal to kill process group
677           
678            #die "Perl terminated wget after timing out repeatedly and is about to exit\n";
679        }
680        }
681        else { # wget finished, terminate naturally
682        close($chld_in);
683        close($chld_out);
684        # Program terminates only when the following line is included
685        # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
686        # it prevents the child from turning into a "zombie process".
687        # While the wget process terminates without it, this perl script does not:
688        # the DOS prompt is not returned without it.
689        waitpid $childpid, 0;
690        }
691       
692        # error or not:
693        $childpid = undef;     
694        if(defined $port) {
695        $read_set->remove($serverSocket);
696        close($serverSocket);
697        }
698    }
699
700    # if we run this script from the command-line (as opposed to from GLI),
701    # then we're not working with sockets and can therefore skip the next bits
702    next unless(defined $port);
703
704    # http://www.perlfect.com/articles/select.shtml
705    # "multiplex between several filehandles within a single thread of control,
706    # thus creating the effect of parallelism in the handling of I/O."
707    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
708
709    # take all readable handles in turn
710    foreach my $rh (@rh_set) {
711        if($rh == $serverSocket) {
712        my $client = $rh->accept();
713        #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
714        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
715       
716        # Read from the client (getting rid of trailing newline)
717        # Has the client sent the <<STOP>> signal?
718        my $signal = <$client>;
719        chomp($signal);
720        if($signal eq "<<STOP>>") {
721            print $client "Perl received STOP signal (on port $port): stopping wget\n";
722            $loop = 0;                          # out of outer while loop
723            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
724           
725            # Sometimes the wget process takes some time to start up. If the STOP signal
726            # was sent, don't try to terminate the process until we know it is running.
727            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
728            # for it to start up, checking for whether it is running in order to kill it.
729            for(my $seconds = 1; $seconds <= 5; $seconds++) {
730            if(kill(0, $childpid)) {
731                # If kill(0, $childpid) returns true, then the process is running
732                # and we need to kill it.
733                close($chld_in);
734                close($chld_out);
735                kill("TERM", $childpid); # prefix - to signal to kill process group
736               
737                $childpid = undef;
738               
739                # Stop monitoring the read_handle and close the serverSocket
740                # (the Java end will close the client socket that Java opened)
741                $read_set->remove($rh);     #$read_set->remove($serverSocket);
742                close($rh);             #close($serverSocket);
743                print $client "Perl terminated wget and is about to exit\n";
744                last;                           # out of inner for loop
745            }
746            else { # the process may just be starting up, wait
747                sleep(1);
748            }
749            }
750            last;                               # out of foreach loop
751        }
752        }
753    }
754    }
755
756    my $command_status = $?;
757    if ($command_status != 0) {
758    $error_text .= "Exit error: $command_status";
759    }
760
761    if ($changed_dir) {
762    chdir $current_dir;
763    }
764   
765    my $final_follow = pop(@follow_list); # might be undefined, but that's OK
766   
767    return ($full_text,$error_text,$final_follow);
768}
769
770
771# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
772sub checkURL
773{
774    my ($self) = @_;
775    if ($self->{'url'} eq "")
776    {
777    &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
778    }
779}
780
781sub error
782{
783    my ($strFunctionName,$strError) = @_;
784    {
785    print "Error occoured in WgetDownload.pm\n".
786        "In Function:".$strFunctionName."\n".
787        "Error Message:".$strError."\n";
788    exit(-1);
789    }
790}
791
7921;
793
Note: See TracBrowser for help on using the browser.