source: main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm@ 31975

Last change on this file since 31975 was 31975, checked in by ak19, 3 years ago

Another bugfix to downloading. Downloading over OAI wasn't working. Dr Bainbridge discovered that this was because OAIDownload.pm was still doublequoting filepaths and URLs too, whereas open3() launching the wget cmd can't handle quotes in its arguments. WgetDownload used split to convert the cmd string into a cmd array. A clean solution was not passing WgetDownload::useWget() methods an array of cmd parameters (too involved and error prone to change all the calling code constructing the parameter cmd string), but to use the quotewords() method in place of split. This will preserve spaces in double quoted params in the cmd string, while splitting on spaces outside quoted strings. Then it also removes double quotes (and unescapes double backslashes). Tested on Mac: both OAI and Web downloading now work.

  • Property svn:keywords set to Author Date Id Revision
File size: 34.5 KB
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36no strict 'subs'; # make an exception so we can use variables as filehandles to pass STDERR/STDOUT to functions, needed for gsprintf()
37use Cwd;
38use util;
39use IPC::Open3;
40use IO::Select;
41use IO::Socket;
42use Text::ParseWords; # part of Core modules. Needed to use quotewords() subroutine
43
44#use IO::Select qw( );
45#use IPC::Open3 qw( open3 );
46use Socket qw( AF_UNIX SOCK_STREAM PF_UNSPEC ); # http://perlmeme.org/howtos/perlfunc/qw_function.html
47
48
49sub BEGIN {
50 @WgetDownload::ISA = ('BaseDownload');
51}
52
53my $arguments =
54 [ { 'name' => "proxy_on",
55 'desc' => "{WgetDownload.proxy_on}",
56 'type' => "flag",
57 'reqd' => "no",
58 'hiddengli' => "yes"},
59 { 'name' => "http_proxy_host",
60 'desc' => "{WgetDownload.http_proxy_host}",
61 'type' => "string",
62 'reqd' => "no",
63 'hiddengli' => "yes"},
64 { 'name' => "http_proxy_port",
65 'desc' => "{WgetDownload.http_proxy_port}",
66 'type' => "string",
67 'reqd' => "no",
68 'hiddengli' => "yes"},
69 { 'name' => "https_proxy_host",
70 'desc' => "{WgetDownload.https_proxy_host}",
71 'type' => "string",
72 'reqd' => "no",
73 'hiddengli' => "yes"},
74 { 'name' => "https_proxy_port",
75 'desc' => "{WgetDownload.https_proxy_port}",
76 'type' => "string",
77 'reqd' => "no",
78 'hiddengli' => "yes"},
79 { 'name' => "ftp_proxy_host",
80 'desc' => "{WgetDownload.ftp_proxy_host}",
81 'type' => "string",
82 'reqd' => "no",
83 'hiddengli' => "yes"},
84 { 'name' => "ftp_proxy_port",
85 'desc' => "{WgetDownload.ftp_proxy_port}",
86 'type' => "string",
87 'reqd' => "no",
88 'hiddengli' => "yes"},
89 { 'name' => "user_name",
90 'desc' => "{WgetDownload.user_name}",
91 'type' => "string",
92 'reqd' => "no",
93 'hiddengli' => "yes"},
94 { 'name' => "user_password",
95 'desc' => "{WgetDownload.user_password}",
96 'type' => "string",
97 'reqd' => "no",
98 'hiddengli' => "yes"},
99 { 'name' => "no_check_certificate",
100 'desc' => "{WgetDownload.no_check_certificate}",
101 'type' => "flag",
102 'reqd' => "no",
103 'hiddengli' => "yes"}
104 ];
105
106my $options = { 'name' => "WgetDownload",
107 'desc' => "{WgetDownload.desc}",
108 'abstract' => "yes",
109 'inherits' => "yes",
110 'args' => $arguments };
111
112
113# Declaring file global variables related to the wget child process so that
114# the termination signal handler for SIGTERM can close the streams and tidy
115# up before ending the child process.
116my $childpid;
117my ($chld_out, $chld_in);
118my ($serverSocket, $read_set);
119
120my $TIMEOUT = 1; # seconds
121my $NUM_TRIES = 10;
122
123# The port this script's server socket will be listening on, to handle
124# incoming signals from GLI to terminate wget. This is also file global,
125# since OAIDownload.pm will make several calls on wget using the same
126# instance of this script and we want to reuse whatever port GLI gave us.
127my $port;
128
129# When this script is called from the command line, this handler will be called
130# if this process is killed or abruptly ends due to receiving one of the
131# terminating signals that this handler is registered to deal with.
132sub abrupt_end_handler {
133 my $termination_signal = shift (@_);
134
135 if(defined $childpid) {
136 close($chld_out);
137 close($chld_in);
138
139 print STDOUT "Received termination signal: $termination_signal\n";
140
141 # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
142 # See http://perldoc.perl.org/perlipc.html#Signals
143 # Warning on using kill at http://perldoc.perl.org/perlfork.html
144 kill("TERM", $childpid); # prefix - to signal to kill process group
145
146 # If the SIGTERM sent on Linux calls this handler, we want to make
147 # sure any socket connection is closed.
148 # Otherwise sockets are only used when this script is run from GLI
149 # in which case the handlers don't really get called.
150 if(defined $serverSocket) {
151 $read_set->remove($serverSocket) if defined $read_set;
152 close($serverSocket);
153 }
154 }
155
156 exit(0);
157}
158
159# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
160# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
161# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
162# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
163# http://affy.blogspot.com/p5be/ch13.htm
164# http://perldoc.perl.org/perlipc.html#Signals
165$SIG{'INT'} = \&abrupt_end_handler;
166$SIG{'TERM'} = \&abrupt_end_handler;
167
168sub new {
169 my ($class) = shift (@_);
170 my ($getlist,$inputargs,$hashArgOptLists) = @_;
171 push(@$getlist, $class);
172
173 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
174 push(@{$hashArgOptLists->{"OptList"}},$options);
175
176 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
177
178 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
179 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
180
181 return bless $self, $class;
182}
183
184sub checkWgetSetup
185{
186 my ($self,$blnGliCall) = @_;
187 #TODO: proxy detection??
188
189 if((!$blnGliCall) && $self->{'proxy_on'})
190 {
191 &checkProxySetup($self);
192 }
193 &checkURL($self);
194}
195
196# Not using this. On Windows, we used to pass proxying settings as flags to wget. But, as that can be
197# seen with Task Manager, we now have the proxy settings set in the environment and are no longer passing it
198sub addProxySettingsAsWgetFlags
199{
200 my ($self) = @_;
201 my $strOptions = "";
202
203 if($self->{'http_proxy_host'} && $self->{'http_proxy_port'}) {
204 $strOptions .= " -e http_proxy=$self->{'http_proxy_host'}:$self->{'http_proxy_port'} ";
205 }
206 if($self->{'https_proxy_host'} && $self->{'https_proxy_port'}) {
207 $strOptions .= " -e https_proxy=$self->{'https_proxy_host'}:$self->{'https_proxy_port'} ";
208 }
209 if($self->{'ftp_proxy_host'} && $self->{'ftp_proxy_port'}) {
210 $strOptions .= " -e ftp_proxy=$self->{'ftp_proxy_host'}:$self->{'ftp_proxy_port'} ";
211 }
212
213 # For wget, there is only one set pair of proxy-user and proxy-passwd, so wget seems to assume
214 # that all 3 proxy protocols (http|https|ftp) will use the same username and pwd combination?
215 # Note that this only matters when passing the proxying details as flags to wget, not when
216 # the proxies are setup as environment variables.
217 if ($self->{'user_name'} && $self->{'user_password'})
218 {
219 $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
220 # how is "--proxy-passwd" instead of "--proxy-password" even working????
221 # see https://www.gnu.org/software/wget/manual/html_node/Proxies.html
222 # and https://www.gnu.org/software/wget/manual/wget.html
223 # Not touching this, in case the manual is simply wrong. Since our code works in
224 # practice (when we were still using wget proxy username/pwd flags for windows).
225 }
226
227 return $strOptions;
228}
229
230sub getWgetOptions
231{
232 my ($self) = @_;
233 my $strOptions = "";
234
235 # If proxy settings are set up in the environment, wget is ready to use them. More secure.
236 # But if proxy settings are not set up in the environment, pass them as flags to wget
237 # This is less secure, as pwd etc visible in task manager, but it was the original way in
238 # which wget was run on windows.
239 # Truth in Perl: https://home.ubalt.edu/abento/452/perl/perltruth.html
240 # http://www.perlmonks.org/?node=what%20is%20true%20and%20false%20in%20Perl%3F
241
242 if ($self->{'proxy_on'}) {
243 if(!$ENV{'http_proxy'} && !$ENV{'https_proxy'} && !$ENV{'ftp_proxy'}) {
244 $strOptions .= $self->addProxySettingsAsWgetFlags();
245 } # else wget will use proxy settings in environment, assume enough settings have been provided
246 # either way, we're using the proxy
247 $strOptions .= " --proxy ";
248 }
249
250 if($self->{'no_check_certificate'}) { # URL may be http that gets redirected to https, so if no_check_certificate is on, turn it on even if URL is http
251
252 $strOptions .= " --no-check-certificate ";
253 }
254
255 return $strOptions;
256}
257
258# Checking for proxy setup: proxy server, proxy port, proxy username and password.
259sub checkProxySetup
260{
261 my ($self) = @_;
262 ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
263 # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
264 # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
265
266}
267
268# Returns true if the wget status needs to be monitored through sockets
269# (if a socket is used to communicate with the Java program on when to
270# terminate wget). True if we are running gli, or if the particular type
271# of WgetDownload is *not* OAIDownload (in that case, the original way of
272# terminating the perl script from Java would terminate wget as well).
273sub dealingWithSockets() {
274 my ($self) = @_;
275 return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
276 # use ref($self) to find the classname of an object
277}
278
279# On Windows, we can only use IO::Select's can_read() with Sockets, not with the usual handles to a child process' iostreams
280# However, we can use Sockets as the handles to connect to a child process' streams, which then allows us to use can_read()
281# not just on Unix but Windows too. The 2 subroutines below to use Sockets to connect to a child process' iostreams come from
282# http://www.perlmonks.org/?node_id=869942
283# http://www.perlmonks.org/?node_id=811650
284# It was suggested that IPC::Run will take care of all this or circumvent the need for all this,
285# but IPC::Run has limitations on Windows, see http://search.cpan.org/~toddr/IPC-Run-0.96/lib/IPC/Run.pm#Win32_LIMITATIONS
286
287# Create a unidirectional pipe to an iostream of a process that is actually a socket
288sub _pipe {
289 socketpair($_[0], $_[1], AF_UNIX, SOCK_STREAM, PF_UNSPEC)
290 or return undef;
291 shutdown($_[0], 1); # No more writing for reader. See http://www.perlmonks.org/?node=108244
292 shutdown($_[1], 0); # No more reading for writer
293 return 1;
294}
295
296sub _open3 {
297 local (*TO_CHLD_R, *TO_CHLD_W);
298 local (*FR_CHLD_R, *FR_CHLD_W);
299 #local (*FR_CHLD_ERR_R, *FR_CHLD_ERR_W);
300
301 if ($^O =~ /Win32/) {
302 _pipe(*TO_CHLD_R, *TO_CHLD_W ) or die $^E;
303 _pipe(*FR_CHLD_R, *FR_CHLD_W ) or die $^E;
304 #_pipe(*FR_CHLD_ERR_R, *FR_CHLD_ERR_W) or die $^E;
305 } else {
306 pipe(*TO_CHLD_R, *TO_CHLD_W ) or die $!;
307 pipe(*FR_CHLD_R, *FR_CHLD_W ) or die $!;
308 #pipe(*FR_CHLD_ERR_R, *FR_CHLD_ERR_W) or die $!;
309 }
310
311 #my $pid = open3('>&TO_CHLD_R', '<&FR_CHLD_W', '<&FR_CHLD_ERR_W', @_);
312 my $pid = open3('>&TO_CHLD_R', '<&FR_CHLD_W', '<&FR_CHLD_W', @_); # use one handle, chldout, for both stdout and stderr of child proc,
313 # see http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
314
315 #return ( $pid, *TO_CHLD_W, *FR_CHLD_R, *FR_CHLD_ERR_R );
316 return ( $pid, *TO_CHLD_W, *FR_CHLD_R);
317}
318
319# useWget and useWgetMonitored are very similar and, when updating, will probably need updating in tandem
320# useWget(Monitored) runs the wget command using open3 and then sits in a loop doing two things per iteration:
321# - processing a set buffer size of the wget (child) process' stdout/stderr streams, if anything has appeared there
322# - followed by checking the socket connection to Java GLI, to see if GLI is trying to cancel the wget process we're running.
323# Then the loop of these two things repeats.
324sub useWget
325{
326 #local $| = 1; # autoflush stdout buffer
327 #print STDOUT "*** Start of subroutine useWget in $0\n";
328
329 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
330
331 my ($strReadIn,$strLine,$command);
332 $strReadIn = "" unless defined $strReadIn;
333
334 my $current_dir = cwd();
335 my $changed_dir = 0;
336 if (defined $working_dir && -e $working_dir) {
337 chdir "$working_dir";
338 $changed_dir = 1;
339 }
340
341 # When we are running this script through GLI, the SIGTERM signal handler
342 # won't get called on Windows when wget is to be prematurely terminated.
343 # Instead, when wget has to be terminated in the middle of execution, GLI will
344 # connect to a serverSocket here to communicate when it's time to stop wget.
345 if($self->dealingWithSockets()) {
346
347 $port = <STDIN>; # gets a port on localhost that's not yet in use
348 chomp($port);
349
350 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
351 LocalPort => $port,
352 Listen => 1,
353 Reuse => 1);
354
355 die "can't setup server" unless $serverSocket;
356 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
357
358 $read_set = new IO::Select(); # create handle set for reading
359 $read_set->add($serverSocket); # add the main socket to the set
360 }
361
362 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
363
364 # Shouldn't use double quotes around wget path after all? See final comment at
365 # http://www.perlmonks.org/?node_id=394709
366 # http://coldattic.info/shvedsky/pro/blogs/a-foo-walks-into-a-bar/posts/63
367 # Therefore, compose the command as an array rather than as a string, to preserve spaces in the filepath
368 # because single/double quotes using open3 seem to launch a subshell, see also final comment at
369 # http://www.perlmonks.org/?node_id=394709 and that ends up causing problems in terminating wget, as 2 processes
370 # got launched then which don't have parent-child pid relationship (so that terminating one doesn't terminate the other).
371
372 # remove leading and trailing spaces, https://stackoverflow.com/questions/4597937/perl-function-to-trim-string-leading-and-trailing-whitespace
373 $cmdWget =~ s/^\s+//;
374 $cmdWget =~ s/\s+$//;
375
376 # split on "words"
377 #my @commandargs = split(' ', $cmdWget);
378 # quotewords: to split on spaces except within quotes, then removes quotes and unescapes double backslash too
379 # https://stackoverflow.com/questions/19762412/regex-to-split-key-value-pairs-ignoring-space-in-double-quotes
380 # https://docstore.mik.ua/orelly/perl/perlnut/c08_389.htm
381 my @commandargs = quotewords('\s+', 0, $cmdWget);
382 unshift(@commandargs, $wget_file_path); # prepend the wget cmd
383 #print STDERR "Command is: ".join(",", @commandargs) . "\n"; # goes into ServerInfoDialog
384
385 # Wget's output needs to be monitored to find out when it has naturally terminated.
386 # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
387 # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
388 # the childpid does not kill wget on Linux but the subshell that launched it instead.
389 # Therefore, we use open3. Though the child process wget sends output only to its stdout [is this meant to be "stderr"?],
390 # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
391 # However that may be, it works with open3. But to avoid the confusion of managing and
392 # closing an extra unused handle, a single handle is used instead for both the child's
393 # stderr and stdout.
394 # See http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
395 # for why this is the right thing to do.
396
397 # Both open2 and open3 don't return on failure, but raise an exception. The handling
398 # of the exception is described on p.568 of the Perl Cookbook
399 eval {
400 #$childpid = open3($chld_in, $chld_out, $chld_out, $command); # There should be no double quotes in command, like around filepaths to wget, else need to use array version of command as below
401 #$childpid = open3($chld_in, $chld_out, $chld_out, @commandargs);
402
403 # instead of calling open3 directly, call wrapper _open3() subroutine that will use sockets to
404 # connect to the child process' iostreams, because we can then use IO::Select's can_read() even on Windows
405 ($childpid, $chld_in, $chld_out) = _open3(@commandargs);
406 };
407 if ($@) {
408 if($@ =~ m/^open3/) {
409 die "open3 failed in $0: $!\n$@\n";
410 }
411 die "Tried to launch open3 in $0, got unexpected exception: $@";
412 }
413
414 # Switching to use IO::Select, which allows timeouts, instead of doing the potentially blocking
415 # if defined(my $strLine=<$chld_out>)
416 # Google: perl open3 read timeout
417 # Google: perl open3 select() example
418 # https://stackoverflow.com/questions/10029406/why-does-ipcopen3-get-deadlocked
419 # https://codereview.stackexchange.com/questions/84496/the-right-way-to-use-ipcopen3-in-perl
420 # https://gist.github.com/shalk/6988937
421 # https://stackoverflow.com/questions/18373500/how-to-check-if-command-executed-with-ipcopen3-is-hung
422 # http://perldoc.perl.org/IO/Select.html
423 # http://perldoc.perl.org/IPC/Open3.html - explains the need for select()/IO::Select with open3
424 # http://www.perlmonks.org/?node_id=951554
425 # http://search.cpan.org/~dmuey/IPC-Open3-Utils-0.91/lib/IPC/Open3/Utils.pm
426 # https://stackoverflow.com/questions/3000907/wget-not-behaving-via-ipcopen3-vs-bash?rq=1
427
428 # create the select object and add our streamhandle(s)
429 my $sel = new IO::Select;
430 $sel->add($chld_out);
431
432 my $num_consecutive_timedouts = 0;
433 my $error = 0;
434 my $loop = 1;
435
436 while($loop)
437 {
438 # assume we're going to timeout trying to read from child process
439 $num_consecutive_timedouts++;
440
441
442 # block until data is available on the registered filehandles or until the timeout specified
443 if(my @readyhandles = $sel->can_read($TIMEOUT)) {
444
445 $num_consecutive_timedouts = 0; # re-zero, as we didn't timeout reading from child process after all
446 # since we're in this if statement
447
448 # now there's a list of registered filehandles we can read from to loop through reading from.
449 # though we've registered only one, chld_out
450 foreach my $fh (@readyhandles) {
451 my $strLine;
452 #sleep 3;
453
454 # read up to 4096 bytes from this filehandle fh.
455 # if there is less than 4096 bytes, we'll only get
456 # those available bytes and won't block. If there
457 # is more than 4096 bytes, we'll only read 4096 and
458 # wait for the next iteration through the loop to
459 # read the rest.
460 my $len = sysread($fh, $strLine, 4096);
461
462 if($len) { # read something
463 if($blnShow) {
464 print STDERR "$strLine\n";
465 }
466 $strReadIn .= $strLine;
467 }
468 else { # error or EOF: (!defined $len || $len == 0)
469
470 if(!defined $len) { # could be an error reading
471 # On Windows, the socket ends up forcibly closed on the "other" side. It's just the way it's implemented
472 # on Windows when using sockets to our child process' iostreams. So $len not being defined is not an error in that case. Refer to
473 # https://stackoverflow.com/questions/16675950/perl-select-returning-undef-on-sysread-when-using-windows-ipcopen3-and-ios/16676271
474 if(!$!{ECONNRESET}) { # anything other ECONNRESET error means it's a real case of undefined $len being an error
475 print STDERR "WgetDownload: Error reading from child stream: $!\n";
476 # SHOULD THIS 'die "errmsg";' instead? - no, sockets may need closing
477 $error = 1;
478 } else { # $! contains message "An existing connection was forcibly closed by remote host" where "remote" is a reference to the sockets to our wget child process,
479 # NOT to the remote web server we're downloading from. In such a case, the error code is ECONNRESET, and it's not an error, despite $len being undefined.
480 #print STDERR "WgetDownload: wget finished\n";
481 }
482 }
483 elsif ($len == 0) { # EOF
484 # Finished reading from this filehandle $fh because we read 0 bytes.
485 # wget finished, terminate naturally
486 #print STDERR "WgetDownload: wget finished\n"; #print STDOUT "\nPerl: open3 command, input streams closed. Wget terminated naturally.\n";
487 }
488
489 $loop = 0; # error or EOF, either way will need to clean up and break out of outer loop
490
491 # last; # if we have more than one filehandle registered with IO::Select
492
493 $sel->remove($fh); # if more than one filehandle registered, we should unregister all of them here on error
494
495 } # end else error or EOF
496
497 } # end foreach on readyhandles
498 } # end if on can_read
499
500 if($num_consecutive_timedouts >= $NUM_TRIES) {
501 $error = 1;
502 $loop = 0; # to break out of outer while loop
503
504 $num_consecutive_timedouts = 0;
505
506 &gsprintf::gsprintf(STDERR, "{WgetDownload.wget_timed_out_warning}\n", $NUM_TRIES);
507 }
508
509 if($loop == 0) { # error or EOF, either way, clean up
510 if($error) {
511 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
512
513 if(kill(0, $childpid)) {
514 # If kill(0, $childpid) returns true, then the process is running
515 # and we need to kill it.
516 close($chld_in);
517 close($chld_out);
518 kill('TERM', $childpid); # kill the process group by prefixing - to signal
519
520 # https://coderwall.com/p/q-ovnw/killing-all-child-processes-in-a-shell-script
521 # https://stackoverflow.com/questions/392022/best-way-to-kill-all-child-processes
522 #print STDERR "SENT SIGTERM TO CHILD PID: $childpid\n";
523 #print STDERR "Perl terminated wget after timing out repeatedly and is about to exit\n";
524 }
525 }
526 else { # wget finished (no errors), terminate naturally
527 #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
528 close($chld_in);
529 close($chld_out);
530 waitpid $childpid, 0;
531 }
532
533 # error or not
534 $childpid = undef;
535 # Stop monitoring the read_handle and close the serverSocket
536 # (the Java end will close the client socket that Java opened)
537 if(defined $port) {
538 $read_set->remove($serverSocket);
539 close($serverSocket);
540 }
541 }
542
543 # If we've already terminated, either naturally or on error, we can get out of the while loop
544 next if($loop == 0);
545
546 # Otherwise check for whether Java GLI has attempted to connect to this perl script via socket
547
548 # if we run this script from the command-line (as opposed to from GLI),
549 # then we're not working with sockets and can therefore skip the next bits
550 next unless(defined $port);
551
552 # http://www.perlfect.com/articles/select.shtml
553 # "multiplex between several filehandles within a single thread of control,
554 # thus creating the effect of parallelism in the handling of I/O."
555 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
556
557 # take all readable handles in turn
558 foreach my $rh (@rh_set) {
559 if($rh == $serverSocket) {
560 my $client = $rh->accept();
561 #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
562 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
563
564 # Read from the client (getting rid of the trailing newline)
565 # Has the client sent the <<STOP>> signal?
566 my $signal = <$client>;
567 chomp($signal);
568 if($signal eq "<<STOP>>") {
569 print $client "Perl received STOP signal (on port $port): stopping wget\n";
570 $loop = 0; # out of outer while loop
571 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
572
573 # Sometimes the wget process takes some time to start up. If the STOP signal
574 # was sent, don't try to terminate the process until we know it is running.
575 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
576 # for it to start up, checking for whether it is running in order to kill it.
577 for(my $seconds = 1; $seconds <= 5; $seconds++) {
578 if(kill(0, $childpid)) {
579 # If kill(0, $childpid) returns true, then the process is running
580 # and we need to kill it.
581 close($chld_in);
582 close($chld_out);
583 kill("TERM", $childpid); # prefix - to signal to kill process group
584
585 $childpid = undef;
586
587 # Stop monitoring the read_handle and close the serverSocket
588 # (the Java end will close the client socket that Java opened)
589 $read_set->remove($rh); #$read_set->remove($serverSocket);
590 close($rh); #close($serverSocket);
591 print $client "Perl terminated wget and is about to exit\n";
592 last; # out of inner for loop
593 }
594 else { # the process may just be starting up, wait
595 sleep(1);
596 }
597 }
598 last; # out of foreach loop
599 }
600 }
601 }
602 }
603
604 if ($changed_dir) {
605 chdir $current_dir;
606 }
607
608 return $strReadIn;
609}
610
611
612sub useWgetMonitored
613{
614 #local $| = 1; # autoflush stdout buffer
615 #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
616
617 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
618
619
620 my $current_dir = cwd();
621 my $changed_dir = 0;
622 if (defined $working_dir && -e $working_dir) {
623 chdir "$working_dir";
624 $changed_dir = 1;
625 }
626
627 # When we are running this script through GLI, the SIGTERM signal handler
628 # won't get called on Windows when wget is to be prematurely terminated.
629 # Instead, when wget has to be terminated in the middle of execution, GLI will
630 # connect to a serverSocket here to communicate when it's time to stop wget.
631 if($self->dealingWithSockets()) {
632
633 $port = <STDIN>; # gets a port on localhost that's not yet in use
634 chomp($port);
635
636 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
637 LocalPort => $port,
638 Listen => 1,
639 Reuse => 1);
640
641 die "can't setup server" unless $serverSocket;
642 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
643
644 $read_set = new IO::Select(); # create handle set for reading
645 $read_set->add($serverSocket); # add the main socket to the set
646 }
647
648 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
649 # compose the command as an array for open3, to preserve spaces in any filepath
650 # Do so by removing leading and trailing spaces, then splitting on "words" (preserving spaces in quoted words and removing quotes)
651 $cmdWget =~ s/^\s+//;
652 $cmdWget =~ s/\s+$//;
653 my @commandargs = quotewords('\s+', 0, $cmdWget);
654 unshift(@commandargs, $wget_file_path); # prepend wget cmd to the command array
655 #print STDOUT "Command is: ".join(",", @commandargs) . "\n";
656
657 eval { # see p.568 of Perl Cookbook
658 #$childpid = open3($chld_in, $chld_out, $chld_out, @commandargs);
659 ($childpid, $chld_in, $chld_out) = _open3(@commandargs);
660 };
661 if ($@) {
662 if($@ =~ m/^open3/) {
663 die "open3 failed in $0: $!\n$@\n";
664 }
665 die "Tried to launch open3 in $0, got unexpected exception: $@";
666 }
667
668 my $full_text = "";
669 my $error_text = "";
670 my @follow_list = ();
671 my $line;
672
673 # create the select object and add our streamhandle(s)
674 my $sel = new IO::Select;
675 $sel->add($chld_out);
676
677 my $num_consecutive_timedouts = 0;
678 my $error = 0;
679 my $loop = 1;
680 while($loop)
681 {
682 # assume we're going to timeout trying to read from child process
683 $num_consecutive_timedouts++;
684
685 # block until data is available on the registered filehandles or until the timeout specified
686 if(my @readyhandles = $sel->can_read($TIMEOUT)) {
687 $num_consecutive_timedouts = 0; # re-zero, as we didn't timeout reading from child process after all
688 # since we're in this if statement
689
690 foreach my $fh (@readyhandles) {
691 my $len = sysread($fh, $line, 4096); # read up to 4k from current ready filehandle
692 if($len) { # read something
693
694
695 if((defined $blnShow) && $blnShow)
696 {
697 print STDERR "$line";
698 }
699
700 if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
701 my $follow_url = $1;
702 push(@follow_list,$follow_url);
703 }
704
705 if ($line =~ m/ERROR\s+\d+/) {
706 $error_text .= $line;
707 }
708
709 $full_text .= $line;
710 } else { # error or EOF
711 if(!defined $len) { # error reading
712 #print STDERR "WgetDownload: Error reading from child stream: $!\n";
713 $error = 1;
714 }
715 if(!defined $len) {
716 if(!$!{ECONNRESET}) { # anything other ECONNRESET error means it's a real case of undefined $len being an error
717 #print STDERR "WgetDownload: Error reading from child stream: $!\n";
718 $error = 1;
719 } else { # the error code is ECONNRESET, and it's not an error, despite $len being undefined.
720 # Happens on Windows when using sockets to a child process' iostreams
721 #print STDERR "WgetDownload: wget finished\n";
722 }
723 }
724 elsif ($len == 0) { # EOF, finished with this filehandle because 0 bytes read
725 #print STDERR "WgetDownload: wget finished\n"; # wget terminated naturally
726 }
727
728 $loop = 0; # error or EOF, either way will need to clean up and break out of outer loop
729
730 # last; # if we have more than one filehandle registered with IO::Select
731
732 $sel->remove($fh); # if more than one filehandle registered, we should unregister all of them here on error
733 } # end else error or EOF
734
735 } # end foreach on readyhandles
736 } # end if on can_read
737
738 if($num_consecutive_timedouts >= $NUM_TRIES) {
739 $error = 1;
740 $loop = 0; # to break out of outer while loop
741
742 $num_consecutive_timedouts = 0;
743
744 #&gsprintf::gsprintf(STDERR, "{WgetDownload.wget_timed_out_warning}\n", $NUM_TRIES);
745 }
746
747 if($loop == 0) { # error or EOF, either way, clean up
748
749 if($error) {
750 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
751
752 if(kill(0, $childpid)) {
753 # If kill(0, $childpid) returns true, then the process is running
754 # and we need to kill it.
755 close($chld_in);
756 close($chld_out);
757 kill("TERM", $childpid); # prefix - to signal to kill process group
758
759 #print STDERR "Perl terminated wget after timing out repeatedly and is about to exit\n";
760 }
761 }
762 else { # wget finished, terminate naturally
763 close($chld_in);
764 close($chld_out);
765 # Program terminates only when the following line is included
766 # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
767 # it prevents the child from turning into a "zombie process".
768 # While the wget process terminates without it, this perl script does not:
769 # the DOS prompt is not returned without it.
770 waitpid $childpid, 0;
771 }
772
773 # error or not:
774 $childpid = undef;
775 if(defined $port) {
776 $read_set->remove($serverSocket);
777 close($serverSocket);
778 }
779 }
780
781 # If we've already terminated, either naturally or on error, we can get out of the while loop
782 next if($loop == 0);
783
784 # Otherwise check for whether Java GLI has attempted to connect to this perl script via socket
785
786 # if we run this script from the command-line (as opposed to from GLI),
787 # then we're not working with sockets and can therefore skip the next bits
788 next unless(defined $port);
789
790 # http://www.perlfect.com/articles/select.shtml
791 # "multiplex between several filehandles within a single thread of control,
792 # thus creating the effect of parallelism in the handling of I/O."
793 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
794
795 # take all readable handles in turn
796 foreach my $rh (@rh_set) {
797 if($rh == $serverSocket) {
798 my $client = $rh->accept();
799 #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
800 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
801
802 # Read from the client (getting rid of trailing newline)
803 # Has the client sent the <<STOP>> signal?
804 my $signal = <$client>;
805 chomp($signal);
806 if($signal eq "<<STOP>>") {
807 print $client "Perl received STOP signal (on port $port): stopping wget\n";
808 $loop = 0; # out of outer while loop
809 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
810
811 # Sometimes the wget process takes some time to start up. If the STOP signal
812 # was sent, don't try to terminate the process until we know it is running.
813 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
814 # for it to start up, checking for whether it is running in order to kill it.
815 for(my $seconds = 1; $seconds <= 5; $seconds++) {
816 if(kill(0, $childpid)) {
817 # If kill(0, $childpid) returns true, then the process is running
818 # and we need to kill it.
819 close($chld_in);
820 close($chld_out);
821 kill("TERM", $childpid); # prefix - to signal to kill process group
822
823 $childpid = undef;
824
825 # Stop monitoring the read_handle and close the serverSocket
826 # (the Java end will close the client socket that Java opened)
827 $read_set->remove($rh); #$read_set->remove($serverSocket);
828 close($rh); #close($serverSocket);
829 print $client "Perl terminated wget and is about to exit\n";
830 last; # out of inner for loop
831 }
832 else { # the process may just be starting up, wait
833 sleep(1);
834 }
835 }
836 last; # out of foreach loop
837 }
838 }
839 }
840 }
841
842 my $command_status = $?;
843 if ($command_status != 0) {
844 $error_text .= "Exit error: $command_status";
845 }
846
847 if ($changed_dir) {
848 chdir $current_dir;
849 }
850
851 my $final_follow = pop(@follow_list); # might be undefined, but that's OK
852
853 return ($full_text,$error_text,$final_follow);
854}
855
856
857# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
858sub checkURL
859{
860 my ($self) = @_;
861 if ($self->{'url'} eq "")
862 {
863 &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
864 }
865}
866
867sub error
868{
869 my ($strFunctionName,$strError) = @_;
870 {
871 print "Error occoured in WgetDownload.pm\n".
872 "In Function:".$strFunctionName."\n".
873 "Error Message:".$strError."\n";
874 exit(-1);
875 }
876}
877
8781;
879
Note: See TracBrowser for help on using the repository browser.