source: main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm

Last change on this file was 32809, checked in by ak19, 5 years ago

Fix to bug that broke OAIDownload tutorial: on Windows, ECONNRESET is not error. This needs to be dealt with in 2 places in WgetDownload.pm, but only once part of the code had the latest version until now. In the remaining spot ECONNRESET was still assumed to be an error even though this happens as a matter of course on Windows. Because of it being dealt with as an error, OAIDownload terminates getting dealing with that oai file before it's actually finished and the gi.Sourcedoc meta no longer ends up in the file. Still another bug remaining in this tutorial.

  • Property svn:keywords set to Author Date Id Revision
File size: 35.4 KB
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36no strict 'subs'; # make an exception so we can use variables as filehandles to pass STDERR/STDOUT to functions, needed for gsprintf()
37use Cwd;
38use util;
39use IPC::Open3;
40use IO::Select;
41use IO::Socket;
42use Text::ParseWords; # part of Core modules. Needed to use quotewords() subroutine
43
44#use IO::Select qw( );
45#use IPC::Open3 qw( open3 );
46use Socket qw( AF_UNIX SOCK_STREAM PF_UNSPEC ); # http://perlmeme.org/howtos/perlfunc/qw_function.html
47
48
49sub BEGIN {
50 @WgetDownload::ISA = ('BaseDownload');
51}
52
53my $arguments =
54 [ { 'name' => "proxy_on",
55 'desc' => "{WgetDownload.proxy_on}",
56 'type' => "flag",
57 'reqd' => "no",
58 'hiddengli' => "yes"},
59 { 'name' => "http_proxy_host",
60 'desc' => "{WgetDownload.http_proxy_host}",
61 'type' => "string",
62 'reqd' => "no",
63 'hiddengli' => "yes"},
64 { 'name' => "http_proxy_port",
65 'desc' => "{WgetDownload.http_proxy_port}",
66 'type' => "string",
67 'reqd' => "no",
68 'hiddengli' => "yes"},
69 { 'name' => "https_proxy_host",
70 'desc' => "{WgetDownload.https_proxy_host}",
71 'type' => "string",
72 'reqd' => "no",
73 'hiddengli' => "yes"},
74 { 'name' => "https_proxy_port",
75 'desc' => "{WgetDownload.https_proxy_port}",
76 'type' => "string",
77 'reqd' => "no",
78 'hiddengli' => "yes"},
79 { 'name' => "ftp_proxy_host",
80 'desc' => "{WgetDownload.ftp_proxy_host}",
81 'type' => "string",
82 'reqd' => "no",
83 'hiddengli' => "yes"},
84 { 'name' => "ftp_proxy_port",
85 'desc' => "{WgetDownload.ftp_proxy_port}",
86 'type' => "string",
87 'reqd' => "no",
88 'hiddengli' => "yes"},
89 { 'name' => "user_name",
90 'desc' => "{WgetDownload.user_name}",
91 'type' => "string",
92 'reqd' => "no",
93 'hiddengli' => "yes"},
94 { 'name' => "user_password",
95 'desc' => "{WgetDownload.user_password}",
96 'type' => "string",
97 'reqd' => "no",
98 'hiddengli' => "yes"},
99 { 'name' => "no_check_certificate",
100 'desc' => "{WgetDownload.no_check_certificate}",
101 'type' => "flag",
102 'reqd' => "no",
103 'hiddengli' => "yes"}
104 ];
105
106my $options = { 'name' => "WgetDownload",
107 'desc' => "{WgetDownload.desc}",
108 'abstract' => "yes",
109 'inherits' => "yes",
110 'args' => $arguments };
111
112
113# Declaring file global variables related to the wget child process so that
114# the termination signal handler for SIGTERM can close the streams and tidy
115# up before ending the child process.
116my $childpid;
117my ($chld_out, $chld_in);
118my ($serverSocket, $read_set);
119
120my $TIMEOUT = 1; # seconds
121my $NUM_TRIES = 10;
122
123# The port this script's server socket will be listening on, to handle
124# incoming signals from GLI to terminate wget. This is also file global,
125# since OAIDownload.pm will make several calls on wget using the same
126# instance of this script and we want to reuse whatever port GLI gave us.
127my $port;
128
129# When this script is called from the command line, this handler will be called
130# if this process is killed or abruptly ends due to receiving one of the
131# terminating signals that this handler is registered to deal with.
132sub abrupt_end_handler {
133 my $termination_signal = shift (@_);
134
135 if(defined $childpid) {
136 close($chld_out);
137 close($chld_in);
138
139 print STDOUT "Received termination signal: $termination_signal\n";
140
141 # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
142 # See http://perldoc.perl.org/perlipc.html#Signals
143 # Warning on using kill at http://perldoc.perl.org/perlfork.html
144 kill("TERM", $childpid); # prefix - to signal to kill process group
145
146 # If the SIGTERM sent on Linux calls this handler, we want to make
147 # sure any socket connection is closed.
148 # Otherwise sockets are only used when this script is run from GLI
149 # in which case the handlers don't really get called.
150 if(defined $serverSocket) {
151 $read_set->remove($serverSocket) if defined $read_set;
152 close($serverSocket);
153 }
154 }
155
156 exit(0);
157}
158
159# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
160# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
161# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
162# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
163# http://affy.blogspot.com/p5be/ch13.htm
164# http://perldoc.perl.org/perlipc.html#Signals
165$SIG{'INT'} = \&abrupt_end_handler;
166$SIG{'TERM'} = \&abrupt_end_handler;
167
168sub new {
169 my ($class) = shift (@_);
170 my ($getlist,$inputargs,$hashArgOptLists) = @_;
171 push(@$getlist, $class);
172
173 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
174 push(@{$hashArgOptLists->{"OptList"}},$options);
175
176 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
177
178 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
179 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
180
181 return bless $self, $class;
182}
183
184sub checkWgetSetup
185{
186 my ($self,$blnGliCall) = @_;
187 #TODO: proxy detection??
188
189 if((!$blnGliCall) && $self->{'proxy_on'})
190 {
191 &checkProxySetup($self);
192 }
193 &checkURL($self);
194}
195
196# Not using this. On Windows, we used to pass proxying settings as flags to wget. But, as that can be
197# seen with Task Manager, we now have the proxy settings set in the environment and are no longer passing it
198sub addProxySettingsAsWgetFlags
199{
200 my ($self) = @_;
201 my $strOptions = "";
202
203 if($self->{'http_proxy_host'} && $self->{'http_proxy_port'}) {
204 $strOptions .= " -e http_proxy=$self->{'http_proxy_host'}:$self->{'http_proxy_port'} ";
205 }
206 if($self->{'https_proxy_host'} && $self->{'https_proxy_port'}) {
207 $strOptions .= " -e https_proxy=$self->{'https_proxy_host'}:$self->{'https_proxy_port'} ";
208 }
209 if($self->{'ftp_proxy_host'} && $self->{'ftp_proxy_port'}) {
210 $strOptions .= " -e ftp_proxy=$self->{'ftp_proxy_host'}:$self->{'ftp_proxy_port'} ";
211 }
212
213 # For wget, there is only one set pair of proxy-user and proxy-passwd, so wget seems to assume
214 # that all 3 proxy protocols (http|https|ftp) will use the same username and pwd combination?
215 # Note that this only matters when passing the proxying details as flags to wget, not when
216 # the proxies are setup as environment variables.
217 if ($self->{'user_name'} && $self->{'user_password'})
218 {
219 $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
220 # how is "--proxy-passwd" instead of "--proxy-password" even working????
221 # see https://www.gnu.org/software/wget/manual/html_node/Proxies.html
222 # and https://www.gnu.org/software/wget/manual/wget.html
223 # Not touching this, in case the manual is simply wrong. Since our code works in
224 # practice (when we were still using wget proxy username/pwd flags for windows).
225 }
226
227 return $strOptions;
228}
229
230sub getWgetOptions
231{
232 my ($self) = @_;
233 my $strOptions = "";
234
235 # If proxy settings are set up in the environment, wget is ready to use them. More secure.
236 # But if proxy settings are not set up in the environment, pass them as flags to wget
237 # This is less secure, as pwd etc visible in task manager, but it was the original way in
238 # which wget was run on windows.
239 # Truth in Perl: https://home.ubalt.edu/abento/452/perl/perltruth.html
240 # http://www.perlmonks.org/?node=what%20is%20true%20and%20false%20in%20Perl%3F
241
242 if ($self->{'proxy_on'}) {
243 if(!$ENV{'http_proxy'} && !$ENV{'https_proxy'} && !$ENV{'ftp_proxy'}) {
244 $strOptions .= $self->addProxySettingsAsWgetFlags();
245 } # else wget will use proxy settings in environment, assume enough settings have been provided
246 # either way, we're using the proxy
247 $strOptions .= " --proxy ";
248 }
249
250 if($self->{'no_check_certificate'}) { # URL may be http that gets redirected to https, so if no_check_certificate is on, turn it on even if URL is http
251
252 $strOptions .= " --no-check-certificate ";
253 }
254
255 return $strOptions;
256}
257
258# Checking for proxy setup: proxy server, proxy port, proxy username and password.
259sub checkProxySetup
260{
261 my ($self) = @_;
262 ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
263 # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
264 # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
265
266}
267
268# Returns true if the wget status needs to be monitored through sockets
269# (if a socket is used to communicate with the Java program on when to
270# terminate wget). True if we are running gli, or if the particular type
271# of WgetDownload is *not* OAIDownload (in that case, the original way of
272# terminating the perl script from Java would terminate wget as well).
273sub dealingWithSockets() {
274 my ($self) = @_;
275 return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
276 # use ref($self) to find the classname of an object
277}
278
279# On Windows, we can only use IO::Select's can_read() with Sockets, not with the usual handles to a child process' iostreams
280# However, we can use Sockets as the handles to connect to a child process' streams, which then allows us to use can_read()
281# not just on Unix but Windows too. The 2 subroutines below to use Sockets to connect to a child process' iostreams come from
282# http://www.perlmonks.org/?node_id=869942
283# http://www.perlmonks.org/?node_id=811650
284# It was suggested that IPC::Run will take care of all this or circumvent the need for all this,
285# but IPC::Run has limitations on Windows, see http://search.cpan.org/~toddr/IPC-Run-0.96/lib/IPC/Run.pm#Win32_LIMITATIONS
286
287# Create a unidirectional pipe to an iostream of a process that is actually a socket
288sub _pipe {
289 socketpair($_[0], $_[1], AF_UNIX, SOCK_STREAM, PF_UNSPEC)
290 or return undef;
291 shutdown($_[0], 1); # No more writing for reader. See http://www.perlmonks.org/?node=108244
292 shutdown($_[1], 0); # No more reading for writer
293 return 1;
294}
295
296sub _open3 {
297 local (*TO_CHLD_R, *TO_CHLD_W);
298 local (*FR_CHLD_R, *FR_CHLD_W);
299 #local (*FR_CHLD_ERR_R, *FR_CHLD_ERR_W);
300
301 if ($^O =~ /Win32/) {
302 _pipe(*TO_CHLD_R, *TO_CHLD_W ) or die $^E;
303 _pipe(*FR_CHLD_R, *FR_CHLD_W ) or die $^E;
304 #_pipe(*FR_CHLD_ERR_R, *FR_CHLD_ERR_W) or die $^E;
305 } else {
306 pipe(*TO_CHLD_R, *TO_CHLD_W ) or die $!;
307 pipe(*FR_CHLD_R, *FR_CHLD_W ) or die $!;
308 #pipe(*FR_CHLD_ERR_R, *FR_CHLD_ERR_W) or die $!;
309 }
310
311 #my $pid = open3('>&TO_CHLD_R', '<&FR_CHLD_W', '<&FR_CHLD_ERR_W', @_);
312 my $pid = open3('>&TO_CHLD_R', '<&FR_CHLD_W', '<&FR_CHLD_W', @_); # use one handle, chldout, for both stdout and stderr of child proc,
313 # see http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
314
315 #return ( $pid, *TO_CHLD_W, *FR_CHLD_R, *FR_CHLD_ERR_R );
316 return ( $pid, *TO_CHLD_W, *FR_CHLD_R);
317}
318
319# useWget and useWgetMonitored are very similar and, when updating, will probably need updating in tandem
320# useWget(Monitored) runs the wget command using open3 and then sits in a loop doing two things per iteration:
321# - processing a set buffer size of the wget (child) process' stdout/stderr streams, if anything has appeared there
322# - followed by checking the socket connection to Java GLI, to see if GLI is trying to cancel the wget process we're running.
323# Then the loop of these two things repeats.
324sub useWget
325{
326 #local $| = 1; # autoflush stdout buffer
327 #print STDOUT "*** Start of subroutine useWget in $0\n";
328
329 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
330
331 my ($strReadIn,$strLine,$command);
332 $strReadIn = "" unless defined $strReadIn;
333
334 my $current_dir = cwd();
335 my $changed_dir = 0;
336 if (defined $working_dir && -e $working_dir) {
337 chdir "$working_dir";
338 $changed_dir = 1;
339 }
340
341 # When we are running this script through GLI, the SIGTERM signal handler
342 # won't get called on Windows when wget is to be prematurely terminated.
343 # Instead, when wget has to be terminated in the middle of execution, GLI will
344 # connect to a serverSocket here to communicate when it's time to stop wget.
345 if($self->dealingWithSockets()) {
346
347 $port = <STDIN>; # gets a port on localhost that's not yet in use
348 chomp($port);
349
350 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
351 LocalPort => $port,
352 Listen => 1,
353 Reuse => 1);
354
355 die "can't setup server" unless $serverSocket;
356 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
357
358 $read_set = new IO::Select(); # create handle set for reading
359 $read_set->add($serverSocket); # add the main socket to the set
360 }
361
362 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
363
364 # Shouldn't use double quotes around wget path after all? See final comment at
365 # http://www.perlmonks.org/?node_id=394709
366 # http://coldattic.info/shvedsky/pro/blogs/a-foo-walks-into-a-bar/posts/63
367 # Therefore, compose the command as an array rather than as a string, to preserve spaces in the filepath
368 # because single/double quotes using open3 seem to launch a subshell, see also final comment at
369 # http://www.perlmonks.org/?node_id=394709 and that ends up causing problems in terminating wget, as 2 processes
370 # got launched then which don't have parent-child pid relationship (so that terminating one doesn't terminate the other).
371
372 # remove leading and trailing spaces, https://stackoverflow.com/questions/4597937/perl-function-to-trim-string-leading-and-trailing-whitespace
373 $cmdWget =~ s/^\s+//;
374 $cmdWget =~ s/\s+$//;
375
376 # replace backslashes with double backslashes, so that we preserve backslash after doing quotewords() step below with its necessary 2nd param keep=0
377 $cmdWget =~ s@\\@\\\\@g;
378
379 # split on "words"
380 #my @commandargs = split(' ', $cmdWget);
381 # quotewords: to split on spaces except within quotes, then removes quotes and unescapes double backslash too
382 # https://stackoverflow.com/questions/19762412/regex-to-split-key-value-pairs-ignoring-space-in-double-quotes
383 # https://docstore.mik.ua/orelly/perl/perlnut/c08_389.htm
384 my @commandargs = quotewords('\s+', 0, $cmdWget);
385 unshift(@commandargs, $wget_file_path); # prepend the wget cmd
386 print STDOUT "Command is: ".join(",", @commandargs) . "\n"; # if STDERR, cmd will go into ServerInfoDialog
387
388 # Wget's output needs to be monitored to find out when it has naturally terminated.
389 # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
390 # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
391 # the childpid does not kill wget on Linux but the subshell that launched it instead.
392 # Therefore, we use open3. Though the child process wget sends output only to its stdout [is this meant to be "stderr"?],
393 # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
394 # However that may be, it works with open3. But to avoid the confusion of managing and
395 # closing an extra unused handle, a single handle is used instead for both the child's
396 # stderr and stdout.
397 # See http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
398 # for why this is the right thing to do.
399
400 # Both open2 and open3 don't return on failure, but raise an exception. The handling
401 # of the exception is described on p.568 of the Perl Cookbook
402 eval {
403 #$childpid = open3($chld_in, $chld_out, $chld_out, $command); # There should be no double quotes in command, like around filepaths to wget, else need to use array version of command as below
404 #$childpid = open3($chld_in, $chld_out, $chld_out, @commandargs);
405
406 # instead of calling open3 directly, call wrapper _open3() subroutine that will use sockets to
407 # connect to the child process' iostreams, because we can then use IO::Select's can_read() even on Windows
408 ($childpid, $chld_in, $chld_out) = _open3(@commandargs);
409 };
410 if ($@) {
411 if($@ =~ m/^open3/) {
412 die "open3 failed in $0: $!\n$@\n";
413 }
414 die "Tried to launch open3 in $0, got unexpected exception: $@";
415 }
416
417 # Switching to use IO::Select, which allows timeouts, instead of doing the potentially blocking
418 # if defined(my $strLine=<$chld_out>)
419 # Google: perl open3 read timeout
420 # Google: perl open3 select() example
421 # https://stackoverflow.com/questions/10029406/why-does-ipcopen3-get-deadlocked
422 # https://codereview.stackexchange.com/questions/84496/the-right-way-to-use-ipcopen3-in-perl
423 # https://gist.github.com/shalk/6988937
424 # https://stackoverflow.com/questions/18373500/how-to-check-if-command-executed-with-ipcopen3-is-hung
425 # http://perldoc.perl.org/IO/Select.html
426 # http://perldoc.perl.org/IPC/Open3.html - explains the need for select()/IO::Select with open3
427 # http://www.perlmonks.org/?node_id=951554
428 # http://search.cpan.org/~dmuey/IPC-Open3-Utils-0.91/lib/IPC/Open3/Utils.pm
429 # https://stackoverflow.com/questions/3000907/wget-not-behaving-via-ipcopen3-vs-bash?rq=1
430
431 # create the select object and add our streamhandle(s)
432 my $sel = new IO::Select;
433 $sel->add($chld_out);
434
435 my $num_consecutive_timedouts = 0;
436 my $error = 0;
437 my $loop = 1;
438
439 while($loop)
440 {
441 # assume we're going to timeout trying to read from child process
442 $num_consecutive_timedouts++;
443
444
445 # block until data is available on the registered filehandles or until the timeout specified
446 if(my @readyhandles = $sel->can_read($TIMEOUT)) {
447
448 $num_consecutive_timedouts = 0; # re-zero, as we didn't timeout reading from child process after all
449 # since we're in this if statement
450
451 # now there's a list of registered filehandles we can read from to loop through reading from.
452 # though we've registered only one, chld_out
453 foreach my $fh (@readyhandles) {
454 my $strLine;
455 #sleep 3;
456
457 # read up to 4096 bytes from this filehandle fh.
458 # if there is less than 4096 bytes, we'll only get
459 # those available bytes and won't block. If there
460 # is more than 4096 bytes, we'll only read 4096 and
461 # wait for the next iteration through the loop to
462 # read the rest.
463 my $len = sysread($fh, $strLine, 4096);
464
465 if($len) { # read something
466 if($blnShow) {
467 print STDERR "$strLine\n";
468 }
469 $strReadIn .= $strLine;
470 }
471 else { # error or EOF: (!defined $len || $len == 0)
472
473 if(!defined $len) { # could be an error reading
474 # On Windows, the socket ends up forcibly closed on the "other" side. It's just the way it's implemented
475 # on Windows when using sockets to our child process' iostreams. So $len not being defined is not an error in that case. Refer to
476 # https://stackoverflow.com/questions/16675950/perl-select-returning-undef-on-sysread-when-using-windows-ipcopen3-and-ios/16676271
477 if(!$!{ECONNRESET}) { # anything other ECONNRESET error means it's a real case of undefined $len being an error
478 print STDERR "WgetDownload: Error reading from child stream: $!\n";
479 # SHOULD THIS 'die "errmsg";' instead? - no, sockets may need closing
480 $error = 1;
481 } else { # $! contains message "An existing connection was forcibly closed by remote host" where "remote" is a reference to the sockets to our wget child process,
482 # NOT to the remote web server we're downloading from. In such a case, the error code is ECONNRESET, and it's not an error, despite $len being undefined.
483 #print STDERR "WgetDownload: wget finished\n";
484 }
485 }
486 elsif ($len == 0) { # EOF
487 # Finished reading from this filehandle $fh because we read 0 bytes.
488 # wget finished, terminate naturally
489 #print STDERR "WgetDownload: wget finished\n"; #print STDOUT "\nPerl: open3 command, input streams closed. Wget terminated naturally.\n";
490 }
491
492 $loop = 0; # error or EOF, either way will need to clean up and break out of outer loop
493
494 # last; # if we have more than one filehandle registered with IO::Select
495
496 $sel->remove($fh); # if more than one filehandle registered, we should unregister all of them here on error
497
498 } # end else error or EOF
499
500 } # end foreach on readyhandles
501 } # end if on can_read
502
503 if($num_consecutive_timedouts >= $NUM_TRIES) {
504 $error = 1;
505 $loop = 0; # to break out of outer while loop
506
507 $num_consecutive_timedouts = 0;
508
509 &gsprintf::gsprintf(STDERR, "{WgetDownload.wget_timed_out_warning}\n", $NUM_TRIES);
510 }
511
512 if($loop == 0) { # error or EOF, either way, clean up
513 if($error) {
514 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
515
516 if(kill(0, $childpid)) {
517 # If kill(0, $childpid) returns true, then the process is running
518 # and we need to kill it.
519 close($chld_in);
520 close($chld_out);
521 kill('TERM', $childpid); # kill the process group by prefixing - to signal
522
523 # https://coderwall.com/p/q-ovnw/killing-all-child-processes-in-a-shell-script
524 # https://stackoverflow.com/questions/392022/best-way-to-kill-all-child-processes
525 #print STDERR "SENT SIGTERM TO CHILD PID: $childpid\n";
526 #print STDERR "Perl terminated wget after timing out repeatedly and is about to exit\n";
527 }
528 }
529 else { # wget finished (no errors), terminate naturally
530 #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
531 close($chld_in);
532 close($chld_out);
533 waitpid $childpid, 0;
534 }
535
536 # error or not
537 $childpid = undef;
538 # Stop monitoring the read_handle and close the serverSocket
539 # (the Java end will close the client socket that Java opened)
540 if(defined $port) {
541 $read_set->remove($serverSocket);
542 close($serverSocket);
543 }
544 }
545
546 # If we've already terminated, either naturally or on error, we can get out of the while loop
547 next if($loop == 0);
548
549 # Otherwise check for whether Java GLI has attempted to connect to this perl script via socket
550
551 # if we run this script from the command-line (as opposed to from GLI),
552 # then we're not working with sockets and can therefore skip the next bits
553 next unless(defined $port);
554
555 # http://www.perlfect.com/articles/select.shtml
556 # "multiplex between several filehandles within a single thread of control,
557 # thus creating the effect of parallelism in the handling of I/O."
558 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
559
560 # take all readable handles in turn
561 foreach my $rh (@rh_set) {
562 if($rh == $serverSocket) {
563 my $client = $rh->accept();
564 #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
565 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
566
567 # Read from the client (getting rid of the trailing newline)
568 # Has the client sent the <<STOP>> signal?
569 my $signal = <$client>;
570 chomp($signal);
571 if($signal eq "<<STOP>>") {
572 print $client "Perl received STOP signal (on port $port): stopping wget\n";
573 $loop = 0; # out of outer while loop
574 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
575
576 # Sometimes the wget process takes some time to start up. If the STOP signal
577 # was sent, don't try to terminate the process until we know it is running.
578 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
579 # for it to start up, checking for whether it is running in order to kill it.
580 for(my $seconds = 1; $seconds <= 5; $seconds++) {
581 if(kill(0, $childpid)) {
582 # If kill(0, $childpid) returns true, then the process is running
583 # and we need to kill it.
584 close($chld_in);
585 close($chld_out);
586 kill("TERM", $childpid); # prefix - to signal to kill process group
587
588 $childpid = undef;
589
590 # Stop monitoring the read_handle and close the serverSocket
591 # (the Java end will close the client socket that Java opened)
592 $read_set->remove($rh); #$read_set->remove($serverSocket);
593 close($rh); #close($serverSocket);
594 print $client "Perl terminated wget and is about to exit\n";
595 last; # out of inner for loop
596 }
597 else { # the process may just be starting up, wait
598 sleep(1);
599 }
600 }
601 last; # out of foreach loop
602 }
603 }
604 }
605 }
606
607 if ($changed_dir) {
608 chdir $current_dir;
609 }
610
611 return $strReadIn;
612}
613
614
615sub useWgetMonitored
616{
617 #local $| = 1; # autoflush stdout buffer
618 #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
619
620 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
621
622
623 my $current_dir = cwd();
624 my $changed_dir = 0;
625 if (defined $working_dir && -e $working_dir) {
626 chdir "$working_dir";
627 $changed_dir = 1;
628 }
629
630 # When we are running this script through GLI, the SIGTERM signal handler
631 # won't get called on Windows when wget is to be prematurely terminated.
632 # Instead, when wget has to be terminated in the middle of execution, GLI will
633 # connect to a serverSocket here to communicate when it's time to stop wget.
634 if($self->dealingWithSockets()) {
635
636 $port = <STDIN>; # gets a port on localhost that's not yet in use
637 chomp($port);
638
639 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
640 LocalPort => $port,
641 Listen => 1,
642 Reuse => 1);
643
644 die "can't setup server" unless $serverSocket;
645 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
646
647 $read_set = new IO::Select(); # create handle set for reading
648 $read_set->add($serverSocket); # add the main socket to the set
649 }
650
651 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
652 # compose the command as an array for open3, to preserve spaces in any filepath
653 # Do so by removing leading and trailing spaces, then splitting on "words" (preserving spaces in quoted words and removing quotes)
654 $cmdWget =~ s/^\s+//;
655 $cmdWget =~ s/\s+$//;
656
657 # replace backslashes with double backslashes, so that we preserve backslash after doing quotewords() step below with its necessary 2nd param keep=0
658 $cmdWget =~ s@\\@\\\\@g;
659
660 my @commandargs = quotewords('\s+', 0, $cmdWget);
661 unshift(@commandargs, $wget_file_path); # prepend wget cmd to the command array
662 #print STDOUT "Command is: ".join(",", @commandargs) . "\n";
663
664 eval { # see p.568 of Perl Cookbook
665 #$childpid = open3($chld_in, $chld_out, $chld_out, @commandargs);
666 ($childpid, $chld_in, $chld_out) = _open3(@commandargs);
667 };
668 if ($@) {
669 if($@ =~ m/^open3/) {
670 die "open3 failed in $0: $!\n$@\n";
671 }
672 die "Tried to launch open3 in $0, got unexpected exception: $@";
673 }
674
675 my $full_text = "";
676 my $error_text = "";
677 my @follow_list = ();
678 my $line;
679
680 # create the select object and add our streamhandle(s)
681 my $sel = new IO::Select;
682 $sel->add($chld_out);
683
684 my $num_consecutive_timedouts = 0;
685 my $error = 0;
686 my $loop = 1;
687 while($loop)
688 {
689 # assume we're going to timeout trying to read from child process
690 $num_consecutive_timedouts++;
691
692 # block until data is available on the registered filehandles or until the timeout specified
693 if(my @readyhandles = $sel->can_read($TIMEOUT)) {
694 $num_consecutive_timedouts = 0; # re-zero, as we didn't timeout reading from child process after all
695 # since we're in this if statement
696
697 foreach my $fh (@readyhandles) {
698 my $len = sysread($fh, $line, 4096); # read up to 4k from current ready filehandle
699 if($len) { # read something
700
701
702 if((defined $blnShow) && $blnShow)
703 {
704 print STDERR "$line";
705 }
706
707 if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
708 my $follow_url = $1;
709 push(@follow_list,$follow_url);
710 }
711
712 if ($line =~ m/ERROR\s+\d+/) {
713 $error_text .= $line;
714 }
715
716 $full_text .= $line;
717 } else { # error or EOF
718 if(!defined $len) { # could be an error reading
719 # On Windows, the socket ends up forcibly closed on the "other" side. It's just the way it's implemented
720 # on Windows when using sockets to our child process' iostreams. So $len not being defined is not an error in that case. Refer to
721 # https://stackoverflow.com/questions/16675950/perl-select-returning-undef-on-sysread-when-using-windows-ipcopen3-and-ios/16676271
722 if(!$!{ECONNRESET}) { # anything other ECONNRESET error means it's a real case of undefined $len being an error
723 print STDERR "WgetDownload: Error reading from child stream: $!\n";
724 # SHOULD THIS 'die "errmsg";' instead? - no, sockets may need closing
725 $error = 1;
726 } else { # $! contains message "An existing connection was forcibly closed by remote host" where "remote" is a reference to the sockets to our wget child process,
727 # NOT to the remote web server we're downloading from. In such a case, the error code is ECONNRESET, and it's not an error, despite $len being undefined.
728 # Happens on Windows when using sockets to a child process' iostreams
729 #print STDERR "WgetDownload: wget finished\n";
730 }
731 }
732 elsif ($len == 0) { # EOF, finished with this filehandle because 0 bytes read
733 #print STDERR "WgetDownload: wget finished\n"; # wget terminated naturally
734 }
735
736 $loop = 0; # error or EOF, either way will need to clean up and break out of outer loop
737
738 # last; # if we have more than one filehandle registered with IO::Select
739
740 $sel->remove($fh); # if more than one filehandle registered, we should unregister all of them here on error
741 } # end else error or EOF
742
743 } # end foreach on readyhandles
744 } # end if on can_read
745
746 if($num_consecutive_timedouts >= $NUM_TRIES) {
747 $error = 1;
748 $loop = 0; # to break out of outer while loop
749
750 $num_consecutive_timedouts = 0;
751
752 #&gsprintf::gsprintf(STDERR, "{WgetDownload.wget_timed_out_warning}\n", $NUM_TRIES);
753 }
754
755 if($loop == 0) { # error or EOF, either way, clean up
756
757 if($error) {
758 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
759
760 if(kill(0, $childpid)) {
761 # If kill(0, $childpid) returns true, then the process is running
762 # and we need to kill it.
763 close($chld_in);
764 close($chld_out);
765 kill("TERM", $childpid); # prefix - to signal to kill process group
766
767 #print STDERR "Perl terminated wget after timing out repeatedly and is about to exit\n";
768 }
769 }
770 else { # wget finished, terminate naturally
771 close($chld_in);
772 close($chld_out);
773 # Program terminates only when the following line is included
774 # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
775 # it prevents the child from turning into a "zombie process".
776 # While the wget process terminates without it, this perl script does not:
777 # the DOS prompt is not returned without it.
778 waitpid $childpid, 0;
779 }
780
781 # error or not:
782 $childpid = undef;
783 if(defined $port) {
784 $read_set->remove($serverSocket);
785 close($serverSocket);
786 }
787 }
788
789 # If we've already terminated, either naturally or on error, we can get out of the while loop
790 next if($loop == 0);
791
792 # Otherwise check for whether Java GLI has attempted to connect to this perl script via socket
793
794 # if we run this script from the command-line (as opposed to from GLI),
795 # then we're not working with sockets and can therefore skip the next bits
796 next unless(defined $port);
797
798 # http://www.perlfect.com/articles/select.shtml
799 # "multiplex between several filehandles within a single thread of control,
800 # thus creating the effect of parallelism in the handling of I/O."
801 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
802
803 # take all readable handles in turn
804 foreach my $rh (@rh_set) {
805 if($rh == $serverSocket) {
806 my $client = $rh->accept();
807 #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
808 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
809
810 # Read from the client (getting rid of trailing newline)
811 # Has the client sent the <<STOP>> signal?
812 my $signal = <$client>;
813 chomp($signal);
814 if($signal eq "<<STOP>>") {
815 print $client "Perl received STOP signal (on port $port): stopping wget\n";
816 $loop = 0; # out of outer while loop
817 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
818
819 # Sometimes the wget process takes some time to start up. If the STOP signal
820 # was sent, don't try to terminate the process until we know it is running.
821 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
822 # for it to start up, checking for whether it is running in order to kill it.
823 for(my $seconds = 1; $seconds <= 5; $seconds++) {
824 if(kill(0, $childpid)) {
825 # If kill(0, $childpid) returns true, then the process is running
826 # and we need to kill it.
827 close($chld_in);
828 close($chld_out);
829 kill("TERM", $childpid); # prefix - to signal to kill process group
830
831 $childpid = undef;
832
833 # Stop monitoring the read_handle and close the serverSocket
834 # (the Java end will close the client socket that Java opened)
835 $read_set->remove($rh); #$read_set->remove($serverSocket);
836 close($rh); #close($serverSocket);
837 print $client "Perl terminated wget and is about to exit\n";
838 last; # out of inner for loop
839 }
840 else { # the process may just be starting up, wait
841 sleep(1);
842 }
843 }
844 last; # out of foreach loop
845 }
846 }
847 }
848 }
849
850 my $command_status = $?;
851 if ($command_status != 0) {
852 $error_text .= "Exit error: $command_status";
853 }
854
855 if ($changed_dir) {
856 chdir $current_dir;
857 }
858
859 my $final_follow = pop(@follow_list); # might be undefined, but that's OK
860
861 return ($full_text,$error_text,$final_follow);
862}
863
864
865# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
866sub checkURL
867{
868 my ($self) = @_;
869 if ($self->{'url'} eq "")
870 {
871 &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
872 }
873}
874
875sub error
876{
877 my ($strFunctionName,$strError) = @_;
878 {
879 print "Error occoured in WgetDownload.pm\n".
880 "In Function:".$strFunctionName."\n".
881 "Error Message:".$strError."\n";
882 exit(-1);
883 }
884}
885
8861;
887
Note: See TracBrowser for help on using the repository browser.