source: main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm@ 31856

Last change on this file since 31856 was 31856, checked in by ak19, 7 years ago

If the proxy settings are wrong or set when not needed, pressing the Server Information button would take forever (freeze GLI GUI), and the wget that java launched through perl would also take forever, blocking. The wget will have to be terminated from Task Manager. To overcome issues of network settings misconfigurations, which Dr Bainbridge said are hard to detect, setting the number of tries on pressing the Server Info button to 2. The number of tries for pressing the Download button were already 2, so this just makes the two wget commands issued more similar (but the wget launched by the Download button now uses the --tries=2 rather than the shorthand -t 2 too, so that the code reads better). Setting the number of wget retries launched by the Server Info Dialog also ensures wget is eventually terminated, as happens when both tries fail. Some more informative messages are now displayed if the server is unavaiable, depending on whether proxying is on or not.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.7 KB
RevLine 
[14657]1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
[17207]34use BaseDownload;
[14657]35use strict;
[17529]36use Cwd;
[28560]37use util;
[17840]38use IPC::Open3;
[17529]39use IO::Select;
40use IO::Socket;
[14657]41
[17529]42
[14657]43sub BEGIN {
[17207]44 @WgetDownload::ISA = ('BaseDownload');
[14657]45}
46
47my $arguments =
48 [ { 'name' => "proxy_on",
49 'desc' => "{WgetDownload.proxy_on}",
50 'type' => "flag",
51 'reqd' => "no",
52 'hiddengli' => "yes"},
53 { 'name' => "proxy_host",
54 'desc' => "{WgetDownload.proxy_host}",
55 'type' => "string",
56 'reqd' => "no",
57 'hiddengli' => "yes"},
58 { 'name' => "proxy_port",
59 'desc' => "{WgetDownload.proxy_port}",
60 'type' => "string",
61 'reqd' => "no",
62 'hiddengli' => "yes"},
63 { 'name' => "user_name",
64 'desc' => "{WgetDownload.user_name}",
65 'type' => "string",
66 'reqd' => "no",
67 'hiddengli' => "yes"},
68 { 'name' => "user_password",
69 'desc' => "{WgetDownload.user_password}",
70 'type' => "string",
71 'reqd' => "no",
72 'hiddengli' => "yes"}];
73
74my $options = { 'name' => "WgetDownload",
75 'desc' => "{WgetDownload.desc}",
76 'abstract' => "yes",
77 'inherits' => "yes",
78 'args' => $arguments };
79
80
[17529]81# Declaring file global variables related to the wget child process so that
82# the termination signal handler for SIGTERM can close the streams and tidy
83# up before ending the child process.
[17354]84my $childpid;
[17531]85my ($chld_out, $chld_in);
[17529]86my ($serverSocket, $read_set);
[17354]87
[17531]88# The port this script's server socket will be listening on, to handle
89# incoming signals from GLI to terminate wget. This is also file global,
90# since OAIDownload.pm will make several calls on wget using the same
91# instance of this script and we want to reuse whatever port GLI gave us.
92my $port;
93
[17529]94# When this script is called from the command line, this handler will be called
95# if this process is killed or abruptly ends due to receiving one of the
96# terminating signals that this handler is registered to deal with.
[17354]97sub abrupt_end_handler {
98 my $termination_signal = shift (@_);
[17549]99
[17529]100 if(defined $childpid) {
101 close($chld_out);
102 close($chld_in);
[17354]103
[17547]104 print STDOUT "Received termination signal: $termination_signal\n";
[17529]105
106 # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
107 # See http://perldoc.perl.org/perlipc.html#Signals
108 # Warning on using kill at http://perldoc.perl.org/perlfork.html
109 kill("TERM", $childpid);
110
111 # If the SIGTERM sent on Linux calls this handler, we want to make
112 # sure any socket connection is closed.
113 # Otherwise sockets are only used when this script is run from GLI
114 # in which case the handlers don't really get called.
115 if(defined $serverSocket) {
116 $read_set->remove($serverSocket) if defined $read_set;
117 close($serverSocket);
[17354]118 }
119 }
[17529]120
[17354]121 exit(0);
122}
123
124# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
125# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
126# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
127# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
128# http://affy.blogspot.com/p5be/ch13.htm
129# http://perldoc.perl.org/perlipc.html#Signals
130$SIG{'INT'} = \&abrupt_end_handler;
131$SIG{'TERM'} = \&abrupt_end_handler;
132
[14657]133sub new {
134 my ($class) = shift (@_);
135 my ($getlist,$inputargs,$hashArgOptLists) = @_;
136 push(@$getlist, $class);
137
[17207]138 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
139 push(@{$hashArgOptLists->{"OptList"}},$options);
[14657]140
[17207]141 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
[14657]142
[28560]143 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
144 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
145
[14657]146 return bless $self, $class;
147}
148
149sub checkWgetSetup
150{
151 my ($self,$blnGliCall) = @_;
152 #TODO: proxy detection??
153
154 if((!$blnGliCall) && $self->{'proxy_on'})
155 {
156 &checkProxySetup($self);
157 }
158 &checkURL($self);
159}
160
161sub getWgetOptions
162{
163 my ($self) = @_;
164 my $strOptions = "";
165
[16791]166 if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
[14657]167 {
[14918]168
[31851]169 if($self->{'url'} =~ m/^https\:/) {
170 $strOptions .= " -e https_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
171 } else {
172 $strOptions .= " -e http_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
173 }
[14918]174
175 if ($self->{'user_name'} && $self->{'user_password'})
176 {
177 $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
178 }
[14657]179 }
180
[16791]181 if ($self->{'proxy_on'}) {
182 $strOptions .= " --proxy ";
183 }
[14918]184
[31851]185 if($self->{'no_check_certificate'} && $self->{'url'} =~ m/^https\:/) {
186 $strOptions .= " --no-check-certificate ";
187 }
188
[14657]189 return $strOptions;
190}
191
192# Checking for proxy setup: proxy server, proxy port, proxy username and password.
193sub checkProxySetup
194{
195 my ($self) = @_;
196 ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
197 # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
[17529]198 # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
[14657]199
200}
201
[17664]202# Returns true if the wget status needs to be monitored through sockets
203# (if a socket is used to communicate with the Java program on when to
204# terminate wget). True if we are running gli, or if the particular type
205# of WgetDownload is *not* OAIDownload (in that case, the original way of
206# terminating the perl script from Java terminated wget as well).
207sub dealingWithSockets() {
208 my ($self) = @_;
209 return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
210 # use ref($self) to find the classname of an object
211}
212
[17795]213
[14657]214sub useWget
215{
[17529]216 #local $| = 1; # autoflush stdout buffer
217 #print STDOUT "*** Start of subroutine useWget in $0\n";
218
[14657]219 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
220
221 my ($strReadIn,$strLine,$command);
222 $strReadIn = "" unless defined $strReadIn;
223
224 my $current_dir = cwd();
225 my $changed_dir = 0;
226 if (defined $working_dir && -e $working_dir) {
227 chdir "$working_dir";
228 $changed_dir = 1;
229 }
230
[17529]231 # When we are running this script through GLI, the SIGTERM signal handler
232 # won't get called on Windows when wget is to be prematurely terminated.
233 # Instead, when wget has to be terminated in the middle of execution, GLI will
234 # connect to a serverSocket here to communicate when it's time to stop wget.
[17664]235 if($self->dealingWithSockets()) {
[14657]236
[17529]237 $port = <STDIN>; # gets a port on localhost that's not yet in use
238 chomp($port);
239
240 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
241 LocalPort => $port,
242 Listen => 1,
243 Reuse => 1);
244
245 die "can't setup server" unless $serverSocket;
246 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
247
248 $read_set = new IO::Select(); # create handle set for reading
249 $read_set->add($serverSocket); # add the main socket to the set
250 }
251
[28250]252 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
[17840]253 $command = "\"$wget_file_path\" $cmdWget";
254 #print STDOUT "Command is: $command\n";
[17354]255
[30520]256 # Wget's output needs to be monitored to find out when it has naturally terminated.
[17840]257 # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
258 # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
259 # the childpid does not kill wget on Linux but the subshell that launched it instead.
260 # Therefore, we use open3. Though the child process wget sends output only to its stdout,
261 # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
262 # However that may be, it works with open3. But to avoid the confusion of managing and
263 # closing an extra unused handle, a single handle is used instead for both the child's
264 # stderr and stdout.
[31856]265 # See http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
266 # for why this is the right thing to do.
[17840]267
268 # Both open2 and open3 don't return on failure, but raise an exception. The handling
269 # of the exception is described on p.568 of the Perl Cookbook
270 eval {
271 $childpid = open3($chld_in, $chld_out, $chld_out, $command);
272 };
273 if ($@) {
274 if($@ =~ m/^open3/) {
275 die "open3 failed in $0: $!\n$@\n";
276 }
277 die "Tried to launch open3 in $0, got unexpected exception: $@";
278 }
279
[17529]280 my $loop = 1;
281 while($loop)
[14657]282 {
[17529]283 if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
284 if($blnShow) {
285 print STDERR "$strLine\n";
286 }
287 $strReadIn .= $strLine;
[31856]288 }
[17529]289 else { # wget finished, terminate naturally
[17547]290 #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
[17529]291 close($chld_in);
292 close($chld_out);
293 waitpid $childpid, 0;
294 $loop = 0;
295
296 $childpid = undef;
297 if(defined $port) {
298 $read_set->remove($serverSocket);
299 close($serverSocket);
300 }
[14657]301 }
302
[17529]303 # if we run this script from the command-line (as opposed to from GLI),
304 # then we're not working with sockets and can therefore can skip the next bits
305 next unless(defined $port);
306
307 # http://www.perlfect.com/articles/select.shtml
308 # "multiplex between several filehandles within a single thread of control,
309 # thus creating the effect of parallelism in the handling of I/O."
310 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
311
312 # take all readable handles in turn
313 foreach my $rh (@rh_set) {
314 if($rh == $serverSocket) {
315 my $client = $rh->accept();
316 #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
317 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
318
[17547]319 # Read from the client (getting rid of the trailing newline)
[17529]320 # Has the client sent the <<STOP>> signal?
321 my $signal = <$client>;
322 chomp($signal);
323 if($signal eq "<<STOP>>") {
324 print $client "Perl received STOP signal (on port $port): stopping wget\n";
[17549]325 $loop = 0; # out of outer while loop
326 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
[17529]327
[17547]328 # Sometimes the wget process takes some time to start up. If the STOP signal
329 # was sent, don't try to terminate the process until we know it is running.
330 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
331 # for it to start up, checking for whether it is running in order to kill it.
332 for(my $seconds = 1; $seconds <= 5; $seconds++) {
333 if(kill(0, $childpid)) {
334 # If kill(0, $childpid) returns true, then the process is running
335 # and we need to kill it.
336 close($chld_in);
337 close($chld_out);
338 kill("TERM", $childpid);
339
340 $childpid = undef;
341
342 # Stop monitoring the read_handle and close the serverSocket
343 # (the Java end will close the client socket that Java opened)
344 $read_set->remove($rh); #$read_set->remove($serverSocket);
345 close($rh); #close($serverSocket);
346 print $client "Perl terminated wget and is about to exit\n";
[17840]347 last; # out of inner for loop
[17547]348 }
349 else { # the process may just be starting up, wait
350 sleep(1);
351 }
352 }
[17549]353 last; # out of foreach loop
[17529]354 }
355 }
356 }
[14657]357 }
358
359 if ($changed_dir) {
360 chdir $current_dir;
361 }
362
363 return $strReadIn;
364}
365
[16791]366
367sub useWgetMonitored
368{
[17537]369 #local $| = 1; # autoflush stdout buffer
370 #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
371
[16791]372 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
373
374
375 my $current_dir = cwd();
376 my $changed_dir = 0;
377 if (defined $working_dir && -e $working_dir) {
378 chdir "$working_dir";
379 $changed_dir = 1;
380 }
[17537]381
382 # When we are running this script through GLI, the SIGTERM signal handler
383 # won't get called on Windows when wget is to be prematurely terminated.
384 # Instead, when wget has to be terminated in the middle of execution, GLI will
385 # connect to a serverSocket here to communicate when it's time to stop wget.
[17664]386 if($self->dealingWithSockets()) {
[17537]387
388 $port = <STDIN>; # gets a port on localhost that's not yet in use
389 chomp($port);
390
391 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
392 LocalPort => $port,
393 Listen => 1,
394 Reuse => 1);
395
396 die "can't setup server" unless $serverSocket;
397 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
398
399 $read_set = new IO::Select(); # create handle set for reading
400 $read_set->add($serverSocket); # add the main socket to the set
401 }
402
[28250]403 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
[17840]404 my $command = "\"$wget_file_path\" $cmdWget";
405 #print STDOUT "Command is: $command\n";
[16791]406
[17840]407 eval { # see p.568 of Perl Cookbook
408 $childpid = open3($chld_in, $chld_out, $chld_out, $command);
409 };
410 if ($@) {
411 if($@ =~ m/^open3/) {
412 die "open3 failed in $0: $!\n$@\n";
413 }
414 die "Tried to launch open3 in $0, got unexpected exception: $@";
415 }
[17795]416
[16791]417 my $full_text = "";
418 my $error_text = "";
419 my @follow_list = ();
420 my $line;
421
[17537]422 my $loop = 1;
423 while($loop)
[16791]424 {
[17537]425 if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
426 if((defined $blnShow) && $blnShow)
427 {
428 print STDERR "$line";
429 }
430
431 if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
432 my $follow_url = $1;
433 push(@follow_list,$follow_url);
434 }
435
436 if ($line =~ m/ERROR\s+\d+/) {
437 $error_text .= $line;
438 }
439
440 $full_text .= $line;
441 }
442 else { # wget finished, terminate naturally
443 #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
444 close($chld_in);
445 close($chld_out);
446 # Program terminates only when the following line is included
447 # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
448 # it prevents the child from turning into a "zombie process".
449 # While the wget process terminates without it, this perl script does not:
450 # the DOS prompt is not returned without it.
451 waitpid $childpid, 0;
452 $loop = 0;
453
454 $childpid = undef;
455 if(defined $port) {
456 $read_set->remove($serverSocket);
457 close($serverSocket);
458 }
[16791]459 }
460
[17537]461 # if we run this script from the command-line (as opposed to from GLI),
462 # then we're not working with sockets and can therefore can skip the next bits
463 next unless(defined $port);
[16791]464
[17537]465 # http://www.perlfect.com/articles/select.shtml
466 # "multiplex between several filehandles within a single thread of control,
467 # thus creating the effect of parallelism in the handling of I/O."
468 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
469
470 # take all readable handles in turn
471 foreach my $rh (@rh_set) {
472 if($rh == $serverSocket) {
473 my $client = $rh->accept();
474 #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
475 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
476
477 # Read from the client (getting rid of trailing newline)
478 # Has the client sent the <<STOP>> signal?
479 my $signal = <$client>;
480 chomp($signal);
481 if($signal eq "<<STOP>>") {
482 print $client "Perl received STOP signal (on port $port): stopping wget\n";
[17549]483 $loop = 0; # out of outer while loop
484 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
[17537]485
[17547]486 # Sometimes the wget process takes some time to start up. If the STOP signal
487 # was sent, don't try to terminate the process until we know it is running.
488 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
489 # for it to start up, checking for whether it is running in order to kill it.
490 for(my $seconds = 1; $seconds <= 5; $seconds++) {
491 if(kill(0, $childpid)) {
492 # If kill(0, $childpid) returns true, then the process is running
493 # and we need to kill it.
494 close($chld_in);
495 close($chld_out);
496 kill("TERM", $childpid);
497
498 $childpid = undef;
499
500 # Stop monitoring the read_handle and close the serverSocket
501 # (the Java end will close the client socket that Java opened)
502 $read_set->remove($rh); #$read_set->remove($serverSocket);
503 close($rh); #close($serverSocket);
504 print $client "Perl terminated wget and is about to exit\n";
[17840]505 last; # out of inner for loop
[17547]506 }
507 else { # the process may just be starting up, wait
508 sleep(1);
509 }
510 }
[17549]511 last; # out of foreach loop
[17537]512 }
513 }
[16791]514 }
515 }
516
517 my $command_status = $?;
518 if ($command_status != 0) {
519 $error_text .= "Exit error: $command_status";
520 }
521
522 if ($changed_dir) {
523 chdir $current_dir;
524 }
525
526 my $final_follow = pop(@follow_list); # might be undefined, but that's OK
527
528 return ($full_text,$error_text,$final_follow);
529}
530
531
[14657]532# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
533sub checkURL
534{
535 my ($self) = @_;
536 if ($self->{'url'} eq "")
537 {
538 &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
539 }
540}
541
542sub error
543{
544 my ($strFunctionName,$strError) = @_;
545 {
546 print "Error occoured in WgetDownload.pm\n".
547 "In Function:".$strFunctionName."\n".
548 "Error Message:".$strError."\n";
549 exit(-1);
550 }
551}
552
5531;
[28250]554
Note: See TracBrowser for help on using the repository browser.