source: main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm@ 31860

Last change on this file since 31860 was 31860, checked in by ak19, 7 years ago

The changes necessary for getting the new no_check_certificate checkbox to appear and work in GLI and get propagated to the perl code that launches wget. This checkbox controls whether wget is launched with the no-check-certificate flag to retrieve Https URLs despite lack of (valid) certificates.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.9 KB
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use util;
38use IPC::Open3;
39use IO::Select;
40use IO::Socket;
41
42
43sub BEGIN {
44 @WgetDownload::ISA = ('BaseDownload');
45}
46
47my $arguments =
48 [ { 'name' => "proxy_on",
49 'desc' => "{WgetDownload.proxy_on}",
50 'type' => "flag",
51 'reqd' => "no",
52 'hiddengli' => "yes"},
53 { 'name' => "proxy_host",
54 'desc' => "{WgetDownload.proxy_host}",
55 'type' => "string",
56 'reqd' => "no",
57 'hiddengli' => "yes"},
58 { 'name' => "proxy_port",
59 'desc' => "{WgetDownload.proxy_port}",
60 'type' => "string",
61 'reqd' => "no",
62 'hiddengli' => "yes"},
63 { 'name' => "user_name",
64 'desc' => "{WgetDownload.user_name}",
65 'type' => "string",
66 'reqd' => "no",
67 'hiddengli' => "yes"},
68 { 'name' => "user_password",
69 'desc' => "{WgetDownload.user_password}",
70 'type' => "string",
71 'reqd' => "no",
72 'hiddengli' => "yes"},
73 { 'name' => "no_check_certificate",
74 'desc' => "{WgetDownload.no_check_certificate}",
75 'type' => "flag",
76 'reqd' => "no",
77 'hiddengli' => "yes"}
78 ];
79
80my $options = { 'name' => "WgetDownload",
81 'desc' => "{WgetDownload.desc}",
82 'abstract' => "yes",
83 'inherits' => "yes",
84 'args' => $arguments };
85
86
87# Declaring file global variables related to the wget child process so that
88# the termination signal handler for SIGTERM can close the streams and tidy
89# up before ending the child process.
90my $childpid;
91my ($chld_out, $chld_in);
92my ($serverSocket, $read_set);
93
94# The port this script's server socket will be listening on, to handle
95# incoming signals from GLI to terminate wget. This is also file global,
96# since OAIDownload.pm will make several calls on wget using the same
97# instance of this script and we want to reuse whatever port GLI gave us.
98my $port;
99
100# When this script is called from the command line, this handler will be called
101# if this process is killed or abruptly ends due to receiving one of the
102# terminating signals that this handler is registered to deal with.
103sub abrupt_end_handler {
104 my $termination_signal = shift (@_);
105
106 if(defined $childpid) {
107 close($chld_out);
108 close($chld_in);
109
110 print STDOUT "Received termination signal: $termination_signal\n";
111
112 # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
113 # See http://perldoc.perl.org/perlipc.html#Signals
114 # Warning on using kill at http://perldoc.perl.org/perlfork.html
115 kill("TERM", $childpid);
116
117 # If the SIGTERM sent on Linux calls this handler, we want to make
118 # sure any socket connection is closed.
119 # Otherwise sockets are only used when this script is run from GLI
120 # in which case the handlers don't really get called.
121 if(defined $serverSocket) {
122 $read_set->remove($serverSocket) if defined $read_set;
123 close($serverSocket);
124 }
125 }
126
127 exit(0);
128}
129
130# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
131# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
132# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
133# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
134# http://affy.blogspot.com/p5be/ch13.htm
135# http://perldoc.perl.org/perlipc.html#Signals
136$SIG{'INT'} = \&abrupt_end_handler;
137$SIG{'TERM'} = \&abrupt_end_handler;
138
139sub new {
140 my ($class) = shift (@_);
141 my ($getlist,$inputargs,$hashArgOptLists) = @_;
142 push(@$getlist, $class);
143
144 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
145 push(@{$hashArgOptLists->{"OptList"}},$options);
146
147 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
148
149 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
150 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
151
152 return bless $self, $class;
153}
154
155sub checkWgetSetup
156{
157 my ($self,$blnGliCall) = @_;
158 #TODO: proxy detection??
159
160 if((!$blnGliCall) && $self->{'proxy_on'})
161 {
162 &checkProxySetup($self);
163 }
164 &checkURL($self);
165}
166
167sub getWgetOptions
168{
169 my ($self) = @_;
170 my $strOptions = "";
171
172 if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
173 {
174
175 if($self->{'url'} =~ m/^https\:/) {
176 $strOptions .= " -e https_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
177 } else {
178 $strOptions .= " -e http_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
179 }
180
181 if ($self->{'user_name'} && $self->{'user_password'})
182 {
183 $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
184 }
185 }
186
187 if ($self->{'proxy_on'}) {
188 $strOptions .= " --proxy ";
189 }
190
191 if($self->{'no_check_certificate'}) { #&& $self->{'url'} =~ m/^https\:/) { # URL may be http that gets redirected to https
192
193 $strOptions .= " --no-check-certificate ";
194 }
195
196 return $strOptions;
197}
198
199# Checking for proxy setup: proxy server, proxy port, proxy username and password.
200sub checkProxySetup
201{
202 my ($self) = @_;
203 ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
204 # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
205 # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
206
207}
208
209# Returns true if the wget status needs to be monitored through sockets
210# (if a socket is used to communicate with the Java program on when to
211# terminate wget). True if we are running gli, or if the particular type
212# of WgetDownload is *not* OAIDownload (in that case, the original way of
213# terminating the perl script from Java terminated wget as well).
214sub dealingWithSockets() {
215 my ($self) = @_;
216 return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
217 # use ref($self) to find the classname of an object
218}
219
220
221sub useWget
222{
223 #local $| = 1; # autoflush stdout buffer
224 #print STDOUT "*** Start of subroutine useWget in $0\n";
225
226 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
227
228 my ($strReadIn,$strLine,$command);
229 $strReadIn = "" unless defined $strReadIn;
230
231 my $current_dir = cwd();
232 my $changed_dir = 0;
233 if (defined $working_dir && -e $working_dir) {
234 chdir "$working_dir";
235 $changed_dir = 1;
236 }
237
238 # When we are running this script through GLI, the SIGTERM signal handler
239 # won't get called on Windows when wget is to be prematurely terminated.
240 # Instead, when wget has to be terminated in the middle of execution, GLI will
241 # connect to a serverSocket here to communicate when it's time to stop wget.
242 if($self->dealingWithSockets()) {
243
244 $port = <STDIN>; # gets a port on localhost that's not yet in use
245 chomp($port);
246
247 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
248 LocalPort => $port,
249 Listen => 1,
250 Reuse => 1);
251
252 die "can't setup server" unless $serverSocket;
253 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
254
255 $read_set = new IO::Select(); # create handle set for reading
256 $read_set->add($serverSocket); # add the main socket to the set
257 }
258
259 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
260 $command = "\"$wget_file_path\" $cmdWget";
261 #print STDOUT "Command is: $command\n";
262
263 # Wget's output needs to be monitored to find out when it has naturally terminated.
264 # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
265 # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
266 # the childpid does not kill wget on Linux but the subshell that launched it instead.
267 # Therefore, we use open3. Though the child process wget sends output only to its stdout,
268 # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
269 # However that may be, it works with open3. But to avoid the confusion of managing and
270 # closing an extra unused handle, a single handle is used instead for both the child's
271 # stderr and stdout.
272 # See http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
273 # for why this is the right thing to do.
274
275 # Both open2 and open3 don't return on failure, but raise an exception. The handling
276 # of the exception is described on p.568 of the Perl Cookbook
277 eval {
278 $childpid = open3($chld_in, $chld_out, $chld_out, $command);
279 };
280 if ($@) {
281 if($@ =~ m/^open3/) {
282 die "open3 failed in $0: $!\n$@\n";
283 }
284 die "Tried to launch open3 in $0, got unexpected exception: $@";
285 }
286
287 my $loop = 1;
288 while($loop)
289 {
290 if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
291 if($blnShow) {
292 print STDERR "$strLine\n";
293 }
294 $strReadIn .= $strLine;
295 }
296 else { # wget finished, terminate naturally
297 #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
298 close($chld_in);
299 close($chld_out);
300 waitpid $childpid, 0;
301 $loop = 0;
302
303 $childpid = undef;
304 if(defined $port) {
305 $read_set->remove($serverSocket);
306 close($serverSocket);
307 }
308 }
309
310 # if we run this script from the command-line (as opposed to from GLI),
311 # then we're not working with sockets and can therefore can skip the next bits
312 next unless(defined $port);
313
314 # http://www.perlfect.com/articles/select.shtml
315 # "multiplex between several filehandles within a single thread of control,
316 # thus creating the effect of parallelism in the handling of I/O."
317 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
318
319 # take all readable handles in turn
320 foreach my $rh (@rh_set) {
321 if($rh == $serverSocket) {
322 my $client = $rh->accept();
323 #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
324 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
325
326 # Read from the client (getting rid of the trailing newline)
327 # Has the client sent the <<STOP>> signal?
328 my $signal = <$client>;
329 chomp($signal);
330 if($signal eq "<<STOP>>") {
331 print $client "Perl received STOP signal (on port $port): stopping wget\n";
332 $loop = 0; # out of outer while loop
333 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
334
335 # Sometimes the wget process takes some time to start up. If the STOP signal
336 # was sent, don't try to terminate the process until we know it is running.
337 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
338 # for it to start up, checking for whether it is running in order to kill it.
339 for(my $seconds = 1; $seconds <= 5; $seconds++) {
340 if(kill(0, $childpid)) {
341 # If kill(0, $childpid) returns true, then the process is running
342 # and we need to kill it.
343 close($chld_in);
344 close($chld_out);
345 kill("TERM", $childpid);
346
347 $childpid = undef;
348
349 # Stop monitoring the read_handle and close the serverSocket
350 # (the Java end will close the client socket that Java opened)
351 $read_set->remove($rh); #$read_set->remove($serverSocket);
352 close($rh); #close($serverSocket);
353 print $client "Perl terminated wget and is about to exit\n";
354 last; # out of inner for loop
355 }
356 else { # the process may just be starting up, wait
357 sleep(1);
358 }
359 }
360 last; # out of foreach loop
361 }
362 }
363 }
364 }
365
366 if ($changed_dir) {
367 chdir $current_dir;
368 }
369
370 return $strReadIn;
371}
372
373
374sub useWgetMonitored
375{
376 #local $| = 1; # autoflush stdout buffer
377 #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
378
379 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
380
381
382 my $current_dir = cwd();
383 my $changed_dir = 0;
384 if (defined $working_dir && -e $working_dir) {
385 chdir "$working_dir";
386 $changed_dir = 1;
387 }
388
389 # When we are running this script through GLI, the SIGTERM signal handler
390 # won't get called on Windows when wget is to be prematurely terminated.
391 # Instead, when wget has to be terminated in the middle of execution, GLI will
392 # connect to a serverSocket here to communicate when it's time to stop wget.
393 if($self->dealingWithSockets()) {
394
395 $port = <STDIN>; # gets a port on localhost that's not yet in use
396 chomp($port);
397
398 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
399 LocalPort => $port,
400 Listen => 1,
401 Reuse => 1);
402
403 die "can't setup server" unless $serverSocket;
404 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
405
406 $read_set = new IO::Select(); # create handle set for reading
407 $read_set->add($serverSocket); # add the main socket to the set
408 }
409
410 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
411 my $command = "\"$wget_file_path\" $cmdWget";
412 #print STDOUT "Command is: $command\n";
413
414 eval { # see p.568 of Perl Cookbook
415 $childpid = open3($chld_in, $chld_out, $chld_out, $command);
416 };
417 if ($@) {
418 if($@ =~ m/^open3/) {
419 die "open3 failed in $0: $!\n$@\n";
420 }
421 die "Tried to launch open3 in $0, got unexpected exception: $@";
422 }
423
424 my $full_text = "";
425 my $error_text = "";
426 my @follow_list = ();
427 my $line;
428
429 my $loop = 1;
430 while($loop)
431 {
432 if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
433 if((defined $blnShow) && $blnShow)
434 {
435 print STDERR "$line";
436 }
437
438 if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
439 my $follow_url = $1;
440 push(@follow_list,$follow_url);
441 }
442
443 if ($line =~ m/ERROR\s+\d+/) {
444 $error_text .= $line;
445 }
446
447 $full_text .= $line;
448 }
449 else { # wget finished, terminate naturally
450 #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
451 close($chld_in);
452 close($chld_out);
453 # Program terminates only when the following line is included
454 # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
455 # it prevents the child from turning into a "zombie process".
456 # While the wget process terminates without it, this perl script does not:
457 # the DOS prompt is not returned without it.
458 waitpid $childpid, 0;
459 $loop = 0;
460
461 $childpid = undef;
462 if(defined $port) {
463 $read_set->remove($serverSocket);
464 close($serverSocket);
465 }
466 }
467
468 # if we run this script from the command-line (as opposed to from GLI),
469 # then we're not working with sockets and can therefore can skip the next bits
470 next unless(defined $port);
471
472 # http://www.perlfect.com/articles/select.shtml
473 # "multiplex between several filehandles within a single thread of control,
474 # thus creating the effect of parallelism in the handling of I/O."
475 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
476
477 # take all readable handles in turn
478 foreach my $rh (@rh_set) {
479 if($rh == $serverSocket) {
480 my $client = $rh->accept();
481 #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
482 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
483
484 # Read from the client (getting rid of trailing newline)
485 # Has the client sent the <<STOP>> signal?
486 my $signal = <$client>;
487 chomp($signal);
488 if($signal eq "<<STOP>>") {
489 print $client "Perl received STOP signal (on port $port): stopping wget\n";
490 $loop = 0; # out of outer while loop
491 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
492
493 # Sometimes the wget process takes some time to start up. If the STOP signal
494 # was sent, don't try to terminate the process until we know it is running.
495 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
496 # for it to start up, checking for whether it is running in order to kill it.
497 for(my $seconds = 1; $seconds <= 5; $seconds++) {
498 if(kill(0, $childpid)) {
499 # If kill(0, $childpid) returns true, then the process is running
500 # and we need to kill it.
501 close($chld_in);
502 close($chld_out);
503 kill("TERM", $childpid);
504
505 $childpid = undef;
506
507 # Stop monitoring the read_handle and close the serverSocket
508 # (the Java end will close the client socket that Java opened)
509 $read_set->remove($rh); #$read_set->remove($serverSocket);
510 close($rh); #close($serverSocket);
511 print $client "Perl terminated wget and is about to exit\n";
512 last; # out of inner for loop
513 }
514 else { # the process may just be starting up, wait
515 sleep(1);
516 }
517 }
518 last; # out of foreach loop
519 }
520 }
521 }
522 }
523
524 my $command_status = $?;
525 if ($command_status != 0) {
526 $error_text .= "Exit error: $command_status";
527 }
528
529 if ($changed_dir) {
530 chdir $current_dir;
531 }
532
533 my $final_follow = pop(@follow_list); # might be undefined, but that's OK
534
535 return ($full_text,$error_text,$final_follow);
536}
537
538
539# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
540sub checkURL
541{
542 my ($self) = @_;
543 if ($self->{'url'} eq "")
544 {
545 &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
546 }
547}
548
549sub error
550{
551 my ($strFunctionName,$strError) = @_;
552 {
553 print "Error occoured in WgetDownload.pm\n".
554 "In Function:".$strFunctionName."\n".
555 "Error Message:".$strError."\n";
556 exit(-1);
557 }
558}
559
5601;
561
Note: See TracBrowser for help on using the repository browser.