source: main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm@ 31864

Last change on this file since 31864 was 31864, checked in by ak19, 7 years ago
  1. Modified GLI and perl to set proxy_on, proxy_host and proxy_port for unix too, whereas in the past, only http(s)_proxy ENV vars were set by GLI for unix. If the env vars are not set, then perl assumes it's the wget setup for windows and passes the proxy vars that were set by GLI as flags to wget. If the env vars were set, then perl should run wget without setting the proxy vars as flags to wget, and wget will use the proxying info in the environment. So now, the decision as to whether proxy vars are set or not will result in context specific suggestion messages to the user when the Server Info button was pressed and if the URL could not be accessed for whatever reason. 2. Modified GLI code to not use proxy if proxying is toggled off, but use proxy when it's toggled back on. (Previously, if proxying was set at any point during a GLI session, then GLI remembered that even when proxying was turned off thereafter.)
  • Property svn:keywords set to Author Date Id Revision
File size: 19.6 KB
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use util;
38use IPC::Open3;
39use IO::Select;
40use IO::Socket;
41
42
43sub BEGIN {
44 @WgetDownload::ISA = ('BaseDownload');
45}
46
47my $arguments =
48 [ { 'name' => "proxy_on",
49 'desc' => "{WgetDownload.proxy_on}",
50 'type' => "flag",
51 'reqd' => "no",
52 'hiddengli' => "yes"},
53 { 'name' => "proxy_host",
54 'desc' => "{WgetDownload.proxy_host}",
55 'type' => "string",
56 'reqd' => "no",
57 'hiddengli' => "yes"},
58 { 'name' => "proxy_port",
59 'desc' => "{WgetDownload.proxy_port}",
60 'type' => "string",
61 'reqd' => "no",
62 'hiddengli' => "yes"},
63 { 'name' => "user_name",
64 'desc' => "{WgetDownload.user_name}",
65 'type' => "string",
66 'reqd' => "no",
67 'hiddengli' => "yes"},
68 { 'name' => "user_password",
69 'desc' => "{WgetDownload.user_password}",
70 'type' => "string",
71 'reqd' => "no",
72 'hiddengli' => "yes"},
73 { 'name' => "no_check_certificate",
74 'desc' => "{WgetDownload.no_check_certificate}",
75 'type' => "flag",
76 'reqd' => "no",
77 'hiddengli' => "yes"}
78 ];
79
80my $options = { 'name' => "WgetDownload",
81 'desc' => "{WgetDownload.desc}",
82 'abstract' => "yes",
83 'inherits' => "yes",
84 'args' => $arguments };
85
86
87# Declaring file global variables related to the wget child process so that
88# the termination signal handler for SIGTERM can close the streams and tidy
89# up before ending the child process.
90my $childpid;
91my ($chld_out, $chld_in);
92my ($serverSocket, $read_set);
93
94# The port this script's server socket will be listening on, to handle
95# incoming signals from GLI to terminate wget. This is also file global,
96# since OAIDownload.pm will make several calls on wget using the same
97# instance of this script and we want to reuse whatever port GLI gave us.
98my $port;
99
100# When this script is called from the command line, this handler will be called
101# if this process is killed or abruptly ends due to receiving one of the
102# terminating signals that this handler is registered to deal with.
103sub abrupt_end_handler {
104 my $termination_signal = shift (@_);
105
106 if(defined $childpid) {
107 close($chld_out);
108 close($chld_in);
109
110 print STDOUT "Received termination signal: $termination_signal\n";
111
112 # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
113 # See http://perldoc.perl.org/perlipc.html#Signals
114 # Warning on using kill at http://perldoc.perl.org/perlfork.html
115 kill("TERM", $childpid);
116
117 # If the SIGTERM sent on Linux calls this handler, we want to make
118 # sure any socket connection is closed.
119 # Otherwise sockets are only used when this script is run from GLI
120 # in which case the handlers don't really get called.
121 if(defined $serverSocket) {
122 $read_set->remove($serverSocket) if defined $read_set;
123 close($serverSocket);
124 }
125 }
126
127 exit(0);
128}
129
130# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
131# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
132# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
133# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
134# http://affy.blogspot.com/p5be/ch13.htm
135# http://perldoc.perl.org/perlipc.html#Signals
136$SIG{'INT'} = \&abrupt_end_handler;
137$SIG{'TERM'} = \&abrupt_end_handler;
138
139sub new {
140 my ($class) = shift (@_);
141 my ($getlist,$inputargs,$hashArgOptLists) = @_;
142 push(@$getlist, $class);
143
144 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
145 push(@{$hashArgOptLists->{"OptList"}},$options);
146
147 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
148
149 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
150 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
151
152 return bless $self, $class;
153}
154
155sub checkWgetSetup
156{
157 my ($self,$blnGliCall) = @_;
158 #TODO: proxy detection??
159
160 if((!$blnGliCall) && $self->{'proxy_on'})
161 {
162 &checkProxySetup($self);
163 }
164 &checkURL($self);
165}
166
167sub getWgetOptions
168{
169 my ($self) = @_;
170 my $strOptions = "";
171
172 # If http_proxy ENV VARS are not set, but proxy Perl vars are set, then we're on Windows
173 # and need to use the proxy vars as flags to wget
174 # If http_proxy Env Vars are set, then we're on Linux, wget will use the http(s) proxy
175 # env vars and we shouldn't be passing any proxy perl vars as flags
176
177 # Truth in Perl: https://home.ubalt.edu/abento/452/perl/perltruth.html
178 # http://www.perlmonks.org/?node=what%20is%20true%20and%20false%20in%20Perl%3F
179
180 if (!$ENV{'http_proxy'} && !$ENV{'https_proxy'}) {
181 if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
182 {
183
184 if($self->{'url'} =~ m/^https\:/) {
185 $strOptions .= " -e https_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
186 } else {
187 $strOptions .= " -e http_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
188 }
189
190 if ($self->{'user_name'} && $self->{'user_password'})
191 {
192 $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
193 }
194 }
195
196 if ($self->{'proxy_on'}) {
197 $strOptions .= " --proxy ";
198 }
199 }
200
201 if($self->{'no_check_certificate'}) { #&& $self->{'url'} =~ m/^https\:/) { # URL may be http that gets redirected to https, so if no_check_certificate is on, turn it on even if URL is http
202
203 $strOptions .= " --no-check-certificate ";
204 }
205
206 return $strOptions;
207}
208
209# Checking for proxy setup: proxy server, proxy port, proxy username and password.
210sub checkProxySetup
211{
212 my ($self) = @_;
213 ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
214 # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
215 # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
216
217}
218
219# Returns true if the wget status needs to be monitored through sockets
220# (if a socket is used to communicate with the Java program on when to
221# terminate wget). True if we are running gli, or if the particular type
222# of WgetDownload is *not* OAIDownload (in that case, the original way of
223# terminating the perl script from Java terminated wget as well).
224sub dealingWithSockets() {
225 my ($self) = @_;
226 return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
227 # use ref($self) to find the classname of an object
228}
229
230
231sub useWget
232{
233 #local $| = 1; # autoflush stdout buffer
234 #print STDOUT "*** Start of subroutine useWget in $0\n";
235
236 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
237
238 my ($strReadIn,$strLine,$command);
239 $strReadIn = "" unless defined $strReadIn;
240
241 my $current_dir = cwd();
242 my $changed_dir = 0;
243 if (defined $working_dir && -e $working_dir) {
244 chdir "$working_dir";
245 $changed_dir = 1;
246 }
247
248 # When we are running this script through GLI, the SIGTERM signal handler
249 # won't get called on Windows when wget is to be prematurely terminated.
250 # Instead, when wget has to be terminated in the middle of execution, GLI will
251 # connect to a serverSocket here to communicate when it's time to stop wget.
252 if($self->dealingWithSockets()) {
253
254 $port = <STDIN>; # gets a port on localhost that's not yet in use
255 chomp($port);
256
257 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
258 LocalPort => $port,
259 Listen => 1,
260 Reuse => 1);
261
262 die "can't setup server" unless $serverSocket;
263 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
264
265 $read_set = new IO::Select(); # create handle set for reading
266 $read_set->add($serverSocket); # add the main socket to the set
267 }
268
269 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
270 $command = "\"$wget_file_path\" $cmdWget";
271 #print STDOUT "Command is: $command\n"; # displayed in GLI output
272 #print STDERR "Command is: $command\n"; # goes into ServerInfoDialog
273
274 # Wget's output needs to be monitored to find out when it has naturally terminated.
275 # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
276 # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
277 # the childpid does not kill wget on Linux but the subshell that launched it instead.
278 # Therefore, we use open3. Though the child process wget sends output only to its stdout,
279 # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
280 # However that may be, it works with open3. But to avoid the confusion of managing and
281 # closing an extra unused handle, a single handle is used instead for both the child's
282 # stderr and stdout.
283 # See http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
284 # for why this is the right thing to do.
285
286 # Both open2 and open3 don't return on failure, but raise an exception. The handling
287 # of the exception is described on p.568 of the Perl Cookbook
288 eval {
289 $childpid = open3($chld_in, $chld_out, $chld_out, $command);
290 };
291 if ($@) {
292 if($@ =~ m/^open3/) {
293 die "open3 failed in $0: $!\n$@\n";
294 }
295 die "Tried to launch open3 in $0, got unexpected exception: $@";
296 }
297
298 my $loop = 1;
299 while($loop)
300 {
301 if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
302 if($blnShow) {
303 print STDERR "$strLine\n";
304 }
305 $strReadIn .= $strLine;
306 }
307 else { # wget finished, terminate naturally
308 #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
309 close($chld_in);
310 close($chld_out);
311 waitpid $childpid, 0;
312 $loop = 0;
313
314 $childpid = undef;
315 if(defined $port) {
316 $read_set->remove($serverSocket);
317 close($serverSocket);
318 }
319 }
320
321 # if we run this script from the command-line (as opposed to from GLI),
322 # then we're not working with sockets and can therefore can skip the next bits
323 next unless(defined $port);
324
325 # http://www.perlfect.com/articles/select.shtml
326 # "multiplex between several filehandles within a single thread of control,
327 # thus creating the effect of parallelism in the handling of I/O."
328 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
329
330 # take all readable handles in turn
331 foreach my $rh (@rh_set) {
332 if($rh == $serverSocket) {
333 my $client = $rh->accept();
334 #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
335 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
336
337 # Read from the client (getting rid of the trailing newline)
338 # Has the client sent the <<STOP>> signal?
339 my $signal = <$client>;
340 chomp($signal);
341 if($signal eq "<<STOP>>") {
342 print $client "Perl received STOP signal (on port $port): stopping wget\n";
343 $loop = 0; # out of outer while loop
344 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
345
346 # Sometimes the wget process takes some time to start up. If the STOP signal
347 # was sent, don't try to terminate the process until we know it is running.
348 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
349 # for it to start up, checking for whether it is running in order to kill it.
350 for(my $seconds = 1; $seconds <= 5; $seconds++) {
351 if(kill(0, $childpid)) {
352 # If kill(0, $childpid) returns true, then the process is running
353 # and we need to kill it.
354 close($chld_in);
355 close($chld_out);
356 kill("TERM", $childpid);
357
358 $childpid = undef;
359
360 # Stop monitoring the read_handle and close the serverSocket
361 # (the Java end will close the client socket that Java opened)
362 $read_set->remove($rh); #$read_set->remove($serverSocket);
363 close($rh); #close($serverSocket);
364 print $client "Perl terminated wget and is about to exit\n";
365 last; # out of inner for loop
366 }
367 else { # the process may just be starting up, wait
368 sleep(1);
369 }
370 }
371 last; # out of foreach loop
372 }
373 }
374 }
375 }
376
377 if ($changed_dir) {
378 chdir $current_dir;
379 }
380
381 return $strReadIn;
382}
383
384
385sub useWgetMonitored
386{
387 #local $| = 1; # autoflush stdout buffer
388 #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
389
390 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
391
392
393 my $current_dir = cwd();
394 my $changed_dir = 0;
395 if (defined $working_dir && -e $working_dir) {
396 chdir "$working_dir";
397 $changed_dir = 1;
398 }
399
400 # When we are running this script through GLI, the SIGTERM signal handler
401 # won't get called on Windows when wget is to be prematurely terminated.
402 # Instead, when wget has to be terminated in the middle of execution, GLI will
403 # connect to a serverSocket here to communicate when it's time to stop wget.
404 if($self->dealingWithSockets()) {
405
406 $port = <STDIN>; # gets a port on localhost that's not yet in use
407 chomp($port);
408
409 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
410 LocalPort => $port,
411 Listen => 1,
412 Reuse => 1);
413
414 die "can't setup server" unless $serverSocket;
415 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
416
417 $read_set = new IO::Select(); # create handle set for reading
418 $read_set->add($serverSocket); # add the main socket to the set
419 }
420
421 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
422 my $command = "\"$wget_file_path\" $cmdWget";
423 #print STDOUT "Command is: $command\n";
424
425 eval { # see p.568 of Perl Cookbook
426 $childpid = open3($chld_in, $chld_out, $chld_out, $command);
427 };
428 if ($@) {
429 if($@ =~ m/^open3/) {
430 die "open3 failed in $0: $!\n$@\n";
431 }
432 die "Tried to launch open3 in $0, got unexpected exception: $@";
433 }
434
435 my $full_text = "";
436 my $error_text = "";
437 my @follow_list = ();
438 my $line;
439
440 my $loop = 1;
441 while($loop)
442 {
443 if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
444 if((defined $blnShow) && $blnShow)
445 {
446 print STDERR "$line";
447 }
448
449 if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
450 my $follow_url = $1;
451 push(@follow_list,$follow_url);
452 }
453
454 if ($line =~ m/ERROR\s+\d+/) {
455 $error_text .= $line;
456 }
457
458 $full_text .= $line;
459 }
460 else { # wget finished, terminate naturally
461 #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
462 close($chld_in);
463 close($chld_out);
464 # Program terminates only when the following line is included
465 # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
466 # it prevents the child from turning into a "zombie process".
467 # While the wget process terminates without it, this perl script does not:
468 # the DOS prompt is not returned without it.
469 waitpid $childpid, 0;
470 $loop = 0;
471
472 $childpid = undef;
473 if(defined $port) {
474 $read_set->remove($serverSocket);
475 close($serverSocket);
476 }
477 }
478
479 # if we run this script from the command-line (as opposed to from GLI),
480 # then we're not working with sockets and can therefore can skip the next bits
481 next unless(defined $port);
482
483 # http://www.perlfect.com/articles/select.shtml
484 # "multiplex between several filehandles within a single thread of control,
485 # thus creating the effect of parallelism in the handling of I/O."
486 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
487
488 # take all readable handles in turn
489 foreach my $rh (@rh_set) {
490 if($rh == $serverSocket) {
491 my $client = $rh->accept();
492 #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
493 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
494
495 # Read from the client (getting rid of trailing newline)
496 # Has the client sent the <<STOP>> signal?
497 my $signal = <$client>;
498 chomp($signal);
499 if($signal eq "<<STOP>>") {
500 print $client "Perl received STOP signal (on port $port): stopping wget\n";
501 $loop = 0; # out of outer while loop
502 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
503
504 # Sometimes the wget process takes some time to start up. If the STOP signal
505 # was sent, don't try to terminate the process until we know it is running.
506 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
507 # for it to start up, checking for whether it is running in order to kill it.
508 for(my $seconds = 1; $seconds <= 5; $seconds++) {
509 if(kill(0, $childpid)) {
510 # If kill(0, $childpid) returns true, then the process is running
511 # and we need to kill it.
512 close($chld_in);
513 close($chld_out);
514 kill("TERM", $childpid);
515
516 $childpid = undef;
517
518 # Stop monitoring the read_handle and close the serverSocket
519 # (the Java end will close the client socket that Java opened)
520 $read_set->remove($rh); #$read_set->remove($serverSocket);
521 close($rh); #close($serverSocket);
522 print $client "Perl terminated wget and is about to exit\n";
523 last; # out of inner for loop
524 }
525 else { # the process may just be starting up, wait
526 sleep(1);
527 }
528 }
529 last; # out of foreach loop
530 }
531 }
532 }
533 }
534
535 my $command_status = $?;
536 if ($command_status != 0) {
537 $error_text .= "Exit error: $command_status";
538 }
539
540 if ($changed_dir) {
541 chdir $current_dir;
542 }
543
544 my $final_follow = pop(@follow_list); # might be undefined, but that's OK
545
546 return ($full_text,$error_text,$final_follow);
547}
548
549
550# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
551sub checkURL
552{
553 my ($self) = @_;
554 if ($self->{'url'} eq "")
555 {
556 &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
557 }
558}
559
560sub error
561{
562 my ($strFunctionName,$strError) = @_;
563 {
564 print "Error occoured in WgetDownload.pm\n".
565 "In Function:".$strFunctionName."\n".
566 "Error Message:".$strError."\n";
567 exit(-1);
568 }
569}
570
5711;
572
Note: See TracBrowser for help on using the repository browser.