source: main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm@ 30520

Last change on this file since 30520 was 30520, checked in by ak19, 8 years ago

Refactoring activate.pl into activate.pm (class, OOP) and activate.pl. Now buildcolutils.pm uses do_deactivate() from activate.pm.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.3 KB
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use util;
38use IPC::Open3;
39use IO::Select;
40use IO::Socket;
41
42
43sub BEGIN {
44 @WgetDownload::ISA = ('BaseDownload');
45}
46
47my $arguments =
48 [ { 'name' => "proxy_on",
49 'desc' => "{WgetDownload.proxy_on}",
50 'type' => "flag",
51 'reqd' => "no",
52 'hiddengli' => "yes"},
53 { 'name' => "proxy_host",
54 'desc' => "{WgetDownload.proxy_host}",
55 'type' => "string",
56 'reqd' => "no",
57 'hiddengli' => "yes"},
58 { 'name' => "proxy_port",
59 'desc' => "{WgetDownload.proxy_port}",
60 'type' => "string",
61 'reqd' => "no",
62 'hiddengli' => "yes"},
63 { 'name' => "user_name",
64 'desc' => "{WgetDownload.user_name}",
65 'type' => "string",
66 'reqd' => "no",
67 'hiddengli' => "yes"},
68 { 'name' => "user_password",
69 'desc' => "{WgetDownload.user_password}",
70 'type' => "string",
71 'reqd' => "no",
72 'hiddengli' => "yes"}];
73
74my $options = { 'name' => "WgetDownload",
75 'desc' => "{WgetDownload.desc}",
76 'abstract' => "yes",
77 'inherits' => "yes",
78 'args' => $arguments };
79
80
81# Declaring file global variables related to the wget child process so that
82# the termination signal handler for SIGTERM can close the streams and tidy
83# up before ending the child process.
84my $childpid;
85my ($chld_out, $chld_in);
86my ($serverSocket, $read_set);
87
88# The port this script's server socket will be listening on, to handle
89# incoming signals from GLI to terminate wget. This is also file global,
90# since OAIDownload.pm will make several calls on wget using the same
91# instance of this script and we want to reuse whatever port GLI gave us.
92my $port;
93
94# When this script is called from the command line, this handler will be called
95# if this process is killed or abruptly ends due to receiving one of the
96# terminating signals that this handler is registered to deal with.
97sub abrupt_end_handler {
98 my $termination_signal = shift (@_);
99
100 if(defined $childpid) {
101 close($chld_out);
102 close($chld_in);
103
104 print STDOUT "Received termination signal: $termination_signal\n";
105
106 # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
107 # See http://perldoc.perl.org/perlipc.html#Signals
108 # Warning on using kill at http://perldoc.perl.org/perlfork.html
109 kill("TERM", $childpid);
110
111 # If the SIGTERM sent on Linux calls this handler, we want to make
112 # sure any socket connection is closed.
113 # Otherwise sockets are only used when this script is run from GLI
114 # in which case the handlers don't really get called.
115 if(defined $serverSocket) {
116 $read_set->remove($serverSocket) if defined $read_set;
117 close($serverSocket);
118 }
119 }
120
121 exit(0);
122}
123
124# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
125# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
126# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
127# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
128# http://affy.blogspot.com/p5be/ch13.htm
129# http://perldoc.perl.org/perlipc.html#Signals
130$SIG{'INT'} = \&abrupt_end_handler;
131$SIG{'TERM'} = \&abrupt_end_handler;
132
133sub new {
134 my ($class) = shift (@_);
135 my ($getlist,$inputargs,$hashArgOptLists) = @_;
136 push(@$getlist, $class);
137
138 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
139 push(@{$hashArgOptLists->{"OptList"}},$options);
140
141 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
142
143 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
144 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
145
146 return bless $self, $class;
147}
148
149sub checkWgetSetup
150{
151 my ($self,$blnGliCall) = @_;
152 #TODO: proxy detection??
153
154 if((!$blnGliCall) && $self->{'proxy_on'})
155 {
156 &checkProxySetup($self);
157 }
158 &checkURL($self);
159}
160
161sub getWgetOptions
162{
163 my ($self) = @_;
164 my $strOptions = "";
165
166 if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
167 {
168
169 $strOptions .= " -e httpproxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
170
171 if ($self->{'user_name'} && $self->{'user_password'})
172 {
173 $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
174 }
175 }
176
177 if ($self->{'proxy_on'}) {
178 $strOptions .= " --proxy ";
179 }
180
181 return $strOptions;
182}
183
184# Checking for proxy setup: proxy server, proxy port, proxy username and password.
185sub checkProxySetup
186{
187 my ($self) = @_;
188 ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
189 # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
190 # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
191
192}
193
194# Returns true if the wget status needs to be monitored through sockets
195# (if a socket is used to communicate with the Java program on when to
196# terminate wget). True if we are running gli, or if the particular type
197# of WgetDownload is *not* OAIDownload (in that case, the original way of
198# terminating the perl script from Java terminated wget as well).
199sub dealingWithSockets() {
200 my ($self) = @_;
201 return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
202 # use ref($self) to find the classname of an object
203}
204
205
206sub useWget
207{
208 #local $| = 1; # autoflush stdout buffer
209 #print STDOUT "*** Start of subroutine useWget in $0\n";
210
211 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
212
213 my ($strReadIn,$strLine,$command);
214 $strReadIn = "" unless defined $strReadIn;
215
216 my $current_dir = cwd();
217 my $changed_dir = 0;
218 if (defined $working_dir && -e $working_dir) {
219 chdir "$working_dir";
220 $changed_dir = 1;
221 }
222
223 # When we are running this script through GLI, the SIGTERM signal handler
224 # won't get called on Windows when wget is to be prematurely terminated.
225 # Instead, when wget has to be terminated in the middle of execution, GLI will
226 # connect to a serverSocket here to communicate when it's time to stop wget.
227 if($self->dealingWithSockets()) {
228
229 $port = <STDIN>; # gets a port on localhost that's not yet in use
230 chomp($port);
231
232 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
233 LocalPort => $port,
234 Listen => 1,
235 Reuse => 1);
236
237 die "can't setup server" unless $serverSocket;
238 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
239
240 $read_set = new IO::Select(); # create handle set for reading
241 $read_set->add($serverSocket); # add the main socket to the set
242 }
243
244 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
245 $command = "\"$wget_file_path\" $cmdWget";
246 #print STDOUT "Command is: $command\n";
247
248 # Wget's output needs to be monitored to find out when it has naturally terminated.
249 # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
250 # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
251 # the childpid does not kill wget on Linux but the subshell that launched it instead.
252 # Therefore, we use open3. Though the child process wget sends output only to its stdout,
253 # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
254 # However that may be, it works with open3. But to avoid the confusion of managing and
255 # closing an extra unused handle, a single handle is used instead for both the child's
256 # stderr and stdout.
257
258 # Both open2 and open3 don't return on failure, but raise an exception. The handling
259 # of the exception is described on p.568 of the Perl Cookbook
260 eval {
261 $childpid = open3($chld_in, $chld_out, $chld_out, $command);
262 };
263 if ($@) {
264 if($@ =~ m/^open3/) {
265 die "open3 failed in $0: $!\n$@\n";
266 }
267 die "Tried to launch open3 in $0, got unexpected exception: $@";
268 }
269
270 my $loop = 1;
271 while($loop)
272 {
273 if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
274 if($blnShow) {
275 print STDERR "$strLine\n";
276 }
277 $strReadIn .= $strLine;
278 }
279 else { # wget finished, terminate naturally
280 #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
281 close($chld_in);
282 close($chld_out);
283 waitpid $childpid, 0;
284 $loop = 0;
285
286 $childpid = undef;
287 if(defined $port) {
288 $read_set->remove($serverSocket);
289 close($serverSocket);
290 }
291 }
292
293 # if we run this script from the command-line (as opposed to from GLI),
294 # then we're not working with sockets and can therefore can skip the next bits
295 next unless(defined $port);
296
297 # http://www.perlfect.com/articles/select.shtml
298 # "multiplex between several filehandles within a single thread of control,
299 # thus creating the effect of parallelism in the handling of I/O."
300 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
301
302 # take all readable handles in turn
303 foreach my $rh (@rh_set) {
304 if($rh == $serverSocket) {
305 my $client = $rh->accept();
306 #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
307 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
308
309 # Read from the client (getting rid of the trailing newline)
310 # Has the client sent the <<STOP>> signal?
311 my $signal = <$client>;
312 chomp($signal);
313 if($signal eq "<<STOP>>") {
314 print $client "Perl received STOP signal (on port $port): stopping wget\n";
315 $loop = 0; # out of outer while loop
316 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
317
318 # Sometimes the wget process takes some time to start up. If the STOP signal
319 # was sent, don't try to terminate the process until we know it is running.
320 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
321 # for it to start up, checking for whether it is running in order to kill it.
322 for(my $seconds = 1; $seconds <= 5; $seconds++) {
323 if(kill(0, $childpid)) {
324 # If kill(0, $childpid) returns true, then the process is running
325 # and we need to kill it.
326 close($chld_in);
327 close($chld_out);
328 kill("TERM", $childpid);
329
330 $childpid = undef;
331
332 # Stop monitoring the read_handle and close the serverSocket
333 # (the Java end will close the client socket that Java opened)
334 $read_set->remove($rh); #$read_set->remove($serverSocket);
335 close($rh); #close($serverSocket);
336 print $client "Perl terminated wget and is about to exit\n";
337 last; # out of inner for loop
338 }
339 else { # the process may just be starting up, wait
340 sleep(1);
341 }
342 }
343 last; # out of foreach loop
344 }
345 }
346 }
347 }
348
349 if ($changed_dir) {
350 chdir $current_dir;
351 }
352
353 return $strReadIn;
354}
355
356
357sub useWgetMonitored
358{
359 #local $| = 1; # autoflush stdout buffer
360 #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
361
362 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
363
364
365 my $current_dir = cwd();
366 my $changed_dir = 0;
367 if (defined $working_dir && -e $working_dir) {
368 chdir "$working_dir";
369 $changed_dir = 1;
370 }
371
372 # When we are running this script through GLI, the SIGTERM signal handler
373 # won't get called on Windows when wget is to be prematurely terminated.
374 # Instead, when wget has to be terminated in the middle of execution, GLI will
375 # connect to a serverSocket here to communicate when it's time to stop wget.
376 if($self->dealingWithSockets()) {
377
378 $port = <STDIN>; # gets a port on localhost that's not yet in use
379 chomp($port);
380
381 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
382 LocalPort => $port,
383 Listen => 1,
384 Reuse => 1);
385
386 die "can't setup server" unless $serverSocket;
387 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
388
389 $read_set = new IO::Select(); # create handle set for reading
390 $read_set->add($serverSocket); # add the main socket to the set
391 }
392
393 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
394 my $command = "\"$wget_file_path\" $cmdWget";
395 #print STDOUT "Command is: $command\n";
396
397 eval { # see p.568 of Perl Cookbook
398 $childpid = open3($chld_in, $chld_out, $chld_out, $command);
399 };
400 if ($@) {
401 if($@ =~ m/^open3/) {
402 die "open3 failed in $0: $!\n$@\n";
403 }
404 die "Tried to launch open3 in $0, got unexpected exception: $@";
405 }
406
407 my $full_text = "";
408 my $error_text = "";
409 my @follow_list = ();
410 my $line;
411
412 my $loop = 1;
413 while($loop)
414 {
415 if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
416 if((defined $blnShow) && $blnShow)
417 {
418 print STDERR "$line";
419 }
420
421 if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
422 my $follow_url = $1;
423 push(@follow_list,$follow_url);
424 }
425
426 if ($line =~ m/ERROR\s+\d+/) {
427 $error_text .= $line;
428 }
429
430 $full_text .= $line;
431 }
432 else { # wget finished, terminate naturally
433 #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
434 close($chld_in);
435 close($chld_out);
436 # Program terminates only when the following line is included
437 # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
438 # it prevents the child from turning into a "zombie process".
439 # While the wget process terminates without it, this perl script does not:
440 # the DOS prompt is not returned without it.
441 waitpid $childpid, 0;
442 $loop = 0;
443
444 $childpid = undef;
445 if(defined $port) {
446 $read_set->remove($serverSocket);
447 close($serverSocket);
448 }
449 }
450
451 # if we run this script from the command-line (as opposed to from GLI),
452 # then we're not working with sockets and can therefore can skip the next bits
453 next unless(defined $port);
454
455 # http://www.perlfect.com/articles/select.shtml
456 # "multiplex between several filehandles within a single thread of control,
457 # thus creating the effect of parallelism in the handling of I/O."
458 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
459
460 # take all readable handles in turn
461 foreach my $rh (@rh_set) {
462 if($rh == $serverSocket) {
463 my $client = $rh->accept();
464 #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
465 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
466
467 # Read from the client (getting rid of trailing newline)
468 # Has the client sent the <<STOP>> signal?
469 my $signal = <$client>;
470 chomp($signal);
471 if($signal eq "<<STOP>>") {
472 print $client "Perl received STOP signal (on port $port): stopping wget\n";
473 $loop = 0; # out of outer while loop
474 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
475
476 # Sometimes the wget process takes some time to start up. If the STOP signal
477 # was sent, don't try to terminate the process until we know it is running.
478 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
479 # for it to start up, checking for whether it is running in order to kill it.
480 for(my $seconds = 1; $seconds <= 5; $seconds++) {
481 if(kill(0, $childpid)) {
482 # If kill(0, $childpid) returns true, then the process is running
483 # and we need to kill it.
484 close($chld_in);
485 close($chld_out);
486 kill("TERM", $childpid);
487
488 $childpid = undef;
489
490 # Stop monitoring the read_handle and close the serverSocket
491 # (the Java end will close the client socket that Java opened)
492 $read_set->remove($rh); #$read_set->remove($serverSocket);
493 close($rh); #close($serverSocket);
494 print $client "Perl terminated wget and is about to exit\n";
495 last; # out of inner for loop
496 }
497 else { # the process may just be starting up, wait
498 sleep(1);
499 }
500 }
501 last; # out of foreach loop
502 }
503 }
504 }
505 }
506
507 my $command_status = $?;
508 if ($command_status != 0) {
509 $error_text .= "Exit error: $command_status";
510 }
511
512 if ($changed_dir) {
513 chdir $current_dir;
514 }
515
516 my $final_follow = pop(@follow_list); # might be undefined, but that's OK
517
518 return ($full_text,$error_text,$final_follow);
519}
520
521
522# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
523sub checkURL
524{
525 my ($self) = @_;
526 if ($self->{'url'} eq "")
527 {
528 &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
529 }
530}
531
532sub error
533{
534 my ($strFunctionName,$strError) = @_;
535 {
536 print "Error occoured in WgetDownload.pm\n".
537 "In Function:".$strFunctionName."\n".
538 "Error Message:".$strError."\n";
539 exit(-1);
540 }
541}
542
5431;
544
Note: See TracBrowser for help on using the repository browser.