source: gsdl/trunk/perllib/downloaders/WgetDownload.pm@ 17795

Last change on this file since 17795 was 17795, checked in by ak19, 15 years ago

Bugfix: On Linux, perl launches a subshell (the child process of the perl script) and THAT launches wget - so need to find the pid of wget in order to allow the user of GLI terminate it. This is because we absolutely have to use 2>&1 (otherwise wget blocks until it has finished). Also have tested that the cmdline version of downloadfrom.pl still terminates wget properly when fed Ctrl-C (SIGINT).

  • Property svn:keywords set to Author Date Id Revision
File size: 18.3 KB
RevLine 
[14657]1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
[17207]34use BaseDownload;
[14657]35use strict;
[17529]36use Cwd;
[14657]37use IPC::Open2;
[17529]38use IO::Select;
39use IO::Socket;
[14657]40
[17529]41
[14657]42sub BEGIN {
[17207]43 @WgetDownload::ISA = ('BaseDownload');
[14657]44}
45
46my $arguments =
47 [ { 'name' => "proxy_on",
48 'desc' => "{WgetDownload.proxy_on}",
49 'type' => "flag",
50 'reqd' => "no",
51 'hiddengli' => "yes"},
52 { 'name' => "proxy_host",
53 'desc' => "{WgetDownload.proxy_host}",
54 'type' => "string",
55 'reqd' => "no",
56 'hiddengli' => "yes"},
57 { 'name' => "proxy_port",
58 'desc' => "{WgetDownload.proxy_port}",
59 'type' => "string",
60 'reqd' => "no",
61 'hiddengli' => "yes"},
62 { 'name' => "user_name",
63 'desc' => "{WgetDownload.user_name}",
64 'type' => "string",
65 'reqd' => "no",
66 'hiddengli' => "yes"},
67 { 'name' => "user_password",
68 'desc' => "{WgetDownload.user_password}",
69 'type' => "string",
70 'reqd' => "no",
71 'hiddengli' => "yes"}];
72
73my $options = { 'name' => "WgetDownload",
74 'desc' => "{WgetDownload.desc}",
75 'abstract' => "yes",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79
[17529]80# Declaring file global variables related to the wget child process so that
81# the termination signal handler for SIGTERM can close the streams and tidy
82# up before ending the child process.
[17354]83my $childpid;
[17531]84my ($chld_out, $chld_in);
[17529]85my ($serverSocket, $read_set);
[17354]86
[17531]87# The port this script's server socket will be listening on, to handle
88# incoming signals from GLI to terminate wget. This is also file global,
89# since OAIDownload.pm will make several calls on wget using the same
90# instance of this script and we want to reuse whatever port GLI gave us.
91my $port;
92
[17529]93# When this script is called from the command line, this handler will be called
94# if this process is killed or abruptly ends due to receiving one of the
95# terminating signals that this handler is registered to deal with.
[17354]96sub abrupt_end_handler {
97 my $termination_signal = shift (@_);
[17549]98
[17529]99 if(defined $childpid) {
100 close($chld_out);
101 close($chld_in);
[17354]102
[17547]103 print STDOUT "Received termination signal: $termination_signal\n";
[17529]104
105 # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
106 # See http://perldoc.perl.org/perlipc.html#Signals
107 # Warning on using kill at http://perldoc.perl.org/perlfork.html
108 kill("TERM", $childpid);
109
110 # If the SIGTERM sent on Linux calls this handler, we want to make
111 # sure any socket connection is closed.
112 # Otherwise sockets are only used when this script is run from GLI
113 # in which case the handlers don't really get called.
114 if(defined $serverSocket) {
115 $read_set->remove($serverSocket) if defined $read_set;
116 close($serverSocket);
[17354]117 }
118 }
[17529]119
[17354]120 exit(0);
121}
122
123# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
124# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
125# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
126# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
127# http://affy.blogspot.com/p5be/ch13.htm
128# http://perldoc.perl.org/perlipc.html#Signals
129$SIG{'INT'} = \&abrupt_end_handler;
130$SIG{'TERM'} = \&abrupt_end_handler;
131
[14657]132sub new {
133 my ($class) = shift (@_);
134 my ($getlist,$inputargs,$hashArgOptLists) = @_;
135 push(@$getlist, $class);
136
[17207]137 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
138 push(@{$hashArgOptLists->{"OptList"}},$options);
[14657]139
[17207]140 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
[14657]141
142 return bless $self, $class;
143}
144
145sub checkWgetSetup
146{
147 my ($self,$blnGliCall) = @_;
148 #TODO: proxy detection??
149
150 if((!$blnGliCall) && $self->{'proxy_on'})
151 {
152 &checkProxySetup($self);
153 }
154 &checkURL($self);
155}
156
157sub getWgetOptions
158{
159 my ($self) = @_;
160 my $strOptions = "";
161
[16791]162 if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
[14657]163 {
[14918]164
165 $strOptions .= " -e httpproxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
166
167 if ($self->{'user_name'} && $self->{'user_password'})
168 {
169 $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
170 }
[14657]171 }
172
[16791]173 if ($self->{'proxy_on'}) {
174 $strOptions .= " --proxy ";
175 }
[14918]176
[14657]177 return $strOptions;
178}
179
180# Checking for proxy setup: proxy server, proxy port, proxy username and password.
181sub checkProxySetup
182{
183 my ($self) = @_;
184 ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
185 # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
[17529]186 # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
[14657]187
188}
189
[17664]190# Returns true if the wget status needs to be monitored through sockets
191# (if a socket is used to communicate with the Java program on when to
192# terminate wget). True if we are running gli, or if the particular type
193# of WgetDownload is *not* OAIDownload (in that case, the original way of
194# terminating the perl script from Java terminated wget as well).
195sub dealingWithSockets() {
196 my ($self) = @_;
197 return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
198 # use ref($self) to find the classname of an object
199}
200
[17795]201sub get_childpid {
202 my ($self, $pid) = @_;
203 my $os = $^O;
204 # for windows, we do nothing special
205 if ($os =~ m/mswin/i) {
206 return $pid;
207 }
208
209 # else $os is macos or linux
210 # This means the wget child process may have been spawned from a subshell
211 # (the real child) that was launched by this perl script. However, we want
212 # the pid of the wget process, since that is what we want to terminate.
213
214 sleep(2); # give it some time to start up, else we'd have finished searching beforehand
215
216 # Look through any processes spawned immediately/soon after the subshell.
217 # Look for any pid greater than the subshell's, but within a certain limit (10)
218 my $child = $pid;
219 for(my $i = 1; $i <= 10; $i++) {
220 $child = $child+$i;
221 if(kill(0, $child)) { # this process exists
222 return $child;
223 }
224 }
225
226 return $pid; # could not find any process within limit, so return the original process id?
227}
228
[14657]229sub useWget
230{
[17529]231 #local $| = 1; # autoflush stdout buffer
232 #print STDOUT "*** Start of subroutine useWget in $0\n";
233
[14657]234 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
235
236 my ($strReadIn,$strLine,$command);
237 $strReadIn = "" unless defined $strReadIn;
238
239 my $current_dir = cwd();
240 my $changed_dir = 0;
241 if (defined $working_dir && -e $working_dir) {
242 chdir "$working_dir";
243 $changed_dir = 1;
244 }
245
[17529]246 # When we are running this script through GLI, the SIGTERM signal handler
247 # won't get called on Windows when wget is to be prematurely terminated.
248 # Instead, when wget has to be terminated in the middle of execution, GLI will
249 # connect to a serverSocket here to communicate when it's time to stop wget.
[17664]250 if($self->dealingWithSockets()) {
[14657]251
[17529]252 $port = <STDIN>; # gets a port on localhost that's not yet in use
253 chomp($port);
254
255 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
256 LocalPort => $port,
257 Listen => 1,
258 Reuse => 1);
259
260 die "can't setup server" unless $serverSocket;
261 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
262
263 $read_set = new IO::Select(); # create handle set for reading
264 $read_set->add($serverSocket); # add the main socket to the set
265 }
266
267 my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
268 $command = "\"$wget_file_path\" $cmdWget 2>&1";
269 # print STDERR "Command is: $command\n";
[17354]270 $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
[17795]271
272 # for linux the wget process starts off as achild of a subshell launched as the child of
273 # this perl script. We want the pid of the wget process, not that of the subshell.
274 $childpid = $self->get_childpid($childpid);
[17354]275
[17529]276 my $loop = 1;
277 while($loop)
[14657]278 {
[17529]279 if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
280 if($blnShow) {
281 print STDERR "$strLine\n";
282 }
283 $strReadIn .= $strLine;
284 }
285 else { # wget finished, terminate naturally
[17547]286 #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
[17529]287 close($chld_in);
288 close($chld_out);
289 waitpid $childpid, 0;
290 $loop = 0;
291
292 $childpid = undef;
293 if(defined $port) {
294 $read_set->remove($serverSocket);
295 close($serverSocket);
296 }
[14657]297 }
298
[17529]299 # if we run this script from the command-line (as opposed to from GLI),
300 # then we're not working with sockets and can therefore can skip the next bits
301 next unless(defined $port);
302
303 # http://www.perlfect.com/articles/select.shtml
304 # "multiplex between several filehandles within a single thread of control,
305 # thus creating the effect of parallelism in the handling of I/O."
306 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
307
308 # take all readable handles in turn
309 foreach my $rh (@rh_set) {
310 if($rh == $serverSocket) {
311 my $client = $rh->accept();
312 #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
313 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
314
[17547]315 # Read from the client (getting rid of the trailing newline)
[17529]316 # Has the client sent the <<STOP>> signal?
317 my $signal = <$client>;
318 chomp($signal);
319 if($signal eq "<<STOP>>") {
320 print $client "Perl received STOP signal (on port $port): stopping wget\n";
[17549]321 $loop = 0; # out of outer while loop
322 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
[17529]323
[17547]324 # Sometimes the wget process takes some time to start up. If the STOP signal
325 # was sent, don't try to terminate the process until we know it is running.
326 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
327 # for it to start up, checking for whether it is running in order to kill it.
328 for(my $seconds = 1; $seconds <= 5; $seconds++) {
329 if(kill(0, $childpid)) {
330 # If kill(0, $childpid) returns true, then the process is running
331 # and we need to kill it.
332 close($chld_in);
333 close($chld_out);
334 kill("TERM", $childpid);
335
336 $childpid = undef;
337
338 # Stop monitoring the read_handle and close the serverSocket
339 # (the Java end will close the client socket that Java opened)
340 $read_set->remove($rh); #$read_set->remove($serverSocket);
341 close($rh); #close($serverSocket);
342 print $client "Perl terminated wget and is about to exit\n";
[17785]343 last; # out of inner for loop
[17547]344 }
345 else { # the process may just be starting up, wait
346 sleep(1);
347 }
348 }
[17549]349 last; # out of foreach loop
[17529]350 }
351 }
352 }
[14657]353 }
354
355 if ($changed_dir) {
356 chdir $current_dir;
357 }
358
359 return $strReadIn;
360}
361
[16791]362
363sub useWgetMonitored
364{
[17537]365 #local $| = 1; # autoflush stdout buffer
366 #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
367
[16791]368 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
369
370
371 my $current_dir = cwd();
372 my $changed_dir = 0;
373 if (defined $working_dir && -e $working_dir) {
374 chdir "$working_dir";
375 $changed_dir = 1;
376 }
[17537]377
378 # When we are running this script through GLI, the SIGTERM signal handler
379 # won't get called on Windows when wget is to be prematurely terminated.
380 # Instead, when wget has to be terminated in the middle of execution, GLI will
381 # connect to a serverSocket here to communicate when it's time to stop wget.
[17664]382 if($self->dealingWithSockets()) {
[17537]383
384 $port = <STDIN>; # gets a port on localhost that's not yet in use
385 chomp($port);
386
387 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
388 LocalPort => $port,
389 Listen => 1,
390 Reuse => 1);
391
392 die "can't setup server" unless $serverSocket;
393 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
394
395 $read_set = new IO::Select(); # create handle set for reading
396 $read_set->add($serverSocket); # add the main socket to the set
397 }
398
[16791]399 my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
[17354]400 #my $command = "\"$wget_file_path\" $cmdWget 2>&1 |"; #open(*WIN,$command) || die "wget request failed: $!\n";
[17537]401 my $command = "\"$wget_file_path\" $cmdWget 2>&1";
402 # print STDERR "Command is: $command\n";
[17354]403 $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
[16791]404
[17795]405 # for linux the wget process starts off as achild of a subshell launched as the child of
406 # this perl script. We want the pid of the wget process, not that of the subshell.
407 $childpid = $self->get_childpid($childpid);
408
[16791]409 my $full_text = "";
410 my $error_text = "";
411 my @follow_list = ();
412 my $line;
413
[17537]414 my $loop = 1;
415 while($loop)
[16791]416 {
[17537]417 if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
418 if((defined $blnShow) && $blnShow)
419 {
420 print STDERR "$line";
421 }
422
423 if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
424 my $follow_url = $1;
425 push(@follow_list,$follow_url);
426 }
427
428 if ($line =~ m/ERROR\s+\d+/) {
429 $error_text .= $line;
430 }
431
432 $full_text .= $line;
433 }
434 else { # wget finished, terminate naturally
435 #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
436 close($chld_in);
437 close($chld_out);
438 # Program terminates only when the following line is included
439 # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
440 # it prevents the child from turning into a "zombie process".
441 # While the wget process terminates without it, this perl script does not:
442 # the DOS prompt is not returned without it.
443 waitpid $childpid, 0;
444 $loop = 0;
445
446 $childpid = undef;
447 if(defined $port) {
448 $read_set->remove($serverSocket);
449 close($serverSocket);
450 }
[16791]451 }
452
[17537]453 # if we run this script from the command-line (as opposed to from GLI),
454 # then we're not working with sockets and can therefore can skip the next bits
455 next unless(defined $port);
[16791]456
[17537]457 # http://www.perlfect.com/articles/select.shtml
458 # "multiplex between several filehandles within a single thread of control,
459 # thus creating the effect of parallelism in the handling of I/O."
460 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
461
462 # take all readable handles in turn
463 foreach my $rh (@rh_set) {
464 if($rh == $serverSocket) {
465 my $client = $rh->accept();
466 #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
467 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
468
469 # Read from the client (getting rid of trailing newline)
470 # Has the client sent the <<STOP>> signal?
471 my $signal = <$client>;
472 chomp($signal);
473 if($signal eq "<<STOP>>") {
474 print $client "Perl received STOP signal (on port $port): stopping wget\n";
[17549]475 $loop = 0; # out of outer while loop
476 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
[17537]477
[17547]478 # Sometimes the wget process takes some time to start up. If the STOP signal
479 # was sent, don't try to terminate the process until we know it is running.
480 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
481 # for it to start up, checking for whether it is running in order to kill it.
482 for(my $seconds = 1; $seconds <= 5; $seconds++) {
483 if(kill(0, $childpid)) {
484 # If kill(0, $childpid) returns true, then the process is running
485 # and we need to kill it.
486 close($chld_in);
487 close($chld_out);
488 kill("TERM", $childpid);
489
490 $childpid = undef;
491
492 # Stop monitoring the read_handle and close the serverSocket
493 # (the Java end will close the client socket that Java opened)
494 $read_set->remove($rh); #$read_set->remove($serverSocket);
495 close($rh); #close($serverSocket);
496 print $client "Perl terminated wget and is about to exit\n";
[17785]497 last; # out of inner for loop
[17547]498 }
499 else { # the process may just be starting up, wait
500 sleep(1);
501 }
502 }
[17549]503 last; # out of foreach loop
[17537]504 }
505 }
[16791]506 }
507 }
508
509 my $command_status = $?;
510 if ($command_status != 0) {
511 $error_text .= "Exit error: $command_status";
512 }
513
514 if ($changed_dir) {
515 chdir $current_dir;
516 }
517
518 my $final_follow = pop(@follow_list); # might be undefined, but that's OK
519
520 return ($full_text,$error_text,$final_follow);
521}
522
523
[14657]524# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
525sub checkURL
526{
527 my ($self) = @_;
528 if ($self->{'url'} eq "")
529 {
530 &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
531 }
532}
533
534sub error
535{
536 my ($strFunctionName,$strError) = @_;
537 {
538 print "Error occoured in WgetDownload.pm\n".
539 "In Function:".$strFunctionName."\n".
540 "Error Message:".$strError."\n";
541 exit(-1);
542 }
543}
544
5451;
Note: See TracBrowser for help on using the repository browser.