source: gsdl/trunk/perllib/downloaders/WgetDownload.pm@ 17547

Last change on this file since 17547 was 17547, checked in by ak19, 16 years ago

When subroutines useWget and useWgetMonitored receive the STOP signal from GLI to prematurely terminate wget, these subroutines now wait 5 seconds for wget to have been launched before trying to terminate them. Previously it was possible for the kill signal to have been sent to wget before it started up, in which wget would then start running and continue in the background without anyone noticing that it hadn't gone away as it should have.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.9 KB
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use IPC::Open2;
38use IO::Select;
39use IO::Socket;
40
41
42sub BEGIN {
43 @WgetDownload::ISA = ('BaseDownload');
44}
45
46my $arguments =
47 [ { 'name' => "proxy_on",
48 'desc' => "{WgetDownload.proxy_on}",
49 'type' => "flag",
50 'reqd' => "no",
51 'hiddengli' => "yes"},
52 { 'name' => "proxy_host",
53 'desc' => "{WgetDownload.proxy_host}",
54 'type' => "string",
55 'reqd' => "no",
56 'hiddengli' => "yes"},
57 { 'name' => "proxy_port",
58 'desc' => "{WgetDownload.proxy_port}",
59 'type' => "string",
60 'reqd' => "no",
61 'hiddengli' => "yes"},
62 { 'name' => "user_name",
63 'desc' => "{WgetDownload.user_name}",
64 'type' => "string",
65 'reqd' => "no",
66 'hiddengli' => "yes"},
67 { 'name' => "user_password",
68 'desc' => "{WgetDownload.user_password}",
69 'type' => "string",
70 'reqd' => "no",
71 'hiddengli' => "yes"}];
72
73my $options = { 'name' => "WgetDownload",
74 'desc' => "{WgetDownload.desc}",
75 'abstract' => "yes",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79
80# Declaring file global variables related to the wget child process so that
81# the termination signal handler for SIGTERM can close the streams and tidy
82# up before ending the child process.
83my $childpid;
84my ($chld_out, $chld_in);
85my ($serverSocket, $read_set);
86
87# The port this script's server socket will be listening on, to handle
88# incoming signals from GLI to terminate wget. This is also file global,
89# since OAIDownload.pm will make several calls on wget using the same
90# instance of this script and we want to reuse whatever port GLI gave us.
91my $port;
92
93# When this script is called from the command line, this handler will be called
94# if this process is killed or abruptly ends due to receiving one of the
95# terminating signals that this handler is registered to deal with.
96sub abrupt_end_handler {
97 my $termination_signal = shift (@_);
98 if(defined $childpid) {
99 close($chld_out);
100 close($chld_in);
101
102 print STDOUT "Received termination signal: $termination_signal\n";
103
104 # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
105 # See http://perldoc.perl.org/perlipc.html#Signals
106 # Warning on using kill at http://perldoc.perl.org/perlfork.html
107 kill("TERM", $childpid);
108
109 # If the SIGTERM sent on Linux calls this handler, we want to make
110 # sure any socket connection is closed.
111 # Otherwise sockets are only used when this script is run from GLI
112 # in which case the handlers don't really get called.
113 if(defined $serverSocket) {
114 $read_set->remove($serverSocket) if defined $read_set;
115 close($serverSocket);
116 }
117 }
118
119 exit(0);
120}
121
122# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
123# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
124# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
125# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
126# http://affy.blogspot.com/p5be/ch13.htm
127# http://perldoc.perl.org/perlipc.html#Signals
128$SIG{'INT'} = \&abrupt_end_handler;
129$SIG{'TERM'} = \&abrupt_end_handler;
130
131sub new {
132 my ($class) = shift (@_);
133 my ($getlist,$inputargs,$hashArgOptLists) = @_;
134 push(@$getlist, $class);
135
136 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
137 push(@{$hashArgOptLists->{"OptList"}},$options);
138
139 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
140
141 return bless $self, $class;
142}
143
144sub checkWgetSetup
145{
146 my ($self,$blnGliCall) = @_;
147 #TODO: proxy detection??
148
149 if((!$blnGliCall) && $self->{'proxy_on'})
150 {
151 &checkProxySetup($self);
152 }
153 &checkURL($self);
154}
155
156sub getWgetOptions
157{
158 my ($self) = @_;
159 my $strOptions = "";
160
161 if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
162 {
163
164 $strOptions .= " -e httpproxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
165
166 if ($self->{'user_name'} && $self->{'user_password'})
167 {
168 $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
169 }
170 }
171
172 if ($self->{'proxy_on'}) {
173 $strOptions .= " --proxy ";
174 }
175
176 return $strOptions;
177}
178
179# Checking for proxy setup: proxy server, proxy port, proxy username and password.
180sub checkProxySetup
181{
182 my ($self) = @_;
183 ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
184 # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
185 # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
186
187}
188
189sub useWget
190{
191 #local $| = 1; # autoflush stdout buffer
192 #print STDOUT "*** Start of subroutine useWget in $0\n";
193
194 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
195
196 my ($strReadIn,$strLine,$command);
197 $strReadIn = "" unless defined $strReadIn;
198
199 my $current_dir = cwd();
200 my $changed_dir = 0;
201 if (defined $working_dir && -e $working_dir) {
202 chdir "$working_dir";
203 $changed_dir = 1;
204 }
205
206 # When we are running this script through GLI, the SIGTERM signal handler
207 # won't get called on Windows when wget is to be prematurely terminated.
208 # Instead, when wget has to be terminated in the middle of execution, GLI will
209 # connect to a serverSocket here to communicate when it's time to stop wget.
210 if(defined $self->{'gli'} && $self->{'gli'} && !defined $port) {
211
212 $port = <STDIN>; # gets a port on localhost that's not yet in use
213 chomp($port);
214
215 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
216 LocalPort => $port,
217 Listen => 1,
218 Reuse => 1);
219
220 die "can't setup server" unless $serverSocket;
221 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
222
223 $read_set = new IO::Select(); # create handle set for reading
224 $read_set->add($serverSocket); # add the main socket to the set
225 }
226
227 my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
228 $command = "\"$wget_file_path\" $cmdWget 2>&1";
229 # print STDERR "Command is: $command\n";
230 $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
231
232 my $loop = 1;
233 while($loop)
234 {
235 if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
236 if($blnShow) {
237 print STDERR "$strLine\n";
238 }
239 $strReadIn .= $strLine;
240 }
241 else { # wget finished, terminate naturally
242 #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
243 close($chld_in);
244 close($chld_out);
245 waitpid $childpid, 0;
246 $loop = 0;
247
248 $childpid = undef;
249 if(defined $port) {
250 $read_set->remove($serverSocket);
251 close($serverSocket);
252 }
253 }
254
255 # if we run this script from the command-line (as opposed to from GLI),
256 # then we're not working with sockets and can therefore can skip the next bits
257 next unless(defined $port);
258
259 # http://www.perlfect.com/articles/select.shtml
260 # "multiplex between several filehandles within a single thread of control,
261 # thus creating the effect of parallelism in the handling of I/O."
262 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
263
264 # take all readable handles in turn
265 foreach my $rh (@rh_set) {
266 if($rh == $serverSocket) {
267 my $client = $rh->accept();
268 #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
269 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
270
271 # Read from the client (getting rid of the trailing newline)
272 # Has the client sent the <<STOP>> signal?
273 my $signal = <$client>;
274 chomp($signal);
275 if($signal eq "<<STOP>>") {
276 print $client "Perl received STOP signal (on port $port): stopping wget\n";
277 $loop = 0;
278
279 # Sometimes the wget process takes some time to start up. If the STOP signal
280 # was sent, don't try to terminate the process until we know it is running.
281 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
282 # for it to start up, checking for whether it is running in order to kill it.
283 for(my $seconds = 1; $seconds <= 5; $seconds++) {
284 if(kill(0, $childpid)) {
285 # If kill(0, $childpid) returns true, then the process is running
286 # and we need to kill it.
287 close($chld_in);
288 close($chld_out);
289 kill("TERM", $childpid);
290
291 $childpid = undef;
292
293 # Stop monitoring the read_handle and close the serverSocket
294 # (the Java end will close the client socket that Java opened)
295 $read_set->remove($rh); #$read_set->remove($serverSocket);
296 close($rh); #close($serverSocket);
297 print $client "Perl terminated wget and is about to exit\n";
298 }
299 else { # the process may just be starting up, wait
300 sleep(1);
301 }
302 }
303 last; # we're done here
304 }
305 }
306 }
307 }
308
309 if ($changed_dir) {
310 chdir $current_dir;
311 }
312
313 return $strReadIn;
314}
315
316
317sub useWgetMonitored
318{
319 #local $| = 1; # autoflush stdout buffer
320 #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
321
322 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
323
324
325 my $current_dir = cwd();
326 my $changed_dir = 0;
327 if (defined $working_dir && -e $working_dir) {
328 chdir "$working_dir";
329 $changed_dir = 1;
330 }
331
332 # When we are running this script through GLI, the SIGTERM signal handler
333 # won't get called on Windows when wget is to be prematurely terminated.
334 # Instead, when wget has to be terminated in the middle of execution, GLI will
335 # connect to a serverSocket here to communicate when it's time to stop wget.
336 if(defined $self->{'gli'} && $self->{'gli'} && !defined $port) {
337
338 $port = <STDIN>; # gets a port on localhost that's not yet in use
339 chomp($port);
340
341 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
342 LocalPort => $port,
343 Listen => 1,
344 Reuse => 1);
345
346 die "can't setup server" unless $serverSocket;
347 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
348
349 $read_set = new IO::Select(); # create handle set for reading
350 $read_set->add($serverSocket); # add the main socket to the set
351 }
352
353 my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
354 #my $command = "\"$wget_file_path\" $cmdWget 2>&1 |"; #open(*WIN,$command) || die "wget request failed: $!\n";
355 my $command = "\"$wget_file_path\" $cmdWget 2>&1";
356 # print STDERR "Command is: $command\n";
357 $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
358
359 my $full_text = "";
360 my $error_text = "";
361 my @follow_list = ();
362 my $line;
363
364 my $loop = 1;
365 while($loop)
366 {
367 if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
368 if((defined $blnShow) && $blnShow)
369 {
370 print STDERR "$line";
371 }
372
373 if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
374 my $follow_url = $1;
375 push(@follow_list,$follow_url);
376 }
377
378 if ($line =~ m/ERROR\s+\d+/) {
379 $error_text .= $line;
380 }
381
382 $full_text .= $line;
383 }
384 else { # wget finished, terminate naturally
385 #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
386 close($chld_in);
387 close($chld_out);
388 # Program terminates only when the following line is included
389 # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
390 # it prevents the child from turning into a "zombie process".
391 # While the wget process terminates without it, this perl script does not:
392 # the DOS prompt is not returned without it.
393 waitpid $childpid, 0;
394 $loop = 0;
395
396 $childpid = undef;
397 if(defined $port) {
398 $read_set->remove($serverSocket);
399 close($serverSocket);
400 }
401 }
402
403 # if we run this script from the command-line (as opposed to from GLI),
404 # then we're not working with sockets and can therefore can skip the next bits
405 next unless(defined $port);
406
407 # http://www.perlfect.com/articles/select.shtml
408 # "multiplex between several filehandles within a single thread of control,
409 # thus creating the effect of parallelism in the handling of I/O."
410 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
411
412 # take all readable handles in turn
413 foreach my $rh (@rh_set) {
414 if($rh == $serverSocket) {
415 my $client = $rh->accept();
416 #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
417 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
418
419 # Read from the client (getting rid of trailing newline)
420 # Has the client sent the <<STOP>> signal?
421 my $signal = <$client>;
422 chomp($signal);
423 if($signal eq "<<STOP>>") {
424 print $client "Perl received STOP signal (on port $port): stopping wget\n";
425 $loop = 0;
426
427 # Sometimes the wget process takes some time to start up. If the STOP signal
428 # was sent, don't try to terminate the process until we know it is running.
429 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
430 # for it to start up, checking for whether it is running in order to kill it.
431 for(my $seconds = 1; $seconds <= 5; $seconds++) {
432 if(kill(0, $childpid)) {
433 # If kill(0, $childpid) returns true, then the process is running
434 # and we need to kill it.
435 close($chld_in);
436 close($chld_out);
437 kill("TERM", $childpid);
438
439 $childpid = undef;
440
441 # Stop monitoring the read_handle and close the serverSocket
442 # (the Java end will close the client socket that Java opened)
443 $read_set->remove($rh); #$read_set->remove($serverSocket);
444 close($rh); #close($serverSocket);
445 print $client "Perl terminated wget and is about to exit\n";
446 }
447 else { # the process may just be starting up, wait
448 sleep(1);
449 }
450 }
451 last; # we're done here
452 }
453 }
454 }
455 }
456
457 my $command_status = $?;
458 if ($command_status != 0) {
459 $error_text .= "Exit error: $command_status";
460 }
461
462 if ($changed_dir) {
463 chdir $current_dir;
464 }
465
466 my $final_follow = pop(@follow_list); # might be undefined, but that's OK
467
468 return ($full_text,$error_text,$final_follow);
469}
470
471
472# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
473sub checkURL
474{
475 my ($self) = @_;
476 if ($self->{'url'} eq "")
477 {
478 &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
479 }
480}
481
482sub error
483{
484 my ($strFunctionName,$strError) = @_;
485 {
486 print "Error occoured in WgetDownload.pm\n".
487 "In Function:".$strFunctionName."\n".
488 "Error Message:".$strError."\n";
489 exit(-1);
490 }
491}
492
4931;
Note: See TracBrowser for help on using the repository browser.