source: gsdl/trunk/perllib/downloaders/WgetDownload.pm@ 17549

Last change on this file since 17549 was 17549, checked in by ak19, 16 years ago

Changes to sudden wget download termination when OAIDownload.pm is used: OAIDownload.pm launches wget several times, one after another (after each previous wget instance has terminated). Therefore when the STOP signal is sent from GLI, the OAIDownload.pm script should stop altogether and issue no more calls to wget. This is now accomplished by checking the new member variable self->force_quit which is set to true when WgetDownload receives a STOP signal from GLI.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.3 KB
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use IPC::Open2;
38use IO::Select;
39use IO::Socket;
40
41
42sub BEGIN {
43 @WgetDownload::ISA = ('BaseDownload');
44}
45
46my $arguments =
47 [ { 'name' => "proxy_on",
48 'desc' => "{WgetDownload.proxy_on}",
49 'type' => "flag",
50 'reqd' => "no",
51 'hiddengli' => "yes"},
52 { 'name' => "proxy_host",
53 'desc' => "{WgetDownload.proxy_host}",
54 'type' => "string",
55 'reqd' => "no",
56 'hiddengli' => "yes"},
57 { 'name' => "proxy_port",
58 'desc' => "{WgetDownload.proxy_port}",
59 'type' => "string",
60 'reqd' => "no",
61 'hiddengli' => "yes"},
62 { 'name' => "user_name",
63 'desc' => "{WgetDownload.user_name}",
64 'type' => "string",
65 'reqd' => "no",
66 'hiddengli' => "yes"},
67 { 'name' => "user_password",
68 'desc' => "{WgetDownload.user_password}",
69 'type' => "string",
70 'reqd' => "no",
71 'hiddengli' => "yes"}];
72
73my $options = { 'name' => "WgetDownload",
74 'desc' => "{WgetDownload.desc}",
75 'abstract' => "yes",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79
80# Declaring file global variables related to the wget child process so that
81# the termination signal handler for SIGTERM can close the streams and tidy
82# up before ending the child process.
83my $childpid;
84my ($chld_out, $chld_in);
85my ($serverSocket, $read_set);
86
87# The port this script's server socket will be listening on, to handle
88# incoming signals from GLI to terminate wget. This is also file global,
89# since OAIDownload.pm will make several calls on wget using the same
90# instance of this script and we want to reuse whatever port GLI gave us.
91my $port;
92
93# When this script is called from the command line, this handler will be called
94# if this process is killed or abruptly ends due to receiving one of the
95# terminating signals that this handler is registered to deal with.
96sub abrupt_end_handler {
97 my $termination_signal = shift (@_);
98
99 if(defined $childpid) {
100 close($chld_out);
101 close($chld_in);
102
103 print STDOUT "Received termination signal: $termination_signal\n";
104
105 # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
106 # See http://perldoc.perl.org/perlipc.html#Signals
107 # Warning on using kill at http://perldoc.perl.org/perlfork.html
108 kill("TERM", $childpid);
109
110 # If the SIGTERM sent on Linux calls this handler, we want to make
111 # sure any socket connection is closed.
112 # Otherwise sockets are only used when this script is run from GLI
113 # in which case the handlers don't really get called.
114 if(defined $serverSocket) {
115 $read_set->remove($serverSocket) if defined $read_set;
116 close($serverSocket);
117 }
118 }
119
120 exit(0);
121}
122
123# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
124# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
125# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
126# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
127# http://affy.blogspot.com/p5be/ch13.htm
128# http://perldoc.perl.org/perlipc.html#Signals
129$SIG{'INT'} = \&abrupt_end_handler;
130$SIG{'TERM'} = \&abrupt_end_handler;
131
132sub new {
133 my ($class) = shift (@_);
134 my ($getlist,$inputargs,$hashArgOptLists) = @_;
135 push(@$getlist, $class);
136
137 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
138 push(@{$hashArgOptLists->{"OptList"}},$options);
139
140 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
141
142 return bless $self, $class;
143}
144
145sub checkWgetSetup
146{
147 my ($self,$blnGliCall) = @_;
148 #TODO: proxy detection??
149
150 if((!$blnGliCall) && $self->{'proxy_on'})
151 {
152 &checkProxySetup($self);
153 }
154 &checkURL($self);
155}
156
157sub getWgetOptions
158{
159 my ($self) = @_;
160 my $strOptions = "";
161
162 if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
163 {
164
165 $strOptions .= " -e httpproxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
166
167 if ($self->{'user_name'} && $self->{'user_password'})
168 {
169 $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
170 }
171 }
172
173 if ($self->{'proxy_on'}) {
174 $strOptions .= " --proxy ";
175 }
176
177 return $strOptions;
178}
179
180# Checking for proxy setup: proxy server, proxy port, proxy username and password.
181sub checkProxySetup
182{
183 my ($self) = @_;
184 ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
185 # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
186 # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
187
188}
189
190sub useWget
191{
192 #local $| = 1; # autoflush stdout buffer
193 #print STDOUT "*** Start of subroutine useWget in $0\n";
194
195 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
196
197 my ($strReadIn,$strLine,$command);
198 $strReadIn = "" unless defined $strReadIn;
199
200 my $current_dir = cwd();
201 my $changed_dir = 0;
202 if (defined $working_dir && -e $working_dir) {
203 chdir "$working_dir";
204 $changed_dir = 1;
205 }
206
207 # When we are running this script through GLI, the SIGTERM signal handler
208 # won't get called on Windows when wget is to be prematurely terminated.
209 # Instead, when wget has to be terminated in the middle of execution, GLI will
210 # connect to a serverSocket here to communicate when it's time to stop wget.
211 if(defined $self->{'gli'} && $self->{'gli'} && !defined $port) {
212
213 $port = <STDIN>; # gets a port on localhost that's not yet in use
214 chomp($port);
215
216 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
217 LocalPort => $port,
218 Listen => 1,
219 Reuse => 1);
220
221 die "can't setup server" unless $serverSocket;
222 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
223
224 $read_set = new IO::Select(); # create handle set for reading
225 $read_set->add($serverSocket); # add the main socket to the set
226 }
227
228 my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
229 $command = "\"$wget_file_path\" $cmdWget 2>&1";
230 # print STDERR "Command is: $command\n";
231 $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
232
233 my $loop = 1;
234 while($loop)
235 {
236 if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
237 if($blnShow) {
238 print STDERR "$strLine\n";
239 }
240 $strReadIn .= $strLine;
241 }
242 else { # wget finished, terminate naturally
243 #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
244 close($chld_in);
245 close($chld_out);
246 waitpid $childpid, 0;
247 $loop = 0;
248
249 $childpid = undef;
250 if(defined $port) {
251 $read_set->remove($serverSocket);
252 close($serverSocket);
253 }
254 }
255
256 # if we run this script from the command-line (as opposed to from GLI),
257 # then we're not working with sockets and can therefore can skip the next bits
258 next unless(defined $port);
259
260 # http://www.perlfect.com/articles/select.shtml
261 # "multiplex between several filehandles within a single thread of control,
262 # thus creating the effect of parallelism in the handling of I/O."
263 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
264
265 # take all readable handles in turn
266 foreach my $rh (@rh_set) {
267 if($rh == $serverSocket) {
268 my $client = $rh->accept();
269 #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
270 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
271
272 # Read from the client (getting rid of the trailing newline)
273 # Has the client sent the <<STOP>> signal?
274 my $signal = <$client>;
275 chomp($signal);
276 if($signal eq "<<STOP>>") {
277 print $client "Perl received STOP signal (on port $port): stopping wget\n";
278 $loop = 0; # out of outer while loop
279 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
280
281 # Sometimes the wget process takes some time to start up. If the STOP signal
282 # was sent, don't try to terminate the process until we know it is running.
283 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
284 # for it to start up, checking for whether it is running in order to kill it.
285 for(my $seconds = 1; $seconds <= 5; $seconds++) {
286 if(kill(0, $childpid)) {
287 # If kill(0, $childpid) returns true, then the process is running
288 # and we need to kill it.
289 close($chld_in);
290 close($chld_out);
291 kill("TERM", $childpid);
292
293 $childpid = undef;
294
295 # Stop monitoring the read_handle and close the serverSocket
296 # (the Java end will close the client socket that Java opened)
297 $read_set->remove($rh); #$read_set->remove($serverSocket);
298 close($rh); #close($serverSocket);
299 print $client "Perl terminated wget and is about to exit\n";
300 }
301 else { # the process may just be starting up, wait
302 sleep(1);
303 }
304 }
305 last; # out of foreach loop
306 }
307 }
308 }
309 }
310
311 if ($changed_dir) {
312 chdir $current_dir;
313 }
314
315 return $strReadIn;
316}
317
318
319sub useWgetMonitored
320{
321 #local $| = 1; # autoflush stdout buffer
322 #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
323
324 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
325
326
327 my $current_dir = cwd();
328 my $changed_dir = 0;
329 if (defined $working_dir && -e $working_dir) {
330 chdir "$working_dir";
331 $changed_dir = 1;
332 }
333
334 # When we are running this script through GLI, the SIGTERM signal handler
335 # won't get called on Windows when wget is to be prematurely terminated.
336 # Instead, when wget has to be terminated in the middle of execution, GLI will
337 # connect to a serverSocket here to communicate when it's time to stop wget.
338 if(defined $self->{'gli'} && $self->{'gli'} && !defined $port) {
339
340 $port = <STDIN>; # gets a port on localhost that's not yet in use
341 chomp($port);
342
343 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
344 LocalPort => $port,
345 Listen => 1,
346 Reuse => 1);
347
348 die "can't setup server" unless $serverSocket;
349 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
350
351 $read_set = new IO::Select(); # create handle set for reading
352 $read_set->add($serverSocket); # add the main socket to the set
353 }
354
355 my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
356 #my $command = "\"$wget_file_path\" $cmdWget 2>&1 |"; #open(*WIN,$command) || die "wget request failed: $!\n";
357 my $command = "\"$wget_file_path\" $cmdWget 2>&1";
358 # print STDERR "Command is: $command\n";
359 $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
360
361 my $full_text = "";
362 my $error_text = "";
363 my @follow_list = ();
364 my $line;
365
366 my $loop = 1;
367 while($loop)
368 {
369 if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
370 if((defined $blnShow) && $blnShow)
371 {
372 print STDERR "$line";
373 }
374
375 if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
376 my $follow_url = $1;
377 push(@follow_list,$follow_url);
378 }
379
380 if ($line =~ m/ERROR\s+\d+/) {
381 $error_text .= $line;
382 }
383
384 $full_text .= $line;
385 }
386 else { # wget finished, terminate naturally
387 #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
388 close($chld_in);
389 close($chld_out);
390 # Program terminates only when the following line is included
391 # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
392 # it prevents the child from turning into a "zombie process".
393 # While the wget process terminates without it, this perl script does not:
394 # the DOS prompt is not returned without it.
395 waitpid $childpid, 0;
396 $loop = 0;
397
398 $childpid = undef;
399 if(defined $port) {
400 $read_set->remove($serverSocket);
401 close($serverSocket);
402 }
403 }
404
405 # if we run this script from the command-line (as opposed to from GLI),
406 # then we're not working with sockets and can therefore can skip the next bits
407 next unless(defined $port);
408
409 # http://www.perlfect.com/articles/select.shtml
410 # "multiplex between several filehandles within a single thread of control,
411 # thus creating the effect of parallelism in the handling of I/O."
412 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
413
414 # take all readable handles in turn
415 foreach my $rh (@rh_set) {
416 if($rh == $serverSocket) {
417 my $client = $rh->accept();
418 #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
419 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
420
421 # Read from the client (getting rid of trailing newline)
422 # Has the client sent the <<STOP>> signal?
423 my $signal = <$client>;
424 chomp($signal);
425 if($signal eq "<<STOP>>") {
426 print $client "Perl received STOP signal (on port $port): stopping wget\n";
427 $loop = 0; # out of outer while loop
428 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
429
430 # Sometimes the wget process takes some time to start up. If the STOP signal
431 # was sent, don't try to terminate the process until we know it is running.
432 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
433 # for it to start up, checking for whether it is running in order to kill it.
434 for(my $seconds = 1; $seconds <= 5; $seconds++) {
435 if(kill(0, $childpid)) {
436 # If kill(0, $childpid) returns true, then the process is running
437 # and we need to kill it.
438 close($chld_in);
439 close($chld_out);
440 kill("TERM", $childpid);
441
442 $childpid = undef;
443
444 # Stop monitoring the read_handle and close the serverSocket
445 # (the Java end will close the client socket that Java opened)
446 $read_set->remove($rh); #$read_set->remove($serverSocket);
447 close($rh); #close($serverSocket);
448 print $client "Perl terminated wget and is about to exit\n";
449 }
450 else { # the process may just be starting up, wait
451 sleep(1);
452 }
453 }
454 last; # out of foreach loop
455 }
456 }
457 }
458 }
459
460 my $command_status = $?;
461 if ($command_status != 0) {
462 $error_text .= "Exit error: $command_status";
463 }
464
465 if ($changed_dir) {
466 chdir $current_dir;
467 }
468
469 my $final_follow = pop(@follow_list); # might be undefined, but that's OK
470
471 return ($full_text,$error_text,$final_follow);
472}
473
474
475# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
476sub checkURL
477{
478 my ($self) = @_;
479 if ($self->{'url'} eq "")
480 {
481 &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
482 }
483}
484
485sub error
486{
487 my ($strFunctionName,$strError) = @_;
488 {
489 print "Error occoured in WgetDownload.pm\n".
490 "In Function:".$strFunctionName."\n".
491 "Error Message:".$strError."\n";
492 exit(-1);
493 }
494}
495
4961;
Note: See TracBrowser for help on using the repository browser.