source: gsdl/trunk/perllib/downloaders/WgetDownload.pm@ 17800

Last change on this file since 17800 was 17800, checked in by ak19, 15 years ago

BugFix: Fixed the position of call to last to break out of for loop inside useWget and useWgetMonitored subroutines, so that it now works on windows. This needed to be done because the calls to last had been added for making things still work on Linux, but the location of the calls was wrong on Windows and caused OAIDownloads to fail all of a sudden.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.3 KB
RevLine 
[14657]1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
[17207]34use BaseDownload;
[14657]35use strict;
[17529]36use Cwd;
[14657]37use IPC::Open2;
[17529]38use IO::Select;
39use IO::Socket;
[14657]40
[17529]41
[14657]42sub BEGIN {
[17207]43 @WgetDownload::ISA = ('BaseDownload');
[14657]44}
45
46my $arguments =
47 [ { 'name' => "proxy_on",
48 'desc' => "{WgetDownload.proxy_on}",
49 'type' => "flag",
50 'reqd' => "no",
51 'hiddengli' => "yes"},
52 { 'name' => "proxy_host",
53 'desc' => "{WgetDownload.proxy_host}",
54 'type' => "string",
55 'reqd' => "no",
56 'hiddengli' => "yes"},
57 { 'name' => "proxy_port",
58 'desc' => "{WgetDownload.proxy_port}",
59 'type' => "string",
60 'reqd' => "no",
61 'hiddengli' => "yes"},
62 { 'name' => "user_name",
63 'desc' => "{WgetDownload.user_name}",
64 'type' => "string",
65 'reqd' => "no",
66 'hiddengli' => "yes"},
67 { 'name' => "user_password",
68 'desc' => "{WgetDownload.user_password}",
69 'type' => "string",
70 'reqd' => "no",
71 'hiddengli' => "yes"}];
72
73my $options = { 'name' => "WgetDownload",
74 'desc' => "{WgetDownload.desc}",
75 'abstract' => "yes",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79
[17529]80# Declaring file global variables related to the wget child process so that
81# the termination signal handler for SIGTERM can close the streams and tidy
82# up before ending the child process.
[17354]83my $childpid;
[17531]84my ($chld_out, $chld_in);
[17529]85my ($serverSocket, $read_set);
[17354]86
[17531]87# The port this script's server socket will be listening on, to handle
88# incoming signals from GLI to terminate wget. This is also file global,
89# since OAIDownload.pm will make several calls on wget using the same
90# instance of this script and we want to reuse whatever port GLI gave us.
91my $port;
92
[17529]93# When this script is called from the command line, this handler will be called
94# if this process is killed or abruptly ends due to receiving one of the
95# terminating signals that this handler is registered to deal with.
[17354]96sub abrupt_end_handler {
97 my $termination_signal = shift (@_);
[17549]98
[17529]99 if(defined $childpid) {
100 close($chld_out);
101 close($chld_in);
[17354]102
[17547]103 print STDOUT "Received termination signal: $termination_signal\n";
[17529]104
105 # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
106 # See http://perldoc.perl.org/perlipc.html#Signals
107 # Warning on using kill at http://perldoc.perl.org/perlfork.html
108 kill("TERM", $childpid);
109
110 # If the SIGTERM sent on Linux calls this handler, we want to make
111 # sure any socket connection is closed.
112 # Otherwise sockets are only used when this script is run from GLI
113 # in which case the handlers don't really get called.
114 if(defined $serverSocket) {
115 $read_set->remove($serverSocket) if defined $read_set;
116 close($serverSocket);
[17354]117 }
118 }
[17529]119
[17354]120 exit(0);
121}
122
123# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
124# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
125# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
126# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
127# http://affy.blogspot.com/p5be/ch13.htm
128# http://perldoc.perl.org/perlipc.html#Signals
129$SIG{'INT'} = \&abrupt_end_handler;
130$SIG{'TERM'} = \&abrupt_end_handler;
131
[14657]132sub new {
133 my ($class) = shift (@_);
134 my ($getlist,$inputargs,$hashArgOptLists) = @_;
135 push(@$getlist, $class);
136
[17207]137 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
138 push(@{$hashArgOptLists->{"OptList"}},$options);
[14657]139
[17207]140 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
[14657]141
142 return bless $self, $class;
143}
144
145sub checkWgetSetup
146{
147 my ($self,$blnGliCall) = @_;
148 #TODO: proxy detection??
149
150 if((!$blnGliCall) && $self->{'proxy_on'})
151 {
152 &checkProxySetup($self);
153 }
154 &checkURL($self);
155}
156
157sub getWgetOptions
158{
159 my ($self) = @_;
160 my $strOptions = "";
161
[16791]162 if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
[14657]163 {
[14918]164
165 $strOptions .= " -e httpproxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
166
167 if ($self->{'user_name'} && $self->{'user_password'})
168 {
169 $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
170 }
[14657]171 }
172
[16791]173 if ($self->{'proxy_on'}) {
174 $strOptions .= " --proxy ";
175 }
[14918]176
[14657]177 return $strOptions;
178}
179
180# Checking for proxy setup: proxy server, proxy port, proxy username and password.
181sub checkProxySetup
182{
183 my ($self) = @_;
184 ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
185 # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
[17529]186 # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
[14657]187
188}
189
[17664]190# Returns true if the wget status needs to be monitored through sockets
191# (if a socket is used to communicate with the Java program on when to
192# terminate wget). True if we are running gli, or if the particular type
193# of WgetDownload is *not* OAIDownload (in that case, the original way of
194# terminating the perl script from Java terminated wget as well).
195sub dealingWithSockets() {
196 my ($self) = @_;
197 return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
198 # use ref($self) to find the classname of an object
199}
200
[17795]201sub get_childpid {
202 my ($self, $pid) = @_;
203 my $os = $^O;
204 # for windows, we do nothing special
205 if ($os =~ m/mswin/i) {
206 return $pid;
207 }
208
209 # else $os is macos or linux
210 # This means the wget child process may have been spawned from a subshell
211 # (the real child) that was launched by this perl script. However, we want
212 # the pid of the wget process, since that is what we want to terminate.
213
214 sleep(2); # give it some time to start up, else we'd have finished searching beforehand
215
216 # Look through any processes spawned immediately/soon after the subshell.
217 # Look for any pid greater than the subshell's, but within a certain limit (10)
218 my $child = $pid;
219 for(my $i = 1; $i <= 10; $i++) {
220 $child = $child+$i;
221 if(kill(0, $child)) { # this process exists
222 return $child;
223 }
224 }
225
226 return $pid; # could not find any process within limit, so return the original process id?
227}
228
[14657]229sub useWget
230{
[17529]231 #local $| = 1; # autoflush stdout buffer
232 #print STDOUT "*** Start of subroutine useWget in $0\n";
233
[14657]234 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
235
236 my ($strReadIn,$strLine,$command);
237 $strReadIn = "" unless defined $strReadIn;
238
239 my $current_dir = cwd();
240 my $changed_dir = 0;
241 if (defined $working_dir && -e $working_dir) {
242 chdir "$working_dir";
243 $changed_dir = 1;
244 }
245
[17529]246 # When we are running this script through GLI, the SIGTERM signal handler
247 # won't get called on Windows when wget is to be prematurely terminated.
248 # Instead, when wget has to be terminated in the middle of execution, GLI will
249 # connect to a serverSocket here to communicate when it's time to stop wget.
[17664]250 if($self->dealingWithSockets()) {
[14657]251
[17529]252 $port = <STDIN>; # gets a port on localhost that's not yet in use
253 chomp($port);
254
255 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
256 LocalPort => $port,
257 Listen => 1,
258 Reuse => 1);
259
260 die "can't setup server" unless $serverSocket;
261 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
262
263 $read_set = new IO::Select(); # create handle set for reading
264 $read_set->add($serverSocket); # add the main socket to the set
265 }
266
267 my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
268 $command = "\"$wget_file_path\" $cmdWget 2>&1";
269 # print STDERR "Command is: $command\n";
[17354]270 $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
[17795]271
272 # for linux the wget process starts off as achild of a subshell launched as the child of
273 # this perl script. We want the pid of the wget process, not that of the subshell.
274 $childpid = $self->get_childpid($childpid);
[17354]275
[17529]276 my $loop = 1;
277 while($loop)
[14657]278 {
[17529]279 if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
280 if($blnShow) {
281 print STDERR "$strLine\n";
282 }
283 $strReadIn .= $strLine;
284 }
285 else { # wget finished, terminate naturally
[17547]286 #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
[17529]287 close($chld_in);
288 close($chld_out);
289 waitpid $childpid, 0;
290 $loop = 0;
291
292 $childpid = undef;
293 if(defined $port) {
294 $read_set->remove($serverSocket);
295 close($serverSocket);
296 }
[14657]297 }
298
[17529]299 # if we run this script from the command-line (as opposed to from GLI),
300 # then we're not working with sockets and can therefore can skip the next bits
301 next unless(defined $port);
302
303 # http://www.perlfect.com/articles/select.shtml
304 # "multiplex between several filehandles within a single thread of control,
305 # thus creating the effect of parallelism in the handling of I/O."
306 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
307
308 # take all readable handles in turn
309 foreach my $rh (@rh_set) {
310 if($rh == $serverSocket) {
311 my $client = $rh->accept();
312 #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
313 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
314
[17547]315 # Read from the client (getting rid of the trailing newline)
[17529]316 # Has the client sent the <<STOP>> signal?
317 my $signal = <$client>;
318 chomp($signal);
319 if($signal eq "<<STOP>>") {
320 print $client "Perl received STOP signal (on port $port): stopping wget\n";
[17549]321 $loop = 0; # out of outer while loop
322 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
[17529]323
[17547]324 # Sometimes the wget process takes some time to start up. If the STOP signal
325 # was sent, don't try to terminate the process until we know it is running.
326 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
327 # for it to start up, checking for whether it is running in order to kill it.
328 for(my $seconds = 1; $seconds <= 5; $seconds++) {
329 if(kill(0, $childpid)) {
330 # If kill(0, $childpid) returns true, then the process is running
331 # and we need to kill it.
332 close($chld_in);
333 close($chld_out);
334 kill("TERM", $childpid);
335
336 $childpid = undef;
337
338 # Stop monitoring the read_handle and close the serverSocket
339 # (the Java end will close the client socket that Java opened)
340 $read_set->remove($rh); #$read_set->remove($serverSocket);
341 close($rh); #close($serverSocket);
342 print $client "Perl terminated wget and is about to exit\n";
343 }
344 else { # the process may just be starting up, wait
345 sleep(1);
346 }
[17800]347 last; # out of inner for loop
[17547]348 }
[17549]349 last; # out of foreach loop
[17529]350 }
351 }
352 }
[14657]353 }
354
355 if ($changed_dir) {
356 chdir $current_dir;
357 }
358
359 return $strReadIn;
360}
361
[16791]362
363sub useWgetMonitored
364{
[17537]365 #local $| = 1; # autoflush stdout buffer
366 #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
367
[16791]368 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
369
370
371 my $current_dir = cwd();
372 my $changed_dir = 0;
373 if (defined $working_dir && -e $working_dir) {
374 chdir "$working_dir";
375 $changed_dir = 1;
376 }
[17537]377
378 # When we are running this script through GLI, the SIGTERM signal handler
379 # won't get called on Windows when wget is to be prematurely terminated.
380 # Instead, when wget has to be terminated in the middle of execution, GLI will
381 # connect to a serverSocket here to communicate when it's time to stop wget.
[17664]382 if($self->dealingWithSockets()) {
[17537]383
384 $port = <STDIN>; # gets a port on localhost that's not yet in use
385 chomp($port);
386
387 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
388 LocalPort => $port,
389 Listen => 1,
390 Reuse => 1);
391
392 die "can't setup server" unless $serverSocket;
393 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
394
395 $read_set = new IO::Select(); # create handle set for reading
396 $read_set->add($serverSocket); # add the main socket to the set
397 }
398
[16791]399 my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
[17354]400 #my $command = "\"$wget_file_path\" $cmdWget 2>&1 |"; #open(*WIN,$command) || die "wget request failed: $!\n";
[17537]401 my $command = "\"$wget_file_path\" $cmdWget 2>&1";
402 # print STDERR "Command is: $command\n";
[17354]403 $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
[16791]404
[17795]405 # for linux the wget process starts off as achild of a subshell launched as the child of
406 # this perl script. We want the pid of the wget process, not that of the subshell.
407 $childpid = $self->get_childpid($childpid);
408
[16791]409 my $full_text = "";
410 my $error_text = "";
411 my @follow_list = ();
412 my $line;
413
[17537]414 my $loop = 1;
415 while($loop)
[16791]416 {
[17537]417 if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
418 if((defined $blnShow) && $blnShow)
419 {
420 print STDERR "$line";
421 }
422
423 if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
424 my $follow_url = $1;
425 push(@follow_list,$follow_url);
426 }
427
428 if ($line =~ m/ERROR\s+\d+/) {
429 $error_text .= $line;
430 }
431
432 $full_text .= $line;
433 }
434 else { # wget finished, terminate naturally
435 #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
436 close($chld_in);
437 close($chld_out);
438 # Program terminates only when the following line is included
439 # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
440 # it prevents the child from turning into a "zombie process".
441 # While the wget process terminates without it, this perl script does not:
442 # the DOS prompt is not returned without it.
443 waitpid $childpid, 0;
444 $loop = 0;
445
446 $childpid = undef;
447 if(defined $port) {
448 $read_set->remove($serverSocket);
449 close($serverSocket);
450 }
[16791]451 }
452
[17537]453 # if we run this script from the command-line (as opposed to from GLI),
454 # then we're not working with sockets and can therefore can skip the next bits
455 next unless(defined $port);
[16791]456
[17537]457 # http://www.perlfect.com/articles/select.shtml
458 # "multiplex between several filehandles within a single thread of control,
459 # thus creating the effect of parallelism in the handling of I/O."
460 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
461
462 # take all readable handles in turn
463 foreach my $rh (@rh_set) {
464 if($rh == $serverSocket) {
465 my $client = $rh->accept();
466 #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
467 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
468
469 # Read from the client (getting rid of trailing newline)
470 # Has the client sent the <<STOP>> signal?
471 my $signal = <$client>;
472 chomp($signal);
473 if($signal eq "<<STOP>>") {
474 print $client "Perl received STOP signal (on port $port): stopping wget\n";
[17549]475 $loop = 0; # out of outer while loop
476 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
[17537]477
[17547]478 # Sometimes the wget process takes some time to start up. If the STOP signal
479 # was sent, don't try to terminate the process until we know it is running.
480 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
481 # for it to start up, checking for whether it is running in order to kill it.
482 for(my $seconds = 1; $seconds <= 5; $seconds++) {
483 if(kill(0, $childpid)) {
484 # If kill(0, $childpid) returns true, then the process is running
485 # and we need to kill it.
486 close($chld_in);
487 close($chld_out);
488 kill("TERM", $childpid);
489
490 $childpid = undef;
491
492 # Stop monitoring the read_handle and close the serverSocket
493 # (the Java end will close the client socket that Java opened)
494 $read_set->remove($rh); #$read_set->remove($serverSocket);
495 close($rh); #close($serverSocket);
496 print $client "Perl terminated wget and is about to exit\n";
497 }
498 else { # the process may just be starting up, wait
499 sleep(1);
500 }
[17800]501 last; # out of inner for loop
[17547]502 }
[17549]503 last; # out of foreach loop
[17537]504 }
505 }
[16791]506 }
507 }
508
509 my $command_status = $?;
510 if ($command_status != 0) {
511 $error_text .= "Exit error: $command_status";
512 }
513
514 if ($changed_dir) {
515 chdir $current_dir;
516 }
517
518 my $final_follow = pop(@follow_list); # might be undefined, but that's OK
519
520 return ($full_text,$error_text,$final_follow);
521}
522
523
[14657]524# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
525sub checkURL
526{
527 my ($self) = @_;
528 if ($self->{'url'} eq "")
529 {
530 &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
531 }
532}
533
534sub error
535{
536 my ($strFunctionName,$strError) = @_;
537 {
538 print "Error occoured in WgetDownload.pm\n".
539 "In Function:".$strFunctionName."\n".
540 "Error Message:".$strError."\n";
541 exit(-1);
542 }
543}
544
5451;
Note: See TracBrowser for help on using the repository browser.