source: gs2-extensions/parallel-building/trunk/src/perllib/downloaders/WgetDownload.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

File size: 18.0 KB
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use IPC::Open3;
38use IO::Select;
39use IO::Socket;
40
41
42sub BEGIN {
43 @WgetDownload::ISA = ('BaseDownload');
44}
45
46my $arguments =
47 [ { 'name' => "proxy_on",
48 'desc' => "{WgetDownload.proxy_on}",
49 'type' => "flag",
50 'reqd' => "no",
51 'hiddengli' => "yes"},
52 { 'name' => "proxy_host",
53 'desc' => "{WgetDownload.proxy_host}",
54 'type' => "string",
55 'reqd' => "no",
56 'hiddengli' => "yes"},
57 { 'name' => "proxy_port",
58 'desc' => "{WgetDownload.proxy_port}",
59 'type' => "string",
60 'reqd' => "no",
61 'hiddengli' => "yes"},
62 { 'name' => "user_name",
63 'desc' => "{WgetDownload.user_name}",
64 'type' => "string",
65 'reqd' => "no",
66 'hiddengli' => "yes"},
67 { 'name' => "user_password",
68 'desc' => "{WgetDownload.user_password}",
69 'type' => "string",
70 'reqd' => "no",
71 'hiddengli' => "yes"}];
72
73my $options = { 'name' => "WgetDownload",
74 'desc' => "{WgetDownload.desc}",
75 'abstract' => "yes",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79
80# Declaring file global variables related to the wget child process so that
81# the termination signal handler for SIGTERM can close the streams and tidy
82# up before ending the child process.
83my $childpid;
84my ($chld_out, $chld_in);
85my ($serverSocket, $read_set);
86
87# The port this script's server socket will be listening on, to handle
88# incoming signals from GLI to terminate wget. This is also file global,
89# since OAIDownload.pm will make several calls on wget using the same
90# instance of this script and we want to reuse whatever port GLI gave us.
91my $port;
92
93# When this script is called from the command line, this handler will be called
94# if this process is killed or abruptly ends due to receiving one of the
95# terminating signals that this handler is registered to deal with.
96sub abrupt_end_handler {
97 my $termination_signal = shift (@_);
98
99 if(defined $childpid) {
100 close($chld_out);
101 close($chld_in);
102
103 print STDOUT "Received termination signal: $termination_signal\n";
104
105 # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
106 # See http://perldoc.perl.org/perlipc.html#Signals
107 # Warning on using kill at http://perldoc.perl.org/perlfork.html
108 kill("TERM", $childpid);
109
110 # If the SIGTERM sent on Linux calls this handler, we want to make
111 # sure any socket connection is closed.
112 # Otherwise sockets are only used when this script is run from GLI
113 # in which case the handlers don't really get called.
114 if(defined $serverSocket) {
115 $read_set->remove($serverSocket) if defined $read_set;
116 close($serverSocket);
117 }
118 }
119
120 exit(0);
121}
122
123# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
124# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
125# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
126# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
127# http://affy.blogspot.com/p5be/ch13.htm
128# http://perldoc.perl.org/perlipc.html#Signals
129$SIG{'INT'} = \&abrupt_end_handler;
130$SIG{'TERM'} = \&abrupt_end_handler;
131
132sub new {
133 my ($class) = shift (@_);
134 my ($getlist,$inputargs,$hashArgOptLists) = @_;
135 push(@$getlist, $class);
136
137 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
138 push(@{$hashArgOptLists->{"OptList"}},$options);
139
140 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
141
142 return bless $self, $class;
143}
144
145sub checkWgetSetup
146{
147 my ($self,$blnGliCall) = @_;
148 #TODO: proxy detection??
149
150 if((!$blnGliCall) && $self->{'proxy_on'})
151 {
152 &checkProxySetup($self);
153 }
154 &checkURL($self);
155}
156
157sub getWgetOptions
158{
159 my ($self) = @_;
160 my $strOptions = "";
161
162 if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
163 {
164
165 $strOptions .= " -e httpproxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
166
167 if ($self->{'user_name'} && $self->{'user_password'})
168 {
169 $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
170 }
171 }
172
173 if ($self->{'proxy_on'}) {
174 $strOptions .= " --proxy ";
175 }
176
177 return $strOptions;
178}
179
180# Checking for proxy setup: proxy server, proxy port, proxy username and password.
181sub checkProxySetup
182{
183 my ($self) = @_;
184 ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
185 # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
186 # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
187
188}
189
190# Returns true if the wget status needs to be monitored through sockets
191# (if a socket is used to communicate with the Java program on when to
192# terminate wget). True if we are running gli, or if the particular type
193# of WgetDownload is *not* OAIDownload (in that case, the original way of
194# terminating the perl script from Java terminated wget as well).
195sub dealingWithSockets() {
196 my ($self) = @_;
197 return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
198 # use ref($self) to find the classname of an object
199}
200
201
202sub useWget
203{
204 #local $| = 1; # autoflush stdout buffer
205 #print STDOUT "*** Start of subroutine useWget in $0\n";
206
207 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
208
209 my ($strReadIn,$strLine,$command);
210 $strReadIn = "" unless defined $strReadIn;
211
212 my $current_dir = cwd();
213 my $changed_dir = 0;
214 if (defined $working_dir && -e $working_dir) {
215 chdir "$working_dir";
216 $changed_dir = 1;
217 }
218
219 # When we are running this script through GLI, the SIGTERM signal handler
220 # won't get called on Windows when wget is to be prematurely terminated.
221 # Instead, when wget has to be terminated in the middle of execution, GLI will
222 # connect to a serverSocket here to communicate when it's time to stop wget.
223 if($self->dealingWithSockets()) {
224
225 $port = <STDIN>; # gets a port on localhost that's not yet in use
226 chomp($port);
227
228 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
229 LocalPort => $port,
230 Listen => 1,
231 Reuse => 1);
232
233 die "can't setup server" unless $serverSocket;
234 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
235
236 $read_set = new IO::Select(); # create handle set for reading
237 $read_set->add($serverSocket); # add the main socket to the set
238 }
239
240 my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
241 $command = "\"$wget_file_path\" $cmdWget";
242 #print STDOUT "Command is: $command\n";
243
244 # Wget's output needs to be monitored to find out when it has naturally termianted.
245 # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
246 # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
247 # the childpid does not kill wget on Linux but the subshell that launched it instead.
248 # Therefore, we use open3. Though the child process wget sends output only to its stdout,
249 # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
250 # However that may be, it works with open3. But to avoid the confusion of managing and
251 # closing an extra unused handle, a single handle is used instead for both the child's
252 # stderr and stdout.
253
254 # Both open2 and open3 don't return on failure, but raise an exception. The handling
255 # of the exception is described on p.568 of the Perl Cookbook
256 eval {
257 $childpid = open3($chld_in, $chld_out, $chld_out, $command);
258 };
259 if ($@) {
260 if($@ =~ m/^open3/) {
261 die "open3 failed in $0: $!\n$@\n";
262 }
263 die "Tried to launch open3 in $0, got unexpected exception: $@";
264 }
265
266 my $loop = 1;
267 while($loop)
268 {
269 if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
270 if($blnShow) {
271 print STDERR "$strLine\n";
272 }
273 $strReadIn .= $strLine;
274 }
275 else { # wget finished, terminate naturally
276 #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
277 close($chld_in);
278 close($chld_out);
279 waitpid $childpid, 0;
280 $loop = 0;
281
282 $childpid = undef;
283 if(defined $port) {
284 $read_set->remove($serverSocket);
285 close($serverSocket);
286 }
287 }
288
289 # if we run this script from the command-line (as opposed to from GLI),
290 # then we're not working with sockets and can therefore can skip the next bits
291 next unless(defined $port);
292
293 # http://www.perlfect.com/articles/select.shtml
294 # "multiplex between several filehandles within a single thread of control,
295 # thus creating the effect of parallelism in the handling of I/O."
296 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
297
298 # take all readable handles in turn
299 foreach my $rh (@rh_set) {
300 if($rh == $serverSocket) {
301 my $client = $rh->accept();
302 #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
303 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
304
305 # Read from the client (getting rid of the trailing newline)
306 # Has the client sent the <<STOP>> signal?
307 my $signal = <$client>;
308 chomp($signal);
309 if($signal eq "<<STOP>>") {
310 print $client "Perl received STOP signal (on port $port): stopping wget\n";
311 $loop = 0; # out of outer while loop
312 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
313
314 # Sometimes the wget process takes some time to start up. If the STOP signal
315 # was sent, don't try to terminate the process until we know it is running.
316 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
317 # for it to start up, checking for whether it is running in order to kill it.
318 for(my $seconds = 1; $seconds <= 5; $seconds++) {
319 if(kill(0, $childpid)) {
320 # If kill(0, $childpid) returns true, then the process is running
321 # and we need to kill it.
322 close($chld_in);
323 close($chld_out);
324 kill("TERM", $childpid);
325
326 $childpid = undef;
327
328 # Stop monitoring the read_handle and close the serverSocket
329 # (the Java end will close the client socket that Java opened)
330 $read_set->remove($rh); #$read_set->remove($serverSocket);
331 close($rh); #close($serverSocket);
332 print $client "Perl terminated wget and is about to exit\n";
333 last; # out of inner for loop
334 }
335 else { # the process may just be starting up, wait
336 sleep(1);
337 }
338 }
339 last; # out of foreach loop
340 }
341 }
342 }
343 }
344
345 if ($changed_dir) {
346 chdir $current_dir;
347 }
348
349 return $strReadIn;
350}
351
352
353sub useWgetMonitored
354{
355 #local $| = 1; # autoflush stdout buffer
356 #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
357
358 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
359
360
361 my $current_dir = cwd();
362 my $changed_dir = 0;
363 if (defined $working_dir && -e $working_dir) {
364 chdir "$working_dir";
365 $changed_dir = 1;
366 }
367
368 # When we are running this script through GLI, the SIGTERM signal handler
369 # won't get called on Windows when wget is to be prematurely terminated.
370 # Instead, when wget has to be terminated in the middle of execution, GLI will
371 # connect to a serverSocket here to communicate when it's time to stop wget.
372 if($self->dealingWithSockets()) {
373
374 $port = <STDIN>; # gets a port on localhost that's not yet in use
375 chomp($port);
376
377 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
378 LocalPort => $port,
379 Listen => 1,
380 Reuse => 1);
381
382 die "can't setup server" unless $serverSocket;
383 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
384
385 $read_set = new IO::Select(); # create handle set for reading
386 $read_set->add($serverSocket); # add the main socket to the set
387 }
388
389 my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
390 my $command = "\"$wget_file_path\" $cmdWget";
391 #print STDOUT "Command is: $command\n";
392
393 eval { # see p.568 of Perl Cookbook
394 $childpid = open3($chld_in, $chld_out, $chld_out, $command);
395 };
396 if ($@) {
397 if($@ =~ m/^open3/) {
398 die "open3 failed in $0: $!\n$@\n";
399 }
400 die "Tried to launch open3 in $0, got unexpected exception: $@";
401 }
402
403 my $full_text = "";
404 my $error_text = "";
405 my @follow_list = ();
406 my $line;
407
408 my $loop = 1;
409 while($loop)
410 {
411 if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
412 if((defined $blnShow) && $blnShow)
413 {
414 print STDERR "$line";
415 }
416
417 if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
418 my $follow_url = $1;
419 push(@follow_list,$follow_url);
420 }
421
422 if ($line =~ m/ERROR\s+\d+/) {
423 $error_text .= $line;
424 }
425
426 $full_text .= $line;
427 }
428 else { # wget finished, terminate naturally
429 #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
430 close($chld_in);
431 close($chld_out);
432 # Program terminates only when the following line is included
433 # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
434 # it prevents the child from turning into a "zombie process".
435 # While the wget process terminates without it, this perl script does not:
436 # the DOS prompt is not returned without it.
437 waitpid $childpid, 0;
438 $loop = 0;
439
440 $childpid = undef;
441 if(defined $port) {
442 $read_set->remove($serverSocket);
443 close($serverSocket);
444 }
445 }
446
447 # if we run this script from the command-line (as opposed to from GLI),
448 # then we're not working with sockets and can therefore can skip the next bits
449 next unless(defined $port);
450
451 # http://www.perlfect.com/articles/select.shtml
452 # "multiplex between several filehandles within a single thread of control,
453 # thus creating the effect of parallelism in the handling of I/O."
454 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
455
456 # take all readable handles in turn
457 foreach my $rh (@rh_set) {
458 if($rh == $serverSocket) {
459 my $client = $rh->accept();
460 #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
461 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
462
463 # Read from the client (getting rid of trailing newline)
464 # Has the client sent the <<STOP>> signal?
465 my $signal = <$client>;
466 chomp($signal);
467 if($signal eq "<<STOP>>") {
468 print $client "Perl received STOP signal (on port $port): stopping wget\n";
469 $loop = 0; # out of outer while loop
470 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
471
472 # Sometimes the wget process takes some time to start up. If the STOP signal
473 # was sent, don't try to terminate the process until we know it is running.
474 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
475 # for it to start up, checking for whether it is running in order to kill it.
476 for(my $seconds = 1; $seconds <= 5; $seconds++) {
477 if(kill(0, $childpid)) {
478 # If kill(0, $childpid) returns true, then the process is running
479 # and we need to kill it.
480 close($chld_in);
481 close($chld_out);
482 kill("TERM", $childpid);
483
484 $childpid = undef;
485
486 # Stop monitoring the read_handle and close the serverSocket
487 # (the Java end will close the client socket that Java opened)
488 $read_set->remove($rh); #$read_set->remove($serverSocket);
489 close($rh); #close($serverSocket);
490 print $client "Perl terminated wget and is about to exit\n";
491 last; # out of inner for loop
492 }
493 else { # the process may just be starting up, wait
494 sleep(1);
495 }
496 }
497 last; # out of foreach loop
498 }
499 }
500 }
501 }
502
503 my $command_status = $?;
504 if ($command_status != 0) {
505 $error_text .= "Exit error: $command_status";
506 }
507
508 if ($changed_dir) {
509 chdir $current_dir;
510 }
511
512 my $final_follow = pop(@follow_list); # might be undefined, but that's OK
513
514 return ($full_text,$error_text,$final_follow);
515}
516
517
518# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
519sub checkURL
520{
521 my ($self) = @_;
522 if ($self->{'url'} eq "")
523 {
524 &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
525 }
526}
527
528sub error
529{
530 my ($strFunctionName,$strError) = @_;
531 {
532 print "Error occoured in WgetDownload.pm\n".
533 "In Function:".$strFunctionName."\n".
534 "Error Message:".$strError."\n";
535 exit(-1);
536 }
537}
538
5391;
Note: See TracBrowser for help on using the repository browser.