source: main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm@ 28560

Last change on this file since 28560 was 28560, checked in by ak19, 10 years ago
  1. New subroutine util::set_gnomelib_env that sets the environment for gnomelib needed for running hashfile, suffix and wget which are dependent on the libiconv dll in ext/gnome-lib(-minimal). It's particularly the Mac Lions that need libiconv.2.dylib. 2. Updated the call to hashfile in doc.pm, the call to suffix in Phind.pm and the calls to wget in several perl scripts and modules to call util::set_gnomelib_env, though this will only set the environment once for each subshell.
  • Property svn:keywords set to Author Date Id Revision
File size: 18.3 KB
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use util;
38use IPC::Open3;
39use IO::Select;
40use IO::Socket;
41
42
43sub BEGIN {
44 @WgetDownload::ISA = ('BaseDownload');
45}
46
47my $arguments =
48 [ { 'name' => "proxy_on",
49 'desc' => "{WgetDownload.proxy_on}",
50 'type' => "flag",
51 'reqd' => "no",
52 'hiddengli' => "yes"},
53 { 'name' => "proxy_host",
54 'desc' => "{WgetDownload.proxy_host}",
55 'type' => "string",
56 'reqd' => "no",
57 'hiddengli' => "yes"},
58 { 'name' => "proxy_port",
59 'desc' => "{WgetDownload.proxy_port}",
60 'type' => "string",
61 'reqd' => "no",
62 'hiddengli' => "yes"},
63 { 'name' => "user_name",
64 'desc' => "{WgetDownload.user_name}",
65 'type' => "string",
66 'reqd' => "no",
67 'hiddengli' => "yes"},
68 { 'name' => "user_password",
69 'desc' => "{WgetDownload.user_password}",
70 'type' => "string",
71 'reqd' => "no",
72 'hiddengli' => "yes"}];
73
74my $options = { 'name' => "WgetDownload",
75 'desc' => "{WgetDownload.desc}",
76 'abstract' => "yes",
77 'inherits' => "yes",
78 'args' => $arguments };
79
80
81# Declaring file global variables related to the wget child process so that
82# the termination signal handler for SIGTERM can close the streams and tidy
83# up before ending the child process.
84my $childpid;
85my ($chld_out, $chld_in);
86my ($serverSocket, $read_set);
87
88# The port this script's server socket will be listening on, to handle
89# incoming signals from GLI to terminate wget. This is also file global,
90# since OAIDownload.pm will make several calls on wget using the same
91# instance of this script and we want to reuse whatever port GLI gave us.
92my $port;
93
94# When this script is called from the command line, this handler will be called
95# if this process is killed or abruptly ends due to receiving one of the
96# terminating signals that this handler is registered to deal with.
97sub abrupt_end_handler {
98 my $termination_signal = shift (@_);
99
100 if(defined $childpid) {
101 close($chld_out);
102 close($chld_in);
103
104 print STDOUT "Received termination signal: $termination_signal\n";
105
106 # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
107 # See http://perldoc.perl.org/perlipc.html#Signals
108 # Warning on using kill at http://perldoc.perl.org/perlfork.html
109 kill("TERM", $childpid);
110
111 # If the SIGTERM sent on Linux calls this handler, we want to make
112 # sure any socket connection is closed.
113 # Otherwise sockets are only used when this script is run from GLI
114 # in which case the handlers don't really get called.
115 if(defined $serverSocket) {
116 $read_set->remove($serverSocket) if defined $read_set;
117 close($serverSocket);
118 }
119 }
120
121 exit(0);
122}
123
124# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
125# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
126# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
127# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
128# http://affy.blogspot.com/p5be/ch13.htm
129# http://perldoc.perl.org/perlipc.html#Signals
130$SIG{'INT'} = \&abrupt_end_handler;
131$SIG{'TERM'} = \&abrupt_end_handler;
132
133sub new {
134 my ($class) = shift (@_);
135 my ($getlist,$inputargs,$hashArgOptLists) = @_;
136 push(@$getlist, $class);
137
138 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
139 push(@{$hashArgOptLists->{"OptList"}},$options);
140
141 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
142
143 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
144 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
145
146 return bless $self, $class;
147}
148
149sub checkWgetSetup
150{
151 my ($self,$blnGliCall) = @_;
152 #TODO: proxy detection??
153
154 if((!$blnGliCall) && $self->{'proxy_on'})
155 {
156 &checkProxySetup($self);
157 }
158 &checkURL($self);
159}
160
161sub getWgetOptions
162{
163 my ($self) = @_;
164 my $strOptions = "";
165
166 if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
167 {
168
169 $strOptions .= " -e httpproxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
170
171 if ($self->{'user_name'} && $self->{'user_password'})
172 {
173 $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
174 }
175 }
176
177 if ($self->{'proxy_on'}) {
178 $strOptions .= " --proxy ";
179 }
180
181 return $strOptions;
182}
183
184# Checking for proxy setup: proxy server, proxy port, proxy username and password.
185sub checkProxySetup
186{
187 my ($self) = @_;
188 ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
189 # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
190 # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
191
192}
193
194# Returns true if the wget status needs to be monitored through sockets
195# (if a socket is used to communicate with the Java program on when to
196# terminate wget). True if we are running gli, or if the particular type
197# of WgetDownload is *not* OAIDownload (in that case, the original way of
198# terminating the perl script from Java terminated wget as well).
199sub dealingWithSockets() {
200 my ($self) = @_;
201 return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
202 # use ref($self) to find the classname of an object
203}
204
205
206sub useWget
207{
208 #local $| = 1; # autoflush stdout buffer
209 #print STDOUT "*** Start of subroutine useWget in $0\n";
210
211 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
212
213 my ($strReadIn,$strLine,$command);
214 $strReadIn = "" unless defined $strReadIn;
215
216 my $current_dir = cwd();
217 my $changed_dir = 0;
218 if (defined $working_dir && -e $working_dir) {
219 chdir "$working_dir";
220 $changed_dir = 1;
221 }
222
223 # When we are running this script through GLI, the SIGTERM signal handler
224 # won't get called on Windows when wget is to be prematurely terminated.
225 # Instead, when wget has to be terminated in the middle of execution, GLI will
226 # connect to a serverSocket here to communicate when it's time to stop wget.
227 if($self->dealingWithSockets()) {
228
229 $port = <STDIN>; # gets a port on localhost that's not yet in use
230 chomp($port);
231
232 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
233 LocalPort => $port,
234 Listen => 1,
235 Reuse => 1);
236
237 die "can't setup server" unless $serverSocket;
238 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
239
240 $read_set = new IO::Select(); # create handle set for reading
241 $read_set->add($serverSocket); # add the main socket to the set
242 }
243
244 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
245 $command = "\"$wget_file_path\" $cmdWget";
246 #print STDOUT "Command is: $command\n";
247
248 # Wget's output needs to be monitored to find out when it has naturally termianted.
249 # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
250 # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
251 # the childpid does not kill wget on Linux but the subshell that launched it instead.
252 # Therefore, we use open3. Though the child process wget sends output only to its stdout,
253 # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
254 # However that may be, it works with open3. But to avoid the confusion of managing and
255 # closing an extra unused handle, a single handle is used instead for both the child's
256 # stderr and stdout.
257
258 # Both open2 and open3 don't return on failure, but raise an exception. The handling
259 # of the exception is described on p.568 of the Perl Cookbook
260 eval {
261 $childpid = open3($chld_in, $chld_out, $chld_out, $command);
262 };
263 if ($@) {
264 if($@ =~ m/^open3/) {
265 die "open3 failed in $0: $!\n$@\n";
266 }
267 die "Tried to launch open3 in $0, got unexpected exception: $@";
268 }
269
270 my $loop = 1;
271 while($loop)
272 {
273 if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
274 if($blnShow) {
275 print STDERR "$strLine\n";
276 }
277 $strReadIn .= $strLine;
278 }
279 else { # wget finished, terminate naturally
280 #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
281 close($chld_in);
282 close($chld_out);
283 waitpid $childpid, 0;
284 $loop = 0;
285
286 $childpid = undef;
287 if(defined $port) {
288 $read_set->remove($serverSocket);
289 close($serverSocket);
290 }
291 }
292
293 # if we run this script from the command-line (as opposed to from GLI),
294 # then we're not working with sockets and can therefore can skip the next bits
295 next unless(defined $port);
296
297 # http://www.perlfect.com/articles/select.shtml
298 # "multiplex between several filehandles within a single thread of control,
299 # thus creating the effect of parallelism in the handling of I/O."
300 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
301
302 # take all readable handles in turn
303 foreach my $rh (@rh_set) {
304 if($rh == $serverSocket) {
305 my $client = $rh->accept();
306 #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
307 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
308
309 # Read from the client (getting rid of the trailing newline)
310 # Has the client sent the <<STOP>> signal?
311 my $signal = <$client>;
312 chomp($signal);
313 if($signal eq "<<STOP>>") {
314 print $client "Perl received STOP signal (on port $port): stopping wget\n";
315 $loop = 0; # out of outer while loop
316 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
317
318 # Sometimes the wget process takes some time to start up. If the STOP signal
319 # was sent, don't try to terminate the process until we know it is running.
320 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
321 # for it to start up, checking for whether it is running in order to kill it.
322 for(my $seconds = 1; $seconds <= 5; $seconds++) {
323 if(kill(0, $childpid)) {
324 # If kill(0, $childpid) returns true, then the process is running
325 # and we need to kill it.
326 close($chld_in);
327 close($chld_out);
328 kill("TERM", $childpid);
329
330 $childpid = undef;
331
332 # Stop monitoring the read_handle and close the serverSocket
333 # (the Java end will close the client socket that Java opened)
334 $read_set->remove($rh); #$read_set->remove($serverSocket);
335 close($rh); #close($serverSocket);
336 print $client "Perl terminated wget and is about to exit\n";
337 last; # out of inner for loop
338 }
339 else { # the process may just be starting up, wait
340 sleep(1);
341 }
342 }
343 last; # out of foreach loop
344 }
345 }
346 }
347 }
348
349 if ($changed_dir) {
350 chdir $current_dir;
351 }
352
353 return $strReadIn;
354}
355
356
357sub useWgetMonitored
358{
359 #local $| = 1; # autoflush stdout buffer
360 #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
361
362 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
363
364
365 my $current_dir = cwd();
366 my $changed_dir = 0;
367 if (defined $working_dir && -e $working_dir) {
368 chdir "$working_dir";
369 $changed_dir = 1;
370 }
371
372 # When we are running this script through GLI, the SIGTERM signal handler
373 # won't get called on Windows when wget is to be prematurely terminated.
374 # Instead, when wget has to be terminated in the middle of execution, GLI will
375 # connect to a serverSocket here to communicate when it's time to stop wget.
376 if($self->dealingWithSockets()) {
377
378 $port = <STDIN>; # gets a port on localhost that's not yet in use
379 chomp($port);
380
381 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
382 LocalPort => $port,
383 Listen => 1,
384 Reuse => 1);
385
386 die "can't setup server" unless $serverSocket;
387 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
388
389 $read_set = new IO::Select(); # create handle set for reading
390 $read_set->add($serverSocket); # add the main socket to the set
391 }
392
393 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
394 my $command = "\"$wget_file_path\" $cmdWget";
395 #print STDOUT "Command is: $command\n";
396
397 eval { # see p.568 of Perl Cookbook
398 $childpid = open3($chld_in, $chld_out, $chld_out, $command);
399 };
400 if ($@) {
401 if($@ =~ m/^open3/) {
402 die "open3 failed in $0: $!\n$@\n";
403 }
404 die "Tried to launch open3 in $0, got unexpected exception: $@";
405 }
406
407 my $full_text = "";
408 my $error_text = "";
409 my @follow_list = ();
410 my $line;
411
412 my $loop = 1;
413 while($loop)
414 {
415 if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
416 if((defined $blnShow) && $blnShow)
417 {
418 print STDERR "$line";
419 }
420
421 if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
422 my $follow_url = $1;
423 push(@follow_list,$follow_url);
424 }
425
426 if ($line =~ m/ERROR\s+\d+/) {
427 $error_text .= $line;
428 }
429
430 $full_text .= $line;
431 }
432 else { # wget finished, terminate naturally
433 #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
434 close($chld_in);
435 close($chld_out);
436 # Program terminates only when the following line is included
437 # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
438 # it prevents the child from turning into a "zombie process".
439 # While the wget process terminates without it, this perl script does not:
440 # the DOS prompt is not returned without it.
441 waitpid $childpid, 0;
442 $loop = 0;
443
444 $childpid = undef;
445 if(defined $port) {
446 $read_set->remove($serverSocket);
447 close($serverSocket);
448 }
449 }
450
451 # if we run this script from the command-line (as opposed to from GLI),
452 # then we're not working with sockets and can therefore can skip the next bits
453 next unless(defined $port);
454
455 # http://www.perlfect.com/articles/select.shtml
456 # "multiplex between several filehandles within a single thread of control,
457 # thus creating the effect of parallelism in the handling of I/O."
458 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
459
460 # take all readable handles in turn
461 foreach my $rh (@rh_set) {
462 if($rh == $serverSocket) {
463 my $client = $rh->accept();
464 #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
465 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
466
467 # Read from the client (getting rid of trailing newline)
468 # Has the client sent the <<STOP>> signal?
469 my $signal = <$client>;
470 chomp($signal);
471 if($signal eq "<<STOP>>") {
472 print $client "Perl received STOP signal (on port $port): stopping wget\n";
473 $loop = 0; # out of outer while loop
474 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
475
476 # Sometimes the wget process takes some time to start up. If the STOP signal
477 # was sent, don't try to terminate the process until we know it is running.
478 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
479 # for it to start up, checking for whether it is running in order to kill it.
480 for(my $seconds = 1; $seconds <= 5; $seconds++) {
481 if(kill(0, $childpid)) {
482 # If kill(0, $childpid) returns true, then the process is running
483 # and we need to kill it.
484 close($chld_in);
485 close($chld_out);
486 kill("TERM", $childpid);
487
488 $childpid = undef;
489
490 # Stop monitoring the read_handle and close the serverSocket
491 # (the Java end will close the client socket that Java opened)
492 $read_set->remove($rh); #$read_set->remove($serverSocket);
493 close($rh); #close($serverSocket);
494 print $client "Perl terminated wget and is about to exit\n";
495 last; # out of inner for loop
496 }
497 else { # the process may just be starting up, wait
498 sleep(1);
499 }
500 }
501 last; # out of foreach loop
502 }
503 }
504 }
505 }
506
507 my $command_status = $?;
508 if ($command_status != 0) {
509 $error_text .= "Exit error: $command_status";
510 }
511
512 if ($changed_dir) {
513 chdir $current_dir;
514 }
515
516 my $final_follow = pop(@follow_list); # might be undefined, but that's OK
517
518 return ($full_text,$error_text,$final_follow);
519}
520
521
522# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
523sub checkURL
524{
525 my ($self) = @_;
526 if ($self->{'url'} eq "")
527 {
528 &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
529 }
530}
531
532sub error
533{
534 my ($strFunctionName,$strError) = @_;
535 {
536 print "Error occoured in WgetDownload.pm\n".
537 "In Function:".$strFunctionName."\n".
538 "Error Message:".$strError."\n";
539 exit(-1);
540 }
541}
542
5431;
544
Note: See TracBrowser for help on using the repository browser.