source: main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm@ 28250

Last change on this file since 28250 was 28250, checked in by ak19, 11 years ago
  1. Jenny and I fixed an oversight in OAIDownload, thanks to Kathy's suggestion, where an html page's contents used to be inspected for file types specified to be downloaded with GetDocument ONLY if the file extension was htm(l). Now the test is whether the header specifies Content-Type text/html. 2. Deprecated utils functions replaced with their FileUtils equivalents.
  • Property svn:keywords set to Author Date Id Revision
File size: 18.0 KB
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use IPC::Open3;
38use IO::Select;
39use IO::Socket;
40
41
42sub BEGIN {
43 @WgetDownload::ISA = ('BaseDownload');
44}
45
46my $arguments =
47 [ { 'name' => "proxy_on",
48 'desc' => "{WgetDownload.proxy_on}",
49 'type' => "flag",
50 'reqd' => "no",
51 'hiddengli' => "yes"},
52 { 'name' => "proxy_host",
53 'desc' => "{WgetDownload.proxy_host}",
54 'type' => "string",
55 'reqd' => "no",
56 'hiddengli' => "yes"},
57 { 'name' => "proxy_port",
58 'desc' => "{WgetDownload.proxy_port}",
59 'type' => "string",
60 'reqd' => "no",
61 'hiddengli' => "yes"},
62 { 'name' => "user_name",
63 'desc' => "{WgetDownload.user_name}",
64 'type' => "string",
65 'reqd' => "no",
66 'hiddengli' => "yes"},
67 { 'name' => "user_password",
68 'desc' => "{WgetDownload.user_password}",
69 'type' => "string",
70 'reqd' => "no",
71 'hiddengli' => "yes"}];
72
73my $options = { 'name' => "WgetDownload",
74 'desc' => "{WgetDownload.desc}",
75 'abstract' => "yes",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79
80# Declaring file global variables related to the wget child process so that
81# the termination signal handler for SIGTERM can close the streams and tidy
82# up before ending the child process.
83my $childpid;
84my ($chld_out, $chld_in);
85my ($serverSocket, $read_set);
86
87# The port this script's server socket will be listening on, to handle
88# incoming signals from GLI to terminate wget. This is also file global,
89# since OAIDownload.pm will make several calls on wget using the same
90# instance of this script and we want to reuse whatever port GLI gave us.
91my $port;
92
93# When this script is called from the command line, this handler will be called
94# if this process is killed or abruptly ends due to receiving one of the
95# terminating signals that this handler is registered to deal with.
96sub abrupt_end_handler {
97 my $termination_signal = shift (@_);
98
99 if(defined $childpid) {
100 close($chld_out);
101 close($chld_in);
102
103 print STDOUT "Received termination signal: $termination_signal\n";
104
105 # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
106 # See http://perldoc.perl.org/perlipc.html#Signals
107 # Warning on using kill at http://perldoc.perl.org/perlfork.html
108 kill("TERM", $childpid);
109
110 # If the SIGTERM sent on Linux calls this handler, we want to make
111 # sure any socket connection is closed.
112 # Otherwise sockets are only used when this script is run from GLI
113 # in which case the handlers don't really get called.
114 if(defined $serverSocket) {
115 $read_set->remove($serverSocket) if defined $read_set;
116 close($serverSocket);
117 }
118 }
119
120 exit(0);
121}
122
123# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
124# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
125# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
126# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
127# http://affy.blogspot.com/p5be/ch13.htm
128# http://perldoc.perl.org/perlipc.html#Signals
129$SIG{'INT'} = \&abrupt_end_handler;
130$SIG{'TERM'} = \&abrupt_end_handler;
131
132sub new {
133 my ($class) = shift (@_);
134 my ($getlist,$inputargs,$hashArgOptLists) = @_;
135 push(@$getlist, $class);
136
137 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
138 push(@{$hashArgOptLists->{"OptList"}},$options);
139
140 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
141
142 return bless $self, $class;
143}
144
145sub checkWgetSetup
146{
147 my ($self,$blnGliCall) = @_;
148 #TODO: proxy detection??
149
150 if((!$blnGliCall) && $self->{'proxy_on'})
151 {
152 &checkProxySetup($self);
153 }
154 &checkURL($self);
155}
156
157sub getWgetOptions
158{
159 my ($self) = @_;
160 my $strOptions = "";
161
162 if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
163 {
164
165 $strOptions .= " -e httpproxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
166
167 if ($self->{'user_name'} && $self->{'user_password'})
168 {
169 $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
170 }
171 }
172
173 if ($self->{'proxy_on'}) {
174 $strOptions .= " --proxy ";
175 }
176
177 return $strOptions;
178}
179
180# Checking for proxy setup: proxy server, proxy port, proxy username and password.
181sub checkProxySetup
182{
183 my ($self) = @_;
184 ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
185 # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
186 # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
187
188}
189
190# Returns true if the wget status needs to be monitored through sockets
191# (if a socket is used to communicate with the Java program on when to
192# terminate wget). True if we are running gli, or if the particular type
193# of WgetDownload is *not* OAIDownload (in that case, the original way of
194# terminating the perl script from Java terminated wget as well).
195sub dealingWithSockets() {
196 my ($self) = @_;
197 return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
198 # use ref($self) to find the classname of an object
199}
200
201
202sub useWget
203{
204 #local $| = 1; # autoflush stdout buffer
205 #print STDOUT "*** Start of subroutine useWget in $0\n";
206
207 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
208
209 my ($strReadIn,$strLine,$command);
210 $strReadIn = "" unless defined $strReadIn;
211
212 my $current_dir = cwd();
213 my $changed_dir = 0;
214 if (defined $working_dir && -e $working_dir) {
215 chdir "$working_dir";
216 $changed_dir = 1;
217 }
218
219 # When we are running this script through GLI, the SIGTERM signal handler
220 # won't get called on Windows when wget is to be prematurely terminated.
221 # Instead, when wget has to be terminated in the middle of execution, GLI will
222 # connect to a serverSocket here to communicate when it's time to stop wget.
223 if($self->dealingWithSockets()) {
224
225 $port = <STDIN>; # gets a port on localhost that's not yet in use
226 chomp($port);
227
228 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
229 LocalPort => $port,
230 Listen => 1,
231 Reuse => 1);
232
233 die "can't setup server" unless $serverSocket;
234 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
235
236 $read_set = new IO::Select(); # create handle set for reading
237 $read_set->add($serverSocket); # add the main socket to the set
238 }
239
240 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
241 $command = "\"$wget_file_path\" $cmdWget";
242 #print STDOUT "Command is: $command\n";
243
244 # Wget's output needs to be monitored to find out when it has naturally termianted.
245 # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
246 # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
247 # the childpid does not kill wget on Linux but the subshell that launched it instead.
248 # Therefore, we use open3. Though the child process wget sends output only to its stdout,
249 # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
250 # However that may be, it works with open3. But to avoid the confusion of managing and
251 # closing an extra unused handle, a single handle is used instead for both the child's
252 # stderr and stdout.
253
254 # Both open2 and open3 don't return on failure, but raise an exception. The handling
255 # of the exception is described on p.568 of the Perl Cookbook
256 eval {
257 $childpid = open3($chld_in, $chld_out, $chld_out, $command);
258 };
259 if ($@) {
260 if($@ =~ m/^open3/) {
261 die "open3 failed in $0: $!\n$@\n";
262 }
263 die "Tried to launch open3 in $0, got unexpected exception: $@";
264 }
265
266 my $loop = 1;
267 while($loop)
268 {
269 if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
270 if($blnShow) {
271 print STDERR "$strLine\n";
272 }
273 $strReadIn .= $strLine;
274 }
275 else { # wget finished, terminate naturally
276 #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
277 close($chld_in);
278 close($chld_out);
279 waitpid $childpid, 0;
280 $loop = 0;
281
282 $childpid = undef;
283 if(defined $port) {
284 $read_set->remove($serverSocket);
285 close($serverSocket);
286 }
287 }
288
289 # if we run this script from the command-line (as opposed to from GLI),
290 # then we're not working with sockets and can therefore can skip the next bits
291 next unless(defined $port);
292
293 # http://www.perlfect.com/articles/select.shtml
294 # "multiplex between several filehandles within a single thread of control,
295 # thus creating the effect of parallelism in the handling of I/O."
296 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
297
298 # take all readable handles in turn
299 foreach my $rh (@rh_set) {
300 if($rh == $serverSocket) {
301 my $client = $rh->accept();
302 #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
303 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
304
305 # Read from the client (getting rid of the trailing newline)
306 # Has the client sent the <<STOP>> signal?
307 my $signal = <$client>;
308 chomp($signal);
309 if($signal eq "<<STOP>>") {
310 print $client "Perl received STOP signal (on port $port): stopping wget\n";
311 $loop = 0; # out of outer while loop
312 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
313
314 # Sometimes the wget process takes some time to start up. If the STOP signal
315 # was sent, don't try to terminate the process until we know it is running.
316 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
317 # for it to start up, checking for whether it is running in order to kill it.
318 for(my $seconds = 1; $seconds <= 5; $seconds++) {
319 if(kill(0, $childpid)) {
320 # If kill(0, $childpid) returns true, then the process is running
321 # and we need to kill it.
322 close($chld_in);
323 close($chld_out);
324 kill("TERM", $childpid);
325
326 $childpid = undef;
327
328 # Stop monitoring the read_handle and close the serverSocket
329 # (the Java end will close the client socket that Java opened)
330 $read_set->remove($rh); #$read_set->remove($serverSocket);
331 close($rh); #close($serverSocket);
332 print $client "Perl terminated wget and is about to exit\n";
333 last; # out of inner for loop
334 }
335 else { # the process may just be starting up, wait
336 sleep(1);
337 }
338 }
339 last; # out of foreach loop
340 }
341 }
342 }
343 }
344
345 if ($changed_dir) {
346 chdir $current_dir;
347 }
348
349 return $strReadIn;
350}
351
352
353sub useWgetMonitored
354{
355 #local $| = 1; # autoflush stdout buffer
356 #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
357
358 my ($self, $cmdWget,$blnShow, $working_dir) = @_;
359
360
361 my $current_dir = cwd();
362 my $changed_dir = 0;
363 if (defined $working_dir && -e $working_dir) {
364 chdir "$working_dir";
365 $changed_dir = 1;
366 }
367
368 # When we are running this script through GLI, the SIGTERM signal handler
369 # won't get called on Windows when wget is to be prematurely terminated.
370 # Instead, when wget has to be terminated in the middle of execution, GLI will
371 # connect to a serverSocket here to communicate when it's time to stop wget.
372 if($self->dealingWithSockets()) {
373
374 $port = <STDIN>; # gets a port on localhost that's not yet in use
375 chomp($port);
376
377 $serverSocket = IO::Socket::INET->new( Proto => 'tcp',
378 LocalPort => $port,
379 Listen => 1,
380 Reuse => 1);
381
382 die "can't setup server" unless $serverSocket;
383 #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
384
385 $read_set = new IO::Select(); # create handle set for reading
386 $read_set->add($serverSocket); # add the main socket to the set
387 }
388
389 my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
390 my $command = "\"$wget_file_path\" $cmdWget";
391 #print STDOUT "Command is: $command\n";
392
393 eval { # see p.568 of Perl Cookbook
394 $childpid = open3($chld_in, $chld_out, $chld_out, $command);
395 };
396 if ($@) {
397 if($@ =~ m/^open3/) {
398 die "open3 failed in $0: $!\n$@\n";
399 }
400 die "Tried to launch open3 in $0, got unexpected exception: $@";
401 }
402
403 my $full_text = "";
404 my $error_text = "";
405 my @follow_list = ();
406 my $line;
407
408 my $loop = 1;
409 while($loop)
410 {
411 if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
412 if((defined $blnShow) && $blnShow)
413 {
414 print STDERR "$line";
415 }
416
417 if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
418 my $follow_url = $1;
419 push(@follow_list,$follow_url);
420 }
421
422 if ($line =~ m/ERROR\s+\d+/) {
423 $error_text .= $line;
424 }
425
426 $full_text .= $line;
427 }
428 else { # wget finished, terminate naturally
429 #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
430 close($chld_in);
431 close($chld_out);
432 # Program terminates only when the following line is included
433 # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
434 # it prevents the child from turning into a "zombie process".
435 # While the wget process terminates without it, this perl script does not:
436 # the DOS prompt is not returned without it.
437 waitpid $childpid, 0;
438 $loop = 0;
439
440 $childpid = undef;
441 if(defined $port) {
442 $read_set->remove($serverSocket);
443 close($serverSocket);
444 }
445 }
446
447 # if we run this script from the command-line (as opposed to from GLI),
448 # then we're not working with sockets and can therefore can skip the next bits
449 next unless(defined $port);
450
451 # http://www.perlfect.com/articles/select.shtml
452 # "multiplex between several filehandles within a single thread of control,
453 # thus creating the effect of parallelism in the handling of I/O."
454 my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
455
456 # take all readable handles in turn
457 foreach my $rh (@rh_set) {
458 if($rh == $serverSocket) {
459 my $client = $rh->accept();
460 #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
461 print $client "Talked to ServerSocket (port $port). Connection accepted\n";
462
463 # Read from the client (getting rid of trailing newline)
464 # Has the client sent the <<STOP>> signal?
465 my $signal = <$client>;
466 chomp($signal);
467 if($signal eq "<<STOP>>") {
468 print $client "Perl received STOP signal (on port $port): stopping wget\n";
469 $loop = 0; # out of outer while loop
470 $self->{'forced_quit'} = 1; # subclasses need to know we're quitting
471
472 # Sometimes the wget process takes some time to start up. If the STOP signal
473 # was sent, don't try to terminate the process until we know it is running.
474 # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
475 # for it to start up, checking for whether it is running in order to kill it.
476 for(my $seconds = 1; $seconds <= 5; $seconds++) {
477 if(kill(0, $childpid)) {
478 # If kill(0, $childpid) returns true, then the process is running
479 # and we need to kill it.
480 close($chld_in);
481 close($chld_out);
482 kill("TERM", $childpid);
483
484 $childpid = undef;
485
486 # Stop monitoring the read_handle and close the serverSocket
487 # (the Java end will close the client socket that Java opened)
488 $read_set->remove($rh); #$read_set->remove($serverSocket);
489 close($rh); #close($serverSocket);
490 print $client "Perl terminated wget and is about to exit\n";
491 last; # out of inner for loop
492 }
493 else { # the process may just be starting up, wait
494 sleep(1);
495 }
496 }
497 last; # out of foreach loop
498 }
499 }
500 }
501 }
502
503 my $command_status = $?;
504 if ($command_status != 0) {
505 $error_text .= "Exit error: $command_status";
506 }
507
508 if ($changed_dir) {
509 chdir $current_dir;
510 }
511
512 my $final_follow = pop(@follow_list); # might be undefined, but that's OK
513
514 return ($full_text,$error_text,$final_follow);
515}
516
517
518# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
519sub checkURL
520{
521 my ($self) = @_;
522 if ($self->{'url'} eq "")
523 {
524 &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
525 }
526}
527
528sub error
529{
530 my ($strFunctionName,$strError) = @_;
531 {
532 print "Error occoured in WgetDownload.pm\n".
533 "In Function:".$strFunctionName."\n".
534 "Error Message:".$strError."\n";
535 exit(-1);
536 }
537}
538
5391;
540
Note: See TracBrowser for help on using the repository browser.