root/main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm @ 31851

Revision 31851, 18.5 KB (checked in by ak19, 2 years ago)

1. Fixes to get proxying to work on Windows. 2. Fixes to timeout if a page doesn't exist and it takes forever to read. Both for downloading from a URL and getting server info (perl code), and also in Java code, when doing a getRedirectURL(). Generally, a URL is correct and when wget is launched, a cancel operation in the Java GUI successfully causes and interrupt which then terminates wget. However, if the URL doesn't exist, either when getting serer info or when downloading, the wget launched by the perl seems to block or something, and the interrupt is not noticed until the wget is manually terminated through the task manager. Then the interrupt is finally noticed. If pages would indicate they don't exist, then it wouldn't have been a problem. This issue is now circumvented through setting a read-timeout, to stop retrieving pages that don't exist but that take forever to access anyway as they don't indicate that they don't exist. A connect timeout is for if you get proxy details wrong or something like that and it takes forever to connect.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use util;
38use IPC::Open3;
39use IO::Select;
40use IO::Socket;
41
42
43sub BEGIN {
44    @WgetDownload::ISA = ('BaseDownload');
45}
46
47my $arguments =
48     [ { 'name' => "proxy_on",
49    'desc' => "{WgetDownload.proxy_on}",
50    'type' => "flag",
51    'reqd' => "no",
52    'hiddengli' => "yes"},
53      { 'name' => "proxy_host", 
54    'desc' => "{WgetDownload.proxy_host}",         
55    'type' => "string",
56    'reqd' => "no",
57    'hiddengli' => "yes"},
58      { 'name' => "proxy_port",
59    'desc' => "{WgetDownload.proxy_port}",         
60    'type' => "string",
61    'reqd' => "no",
62    'hiddengli' => "yes"},
63      { 'name' => "user_name", 
64    'desc' => "{WgetDownload.user_name}",         
65    'type' => "string",
66    'reqd' => "no",
67    'hiddengli' => "yes"},
68      { 'name' => "user_password",
69    'desc' => "{WgetDownload.user_password}",         
70    'type' => "string",
71    'reqd' => "no",
72    'hiddengli' => "yes"}];
73
74my $options = { 'name'     => "WgetDownload",
75        'desc'     => "{WgetDownload.desc}",
76        'abstract' => "yes",
77        'inherits' => "yes",
78        'args'     => $arguments };
79
80
81# Declaring file global variables related to the wget child process so that
82# the termination signal handler for SIGTERM can close the streams and tidy
83# up before ending the child process.
84my $childpid;
85my ($chld_out, $chld_in);
86my ($serverSocket, $read_set);
87
88# The port this script's server socket will be listening on, to handle
89# incoming signals from GLI to terminate wget. This is also file global,
90# since OAIDownload.pm will make several calls on wget using the same
91# instance of this script and we want to reuse whatever port GLI gave us.
92my $port;
93
94# When this script is called from the command line, this handler will be called
95# if this process is killed or abruptly ends due to receiving one of the
96# terminating signals that this handler is registered to deal with.
97sub abrupt_end_handler {
98    my $termination_signal = shift (@_);
99
100    if(defined $childpid) {
101    close($chld_out);
102    close($chld_in);
103   
104    print STDOUT "Received termination signal: $termination_signal\n";
105
106    # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
107    # See http://perldoc.perl.org/perlipc.html#Signals
108    # Warning on using kill at http://perldoc.perl.org/perlfork.html
109    kill("TERM", $childpid);
110
111    # If the SIGTERM sent on Linux calls this handler, we want to make
112    # sure any socket connection is closed.
113    # Otherwise sockets are only used when this script is run from GLI
114    # in which case the handlers don't really get called.
115    if(defined $serverSocket) {
116        $read_set->remove($serverSocket) if defined $read_set;
117        close($serverSocket);
118    }
119    }
120
121    exit(0);
122}
123
124# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
125# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
126# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
127# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
128# http://affy.blogspot.com/p5be/ch13.htm
129# http://perldoc.perl.org/perlipc.html#Signals
130$SIG{'INT'} = \&abrupt_end_handler;
131$SIG{'TERM'} = \&abrupt_end_handler;
132
133sub new {
134    my ($class) = shift (@_);
135    my ($getlist,$inputargs,$hashArgOptLists) = @_;
136    push(@$getlist, $class);
137
138    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
139    push(@{$hashArgOptLists->{"OptList"}},$options);
140
141    my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
142
143    # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
144    &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
145
146    return bless $self, $class;
147}
148
149sub checkWgetSetup
150{
151    my ($self,$blnGliCall) = @_;
152    #TODO: proxy detection??
153   
154    if((!$blnGliCall) && $self->{'proxy_on'})
155    {
156    &checkProxySetup($self);
157    }
158    &checkURL($self);
159}
160
161sub getWgetOptions
162{
163    my ($self) = @_;
164    my $strOptions = "";
165   
166    if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
167    {
168
169    if($self->{'url'} =~ m/^https\:/) {
170        $strOptions .= " -e https_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} "; 
171    } else {
172        $strOptions .= " -e http_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
173    }   
174
175    if ($self->{'user_name'} && $self->{'user_password'})
176    {
177        $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
178    }
179    }
180
181    if ($self->{'proxy_on'}) {
182    $strOptions .= " --proxy ";
183    }
184
185    if($self->{'no_check_certificate'} && $self->{'url'} =~ m/^https\:/) {
186        $strOptions .= " --no-check-certificate ";
187    }
188   
189    return $strOptions;
190}
191
192# Checking for proxy setup: proxy server, proxy port, proxy username and password.
193sub checkProxySetup
194{
195    my ($self) = @_;
196    ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
197    # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
198    # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
199
200}
201
202# Returns true if the wget status needs to be monitored through sockets
203# (if a socket is used to communicate with the Java program on when to
204# terminate wget). True if we are running gli, or if the particular type
205# of WgetDownload is *not* OAIDownload (in that case, the original way of
206# terminating the perl script from Java terminated wget as well).
207sub dealingWithSockets() {
208    my ($self) = @_;
209    return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
210                       # use ref($self) to find the classname of an object
211}
212
213
214sub useWget
215{
216    #local $| = 1; # autoflush stdout buffer
217    #print STDOUT "*** Start of subroutine useWget in $0\n";
218
219    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
220
221    my ($strReadIn,$strLine,$command);
222    $strReadIn = "" unless defined $strReadIn;
223
224    my $current_dir = cwd();
225    my $changed_dir = 0;
226    if (defined $working_dir && -e $working_dir) {
227    chdir "$working_dir";
228    $changed_dir = 1;
229    }
230
231    # When we are running this script through GLI, the SIGTERM signal handler
232    # won't get called on Windows when wget is to be prematurely terminated.
233    # Instead, when wget has to be terminated in the middle of execution, GLI will
234    # connect to a serverSocket here to communicate when it's time to stop wget.
235    if($self->dealingWithSockets()) {
236
237    $port = <STDIN>; # gets a port on localhost that's not yet in use
238    chomp($port);
239   
240    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
241                           LocalPort => $port,
242                           Listen    => 1,
243                           Reuse     => 1);
244   
245    die "can't setup server" unless $serverSocket;
246    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
247
248    $read_set = new IO::Select();         # create handle set for reading
249    $read_set->add($serverSocket);        # add the main socket to the set
250    }
251
252    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
253    $command = "\"$wget_file_path\" $cmdWget";
254    #print STDOUT "Command is: $command\n";
255
256    # Wget's output needs to be monitored to find out when it has naturally terminated.
257    # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
258    # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
259    # the childpid does not kill wget on Linux but the subshell that launched it instead.
260    # Therefore, we use open3. Though the child process wget sends output only to its stdout,
261    # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
262    # However that may be, it works with open3. But to avoid the confusion of managing and
263    # closing an extra unused handle, a single handle is used instead for both the child's
264    # stderr and stdout.
265
266    # Both open2 and open3 don't return on failure, but raise an exception. The handling
267    # of the exception is described on p.568 of the Perl Cookbook
268    eval {
269    $childpid = open3($chld_in, $chld_out, $chld_out, $command);
270    };
271    if ($@) {
272    if($@ =~ m/^open3/) {
273        die "open3 failed in $0: $!\n$@\n";     
274    }
275    die "Tried to launch open3 in $0, got unexpected exception: $@";
276    }
277
278    my $loop = 1;
279    while($loop)
280    {
281    if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
282        if($blnShow) {
283        print STDERR "$strLine\n";
284        }
285        $strReadIn .= $strLine;
286    }
287    else { # wget finished, terminate naturally
288        #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
289        close($chld_in);
290        close($chld_out);
291        waitpid $childpid, 0;
292        $loop = 0;
293       
294        $childpid = undef;
295        if(defined $port) {
296        $read_set->remove($serverSocket);
297        close($serverSocket);
298        }
299    }
300
301    # if we run this script from the command-line (as opposed to from GLI),
302    # then we're not working with sockets and can therefore can skip the next bits
303    next unless(defined $port);
304
305    # http://www.perlfect.com/articles/select.shtml
306    # "multiplex between several filehandles within a single thread of control,
307    # thus creating the effect of parallelism in the handling of I/O."
308    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
309
310    # take all readable handles in turn
311    foreach my $rh (@rh_set) {
312        if($rh == $serverSocket) {
313        my $client = $rh->accept();
314        #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
315        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
316       
317        # Read from the client (getting rid of the trailing newline)
318        # Has the client sent the <<STOP>> signal?
319        my $signal = <$client>;
320        chomp($signal);
321        if($signal eq "<<STOP>>") {
322            print $client "Perl received STOP signal (on port $port): stopping wget\n";
323            $loop = 0;                          # out of outer while loop
324            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
325           
326            # Sometimes the wget process takes some time to start up. If the STOP signal
327            # was sent, don't try to terminate the process until we know it is running.
328            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
329            # for it to start up, checking for whether it is running in order to kill it.
330            for(my $seconds = 1; $seconds <= 5; $seconds++) {
331            if(kill(0, $childpid)) {
332                # If kill(0, $childpid) returns true, then the process is running
333                # and we need to kill it.
334                close($chld_in);
335                close($chld_out);
336                kill("TERM", $childpid);
337               
338                $childpid = undef;
339               
340                # Stop monitoring the read_handle and close the serverSocket
341                # (the Java end will close the client socket that Java opened)
342                $read_set->remove($rh);     #$read_set->remove($serverSocket);
343                close($rh);             #close($serverSocket);
344                print $client "Perl terminated wget and is about to exit\n";
345                last;                           # out of inner for loop
346            }
347            else { # the process may just be starting up, wait
348                sleep(1);
349            }
350            }
351            last;                               # out of foreach loop
352        }
353        }
354    }
355    }
356
357    if ($changed_dir) {
358    chdir $current_dir;
359    }
360   
361    return $strReadIn;
362}
363
364
365sub useWgetMonitored
366{
367    #local $| = 1; # autoflush stdout buffer
368    #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
369
370    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
371
372
373    my $current_dir = cwd();
374    my $changed_dir = 0;
375    if (defined $working_dir && -e $working_dir) {
376    chdir "$working_dir";
377    $changed_dir = 1;
378    }
379
380    # When we are running this script through GLI, the SIGTERM signal handler
381    # won't get called on Windows when wget is to be prematurely terminated.
382    # Instead, when wget has to be terminated in the middle of execution, GLI will
383    # connect to a serverSocket here to communicate when it's time to stop wget.
384    if($self->dealingWithSockets()) {
385
386    $port = <STDIN>; # gets a port on localhost that's not yet in use
387    chomp($port);
388   
389    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
390                           LocalPort => $port,
391                           Listen    => 1,
392                           Reuse     => 1);
393   
394    die "can't setup server" unless $serverSocket;
395    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
396
397    $read_set = new IO::Select();         # create handle set for reading
398    $read_set->add($serverSocket);        # add the main socket to the set
399    }
400
401    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
402    my $command = "\"$wget_file_path\" $cmdWget";
403    #print STDOUT "Command is: $command\n";
404
405    eval {     # see p.568 of Perl Cookbook
406    $childpid = open3($chld_in, $chld_out, $chld_out, $command);
407    };
408    if ($@) {
409    if($@ =~ m/^open3/) {
410        die "open3 failed in $0: $!\n$@\n";     
411    }
412    die "Tried to launch open3 in $0, got unexpected exception: $@";
413    }
414
415    my $full_text = "";
416    my $error_text = "";
417    my @follow_list = ();
418    my $line;
419
420    my $loop = 1;
421    while($loop)
422    {
423    if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
424        if((defined $blnShow) && $blnShow)
425        {
426        print STDERR "$line";
427        }
428       
429        if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
430        my $follow_url = $1;
431        push(@follow_list,$follow_url);
432        }
433       
434        if ($line =~ m/ERROR\s+\d+/) {
435        $error_text .= $line;
436        }
437       
438        $full_text .= $line;
439    }
440    else { # wget finished, terminate naturally
441        #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
442        close($chld_in);
443        close($chld_out);
444        # Program terminates only when the following line is included
445        # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
446        # it prevents the child from turning into a "zombie process".
447        # While the wget process terminates without it, this perl script does not:
448        # the DOS prompt is not returned without it.
449        waitpid $childpid, 0;
450        $loop = 0;
451       
452        $childpid = undef;
453        if(defined $port) {
454        $read_set->remove($serverSocket);
455        close($serverSocket);
456        }
457    }
458
459    # if we run this script from the command-line (as opposed to from GLI),
460    # then we're not working with sockets and can therefore can skip the next bits
461    next unless(defined $port);
462
463    # http://www.perlfect.com/articles/select.shtml
464    # "multiplex between several filehandles within a single thread of control,
465    # thus creating the effect of parallelism in the handling of I/O."
466    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
467
468    # take all readable handles in turn
469    foreach my $rh (@rh_set) {
470        if($rh == $serverSocket) {
471        my $client = $rh->accept();
472        #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
473        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
474       
475        # Read from the client (getting rid of trailing newline)
476        # Has the client sent the <<STOP>> signal?
477        my $signal = <$client>;
478        chomp($signal);
479        if($signal eq "<<STOP>>") {
480            print $client "Perl received STOP signal (on port $port): stopping wget\n";
481            $loop = 0;                          # out of outer while loop
482            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
483           
484            # Sometimes the wget process takes some time to start up. If the STOP signal
485            # was sent, don't try to terminate the process until we know it is running.
486            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
487            # for it to start up, checking for whether it is running in order to kill it.
488            for(my $seconds = 1; $seconds <= 5; $seconds++) {
489            if(kill(0, $childpid)) {
490                # If kill(0, $childpid) returns true, then the process is running
491                # and we need to kill it.
492                close($chld_in);
493                close($chld_out);
494                kill("TERM", $childpid);
495               
496                $childpid = undef;
497               
498                # Stop monitoring the read_handle and close the serverSocket
499                # (the Java end will close the client socket that Java opened)
500                $read_set->remove($rh);     #$read_set->remove($serverSocket);
501                close($rh);             #close($serverSocket);
502                print $client "Perl terminated wget and is about to exit\n";
503                last;                           # out of inner for loop
504            }
505            else { # the process may just be starting up, wait
506                sleep(1);
507            }
508            }
509            last;                               # out of foreach loop
510        }
511        }
512    }
513    }
514
515    my $command_status = $?;
516    if ($command_status != 0) {
517    $error_text .= "Exit error: $command_status";
518    }
519
520    if ($changed_dir) {
521    chdir $current_dir;
522    }
523   
524    my $final_follow = pop(@follow_list); # might be undefined, but that's OK
525   
526    return ($full_text,$error_text,$final_follow);
527}
528
529
530# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
531sub checkURL
532{
533    my ($self) = @_;
534    if ($self->{'url'} eq "")
535    {
536    &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
537    }
538}
539
540sub error
541{
542    my ($strFunctionName,$strError) = @_;
543    {
544    print "Error occoured in WgetDownload.pm\n".
545        "In Function:".$strFunctionName."\n".
546        "Error Message:".$strError."\n";
547    exit(-1);
548    }
549}
550
5511;
552
Note: See TracBrowser for help on using the browser.