root/main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm @ 31856

Revision 31856, 18.7 KB (checked in by ak19, 3 years ago)

If the proxy settings are wrong or set when not needed, pressing the Server Information button would take forever (freeze GLI GUI), and the wget that java launched through perl would also take forever, blocking. The wget will have to be terminated from Task Manager. To overcome issues of network settings misconfigurations, which Dr Bainbridge said are hard to detect, setting the number of tries on pressing the Server Info button to 2. The number of tries for pressing the Download button were already 2, so this just makes the two wget commands issued more similar (but the wget launched by the Download button now uses the --tries=2 rather than the shorthand -t 2 too, so that the code reads better). Setting the number of wget retries launched by the Server Info Dialog also ensures wget is eventually terminated, as happens when both tries fail. Some more informative messages are now displayed if the server is unavaiable, depending on whether proxying is on or not.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use util;
38use IPC::Open3;
39use IO::Select;
40use IO::Socket;
41
42
43sub BEGIN {
44    @WgetDownload::ISA = ('BaseDownload');
45}
46
47my $arguments =
48     [ { 'name' => "proxy_on",
49    'desc' => "{WgetDownload.proxy_on}",
50    'type' => "flag",
51    'reqd' => "no",
52    'hiddengli' => "yes"},
53      { 'name' => "proxy_host", 
54    'desc' => "{WgetDownload.proxy_host}",         
55    'type' => "string",
56    'reqd' => "no",
57    'hiddengli' => "yes"},
58      { 'name' => "proxy_port",
59    'desc' => "{WgetDownload.proxy_port}",         
60    'type' => "string",
61    'reqd' => "no",
62    'hiddengli' => "yes"},
63      { 'name' => "user_name", 
64    'desc' => "{WgetDownload.user_name}",         
65    'type' => "string",
66    'reqd' => "no",
67    'hiddengli' => "yes"},
68      { 'name' => "user_password",
69    'desc' => "{WgetDownload.user_password}",         
70    'type' => "string",
71    'reqd' => "no",
72    'hiddengli' => "yes"}];
73
74my $options = { 'name'     => "WgetDownload",
75        'desc'     => "{WgetDownload.desc}",
76        'abstract' => "yes",
77        'inherits' => "yes",
78        'args'     => $arguments };
79
80
81# Declaring file global variables related to the wget child process so that
82# the termination signal handler for SIGTERM can close the streams and tidy
83# up before ending the child process.
84my $childpid;
85my ($chld_out, $chld_in);
86my ($serverSocket, $read_set);
87
88# The port this script's server socket will be listening on, to handle
89# incoming signals from GLI to terminate wget. This is also file global,
90# since OAIDownload.pm will make several calls on wget using the same
91# instance of this script and we want to reuse whatever port GLI gave us.
92my $port;
93
94# When this script is called from the command line, this handler will be called
95# if this process is killed or abruptly ends due to receiving one of the
96# terminating signals that this handler is registered to deal with.
97sub abrupt_end_handler {
98    my $termination_signal = shift (@_);
99
100    if(defined $childpid) {
101    close($chld_out);
102    close($chld_in);
103   
104    print STDOUT "Received termination signal: $termination_signal\n";
105
106    # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
107    # See http://perldoc.perl.org/perlipc.html#Signals
108    # Warning on using kill at http://perldoc.perl.org/perlfork.html
109    kill("TERM", $childpid);
110
111    # If the SIGTERM sent on Linux calls this handler, we want to make
112    # sure any socket connection is closed.
113    # Otherwise sockets are only used when this script is run from GLI
114    # in which case the handlers don't really get called.
115    if(defined $serverSocket) {
116        $read_set->remove($serverSocket) if defined $read_set;
117        close($serverSocket);
118    }
119    }
120
121    exit(0);
122}
123
124# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
125# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
126# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
127# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
128# http://affy.blogspot.com/p5be/ch13.htm
129# http://perldoc.perl.org/perlipc.html#Signals
130$SIG{'INT'} = \&abrupt_end_handler;
131$SIG{'TERM'} = \&abrupt_end_handler;
132
133sub new {
134    my ($class) = shift (@_);
135    my ($getlist,$inputargs,$hashArgOptLists) = @_;
136    push(@$getlist, $class);
137
138    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
139    push(@{$hashArgOptLists->{"OptList"}},$options);
140
141    my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
142
143    # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
144    &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
145
146    return bless $self, $class;
147}
148
149sub checkWgetSetup
150{
151    my ($self,$blnGliCall) = @_;
152    #TODO: proxy detection??
153   
154    if((!$blnGliCall) && $self->{'proxy_on'})
155    {
156    &checkProxySetup($self);
157    }
158    &checkURL($self);
159}
160
161sub getWgetOptions
162{
163    my ($self) = @_;
164    my $strOptions = "";
165   
166    if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
167    {
168
169    if($self->{'url'} =~ m/^https\:/) {
170        $strOptions .= " -e https_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} "; 
171    } else {
172        $strOptions .= " -e http_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
173    }   
174
175    if ($self->{'user_name'} && $self->{'user_password'})
176    {
177        $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
178    }
179    }
180
181    if ($self->{'proxy_on'}) {
182    $strOptions .= " --proxy ";
183    }
184
185    if($self->{'no_check_certificate'} && $self->{'url'} =~ m/^https\:/) {
186        $strOptions .= " --no-check-certificate ";
187    }
188   
189    return $strOptions;
190}
191
192# Checking for proxy setup: proxy server, proxy port, proxy username and password.
193sub checkProxySetup
194{
195    my ($self) = @_;
196    ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
197    # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
198    # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
199
200}
201
202# Returns true if the wget status needs to be monitored through sockets
203# (if a socket is used to communicate with the Java program on when to
204# terminate wget). True if we are running gli, or if the particular type
205# of WgetDownload is *not* OAIDownload (in that case, the original way of
206# terminating the perl script from Java terminated wget as well).
207sub dealingWithSockets() {
208    my ($self) = @_;
209    return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
210                       # use ref($self) to find the classname of an object
211}
212
213
214sub useWget
215{
216    #local $| = 1; # autoflush stdout buffer
217    #print STDOUT "*** Start of subroutine useWget in $0\n";
218
219    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
220
221    my ($strReadIn,$strLine,$command);
222    $strReadIn = "" unless defined $strReadIn;
223
224    my $current_dir = cwd();
225    my $changed_dir = 0;
226    if (defined $working_dir && -e $working_dir) {
227    chdir "$working_dir";
228    $changed_dir = 1;
229    }
230
231    # When we are running this script through GLI, the SIGTERM signal handler
232    # won't get called on Windows when wget is to be prematurely terminated.
233    # Instead, when wget has to be terminated in the middle of execution, GLI will
234    # connect to a serverSocket here to communicate when it's time to stop wget.
235    if($self->dealingWithSockets()) {
236
237    $port = <STDIN>; # gets a port on localhost that's not yet in use
238    chomp($port);
239   
240    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
241                           LocalPort => $port,
242                           Listen    => 1,
243                           Reuse     => 1);
244   
245    die "can't setup server" unless $serverSocket;
246    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
247
248    $read_set = new IO::Select();         # create handle set for reading
249    $read_set->add($serverSocket);        # add the main socket to the set
250    }
251
252    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
253    $command = "\"$wget_file_path\" $cmdWget";
254    #print STDOUT "Command is: $command\n";
255
256    # Wget's output needs to be monitored to find out when it has naturally terminated.
257    # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
258    # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
259    # the childpid does not kill wget on Linux but the subshell that launched it instead.
260    # Therefore, we use open3. Though the child process wget sends output only to its stdout,
261    # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
262    # However that may be, it works with open3. But to avoid the confusion of managing and
263    # closing an extra unused handle, a single handle is used instead for both the child's
264    # stderr and stdout.
265    # See http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
266    # for why this is the right thing to do.
267
268    # Both open2 and open3 don't return on failure, but raise an exception. The handling
269    # of the exception is described on p.568 of the Perl Cookbook
270    eval {
271    $childpid = open3($chld_in, $chld_out, $chld_out, $command);
272    };
273    if ($@) {
274    if($@ =~ m/^open3/) {
275        die "open3 failed in $0: $!\n$@\n";     
276    }
277    die "Tried to launch open3 in $0, got unexpected exception: $@";
278    }
279
280    my $loop = 1;
281    while($loop)
282    {
283    if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
284        if($blnShow) {
285        print STDERR "$strLine\n";
286        }
287        $strReadIn .= $strLine;
288    }
289    else { # wget finished, terminate naturally
290        #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
291        close($chld_in);
292        close($chld_out);
293        waitpid $childpid, 0;
294        $loop = 0;
295       
296        $childpid = undef;
297        if(defined $port) {
298        $read_set->remove($serverSocket);
299        close($serverSocket);
300        }
301    }
302
303    # if we run this script from the command-line (as opposed to from GLI),
304    # then we're not working with sockets and can therefore can skip the next bits
305    next unless(defined $port);
306
307    # http://www.perlfect.com/articles/select.shtml
308    # "multiplex between several filehandles within a single thread of control,
309    # thus creating the effect of parallelism in the handling of I/O."
310    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
311
312    # take all readable handles in turn
313    foreach my $rh (@rh_set) {
314        if($rh == $serverSocket) {
315        my $client = $rh->accept();
316        #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
317        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
318       
319        # Read from the client (getting rid of the trailing newline)
320        # Has the client sent the <<STOP>> signal?
321        my $signal = <$client>;
322        chomp($signal);
323        if($signal eq "<<STOP>>") {
324            print $client "Perl received STOP signal (on port $port): stopping wget\n";
325            $loop = 0;                          # out of outer while loop
326            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
327           
328            # Sometimes the wget process takes some time to start up. If the STOP signal
329            # was sent, don't try to terminate the process until we know it is running.
330            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
331            # for it to start up, checking for whether it is running in order to kill it.
332            for(my $seconds = 1; $seconds <= 5; $seconds++) {
333            if(kill(0, $childpid)) {
334                # If kill(0, $childpid) returns true, then the process is running
335                # and we need to kill it.
336                close($chld_in);
337                close($chld_out);
338                kill("TERM", $childpid);
339               
340                $childpid = undef;
341               
342                # Stop monitoring the read_handle and close the serverSocket
343                # (the Java end will close the client socket that Java opened)
344                $read_set->remove($rh);     #$read_set->remove($serverSocket);
345                close($rh);             #close($serverSocket);
346                print $client "Perl terminated wget and is about to exit\n";
347                last;                           # out of inner for loop
348            }
349            else { # the process may just be starting up, wait
350                sleep(1);
351            }
352            }
353            last;                               # out of foreach loop
354        }
355        }
356    }
357    }
358
359    if ($changed_dir) {
360    chdir $current_dir;
361    }
362   
363    return $strReadIn;
364}
365
366
367sub useWgetMonitored
368{
369    #local $| = 1; # autoflush stdout buffer
370    #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
371
372    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
373
374
375    my $current_dir = cwd();
376    my $changed_dir = 0;
377    if (defined $working_dir && -e $working_dir) {
378    chdir "$working_dir";
379    $changed_dir = 1;
380    }
381
382    # When we are running this script through GLI, the SIGTERM signal handler
383    # won't get called on Windows when wget is to be prematurely terminated.
384    # Instead, when wget has to be terminated in the middle of execution, GLI will
385    # connect to a serverSocket here to communicate when it's time to stop wget.
386    if($self->dealingWithSockets()) {
387
388    $port = <STDIN>; # gets a port on localhost that's not yet in use
389    chomp($port);
390   
391    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
392                           LocalPort => $port,
393                           Listen    => 1,
394                           Reuse     => 1);
395   
396    die "can't setup server" unless $serverSocket;
397    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
398
399    $read_set = new IO::Select();         # create handle set for reading
400    $read_set->add($serverSocket);        # add the main socket to the set
401    }
402
403    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
404    my $command = "\"$wget_file_path\" $cmdWget";
405    #print STDOUT "Command is: $command\n";
406
407    eval {     # see p.568 of Perl Cookbook
408    $childpid = open3($chld_in, $chld_out, $chld_out, $command);
409    };
410    if ($@) {
411    if($@ =~ m/^open3/) {
412        die "open3 failed in $0: $!\n$@\n";     
413    }
414    die "Tried to launch open3 in $0, got unexpected exception: $@";
415    }
416
417    my $full_text = "";
418    my $error_text = "";
419    my @follow_list = ();
420    my $line;
421
422    my $loop = 1;
423    while($loop)
424    {
425    if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
426        if((defined $blnShow) && $blnShow)
427        {
428        print STDERR "$line";
429        }
430       
431        if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
432        my $follow_url = $1;
433        push(@follow_list,$follow_url);
434        }
435       
436        if ($line =~ m/ERROR\s+\d+/) {
437        $error_text .= $line;
438        }
439       
440        $full_text .= $line;
441    }
442    else { # wget finished, terminate naturally
443        #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
444        close($chld_in);
445        close($chld_out);
446        # Program terminates only when the following line is included
447        # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
448        # it prevents the child from turning into a "zombie process".
449        # While the wget process terminates without it, this perl script does not:
450        # the DOS prompt is not returned without it.
451        waitpid $childpid, 0;
452        $loop = 0;
453       
454        $childpid = undef;
455        if(defined $port) {
456        $read_set->remove($serverSocket);
457        close($serverSocket);
458        }
459    }
460
461    # if we run this script from the command-line (as opposed to from GLI),
462    # then we're not working with sockets and can therefore can skip the next bits
463    next unless(defined $port);
464
465    # http://www.perlfect.com/articles/select.shtml
466    # "multiplex between several filehandles within a single thread of control,
467    # thus creating the effect of parallelism in the handling of I/O."
468    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
469
470    # take all readable handles in turn
471    foreach my $rh (@rh_set) {
472        if($rh == $serverSocket) {
473        my $client = $rh->accept();
474        #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
475        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
476       
477        # Read from the client (getting rid of trailing newline)
478        # Has the client sent the <<STOP>> signal?
479        my $signal = <$client>;
480        chomp($signal);
481        if($signal eq "<<STOP>>") {
482            print $client "Perl received STOP signal (on port $port): stopping wget\n";
483            $loop = 0;                          # out of outer while loop
484            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
485           
486            # Sometimes the wget process takes some time to start up. If the STOP signal
487            # was sent, don't try to terminate the process until we know it is running.
488            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
489            # for it to start up, checking for whether it is running in order to kill it.
490            for(my $seconds = 1; $seconds <= 5; $seconds++) {
491            if(kill(0, $childpid)) {
492                # If kill(0, $childpid) returns true, then the process is running
493                # and we need to kill it.
494                close($chld_in);
495                close($chld_out);
496                kill("TERM", $childpid);
497               
498                $childpid = undef;
499               
500                # Stop monitoring the read_handle and close the serverSocket
501                # (the Java end will close the client socket that Java opened)
502                $read_set->remove($rh);     #$read_set->remove($serverSocket);
503                close($rh);             #close($serverSocket);
504                print $client "Perl terminated wget and is about to exit\n";
505                last;                           # out of inner for loop
506            }
507            else { # the process may just be starting up, wait
508                sleep(1);
509            }
510            }
511            last;                               # out of foreach loop
512        }
513        }
514    }
515    }
516
517    my $command_status = $?;
518    if ($command_status != 0) {
519    $error_text .= "Exit error: $command_status";
520    }
521
522    if ($changed_dir) {
523    chdir $current_dir;
524    }
525   
526    my $final_follow = pop(@follow_list); # might be undefined, but that's OK
527   
528    return ($full_text,$error_text,$final_follow);
529}
530
531
532# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
533sub checkURL
534{
535    my ($self) = @_;
536    if ($self->{'url'} eq "")
537    {
538    &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
539    }
540}
541
542sub error
543{
544    my ($strFunctionName,$strError) = @_;
545    {
546    print "Error occoured in WgetDownload.pm\n".
547        "In Function:".$strFunctionName."\n".
548        "Error Message:".$strError."\n";
549    exit(-1);
550    }
551}
552
5531;
554
Note: See TracBrowser for help on using the browser.