root/main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm @ 31864

Revision 31864, 19.6 KB (checked in by ak19, 3 years ago)

1. Modified GLI and perl to set proxy_on, proxy_host and proxy_port for unix too, whereas in the past, only http(s)_proxy ENV vars were set by GLI for unix. If the env vars are not set, then perl assumes it's the wget setup for windows and passes the proxy vars that were set by GLI as flags to wget. If the env vars were set, then perl should run wget without setting the proxy vars as flags to wget, and wget will use the proxying info in the environment. So now, the decision as to whether proxy vars are set or not will result in context specific suggestion messages to the user when the Server Info button was pressed and if the URL could not be accessed for whatever reason. 2. Modified GLI code to not use proxy if proxying is toggled off, but use proxy when it's toggled back on. (Previously, if proxying was set at any point during a GLI session, then GLI remembered that even when proxying was turned off thereafter.)

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use util;
38use IPC::Open3;
39use IO::Select;
40use IO::Socket;
41
42
43sub BEGIN {
44    @WgetDownload::ISA = ('BaseDownload');
45}
46
47my $arguments =
48     [ { 'name' => "proxy_on",
49    'desc' => "{WgetDownload.proxy_on}",
50    'type' => "flag",
51    'reqd' => "no",
52    'hiddengli' => "yes"},
53      { 'name' => "proxy_host", 
54    'desc' => "{WgetDownload.proxy_host}",         
55    'type' => "string",
56    'reqd' => "no",
57    'hiddengli' => "yes"},
58      { 'name' => "proxy_port",
59    'desc' => "{WgetDownload.proxy_port}",         
60    'type' => "string",
61    'reqd' => "no",
62    'hiddengli' => "yes"},
63      { 'name' => "user_name", 
64    'desc' => "{WgetDownload.user_name}",         
65    'type' => "string",
66    'reqd' => "no",
67    'hiddengli' => "yes"},
68      { 'name' => "user_password",
69    'desc' => "{WgetDownload.user_password}",         
70    'type' => "string",
71    'reqd' => "no",
72    'hiddengli' => "yes"},
73      { 'name' => "no_check_certificate",
74    'desc' => "{WgetDownload.no_check_certificate}",
75    'type' => "flag",
76    'reqd' => "no",
77    'hiddengli' => "yes"}
78     ];
79
80my $options = { 'name'     => "WgetDownload",
81        'desc'     => "{WgetDownload.desc}",
82        'abstract' => "yes",
83        'inherits' => "yes",
84        'args'     => $arguments };
85
86
87# Declaring file global variables related to the wget child process so that
88# the termination signal handler for SIGTERM can close the streams and tidy
89# up before ending the child process.
90my $childpid;
91my ($chld_out, $chld_in);
92my ($serverSocket, $read_set);
93
94# The port this script's server socket will be listening on, to handle
95# incoming signals from GLI to terminate wget. This is also file global,
96# since OAIDownload.pm will make several calls on wget using the same
97# instance of this script and we want to reuse whatever port GLI gave us.
98my $port;
99
100# When this script is called from the command line, this handler will be called
101# if this process is killed or abruptly ends due to receiving one of the
102# terminating signals that this handler is registered to deal with.
103sub abrupt_end_handler {
104    my $termination_signal = shift (@_);
105
106    if(defined $childpid) {
107    close($chld_out);
108    close($chld_in);
109   
110    print STDOUT "Received termination signal: $termination_signal\n";
111
112    # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
113    # See http://perldoc.perl.org/perlipc.html#Signals
114    # Warning on using kill at http://perldoc.perl.org/perlfork.html
115    kill("TERM", $childpid);
116
117    # If the SIGTERM sent on Linux calls this handler, we want to make
118    # sure any socket connection is closed.
119    # Otherwise sockets are only used when this script is run from GLI
120    # in which case the handlers don't really get called.
121    if(defined $serverSocket) {
122        $read_set->remove($serverSocket) if defined $read_set;
123        close($serverSocket);
124    }
125    }
126
127    exit(0);
128}
129
130# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
131# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
132# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
133# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
134# http://affy.blogspot.com/p5be/ch13.htm
135# http://perldoc.perl.org/perlipc.html#Signals
136$SIG{'INT'} = \&abrupt_end_handler;
137$SIG{'TERM'} = \&abrupt_end_handler;
138
139sub new {
140    my ($class) = shift (@_);
141    my ($getlist,$inputargs,$hashArgOptLists) = @_;
142    push(@$getlist, $class);
143
144    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
145    push(@{$hashArgOptLists->{"OptList"}},$options);
146
147    my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
148
149    # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
150    &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
151
152    return bless $self, $class;
153}
154
155sub checkWgetSetup
156{
157    my ($self,$blnGliCall) = @_;
158    #TODO: proxy detection??
159   
160    if((!$blnGliCall) && $self->{'proxy_on'})
161    {
162    &checkProxySetup($self);
163    }
164    &checkURL($self);
165}
166
167sub getWgetOptions
168{
169    my ($self) = @_;
170    my $strOptions = "";
171
172    # If http_proxy ENV VARS are not set, but proxy Perl vars are set, then we're on Windows
173    # and need to use the proxy vars as flags to wget
174    # If http_proxy Env Vars are set, then we're on Linux, wget will use the http(s) proxy
175    # env vars and we shouldn't be passing any proxy perl vars as flags
176   
177    # Truth in Perl: https://home.ubalt.edu/abento/452/perl/perltruth.html
178    # http://www.perlmonks.org/?node=what%20is%20true%20and%20false%20in%20Perl%3F
179   
180    if (!$ENV{'http_proxy'} && !$ENV{'https_proxy'}) {
181    if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
182    {
183       
184        if($self->{'url'} =~ m/^https\:/) {
185        $strOptions .= " -e https_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} "; 
186        } else {
187        $strOptions .= " -e http_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
188        }   
189       
190        if ($self->{'user_name'} && $self->{'user_password'})
191        {
192        $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
193        }
194    }
195
196    if ($self->{'proxy_on'}) {
197        $strOptions .= " --proxy ";
198    }
199    }
200
201    if($self->{'no_check_certificate'}) { #&& $self->{'url'} =~ m/^https\:/) { # URL may be http that gets redirected to https, so if no_check_certificate is on, turn it on even if URL is http
202
203    $strOptions .= " --no-check-certificate ";
204    }
205   
206    return $strOptions;
207}
208
209# Checking for proxy setup: proxy server, proxy port, proxy username and password.
210sub checkProxySetup
211{
212    my ($self) = @_;
213    ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
214    # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
215    # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
216
217}
218
219# Returns true if the wget status needs to be monitored through sockets
220# (if a socket is used to communicate with the Java program on when to
221# terminate wget). True if we are running gli, or if the particular type
222# of WgetDownload is *not* OAIDownload (in that case, the original way of
223# terminating the perl script from Java terminated wget as well).
224sub dealingWithSockets() {
225    my ($self) = @_;
226    return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
227                       # use ref($self) to find the classname of an object
228}
229
230
231sub useWget
232{
233    #local $| = 1; # autoflush stdout buffer
234    #print STDOUT "*** Start of subroutine useWget in $0\n";
235
236    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
237
238    my ($strReadIn,$strLine,$command);
239    $strReadIn = "" unless defined $strReadIn;
240
241    my $current_dir = cwd();
242    my $changed_dir = 0;
243    if (defined $working_dir && -e $working_dir) {
244    chdir "$working_dir";
245    $changed_dir = 1;
246    }
247
248    # When we are running this script through GLI, the SIGTERM signal handler
249    # won't get called on Windows when wget is to be prematurely terminated.
250    # Instead, when wget has to be terminated in the middle of execution, GLI will
251    # connect to a serverSocket here to communicate when it's time to stop wget.
252    if($self->dealingWithSockets()) {
253
254    $port = <STDIN>; # gets a port on localhost that's not yet in use
255    chomp($port);
256   
257    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
258                           LocalPort => $port,
259                           Listen    => 1,
260                           Reuse     => 1);
261   
262    die "can't setup server" unless $serverSocket;
263    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
264
265    $read_set = new IO::Select();         # create handle set for reading
266    $read_set->add($serverSocket);        # add the main socket to the set
267    }
268
269    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
270    $command = "\"$wget_file_path\" $cmdWget";
271    #print STDOUT "Command is: $command\n"; # displayed in GLI output
272    #print STDERR "Command is: $command\n"; # goes into ServerInfoDialog
273   
274    # Wget's output needs to be monitored to find out when it has naturally terminated.
275    # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
276    # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
277    # the childpid does not kill wget on Linux but the subshell that launched it instead.
278    # Therefore, we use open3. Though the child process wget sends output only to its stdout,
279    # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
280    # However that may be, it works with open3. But to avoid the confusion of managing and
281    # closing an extra unused handle, a single handle is used instead for both the child's
282    # stderr and stdout.
283    # See http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
284    # for why this is the right thing to do.
285
286    # Both open2 and open3 don't return on failure, but raise an exception. The handling
287    # of the exception is described on p.568 of the Perl Cookbook
288    eval {
289    $childpid = open3($chld_in, $chld_out, $chld_out, $command);
290    };
291    if ($@) {
292    if($@ =~ m/^open3/) {
293        die "open3 failed in $0: $!\n$@\n";     
294    }
295    die "Tried to launch open3 in $0, got unexpected exception: $@";
296    }
297
298    my $loop = 1;
299    while($loop)
300    {
301    if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
302        if($blnShow) {
303        print STDERR "$strLine\n";
304        }
305        $strReadIn .= $strLine;
306    }
307    else { # wget finished, terminate naturally
308        #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
309        close($chld_in);
310        close($chld_out);
311        waitpid $childpid, 0;
312        $loop = 0;
313       
314        $childpid = undef;
315        if(defined $port) {
316        $read_set->remove($serverSocket);
317        close($serverSocket);
318        }
319    }
320
321    # if we run this script from the command-line (as opposed to from GLI),
322    # then we're not working with sockets and can therefore can skip the next bits
323    next unless(defined $port);
324
325    # http://www.perlfect.com/articles/select.shtml
326    # "multiplex between several filehandles within a single thread of control,
327    # thus creating the effect of parallelism in the handling of I/O."
328    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
329
330    # take all readable handles in turn
331    foreach my $rh (@rh_set) {
332        if($rh == $serverSocket) {
333        my $client = $rh->accept();
334        #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
335        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
336       
337        # Read from the client (getting rid of the trailing newline)
338        # Has the client sent the <<STOP>> signal?
339        my $signal = <$client>;
340        chomp($signal);
341        if($signal eq "<<STOP>>") {
342            print $client "Perl received STOP signal (on port $port): stopping wget\n";
343            $loop = 0;                          # out of outer while loop
344            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
345           
346            # Sometimes the wget process takes some time to start up. If the STOP signal
347            # was sent, don't try to terminate the process until we know it is running.
348            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
349            # for it to start up, checking for whether it is running in order to kill it.
350            for(my $seconds = 1; $seconds <= 5; $seconds++) {
351            if(kill(0, $childpid)) {
352                # If kill(0, $childpid) returns true, then the process is running
353                # and we need to kill it.
354                close($chld_in);
355                close($chld_out);
356                kill("TERM", $childpid);
357               
358                $childpid = undef;
359               
360                # Stop monitoring the read_handle and close the serverSocket
361                # (the Java end will close the client socket that Java opened)
362                $read_set->remove($rh);     #$read_set->remove($serverSocket);
363                close($rh);             #close($serverSocket);
364                print $client "Perl terminated wget and is about to exit\n";
365                last;                           # out of inner for loop
366            }
367            else { # the process may just be starting up, wait
368                sleep(1);
369            }
370            }
371            last;                               # out of foreach loop
372        }
373        }
374    }
375    }
376
377    if ($changed_dir) {
378    chdir $current_dir;
379    }
380   
381    return $strReadIn;
382}
383
384
385sub useWgetMonitored
386{
387    #local $| = 1; # autoflush stdout buffer
388    #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
389
390    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
391
392
393    my $current_dir = cwd();
394    my $changed_dir = 0;
395    if (defined $working_dir && -e $working_dir) {
396    chdir "$working_dir";
397    $changed_dir = 1;
398    }
399
400    # When we are running this script through GLI, the SIGTERM signal handler
401    # won't get called on Windows when wget is to be prematurely terminated.
402    # Instead, when wget has to be terminated in the middle of execution, GLI will
403    # connect to a serverSocket here to communicate when it's time to stop wget.
404    if($self->dealingWithSockets()) {
405
406    $port = <STDIN>; # gets a port on localhost that's not yet in use
407    chomp($port);
408   
409    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
410                           LocalPort => $port,
411                           Listen    => 1,
412                           Reuse     => 1);
413   
414    die "can't setup server" unless $serverSocket;
415    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
416
417    $read_set = new IO::Select();         # create handle set for reading
418    $read_set->add($serverSocket);        # add the main socket to the set
419    }
420
421    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
422    my $command = "\"$wget_file_path\" $cmdWget";
423    #print STDOUT "Command is: $command\n";
424
425    eval {     # see p.568 of Perl Cookbook
426    $childpid = open3($chld_in, $chld_out, $chld_out, $command);
427    };
428    if ($@) {
429    if($@ =~ m/^open3/) {
430        die "open3 failed in $0: $!\n$@\n";     
431    }
432    die "Tried to launch open3 in $0, got unexpected exception: $@";
433    }
434
435    my $full_text = "";
436    my $error_text = "";
437    my @follow_list = ();
438    my $line;
439
440    my $loop = 1;
441    while($loop)
442    {
443    if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
444        if((defined $blnShow) && $blnShow)
445        {
446        print STDERR "$line";
447        }
448       
449        if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
450        my $follow_url = $1;
451        push(@follow_list,$follow_url);
452        }
453       
454        if ($line =~ m/ERROR\s+\d+/) {
455        $error_text .= $line;
456        }
457       
458        $full_text .= $line;
459    }
460    else { # wget finished, terminate naturally
461        #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
462        close($chld_in);
463        close($chld_out);
464        # Program terminates only when the following line is included
465        # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
466        # it prevents the child from turning into a "zombie process".
467        # While the wget process terminates without it, this perl script does not:
468        # the DOS prompt is not returned without it.
469        waitpid $childpid, 0;
470        $loop = 0;
471       
472        $childpid = undef;
473        if(defined $port) {
474        $read_set->remove($serverSocket);
475        close($serverSocket);
476        }
477    }
478
479    # if we run this script from the command-line (as opposed to from GLI),
480    # then we're not working with sockets and can therefore can skip the next bits
481    next unless(defined $port);
482
483    # http://www.perlfect.com/articles/select.shtml
484    # "multiplex between several filehandles within a single thread of control,
485    # thus creating the effect of parallelism in the handling of I/O."
486    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
487
488    # take all readable handles in turn
489    foreach my $rh (@rh_set) {
490        if($rh == $serverSocket) {
491        my $client = $rh->accept();
492        #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
493        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
494       
495        # Read from the client (getting rid of trailing newline)
496        # Has the client sent the <<STOP>> signal?
497        my $signal = <$client>;
498        chomp($signal);
499        if($signal eq "<<STOP>>") {
500            print $client "Perl received STOP signal (on port $port): stopping wget\n";
501            $loop = 0;                          # out of outer while loop
502            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
503           
504            # Sometimes the wget process takes some time to start up. If the STOP signal
505            # was sent, don't try to terminate the process until we know it is running.
506            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
507            # for it to start up, checking for whether it is running in order to kill it.
508            for(my $seconds = 1; $seconds <= 5; $seconds++) {
509            if(kill(0, $childpid)) {
510                # If kill(0, $childpid) returns true, then the process is running
511                # and we need to kill it.
512                close($chld_in);
513                close($chld_out);
514                kill("TERM", $childpid);
515               
516                $childpid = undef;
517               
518                # Stop monitoring the read_handle and close the serverSocket
519                # (the Java end will close the client socket that Java opened)
520                $read_set->remove($rh);     #$read_set->remove($serverSocket);
521                close($rh);             #close($serverSocket);
522                print $client "Perl terminated wget and is about to exit\n";
523                last;                           # out of inner for loop
524            }
525            else { # the process may just be starting up, wait
526                sleep(1);
527            }
528            }
529            last;                               # out of foreach loop
530        }
531        }
532    }
533    }
534
535    my $command_status = $?;
536    if ($command_status != 0) {
537    $error_text .= "Exit error: $command_status";
538    }
539
540    if ($changed_dir) {
541    chdir $current_dir;
542    }
543   
544    my $final_follow = pop(@follow_list); # might be undefined, but that's OK
545   
546    return ($full_text,$error_text,$final_follow);
547}
548
549
550# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
551sub checkURL
552{
553    my ($self) = @_;
554    if ($self->{'url'} eq "")
555    {
556    &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
557    }
558}
559
560sub error
561{
562    my ($strFunctionName,$strError) = @_;
563    {
564    print "Error occoured in WgetDownload.pm\n".
565        "In Function:".$strFunctionName."\n".
566        "Error Message:".$strError."\n";
567    exit(-1);
568    }
569}
570
5711;
572
Note: See TracBrowser for help on using the browser.