root/main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm @ 31877

Revision 31877, 19.9 KB (checked in by ak19, 2 years ago)

Getting Windows wget to use proxy settings in environment. They didn't need to be in CAPS, as Windows has the side effect of setting env vars in both the original case of the letters of the env variable name as well as in all caps, and unsetting either version unsets both. On Windows however, I noticed that Perl was not on the PATH after open3() in WgetDownload?.pm::useWget() failed with an unclear error message. So now the PATH is also propagated from Java to the perl code for downloading using wget.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use util;
38use IPC::Open3;
39use IO::Select;
40use IO::Socket;
41
42
43sub BEGIN {
44    @WgetDownload::ISA = ('BaseDownload');
45}
46
47my $arguments =
48     [ { 'name' => "proxy_on",
49    'desc' => "{WgetDownload.proxy_on}",
50    'type' => "flag",
51    'reqd' => "no",
52    'hiddengli' => "yes"},
53      { 'name' => "proxy_host", 
54    'desc' => "{WgetDownload.proxy_host}",         
55    'type' => "string",
56    'reqd' => "no",
57    'hiddengli' => "yes"},
58      { 'name' => "proxy_port",
59    'desc' => "{WgetDownload.proxy_port}",         
60    'type' => "string",
61    'reqd' => "no",
62    'hiddengli' => "yes"},
63      { 'name' => "user_name", 
64    'desc' => "{WgetDownload.user_name}",         
65    'type' => "string",
66    'reqd' => "no",
67    'hiddengli' => "yes"},
68      { 'name' => "user_password",
69    'desc' => "{WgetDownload.user_password}",         
70    'type' => "string",
71    'reqd' => "no",
72    'hiddengli' => "yes"},
73      { 'name' => "no_check_certificate",
74    'desc' => "{WgetDownload.no_check_certificate}",
75    'type' => "flag",
76    'reqd' => "no",
77    'hiddengli' => "yes"}
78     ];
79
80my $options = { 'name'     => "WgetDownload",
81        'desc'     => "{WgetDownload.desc}",
82        'abstract' => "yes",
83        'inherits' => "yes",
84        'args'     => $arguments };
85
86
87# Declaring file global variables related to the wget child process so that
88# the termination signal handler for SIGTERM can close the streams and tidy
89# up before ending the child process.
90my $childpid;
91my ($chld_out, $chld_in);
92my ($serverSocket, $read_set);
93
94# The port this script's server socket will be listening on, to handle
95# incoming signals from GLI to terminate wget. This is also file global,
96# since OAIDownload.pm will make several calls on wget using the same
97# instance of this script and we want to reuse whatever port GLI gave us.
98my $port;
99
100# When this script is called from the command line, this handler will be called
101# if this process is killed or abruptly ends due to receiving one of the
102# terminating signals that this handler is registered to deal with.
103sub abrupt_end_handler {
104    my $termination_signal = shift (@_);
105
106    if(defined $childpid) {
107    close($chld_out);
108    close($chld_in);
109   
110    print STDOUT "Received termination signal: $termination_signal\n";
111
112    # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
113    # See http://perldoc.perl.org/perlipc.html#Signals
114    # Warning on using kill at http://perldoc.perl.org/perlfork.html
115    kill("TERM", $childpid);
116
117    # If the SIGTERM sent on Linux calls this handler, we want to make
118    # sure any socket connection is closed.
119    # Otherwise sockets are only used when this script is run from GLI
120    # in which case the handlers don't really get called.
121    if(defined $serverSocket) {
122        $read_set->remove($serverSocket) if defined $read_set;
123        close($serverSocket);
124    }
125    }
126
127    exit(0);
128}
129
130# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
131# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
132# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
133# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
134# http://affy.blogspot.com/p5be/ch13.htm
135# http://perldoc.perl.org/perlipc.html#Signals
136$SIG{'INT'} = \&abrupt_end_handler;
137$SIG{'TERM'} = \&abrupt_end_handler;
138
139sub new {
140    my ($class) = shift (@_);
141    my ($getlist,$inputargs,$hashArgOptLists) = @_;
142    push(@$getlist, $class);
143
144    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
145    push(@{$hashArgOptLists->{"OptList"}},$options);
146
147    my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
148
149    # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
150    &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
151
152    return bless $self, $class;
153}
154
155sub checkWgetSetup
156{
157    my ($self,$blnGliCall) = @_;
158    #TODO: proxy detection??
159   
160    if((!$blnGliCall) && $self->{'proxy_on'})
161    {
162    &checkProxySetup($self);
163    }
164    &checkURL($self);
165}
166
167# Not using this. On Windows, we used to pass proxying settings as flags to wget. But, as that can be
168# seen with Task Manager, we now have the proxy settings set in the environment and are no longer passing it
169sub addProxySettingsAsWgetFlags
170{
171    my ($self) = @_;
172    my $strOptions = "";
173   
174    if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
175    {
176        $strOptions .= " -e https_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} "; 
177        $strOptions .= " -e http_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";       
178       
179        if ($self->{'user_name'} && $self->{'user_password'})
180        {
181        $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
182        }
183    }
184
185    if ($self->{'proxy_on'}) {
186        $strOptions .= " --proxy ";
187    }
188   
189    return $strOptions;
190}
191
192sub getWgetOptions
193{
194    my ($self) = @_;
195    my $strOptions = "";
196   
197    # If proxy settings are set up in the environment, wget is ready to use them. More secure.
198    # But if proxy settings are not set up in the environment, pass them as flags to wget
199    # This is less secure, as pwd etc visible in task manager, but it was the original way in
200    # which wget was run on windows.
201    # Truth in Perl: https://home.ubalt.edu/abento/452/perl/perltruth.html
202    # http://www.perlmonks.org/?node=what%20is%20true%20and%20false%20in%20Perl%3F
203
204    if (!$ENV{'http_proxy'} && !$ENV{'https_proxy'}) {
205        $strOptions .= $self->addProxySettingsAsWgetFlags();
206    } # else wget will use proxy settings in environment, assume enough settings have been provided
207   
208    if($self->{'no_check_certificate'}) { # URL may be http that gets redirected to https, so if no_check_certificate is on, turn it on even if URL is http
209
210    $strOptions .= " --no-check-certificate ";
211    }
212   
213    return $strOptions;
214}
215
216# Checking for proxy setup: proxy server, proxy port, proxy username and password.
217sub checkProxySetup
218{
219    my ($self) = @_;
220    ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
221    # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
222    # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
223
224}
225
226# Returns true if the wget status needs to be monitored through sockets
227# (if a socket is used to communicate with the Java program on when to
228# terminate wget). True if we are running gli, or if the particular type
229# of WgetDownload is *not* OAIDownload (in that case, the original way of
230# terminating the perl script from Java terminated wget as well).
231sub dealingWithSockets() {
232    my ($self) = @_;
233    return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
234                       # use ref($self) to find the classname of an object
235}
236
237
238sub useWget
239{
240    #local $| = 1; # autoflush stdout buffer
241    #print STDOUT "*** Start of subroutine useWget in $0\n";
242
243    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
244
245    my ($strReadIn,$strLine,$command);
246    $strReadIn = "" unless defined $strReadIn;
247
248    my $current_dir = cwd();
249    my $changed_dir = 0;
250    if (defined $working_dir && -e $working_dir) {
251    chdir "$working_dir";
252    $changed_dir = 1;
253    }
254
255    # When we are running this script through GLI, the SIGTERM signal handler
256    # won't get called on Windows when wget is to be prematurely terminated.
257    # Instead, when wget has to be terminated in the middle of execution, GLI will
258    # connect to a serverSocket here to communicate when it's time to stop wget.
259    if($self->dealingWithSockets()) {
260
261    $port = <STDIN>; # gets a port on localhost that's not yet in use
262    chomp($port);
263   
264    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
265                           LocalPort => $port,
266                           Listen    => 1,
267                           Reuse     => 1);
268   
269    die "can't setup server" unless $serverSocket;
270    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
271
272    $read_set = new IO::Select();         # create handle set for reading
273    $read_set->add($serverSocket);        # add the main socket to the set
274    }
275
276    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
277    $command = "\"$wget_file_path\" $cmdWget";
278    #print STDOUT "Command is: $command\n"; # displayed in GLI output
279    #print STDERR "Command is: $command\n"; # goes into ServerInfoDialog
280   
281    # Wget's output needs to be monitored to find out when it has naturally terminated.
282    # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
283    # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
284    # the childpid does not kill wget on Linux but the subshell that launched it instead.
285    # Therefore, we use open3. Though the child process wget sends output only to its stdout,
286    # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
287    # However that may be, it works with open3. But to avoid the confusion of managing and
288    # closing an extra unused handle, a single handle is used instead for both the child's
289    # stderr and stdout.
290    # See http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
291    # for why this is the right thing to do.
292
293    # Both open2 and open3 don't return on failure, but raise an exception. The handling
294    # of the exception is described on p.568 of the Perl Cookbook
295    eval {
296    $childpid = open3($chld_in, $chld_out, $chld_out, $command);
297    };
298    if ($@) {
299    if($@ =~ m/^open3/) {
300        die "open3 failed in $0: $!\n$@\n";     
301    }
302    die "Tried to launch open3 in $0, got unexpected exception: $@";
303    }
304
305    my $loop = 1;
306    while($loop)
307    {
308    if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
309        if($blnShow) {
310        print STDERR "$strLine\n";
311        }
312        $strReadIn .= $strLine;
313    }
314    else { # wget finished, terminate naturally
315        #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
316        close($chld_in);
317        close($chld_out);
318        waitpid $childpid, 0;
319        $loop = 0;
320       
321        $childpid = undef;
322        if(defined $port) {
323        $read_set->remove($serverSocket);
324        close($serverSocket);
325        }
326    }
327
328    # if we run this script from the command-line (as opposed to from GLI),
329    # then we're not working with sockets and can therefore can skip the next bits
330    next unless(defined $port);
331
332    # http://www.perlfect.com/articles/select.shtml
333    # "multiplex between several filehandles within a single thread of control,
334    # thus creating the effect of parallelism in the handling of I/O."
335    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
336
337    # take all readable handles in turn
338    foreach my $rh (@rh_set) {
339        if($rh == $serverSocket) {
340        my $client = $rh->accept();
341        #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
342        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
343       
344        # Read from the client (getting rid of the trailing newline)
345        # Has the client sent the <<STOP>> signal?
346        my $signal = <$client>;
347        chomp($signal);
348        if($signal eq "<<STOP>>") {
349            print $client "Perl received STOP signal (on port $port): stopping wget\n";
350            $loop = 0;                          # out of outer while loop
351            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
352           
353            # Sometimes the wget process takes some time to start up. If the STOP signal
354            # was sent, don't try to terminate the process until we know it is running.
355            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
356            # for it to start up, checking for whether it is running in order to kill it.
357            for(my $seconds = 1; $seconds <= 5; $seconds++) {
358            if(kill(0, $childpid)) {
359                # If kill(0, $childpid) returns true, then the process is running
360                # and we need to kill it.
361                close($chld_in);
362                close($chld_out);
363                kill("TERM", $childpid);
364               
365                $childpid = undef;
366               
367                # Stop monitoring the read_handle and close the serverSocket
368                # (the Java end will close the client socket that Java opened)
369                $read_set->remove($rh);     #$read_set->remove($serverSocket);
370                close($rh);             #close($serverSocket);
371                print $client "Perl terminated wget and is about to exit\n";
372                last;                           # out of inner for loop
373            }
374            else { # the process may just be starting up, wait
375                sleep(1);
376            }
377            }
378            last;                               # out of foreach loop
379        }
380        }
381    }
382    }
383
384    if ($changed_dir) {
385    chdir $current_dir;
386    }
387   
388    return $strReadIn;
389}
390
391
392sub useWgetMonitored
393{
394    #local $| = 1; # autoflush stdout buffer
395    #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
396
397    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
398
399
400    my $current_dir = cwd();
401    my $changed_dir = 0;
402    if (defined $working_dir && -e $working_dir) {
403    chdir "$working_dir";
404    $changed_dir = 1;
405    }
406
407    # When we are running this script through GLI, the SIGTERM signal handler
408    # won't get called on Windows when wget is to be prematurely terminated.
409    # Instead, when wget has to be terminated in the middle of execution, GLI will
410    # connect to a serverSocket here to communicate when it's time to stop wget.
411    if($self->dealingWithSockets()) {
412
413    $port = <STDIN>; # gets a port on localhost that's not yet in use
414    chomp($port);
415   
416    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
417                           LocalPort => $port,
418                           Listen    => 1,
419                           Reuse     => 1);
420   
421    die "can't setup server" unless $serverSocket;
422    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
423
424    $read_set = new IO::Select();         # create handle set for reading
425    $read_set->add($serverSocket);        # add the main socket to the set
426    }
427
428    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
429    my $command = "\"$wget_file_path\" $cmdWget";
430    #print STDOUT "Command is: $command\n";
431
432    eval {     # see p.568 of Perl Cookbook
433    $childpid = open3($chld_in, $chld_out, $chld_out, $command);
434    };
435    if ($@) {
436    if($@ =~ m/^open3/) {
437        die "open3 failed in $0: $!\n$@\n";     
438    }
439    die "Tried to launch open3 in $0, got unexpected exception: $@";
440    }
441
442    my $full_text = "";
443    my $error_text = "";
444    my @follow_list = ();
445    my $line;
446
447    my $loop = 1;
448    while($loop)
449    {
450    if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
451        if((defined $blnShow) && $blnShow)
452        {
453        print STDERR "$line";
454        }
455       
456        if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
457        my $follow_url = $1;
458        push(@follow_list,$follow_url);
459        }
460       
461        if ($line =~ m/ERROR\s+\d+/) {
462        $error_text .= $line;
463        }
464       
465        $full_text .= $line;
466    }
467    else { # wget finished, terminate naturally
468        #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
469        close($chld_in);
470        close($chld_out);
471        # Program terminates only when the following line is included
472        # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
473        # it prevents the child from turning into a "zombie process".
474        # While the wget process terminates without it, this perl script does not:
475        # the DOS prompt is not returned without it.
476        waitpid $childpid, 0;
477        $loop = 0;
478       
479        $childpid = undef;
480        if(defined $port) {
481        $read_set->remove($serverSocket);
482        close($serverSocket);
483        }
484    }
485
486    # if we run this script from the command-line (as opposed to from GLI),
487    # then we're not working with sockets and can therefore can skip the next bits
488    next unless(defined $port);
489
490    # http://www.perlfect.com/articles/select.shtml
491    # "multiplex between several filehandles within a single thread of control,
492    # thus creating the effect of parallelism in the handling of I/O."
493    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
494
495    # take all readable handles in turn
496    foreach my $rh (@rh_set) {
497        if($rh == $serverSocket) {
498        my $client = $rh->accept();
499        #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
500        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
501       
502        # Read from the client (getting rid of trailing newline)
503        # Has the client sent the <<STOP>> signal?
504        my $signal = <$client>;
505        chomp($signal);
506        if($signal eq "<<STOP>>") {
507            print $client "Perl received STOP signal (on port $port): stopping wget\n";
508            $loop = 0;                          # out of outer while loop
509            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
510           
511            # Sometimes the wget process takes some time to start up. If the STOP signal
512            # was sent, don't try to terminate the process until we know it is running.
513            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
514            # for it to start up, checking for whether it is running in order to kill it.
515            for(my $seconds = 1; $seconds <= 5; $seconds++) {
516            if(kill(0, $childpid)) {
517                # If kill(0, $childpid) returns true, then the process is running
518                # and we need to kill it.
519                close($chld_in);
520                close($chld_out);
521                kill("TERM", $childpid);
522               
523                $childpid = undef;
524               
525                # Stop monitoring the read_handle and close the serverSocket
526                # (the Java end will close the client socket that Java opened)
527                $read_set->remove($rh);     #$read_set->remove($serverSocket);
528                close($rh);             #close($serverSocket);
529                print $client "Perl terminated wget and is about to exit\n";
530                last;                           # out of inner for loop
531            }
532            else { # the process may just be starting up, wait
533                sleep(1);
534            }
535            }
536            last;                               # out of foreach loop
537        }
538        }
539    }
540    }
541
542    my $command_status = $?;
543    if ($command_status != 0) {
544    $error_text .= "Exit error: $command_status";
545    }
546
547    if ($changed_dir) {
548    chdir $current_dir;
549    }
550   
551    my $final_follow = pop(@follow_list); # might be undefined, but that's OK
552   
553    return ($full_text,$error_text,$final_follow);
554}
555
556
557# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
558sub checkURL
559{
560    my ($self) = @_;
561    if ($self->{'url'} eq "")
562    {
563    &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
564    }
565}
566
567sub error
568{
569    my ($strFunctionName,$strError) = @_;
570    {
571    print "Error occoured in WgetDownload.pm\n".
572        "In Function:".$strFunctionName."\n".
573        "Error Message:".$strError."\n";
574    exit(-1);
575    }
576}
577
5781;
579
Note: See TracBrowser for help on using the browser.