root/main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm @ 31880

Revision 31880, 21.5 KB (checked in by ak19, 2 years ago)

All the changes that were required to set up multiple proxy servers, one for HTTP, one for HTTPS, one for FTP. Still need to test on Windows

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use util;
38use IPC::Open3;
39use IO::Select;
40use IO::Socket;
41
42
43sub BEGIN {
44    @WgetDownload::ISA = ('BaseDownload');
45}
46
47my $arguments =
48     [ { 'name' => "proxy_on",
49    'desc' => "{WgetDownload.proxy_on}",
50    'type' => "flag",
51    'reqd' => "no",
52    'hiddengli' => "yes"},
53      { 'name' => "http_proxy_host",
54    'desc' => "{WgetDownload.http_proxy_host}",
55    'type' => "string",
56    'reqd' => "no",
57    'hiddengli' => "yes"},
58      { 'name' => "http_proxy_port",
59    'desc' => "{WgetDownload.http_proxy_port}",
60    'type' => "string",
61    'reqd' => "no",
62    'hiddengli' => "yes"},
63      { 'name' => "https_proxy_host", 
64    'desc' => "{WgetDownload.https_proxy_host}",         
65    'type' => "string",
66    'reqd' => "no",
67    'hiddengli' => "yes"},
68      { 'name' => "https_proxy_port",
69    'desc' => "{WgetDownload.https_proxy_port}",         
70    'type' => "string",
71    'reqd' => "no",
72    'hiddengli' => "yes"},
73      { 'name' => "ftp_proxy_host", 
74    'desc' => "{WgetDownload.ftp_proxy_host}",         
75    'type' => "string",
76    'reqd' => "no",
77    'hiddengli' => "yes"},
78      { 'name' => "ftp_proxy_port",
79    'desc' => "{WgetDownload.ftp_proxy_port}",         
80    'type' => "string",
81    'reqd' => "no",
82    'hiddengli' => "yes"},
83      { 'name' => "user_name", 
84    'desc' => "{WgetDownload.user_name}",
85    'type' => "string",
86    'reqd' => "no",
87    'hiddengli' => "yes"},
88      { 'name' => "user_password",
89    'desc' => "{WgetDownload.user_password}",
90    'type' => "string",
91    'reqd' => "no",
92    'hiddengli' => "yes"},
93      { 'name' => "no_check_certificate",
94    'desc' => "{WgetDownload.no_check_certificate}",
95    'type' => "flag",
96    'reqd' => "no",
97    'hiddengli' => "yes"}
98     ];
99
100my $options = { 'name'     => "WgetDownload",
101        'desc'     => "{WgetDownload.desc}",
102        'abstract' => "yes",
103        'inherits' => "yes",
104        'args'     => $arguments };
105
106
107# Declaring file global variables related to the wget child process so that
108# the termination signal handler for SIGTERM can close the streams and tidy
109# up before ending the child process.
110my $childpid;
111my ($chld_out, $chld_in);
112my ($serverSocket, $read_set);
113
114# The port this script's server socket will be listening on, to handle
115# incoming signals from GLI to terminate wget. This is also file global,
116# since OAIDownload.pm will make several calls on wget using the same
117# instance of this script and we want to reuse whatever port GLI gave us.
118my $port;
119
120# When this script is called from the command line, this handler will be called
121# if this process is killed or abruptly ends due to receiving one of the
122# terminating signals that this handler is registered to deal with.
123sub abrupt_end_handler {
124    my $termination_signal = shift (@_);
125
126    if(defined $childpid) {
127    close($chld_out);
128    close($chld_in);
129   
130    print STDOUT "Received termination signal: $termination_signal\n";
131
132    # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
133    # See http://perldoc.perl.org/perlipc.html#Signals
134    # Warning on using kill at http://perldoc.perl.org/perlfork.html
135    kill("TERM", $childpid);
136
137    # If the SIGTERM sent on Linux calls this handler, we want to make
138    # sure any socket connection is closed.
139    # Otherwise sockets are only used when this script is run from GLI
140    # in which case the handlers don't really get called.
141    if(defined $serverSocket) {
142        $read_set->remove($serverSocket) if defined $read_set;
143        close($serverSocket);
144    }
145    }
146
147    exit(0);
148}
149
150# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
151# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
152# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
153# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
154# http://affy.blogspot.com/p5be/ch13.htm
155# http://perldoc.perl.org/perlipc.html#Signals
156$SIG{'INT'} = \&abrupt_end_handler;
157$SIG{'TERM'} = \&abrupt_end_handler;
158
159sub new {
160    my ($class) = shift (@_);
161    my ($getlist,$inputargs,$hashArgOptLists) = @_;
162    push(@$getlist, $class);
163
164    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
165    push(@{$hashArgOptLists->{"OptList"}},$options);
166
167    my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
168
169    # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
170    &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
171
172    return bless $self, $class;
173}
174
175sub checkWgetSetup
176{
177    my ($self,$blnGliCall) = @_;
178    #TODO: proxy detection??
179   
180    if((!$blnGliCall) && $self->{'proxy_on'})
181    {
182    &checkProxySetup($self);
183    }
184    &checkURL($self);
185}
186
187# Not using this. On Windows, we used to pass proxying settings as flags to wget. But, as that can be
188# seen with Task Manager, we now have the proxy settings set in the environment and are no longer passing it
189sub addProxySettingsAsWgetFlags
190{
191    my ($self) = @_;
192    my $strOptions = "";
193
194    if($self->{'http_proxy_host'} && $self->{'http_proxy_port'}) {
195    $strOptions .= " -e http_proxy=$self->{'http_proxy_host'}:$self->{'http_proxy_port'} ";
196    }
197    if($self->{'https_proxy_host'} && $self->{'https_proxy_port'}) {
198    $strOptions .= " -e https_proxy=$self->{'https_proxy_host'}:$self->{'https_proxy_port'} ";
199    }
200    if($self->{'ftp_proxy_host'} && $self->{'ftp_proxy_port'}) {
201    $strOptions .= " -e ftp_proxy=$self->{'ftp_proxy_host'}:$self->{'ftp_proxy_port'} ";
202    }
203   
204    # For wget, there is only one set pair of proxy-user and proxy-passwd, so wget seems to assume
205    # that all 3 proxy protocols (http|https|ftp) will use the same username and pwd combination?
206    # Note that this only matters when passing the proxying details as flags to wget, not when
207    # the proxies are setup as environment variables.
208    if ($self->{'user_name'} && $self->{'user_password'})
209    {
210    $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";     
211    # how is "--proxy-passwd" instead of "--proxy-password" even working????       
212    # see https://www.gnu.org/software/wget/manual/html_node/Proxies.html
213    # and https://www.gnu.org/software/wget/manual/wget.html
214    # Not touching this, in case the manual is simply wrong. Since our code works in
215    # practice (when we were still using wget proxy username/pwd flags for windows).
216    }   
217   
218    return $strOptions;
219}
220
221sub getWgetOptions
222{
223    my ($self) = @_;
224    my $strOptions = "";
225   
226    # If proxy settings are set up in the environment, wget is ready to use them. More secure.
227    # But if proxy settings are not set up in the environment, pass them as flags to wget
228    # This is less secure, as pwd etc visible in task manager, but it was the original way in
229    # which wget was run on windows.
230    # Truth in Perl: https://home.ubalt.edu/abento/452/perl/perltruth.html
231    # http://www.perlmonks.org/?node=what%20is%20true%20and%20false%20in%20Perl%3F
232   
233    if ($self->{'proxy_on'}) {
234    if(!$ENV{'http_proxy'} && !$ENV{'https_proxy'} && !$ENV{'ftp_proxy'}) {
235        $strOptions .= $self->addProxySettingsAsWgetFlags();
236    } # else wget will use proxy settings in environment, assume enough settings have been provided
237    # either way, we're using the proxy
238    $strOptions .= " --proxy ";
239    }
240   
241    if($self->{'no_check_certificate'}) { # URL may be http that gets redirected to https, so if no_check_certificate is on, turn it on even if URL is http
242   
243    $strOptions .= " --no-check-certificate ";
244    }
245   
246    return $strOptions;
247}
248
249# Checking for proxy setup: proxy server, proxy port, proxy username and password.
250sub checkProxySetup
251{
252    my ($self) = @_;
253    ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
254    # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
255    # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
256
257}
258
259# Returns true if the wget status needs to be monitored through sockets
260# (if a socket is used to communicate with the Java program on when to
261# terminate wget). True if we are running gli, or if the particular type
262# of WgetDownload is *not* OAIDownload (in that case, the original way of
263# terminating the perl script from Java terminated wget as well).
264sub dealingWithSockets() {
265    my ($self) = @_;
266    return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
267                       # use ref($self) to find the classname of an object
268}
269
270
271sub useWget
272{
273    #local $| = 1; # autoflush stdout buffer
274    #print STDOUT "*** Start of subroutine useWget in $0\n";
275
276    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
277
278    my ($strReadIn,$strLine,$command);
279    $strReadIn = "" unless defined $strReadIn;
280
281    my $current_dir = cwd();
282    my $changed_dir = 0;
283    if (defined $working_dir && -e $working_dir) {
284    chdir "$working_dir";
285    $changed_dir = 1;
286    }
287
288    # When we are running this script through GLI, the SIGTERM signal handler
289    # won't get called on Windows when wget is to be prematurely terminated.
290    # Instead, when wget has to be terminated in the middle of execution, GLI will
291    # connect to a serverSocket here to communicate when it's time to stop wget.
292    if($self->dealingWithSockets()) {
293
294    $port = <STDIN>; # gets a port on localhost that's not yet in use
295    chomp($port);
296   
297    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
298                           LocalPort => $port,
299                           Listen    => 1,
300                           Reuse     => 1);
301   
302    die "can't setup server" unless $serverSocket;
303    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
304
305    $read_set = new IO::Select();         # create handle set for reading
306    $read_set->add($serverSocket);        # add the main socket to the set
307    }
308
309    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
310    $command = "\"$wget_file_path\" $cmdWget";
311    #print STDOUT "Command is: $command\n"; # displayed in GLI output
312    #print STDERR "Command is: $command\n"; # goes into ServerInfoDialog
313   
314    # Wget's output needs to be monitored to find out when it has naturally terminated.
315    # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
316    # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
317    # the childpid does not kill wget on Linux but the subshell that launched it instead.
318    # Therefore, we use open3. Though the child process wget sends output only to its stdout,
319    # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
320    # However that may be, it works with open3. But to avoid the confusion of managing and
321    # closing an extra unused handle, a single handle is used instead for both the child's
322    # stderr and stdout.
323    # See http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
324    # for why this is the right thing to do.
325
326    # Both open2 and open3 don't return on failure, but raise an exception. The handling
327    # of the exception is described on p.568 of the Perl Cookbook
328    eval {
329    $childpid = open3($chld_in, $chld_out, $chld_out, $command);
330    };
331    if ($@) {
332    if($@ =~ m/^open3/) {
333        die "open3 failed in $0: $!\n$@\n";     
334    }
335    die "Tried to launch open3 in $0, got unexpected exception: $@";
336    }
337
338    my $loop = 1;
339    while($loop)
340    {
341    if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
342        if($blnShow) {
343        print STDERR "$strLine\n";
344        }
345        $strReadIn .= $strLine;
346    }
347    else { # wget finished, terminate naturally
348        #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
349        close($chld_in);
350        close($chld_out);
351        waitpid $childpid, 0;
352        $loop = 0;
353       
354        $childpid = undef;
355        if(defined $port) {
356        $read_set->remove($serverSocket);
357        close($serverSocket);
358        }
359    }
360
361    # if we run this script from the command-line (as opposed to from GLI),
362    # then we're not working with sockets and can therefore can skip the next bits
363    next unless(defined $port);
364
365    # http://www.perlfect.com/articles/select.shtml
366    # "multiplex between several filehandles within a single thread of control,
367    # thus creating the effect of parallelism in the handling of I/O."
368    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
369
370    # take all readable handles in turn
371    foreach my $rh (@rh_set) {
372        if($rh == $serverSocket) {
373        my $client = $rh->accept();
374        #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
375        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
376       
377        # Read from the client (getting rid of the trailing newline)
378        # Has the client sent the <<STOP>> signal?
379        my $signal = <$client>;
380        chomp($signal);
381        if($signal eq "<<STOP>>") {
382            print $client "Perl received STOP signal (on port $port): stopping wget\n";
383            $loop = 0;                          # out of outer while loop
384            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
385           
386            # Sometimes the wget process takes some time to start up. If the STOP signal
387            # was sent, don't try to terminate the process until we know it is running.
388            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
389            # for it to start up, checking for whether it is running in order to kill it.
390            for(my $seconds = 1; $seconds <= 5; $seconds++) {
391            if(kill(0, $childpid)) {
392                # If kill(0, $childpid) returns true, then the process is running
393                # and we need to kill it.
394                close($chld_in);
395                close($chld_out);
396                kill("TERM", $childpid);
397               
398                $childpid = undef;
399               
400                # Stop monitoring the read_handle and close the serverSocket
401                # (the Java end will close the client socket that Java opened)
402                $read_set->remove($rh);     #$read_set->remove($serverSocket);
403                close($rh);             #close($serverSocket);
404                print $client "Perl terminated wget and is about to exit\n";
405                last;                           # out of inner for loop
406            }
407            else { # the process may just be starting up, wait
408                sleep(1);
409            }
410            }
411            last;                               # out of foreach loop
412        }
413        }
414    }
415    }
416
417    if ($changed_dir) {
418    chdir $current_dir;
419    }
420   
421    return $strReadIn;
422}
423
424
425sub useWgetMonitored
426{
427    #local $| = 1; # autoflush stdout buffer
428    #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
429
430    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
431
432
433    my $current_dir = cwd();
434    my $changed_dir = 0;
435    if (defined $working_dir && -e $working_dir) {
436    chdir "$working_dir";
437    $changed_dir = 1;
438    }
439
440    # When we are running this script through GLI, the SIGTERM signal handler
441    # won't get called on Windows when wget is to be prematurely terminated.
442    # Instead, when wget has to be terminated in the middle of execution, GLI will
443    # connect to a serverSocket here to communicate when it's time to stop wget.
444    if($self->dealingWithSockets()) {
445
446    $port = <STDIN>; # gets a port on localhost that's not yet in use
447    chomp($port);
448   
449    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
450                           LocalPort => $port,
451                           Listen    => 1,
452                           Reuse     => 1);
453   
454    die "can't setup server" unless $serverSocket;
455    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
456
457    $read_set = new IO::Select();         # create handle set for reading
458    $read_set->add($serverSocket);        # add the main socket to the set
459    }
460
461    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
462    my $command = "\"$wget_file_path\" $cmdWget";
463    #print STDOUT "Command is: $command\n";
464
465    eval {     # see p.568 of Perl Cookbook
466    $childpid = open3($chld_in, $chld_out, $chld_out, $command);
467    };
468    if ($@) {
469    if($@ =~ m/^open3/) {
470        die "open3 failed in $0: $!\n$@\n";     
471    }
472    die "Tried to launch open3 in $0, got unexpected exception: $@";
473    }
474
475    my $full_text = "";
476    my $error_text = "";
477    my @follow_list = ();
478    my $line;
479
480    my $loop = 1;
481    while($loop)
482    {
483    if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
484        if((defined $blnShow) && $blnShow)
485        {
486        print STDERR "$line";
487        }
488       
489        if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
490        my $follow_url = $1;
491        push(@follow_list,$follow_url);
492        }
493       
494        if ($line =~ m/ERROR\s+\d+/) {
495        $error_text .= $line;
496        }
497       
498        $full_text .= $line;
499    }
500    else { # wget finished, terminate naturally
501        #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
502        close($chld_in);
503        close($chld_out);
504        # Program terminates only when the following line is included
505        # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
506        # it prevents the child from turning into a "zombie process".
507        # While the wget process terminates without it, this perl script does not:
508        # the DOS prompt is not returned without it.
509        waitpid $childpid, 0;
510        $loop = 0;
511       
512        $childpid = undef;
513        if(defined $port) {
514        $read_set->remove($serverSocket);
515        close($serverSocket);
516        }
517    }
518
519    # if we run this script from the command-line (as opposed to from GLI),
520    # then we're not working with sockets and can therefore can skip the next bits
521    next unless(defined $port);
522
523    # http://www.perlfect.com/articles/select.shtml
524    # "multiplex between several filehandles within a single thread of control,
525    # thus creating the effect of parallelism in the handling of I/O."
526    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
527
528    # take all readable handles in turn
529    foreach my $rh (@rh_set) {
530        if($rh == $serverSocket) {
531        my $client = $rh->accept();
532        #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
533        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
534       
535        # Read from the client (getting rid of trailing newline)
536        # Has the client sent the <<STOP>> signal?
537        my $signal = <$client>;
538        chomp($signal);
539        if($signal eq "<<STOP>>") {
540            print $client "Perl received STOP signal (on port $port): stopping wget\n";
541            $loop = 0;                          # out of outer while loop
542            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
543           
544            # Sometimes the wget process takes some time to start up. If the STOP signal
545            # was sent, don't try to terminate the process until we know it is running.
546            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
547            # for it to start up, checking for whether it is running in order to kill it.
548            for(my $seconds = 1; $seconds <= 5; $seconds++) {
549            if(kill(0, $childpid)) {
550                # If kill(0, $childpid) returns true, then the process is running
551                # and we need to kill it.
552                close($chld_in);
553                close($chld_out);
554                kill("TERM", $childpid);
555               
556                $childpid = undef;
557               
558                # Stop monitoring the read_handle and close the serverSocket
559                # (the Java end will close the client socket that Java opened)
560                $read_set->remove($rh);     #$read_set->remove($serverSocket);
561                close($rh);             #close($serverSocket);
562                print $client "Perl terminated wget and is about to exit\n";
563                last;                           # out of inner for loop
564            }
565            else { # the process may just be starting up, wait
566                sleep(1);
567            }
568            }
569            last;                               # out of foreach loop
570        }
571        }
572    }
573    }
574
575    my $command_status = $?;
576    if ($command_status != 0) {
577    $error_text .= "Exit error: $command_status";
578    }
579
580    if ($changed_dir) {
581    chdir $current_dir;
582    }
583   
584    my $final_follow = pop(@follow_list); # might be undefined, but that's OK
585   
586    return ($full_text,$error_text,$final_follow);
587}
588
589
590# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
591sub checkURL
592{
593    my ($self) = @_;
594    if ($self->{'url'} eq "")
595    {
596    &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
597    }
598}
599
600sub error
601{
602    my ($strFunctionName,$strError) = @_;
603    {
604    print "Error occoured in WgetDownload.pm\n".
605        "In Function:".$strFunctionName."\n".
606        "Error Message:".$strError."\n";
607    exit(-1);
608    }
609}
610
6111;
612
Note: See TracBrowser for help on using the browser.