root/main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm @ 31878

Revision 31878, 20.0 KB (checked in by ak19, 2 years ago)

1. Previous commit message was incorrect: it wasn't that perl wasn't found that resulted in the open3() failure error message, but that something on the PATH wasn't available to it, possible wget itself. 2. Updating unused DownloadJob?.old_callDownload() to have the recently committed changes in callDownload(). 3. Emacs tabbing for recently committed files.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use util;
38use IPC::Open3;
39use IO::Select;
40use IO::Socket;
41
42
43sub BEGIN {
44    @WgetDownload::ISA = ('BaseDownload');
45}
46
47my $arguments =
48     [ { 'name' => "proxy_on",
49    'desc' => "{WgetDownload.proxy_on}",
50    'type' => "flag",
51    'reqd' => "no",
52    'hiddengli' => "yes"},
53      { 'name' => "proxy_host", 
54    'desc' => "{WgetDownload.proxy_host}",         
55    'type' => "string",
56    'reqd' => "no",
57    'hiddengli' => "yes"},
58      { 'name' => "proxy_port",
59    'desc' => "{WgetDownload.proxy_port}",         
60    'type' => "string",
61    'reqd' => "no",
62    'hiddengli' => "yes"},
63      { 'name' => "user_name", 
64    'desc' => "{WgetDownload.user_name}",         
65    'type' => "string",
66    'reqd' => "no",
67    'hiddengli' => "yes"},
68      { 'name' => "user_password",
69    'desc' => "{WgetDownload.user_password}",         
70    'type' => "string",
71    'reqd' => "no",
72    'hiddengli' => "yes"},
73      { 'name' => "no_check_certificate",
74    'desc' => "{WgetDownload.no_check_certificate}",
75    'type' => "flag",
76    'reqd' => "no",
77    'hiddengli' => "yes"}
78     ];
79
80my $options = { 'name'     => "WgetDownload",
81        'desc'     => "{WgetDownload.desc}",
82        'abstract' => "yes",
83        'inherits' => "yes",
84        'args'     => $arguments };
85
86
87# Declaring file global variables related to the wget child process so that
88# the termination signal handler for SIGTERM can close the streams and tidy
89# up before ending the child process.
90my $childpid;
91my ($chld_out, $chld_in);
92my ($serverSocket, $read_set);
93
94# The port this script's server socket will be listening on, to handle
95# incoming signals from GLI to terminate wget. This is also file global,
96# since OAIDownload.pm will make several calls on wget using the same
97# instance of this script and we want to reuse whatever port GLI gave us.
98my $port;
99
100# When this script is called from the command line, this handler will be called
101# if this process is killed or abruptly ends due to receiving one of the
102# terminating signals that this handler is registered to deal with.
103sub abrupt_end_handler {
104    my $termination_signal = shift (@_);
105
106    if(defined $childpid) {
107    close($chld_out);
108    close($chld_in);
109   
110    print STDOUT "Received termination signal: $termination_signal\n";
111
112    # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
113    # See http://perldoc.perl.org/perlipc.html#Signals
114    # Warning on using kill at http://perldoc.perl.org/perlfork.html
115    kill("TERM", $childpid);
116
117    # If the SIGTERM sent on Linux calls this handler, we want to make
118    # sure any socket connection is closed.
119    # Otherwise sockets are only used when this script is run from GLI
120    # in which case the handlers don't really get called.
121    if(defined $serverSocket) {
122        $read_set->remove($serverSocket) if defined $read_set;
123        close($serverSocket);
124    }
125    }
126
127    exit(0);
128}
129
130# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
131# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
132# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
133# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
134# http://affy.blogspot.com/p5be/ch13.htm
135# http://perldoc.perl.org/perlipc.html#Signals
136$SIG{'INT'} = \&abrupt_end_handler;
137$SIG{'TERM'} = \&abrupt_end_handler;
138
139sub new {
140    my ($class) = shift (@_);
141    my ($getlist,$inputargs,$hashArgOptLists) = @_;
142    push(@$getlist, $class);
143
144    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
145    push(@{$hashArgOptLists->{"OptList"}},$options);
146
147    my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
148
149    # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
150    &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
151
152    return bless $self, $class;
153}
154
155sub checkWgetSetup
156{
157    my ($self,$blnGliCall) = @_;
158    #TODO: proxy detection??
159   
160    if((!$blnGliCall) && $self->{'proxy_on'})
161    {
162    &checkProxySetup($self);
163    }
164    &checkURL($self);
165}
166
167# Not using this. On Windows, we used to pass proxying settings as flags to wget. But, as that can be
168# seen with Task Manager, we now have the proxy settings set in the environment and are no longer passing it
169sub addProxySettingsAsWgetFlags
170{
171    my ($self) = @_;
172    my $strOptions = "";
173   
174    if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
175    {
176    $strOptions .= " -e https_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} "; 
177    $strOptions .= " -e http_proxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";       
178   
179    if ($self->{'user_name'} && $self->{'user_password'})
180    {
181        $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
182    }
183    }
184   
185    if ($self->{'proxy_on'}) {
186    $strOptions .= " --proxy ";
187    }
188   
189    return $strOptions;
190}
191
192sub getWgetOptions
193{
194    my ($self) = @_;
195    my $strOptions = "";
196   
197    # If proxy settings are set up in the environment, wget is ready to use them. More secure.
198    # But if proxy settings are not set up in the environment, pass them as flags to wget
199    # This is less secure, as pwd etc visible in task manager, but it was the original way in
200    # which wget was run on windows.
201    # Truth in Perl: https://home.ubalt.edu/abento/452/perl/perltruth.html
202    # http://www.perlmonks.org/?node=what%20is%20true%20and%20false%20in%20Perl%3F
203   
204    if (!$ENV{'http_proxy'} && !$ENV{'https_proxy'}) {
205    $strOptions .= $self->addProxySettingsAsWgetFlags();
206    } # else wget will use proxy settings in environment, assume enough settings have been provided
207   
208    if($self->{'no_check_certificate'}) { # URL may be http that gets redirected to https, so if no_check_certificate is on, turn it on even if URL is http
209   
210    $strOptions .= " --no-check-certificate ";
211    }
212   
213    return $strOptions;
214}
215
216# Checking for proxy setup: proxy server, proxy port, proxy username and password.
217sub checkProxySetup
218{
219    my ($self) = @_;
220    ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
221    # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
222    # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
223
224}
225
226# Returns true if the wget status needs to be monitored through sockets
227# (if a socket is used to communicate with the Java program on when to
228# terminate wget). True if we are running gli, or if the particular type
229# of WgetDownload is *not* OAIDownload (in that case, the original way of
230# terminating the perl script from Java terminated wget as well).
231sub dealingWithSockets() {
232    my ($self) = @_;
233    return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
234                       # use ref($self) to find the classname of an object
235}
236
237
238sub useWget
239{
240    #local $| = 1; # autoflush stdout buffer
241    #print STDOUT "*** Start of subroutine useWget in $0\n";
242
243    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
244
245    my ($strReadIn,$strLine,$command);
246    $strReadIn = "" unless defined $strReadIn;
247
248    my $current_dir = cwd();
249    my $changed_dir = 0;
250    if (defined $working_dir && -e $working_dir) {
251    chdir "$working_dir";
252    $changed_dir = 1;
253    }
254
255    # When we are running this script through GLI, the SIGTERM signal handler
256    # won't get called on Windows when wget is to be prematurely terminated.
257    # Instead, when wget has to be terminated in the middle of execution, GLI will
258    # connect to a serverSocket here to communicate when it's time to stop wget.
259    if($self->dealingWithSockets()) {
260
261    $port = <STDIN>; # gets a port on localhost that's not yet in use
262    chomp($port);
263   
264    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
265                           LocalPort => $port,
266                           Listen    => 1,
267                           Reuse     => 1);
268   
269    die "can't setup server" unless $serverSocket;
270    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
271
272    $read_set = new IO::Select();         # create handle set for reading
273    $read_set->add($serverSocket);        # add the main socket to the set
274    }
275
276    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
277    $command = "\"$wget_file_path\" $cmdWget";
278    #print STDOUT "Command is: $command\n"; # displayed in GLI output
279    #print STDERR "Command is: $command\n"; # goes into ServerInfoDialog
280   
281    # Wget's output needs to be monitored to find out when it has naturally terminated.
282    # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
283    # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
284    # the childpid does not kill wget on Linux but the subshell that launched it instead.
285    # Therefore, we use open3. Though the child process wget sends output only to its stdout,
286    # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
287    # However that may be, it works with open3. But to avoid the confusion of managing and
288    # closing an extra unused handle, a single handle is used instead for both the child's
289    # stderr and stdout.
290    # See http://blog.0x1fff.com/2009/09/howto-execute-system-commands-in-perl.html
291    # for why this is the right thing to do.
292
293    # Both open2 and open3 don't return on failure, but raise an exception. The handling
294    # of the exception is described on p.568 of the Perl Cookbook
295    eval {
296    $childpid = open3($chld_in, $chld_out, $chld_out, $command);
297    };
298    if ($@) {
299    if($@ =~ m/^open3/) {
300        die "open3 failed in $0: $!\n$@\n";     
301    }
302    die "Tried to launch open3 in $0, got unexpected exception: $@";
303    }
304
305    my $loop = 1;
306    while($loop)
307    {
308    if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
309        if($blnShow) {
310        print STDERR "$strLine\n";
311        }
312        $strReadIn .= $strLine;
313    }
314    else { # wget finished, terminate naturally
315        #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
316        close($chld_in);
317        close($chld_out);
318        waitpid $childpid, 0;
319        $loop = 0;
320       
321        $childpid = undef;
322        if(defined $port) {
323        $read_set->remove($serverSocket);
324        close($serverSocket);
325        }
326    }
327
328    # if we run this script from the command-line (as opposed to from GLI),
329    # then we're not working with sockets and can therefore can skip the next bits
330    next unless(defined $port);
331
332    # http://www.perlfect.com/articles/select.shtml
333    # "multiplex between several filehandles within a single thread of control,
334    # thus creating the effect of parallelism in the handling of I/O."
335    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
336
337    # take all readable handles in turn
338    foreach my $rh (@rh_set) {
339        if($rh == $serverSocket) {
340        my $client = $rh->accept();
341        #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
342        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
343       
344        # Read from the client (getting rid of the trailing newline)
345        # Has the client sent the <<STOP>> signal?
346        my $signal = <$client>;
347        chomp($signal);
348        if($signal eq "<<STOP>>") {
349            print $client "Perl received STOP signal (on port $port): stopping wget\n";
350            $loop = 0;                          # out of outer while loop
351            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
352           
353            # Sometimes the wget process takes some time to start up. If the STOP signal
354            # was sent, don't try to terminate the process until we know it is running.
355            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
356            # for it to start up, checking for whether it is running in order to kill it.
357            for(my $seconds = 1; $seconds <= 5; $seconds++) {
358            if(kill(0, $childpid)) {
359                # If kill(0, $childpid) returns true, then the process is running
360                # and we need to kill it.
361                close($chld_in);
362                close($chld_out);
363                kill("TERM", $childpid);
364               
365                $childpid = undef;
366               
367                # Stop monitoring the read_handle and close the serverSocket
368                # (the Java end will close the client socket that Java opened)
369                $read_set->remove($rh);     #$read_set->remove($serverSocket);
370                close($rh);             #close($serverSocket);
371                print $client "Perl terminated wget and is about to exit\n";
372                last;                           # out of inner for loop
373            }
374            else { # the process may just be starting up, wait
375                sleep(1);
376            }
377            }
378            last;                               # out of foreach loop
379        }
380        }
381    }
382    }
383
384    if ($changed_dir) {
385    chdir $current_dir;
386    }
387   
388    return $strReadIn;
389}
390
391
392sub useWgetMonitored
393{
394    #local $| = 1; # autoflush stdout buffer
395    #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
396
397    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
398
399
400    my $current_dir = cwd();
401    my $changed_dir = 0;
402    if (defined $working_dir && -e $working_dir) {
403    chdir "$working_dir";
404    $changed_dir = 1;
405    }
406
407    # When we are running this script through GLI, the SIGTERM signal handler
408    # won't get called on Windows when wget is to be prematurely terminated.
409    # Instead, when wget has to be terminated in the middle of execution, GLI will
410    # connect to a serverSocket here to communicate when it's time to stop wget.
411    if($self->dealingWithSockets()) {
412
413    $port = <STDIN>; # gets a port on localhost that's not yet in use
414    chomp($port);
415   
416    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
417                           LocalPort => $port,
418                           Listen    => 1,
419                           Reuse     => 1);
420   
421    die "can't setup server" unless $serverSocket;
422    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
423
424    $read_set = new IO::Select();         # create handle set for reading
425    $read_set->add($serverSocket);        # add the main socket to the set
426    }
427
428    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
429    my $command = "\"$wget_file_path\" $cmdWget";
430    #print STDOUT "Command is: $command\n";
431
432    eval {     # see p.568 of Perl Cookbook
433    $childpid = open3($chld_in, $chld_out, $chld_out, $command);
434    };
435    if ($@) {
436    if($@ =~ m/^open3/) {
437        die "open3 failed in $0: $!\n$@\n";     
438    }
439    die "Tried to launch open3 in $0, got unexpected exception: $@";
440    }
441
442    my $full_text = "";
443    my $error_text = "";
444    my @follow_list = ();
445    my $line;
446
447    my $loop = 1;
448    while($loop)
449    {
450    if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
451        if((defined $blnShow) && $blnShow)
452        {
453        print STDERR "$line";
454        }
455       
456        if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
457        my $follow_url = $1;
458        push(@follow_list,$follow_url);
459        }
460       
461        if ($line =~ m/ERROR\s+\d+/) {
462        $error_text .= $line;
463        }
464       
465        $full_text .= $line;
466    }
467    else { # wget finished, terminate naturally
468        #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
469        close($chld_in);
470        close($chld_out);
471        # Program terminates only when the following line is included
472        # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
473        # it prevents the child from turning into a "zombie process".
474        # While the wget process terminates without it, this perl script does not:
475        # the DOS prompt is not returned without it.
476        waitpid $childpid, 0;
477        $loop = 0;
478       
479        $childpid = undef;
480        if(defined $port) {
481        $read_set->remove($serverSocket);
482        close($serverSocket);
483        }
484    }
485
486    # if we run this script from the command-line (as opposed to from GLI),
487    # then we're not working with sockets and can therefore can skip the next bits
488    next unless(defined $port);
489
490    # http://www.perlfect.com/articles/select.shtml
491    # "multiplex between several filehandles within a single thread of control,
492    # thus creating the effect of parallelism in the handling of I/O."
493    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
494
495    # take all readable handles in turn
496    foreach my $rh (@rh_set) {
497        if($rh == $serverSocket) {
498        my $client = $rh->accept();
499        #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
500        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
501       
502        # Read from the client (getting rid of trailing newline)
503        # Has the client sent the <<STOP>> signal?
504        my $signal = <$client>;
505        chomp($signal);
506        if($signal eq "<<STOP>>") {
507            print $client "Perl received STOP signal (on port $port): stopping wget\n";
508            $loop = 0;                          # out of outer while loop
509            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
510           
511            # Sometimes the wget process takes some time to start up. If the STOP signal
512            # was sent, don't try to terminate the process until we know it is running.
513            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
514            # for it to start up, checking for whether it is running in order to kill it.
515            for(my $seconds = 1; $seconds <= 5; $seconds++) {
516            if(kill(0, $childpid)) {
517                # If kill(0, $childpid) returns true, then the process is running
518                # and we need to kill it.
519                close($chld_in);
520                close($chld_out);
521                kill("TERM", $childpid);
522               
523                $childpid = undef;
524               
525                # Stop monitoring the read_handle and close the serverSocket
526                # (the Java end will close the client socket that Java opened)
527                $read_set->remove($rh);     #$read_set->remove($serverSocket);
528                close($rh);             #close($serverSocket);
529                print $client "Perl terminated wget and is about to exit\n";
530                last;                           # out of inner for loop
531            }
532            else { # the process may just be starting up, wait
533                sleep(1);
534            }
535            }
536            last;                               # out of foreach loop
537        }
538        }
539    }
540    }
541
542    my $command_status = $?;
543    if ($command_status != 0) {
544    $error_text .= "Exit error: $command_status";
545    }
546
547    if ($changed_dir) {
548    chdir $current_dir;
549    }
550   
551    my $final_follow = pop(@follow_list); # might be undefined, but that's OK
552   
553    return ($full_text,$error_text,$final_follow);
554}
555
556
557# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
558sub checkURL
559{
560    my ($self) = @_;
561    if ($self->{'url'} eq "")
562    {
563    &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
564    }
565}
566
567sub error
568{
569    my ($strFunctionName,$strError) = @_;
570    {
571    print "Error occoured in WgetDownload.pm\n".
572        "In Function:".$strFunctionName."\n".
573        "Error Message:".$strError."\n";
574    exit(-1);
575    }
576}
577
5781;
579
Note: See TracBrowser for help on using the browser.