root/main/trunk/greenstone2/perllib/downloaders/WgetDownload.pm @ 30520

Revision 30520, 18.3 KB (checked in by ak19, 4 years ago)

Refactoring activate.pl into activate.pm (class, OOP) and activate.pl. Now buildcolutils.pm uses do_deactivate() from activate.pm.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use util;
38use IPC::Open3;
39use IO::Select;
40use IO::Socket;
41
42
43sub BEGIN {
44    @WgetDownload::ISA = ('BaseDownload');
45}
46
47my $arguments =
48     [ { 'name' => "proxy_on",
49    'desc' => "{WgetDownload.proxy_on}",
50    'type' => "flag",
51    'reqd' => "no",
52    'hiddengli' => "yes"},
53      { 'name' => "proxy_host", 
54    'desc' => "{WgetDownload.proxy_host}",         
55    'type' => "string",
56    'reqd' => "no",
57    'hiddengli' => "yes"},
58      { 'name' => "proxy_port",
59    'desc' => "{WgetDownload.proxy_port}",         
60    'type' => "string",
61    'reqd' => "no",
62    'hiddengli' => "yes"},
63      { 'name' => "user_name", 
64    'desc' => "{WgetDownload.user_name}",         
65    'type' => "string",
66    'reqd' => "no",
67    'hiddengli' => "yes"},
68      { 'name' => "user_password",
69    'desc' => "{WgetDownload.user_password}",         
70    'type' => "string",
71    'reqd' => "no",
72    'hiddengli' => "yes"}];
73
74my $options = { 'name'     => "WgetDownload",
75        'desc'     => "{WgetDownload.desc}",
76        'abstract' => "yes",
77        'inherits' => "yes",
78        'args'     => $arguments };
79
80
81# Declaring file global variables related to the wget child process so that
82# the termination signal handler for SIGTERM can close the streams and tidy
83# up before ending the child process.
84my $childpid;
85my ($chld_out, $chld_in);
86my ($serverSocket, $read_set);
87
88# The port this script's server socket will be listening on, to handle
89# incoming signals from GLI to terminate wget. This is also file global,
90# since OAIDownload.pm will make several calls on wget using the same
91# instance of this script and we want to reuse whatever port GLI gave us.
92my $port;
93
94# When this script is called from the command line, this handler will be called
95# if this process is killed or abruptly ends due to receiving one of the
96# terminating signals that this handler is registered to deal with.
97sub abrupt_end_handler {
98    my $termination_signal = shift (@_);
99
100    if(defined $childpid) {
101    close($chld_out);
102    close($chld_in);
103   
104    print STDOUT "Received termination signal: $termination_signal\n";
105
106    # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
107    # See http://perldoc.perl.org/perlipc.html#Signals
108    # Warning on using kill at http://perldoc.perl.org/perlfork.html
109    kill("TERM", $childpid);
110
111    # If the SIGTERM sent on Linux calls this handler, we want to make
112    # sure any socket connection is closed.
113    # Otherwise sockets are only used when this script is run from GLI
114    # in which case the handlers don't really get called.
115    if(defined $serverSocket) {
116        $read_set->remove($serverSocket) if defined $read_set;
117        close($serverSocket);
118    }
119    }
120
121    exit(0);
122}
123
124# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
125# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
126# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
127# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
128# http://affy.blogspot.com/p5be/ch13.htm
129# http://perldoc.perl.org/perlipc.html#Signals
130$SIG{'INT'} = \&abrupt_end_handler;
131$SIG{'TERM'} = \&abrupt_end_handler;
132
133sub new {
134    my ($class) = shift (@_);
135    my ($getlist,$inputargs,$hashArgOptLists) = @_;
136    push(@$getlist, $class);
137
138    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
139    push(@{$hashArgOptLists->{"OptList"}},$options);
140
141    my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
142
143    # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
144    &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
145
146    return bless $self, $class;
147}
148
149sub checkWgetSetup
150{
151    my ($self,$blnGliCall) = @_;
152    #TODO: proxy detection??
153   
154    if((!$blnGliCall) && $self->{'proxy_on'})
155    {
156    &checkProxySetup($self);
157    }
158    &checkURL($self);
159}
160
161sub getWgetOptions
162{
163    my ($self) = @_;
164    my $strOptions = "";
165   
166    if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
167    {
168
169    $strOptions .= " -e httpproxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
170
171    if ($self->{'user_name'} && $self->{'user_password'})
172    {
173        $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
174    }
175    }
176
177    if ($self->{'proxy_on'}) {
178    $strOptions .= " --proxy ";
179    }
180
181    return $strOptions;
182}
183
184# Checking for proxy setup: proxy server, proxy port, proxy username and password.
185sub checkProxySetup
186{
187    my ($self) = @_;
188    ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
189    # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
190    # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
191
192}
193
194# Returns true if the wget status needs to be monitored through sockets
195# (if a socket is used to communicate with the Java program on when to
196# terminate wget). True if we are running gli, or if the particular type
197# of WgetDownload is *not* OAIDownload (in that case, the original way of
198# terminating the perl script from Java terminated wget as well).
199sub dealingWithSockets() {
200    my ($self) = @_;
201    return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
202                       # use ref($self) to find the classname of an object
203}
204
205
206sub useWget
207{
208    #local $| = 1; # autoflush stdout buffer
209    #print STDOUT "*** Start of subroutine useWget in $0\n";
210
211    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
212
213    my ($strReadIn,$strLine,$command);
214    $strReadIn = "" unless defined $strReadIn;
215
216    my $current_dir = cwd();
217    my $changed_dir = 0;
218    if (defined $working_dir && -e $working_dir) {
219    chdir "$working_dir";
220    $changed_dir = 1;
221    }
222
223    # When we are running this script through GLI, the SIGTERM signal handler
224    # won't get called on Windows when wget is to be prematurely terminated.
225    # Instead, when wget has to be terminated in the middle of execution, GLI will
226    # connect to a serverSocket here to communicate when it's time to stop wget.
227    if($self->dealingWithSockets()) {
228
229    $port = <STDIN>; # gets a port on localhost that's not yet in use
230    chomp($port);
231   
232    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
233                           LocalPort => $port,
234                           Listen    => 1,
235                           Reuse     => 1);
236   
237    die "can't setup server" unless $serverSocket;
238    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
239
240    $read_set = new IO::Select();         # create handle set for reading
241    $read_set->add($serverSocket);        # add the main socket to the set
242    }
243
244    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
245    $command = "\"$wget_file_path\" $cmdWget";
246    #print STDOUT "Command is: $command\n";
247
248    # Wget's output needs to be monitored to find out when it has naturally terminated.
249    # Wget's output is sent to its STDERR so we can't use open2 without doing 2>&1.
250    # On linux, 2>&1 launches a subshell which then launches wget, meaning that killing
251    # the childpid does not kill wget on Linux but the subshell that launched it instead.
252    # Therefore, we use open3. Though the child process wget sends output only to its stdout,
253    # using open3 says chld_err is undefined and the output of wget only comes in chld_out(!)
254    # However that may be, it works with open3. But to avoid the confusion of managing and
255    # closing an extra unused handle, a single handle is used instead for both the child's
256    # stderr and stdout.
257
258    # Both open2 and open3 don't return on failure, but raise an exception. The handling
259    # of the exception is described on p.568 of the Perl Cookbook
260    eval {
261    $childpid = open3($chld_in, $chld_out, $chld_out, $command);
262    };
263    if ($@) {
264    if($@ =~ m/^open3/) {
265        die "open3 failed in $0: $!\n$@\n";     
266    }
267    die "Tried to launch open3 in $0, got unexpected exception: $@";
268    }
269
270    my $loop = 1;
271    while($loop)
272    {
273    if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
274        if($blnShow) {
275        print STDERR "$strLine\n";
276        }
277        $strReadIn .= $strLine;
278    }
279    else { # wget finished, terminate naturally
280        #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
281        close($chld_in);
282        close($chld_out);
283        waitpid $childpid, 0;
284        $loop = 0;
285       
286        $childpid = undef;
287        if(defined $port) {
288        $read_set->remove($serverSocket);
289        close($serverSocket);
290        }
291    }
292
293    # if we run this script from the command-line (as opposed to from GLI),
294    # then we're not working with sockets and can therefore can skip the next bits
295    next unless(defined $port);
296
297    # http://www.perlfect.com/articles/select.shtml
298    # "multiplex between several filehandles within a single thread of control,
299    # thus creating the effect of parallelism in the handling of I/O."
300    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
301
302    # take all readable handles in turn
303    foreach my $rh (@rh_set) {
304        if($rh == $serverSocket) {
305        my $client = $rh->accept();
306        #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
307        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
308       
309        # Read from the client (getting rid of the trailing newline)
310        # Has the client sent the <<STOP>> signal?
311        my $signal = <$client>;
312        chomp($signal);
313        if($signal eq "<<STOP>>") {
314            print $client "Perl received STOP signal (on port $port): stopping wget\n";
315            $loop = 0;                          # out of outer while loop
316            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
317           
318            # Sometimes the wget process takes some time to start up. If the STOP signal
319            # was sent, don't try to terminate the process until we know it is running.
320            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
321            # for it to start up, checking for whether it is running in order to kill it.
322            for(my $seconds = 1; $seconds <= 5; $seconds++) {
323            if(kill(0, $childpid)) {
324                # If kill(0, $childpid) returns true, then the process is running
325                # and we need to kill it.
326                close($chld_in);
327                close($chld_out);
328                kill("TERM", $childpid);
329               
330                $childpid = undef;
331               
332                # Stop monitoring the read_handle and close the serverSocket
333                # (the Java end will close the client socket that Java opened)
334                $read_set->remove($rh);     #$read_set->remove($serverSocket);
335                close($rh);             #close($serverSocket);
336                print $client "Perl terminated wget and is about to exit\n";
337                last;                           # out of inner for loop
338            }
339            else { # the process may just be starting up, wait
340                sleep(1);
341            }
342            }
343            last;                               # out of foreach loop
344        }
345        }
346    }
347    }
348
349    if ($changed_dir) {
350    chdir $current_dir;
351    }
352   
353    return $strReadIn;
354}
355
356
357sub useWgetMonitored
358{
359    #local $| = 1; # autoflush stdout buffer
360    #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
361
362    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
363
364
365    my $current_dir = cwd();
366    my $changed_dir = 0;
367    if (defined $working_dir && -e $working_dir) {
368    chdir "$working_dir";
369    $changed_dir = 1;
370    }
371
372    # When we are running this script through GLI, the SIGTERM signal handler
373    # won't get called on Windows when wget is to be prematurely terminated.
374    # Instead, when wget has to be terminated in the middle of execution, GLI will
375    # connect to a serverSocket here to communicate when it's time to stop wget.
376    if($self->dealingWithSockets()) {
377
378    $port = <STDIN>; # gets a port on localhost that's not yet in use
379    chomp($port);
380   
381    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
382                           LocalPort => $port,
383                           Listen    => 1,
384                           Reuse     => 1);
385   
386    die "can't setup server" unless $serverSocket;
387    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
388
389    $read_set = new IO::Select();         # create handle set for reading
390    $read_set->add($serverSocket);        # add the main socket to the set
391    }
392
393    my $wget_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
394    my $command = "\"$wget_file_path\" $cmdWget";
395    #print STDOUT "Command is: $command\n";
396
397    eval {     # see p.568 of Perl Cookbook
398    $childpid = open3($chld_in, $chld_out, $chld_out, $command);
399    };
400    if ($@) {
401    if($@ =~ m/^open3/) {
402        die "open3 failed in $0: $!\n$@\n";     
403    }
404    die "Tried to launch open3 in $0, got unexpected exception: $@";
405    }
406
407    my $full_text = "";
408    my $error_text = "";
409    my @follow_list = ();
410    my $line;
411
412    my $loop = 1;
413    while($loop)
414    {
415    if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
416        if((defined $blnShow) && $blnShow)
417        {
418        print STDERR "$line";
419        }
420       
421        if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
422        my $follow_url = $1;
423        push(@follow_list,$follow_url);
424        }
425       
426        if ($line =~ m/ERROR\s+\d+/) {
427        $error_text .= $line;
428        }
429       
430        $full_text .= $line;
431    }
432    else { # wget finished, terminate naturally
433        #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
434        close($chld_in);
435        close($chld_out);
436        # Program terminates only when the following line is included
437        # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
438        # it prevents the child from turning into a "zombie process".
439        # While the wget process terminates without it, this perl script does not:
440        # the DOS prompt is not returned without it.
441        waitpid $childpid, 0;
442        $loop = 0;
443       
444        $childpid = undef;
445        if(defined $port) {
446        $read_set->remove($serverSocket);
447        close($serverSocket);
448        }
449    }
450
451    # if we run this script from the command-line (as opposed to from GLI),
452    # then we're not working with sockets and can therefore can skip the next bits
453    next unless(defined $port);
454
455    # http://www.perlfect.com/articles/select.shtml
456    # "multiplex between several filehandles within a single thread of control,
457    # thus creating the effect of parallelism in the handling of I/O."
458    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
459
460    # take all readable handles in turn
461    foreach my $rh (@rh_set) {
462        if($rh == $serverSocket) {
463        my $client = $rh->accept();
464        #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
465        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
466       
467        # Read from the client (getting rid of trailing newline)
468        # Has the client sent the <<STOP>> signal?
469        my $signal = <$client>;
470        chomp($signal);
471        if($signal eq "<<STOP>>") {
472            print $client "Perl received STOP signal (on port $port): stopping wget\n";
473            $loop = 0;                          # out of outer while loop
474            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
475           
476            # Sometimes the wget process takes some time to start up. If the STOP signal
477            # was sent, don't try to terminate the process until we know it is running.
478            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
479            # for it to start up, checking for whether it is running in order to kill it.
480            for(my $seconds = 1; $seconds <= 5; $seconds++) {
481            if(kill(0, $childpid)) {
482                # If kill(0, $childpid) returns true, then the process is running
483                # and we need to kill it.
484                close($chld_in);
485                close($chld_out);
486                kill("TERM", $childpid);
487               
488                $childpid = undef;
489               
490                # Stop monitoring the read_handle and close the serverSocket
491                # (the Java end will close the client socket that Java opened)
492                $read_set->remove($rh);     #$read_set->remove($serverSocket);
493                close($rh);             #close($serverSocket);
494                print $client "Perl terminated wget and is about to exit\n";
495                last;                           # out of inner for loop
496            }
497            else { # the process may just be starting up, wait
498                sleep(1);
499            }
500            }
501            last;                               # out of foreach loop
502        }
503        }
504    }
505    }
506
507    my $command_status = $?;
508    if ($command_status != 0) {
509    $error_text .= "Exit error: $command_status";
510    }
511
512    if ($changed_dir) {
513    chdir $current_dir;
514    }
515   
516    my $final_follow = pop(@follow_list); # might be undefined, but that's OK
517   
518    return ($full_text,$error_text,$final_follow);
519}
520
521
522# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
523sub checkURL
524{
525    my ($self) = @_;
526    if ($self->{'url'} eq "")
527    {
528    &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
529    }
530}
531
532sub error
533{
534    my ($strFunctionName,$strError) = @_;
535    {
536    print "Error occoured in WgetDownload.pm\n".
537        "In Function:".$strFunctionName."\n".
538        "Error Message:".$strError."\n";
539    exit(-1);
540    }
541}
542
5431;
544
Note: See TracBrowser for help on using the browser.