root/gsdl/trunk/perllib/downloaders/WgetDownload.pm @ 17537

Revision 17537, 14.6 KB (checked in by ak19, 11 years ago)

Subroutine useWgetMonitored updated to include the modifications made recently to subroutine useWget: uses a serverSocket to monitor any signals from GLI indicating that wget should be prematurely terminated.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use IPC::Open2;
38use IO::Select;
39use IO::Socket;
40
41
42sub BEGIN {
43    @WgetDownload::ISA = ('BaseDownload');
44}
45
46my $arguments =
47     [ { 'name' => "proxy_on",
48    'desc' => "{WgetDownload.proxy_on}",
49    'type' => "flag",
50    'reqd' => "no",
51    'hiddengli' => "yes"},
52      { 'name' => "proxy_host", 
53    'desc' => "{WgetDownload.proxy_host}",         
54    'type' => "string",
55    'reqd' => "no",
56    'hiddengli' => "yes"},
57      { 'name' => "proxy_port",
58    'desc' => "{WgetDownload.proxy_port}",         
59    'type' => "string",
60    'reqd' => "no",
61    'hiddengli' => "yes"},
62      { 'name' => "user_name", 
63    'desc' => "{WgetDownload.user_name}",         
64    'type' => "string",
65    'reqd' => "no",
66    'hiddengli' => "yes"},
67      { 'name' => "user_password",
68    'desc' => "{WgetDownload.user_password}",         
69    'type' => "string",
70    'reqd' => "no",
71    'hiddengli' => "yes"}];
72
73my $options = { 'name'     => "WgetDownload",
74        'desc'     => "{WgetDownload.desc}",
75        'abstract' => "yes",
76        'inherits' => "yes",
77        'args'     => $arguments };
78
79
80# Declaring file global variables related to the wget child process so that
81# the termination signal handler for SIGTERM can close the streams and tidy
82# up before ending the child process.
83my $childpid;
84my ($chld_out, $chld_in);
85my ($serverSocket, $read_set);
86
87# The port this script's server socket will be listening on, to handle
88# incoming signals from GLI to terminate wget. This is also file global,
89# since OAIDownload.pm will make several calls on wget using the same
90# instance of this script and we want to reuse whatever port GLI gave us.
91my $port;
92
93# When this script is called from the command line, this handler will be called
94# if this process is killed or abruptly ends due to receiving one of the
95# terminating signals that this handler is registered to deal with.
96sub abrupt_end_handler {
97    my $termination_signal = shift (@_);
98    if(defined $childpid) {
99    close($chld_out);
100    close($chld_in);
101   
102    #print STDOUT "Received termination signal: $termination_signal\n";
103
104    # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
105    # See http://perldoc.perl.org/perlipc.html#Signals
106    # Warning on using kill at http://perldoc.perl.org/perlfork.html
107    kill("TERM", $childpid);
108
109    # If the SIGTERM sent on Linux calls this handler, we want to make
110    # sure any socket connection is closed.
111    # Otherwise sockets are only used when this script is run from GLI
112    # in which case the handlers don't really get called.
113    if(defined $serverSocket) {
114        $read_set->remove($serverSocket) if defined $read_set;
115        close($serverSocket);
116    }
117    }
118
119    exit(0);
120}
121
122# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
123# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
124# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
125# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
126# http://affy.blogspot.com/p5be/ch13.htm
127# http://perldoc.perl.org/perlipc.html#Signals
128$SIG{'INT'} = \&abrupt_end_handler;
129$SIG{'TERM'} = \&abrupt_end_handler;
130
131sub new {
132    my ($class) = shift (@_);
133    my ($getlist,$inputargs,$hashArgOptLists) = @_;
134    push(@$getlist, $class);
135
136    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
137    push(@{$hashArgOptLists->{"OptList"}},$options);
138
139    my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
140
141    return bless $self, $class;
142}
143
144sub checkWgetSetup
145{
146    my ($self,$blnGliCall) = @_;
147    #TODO: proxy detection??
148   
149    if((!$blnGliCall) && $self->{'proxy_on'})
150    {
151    &checkProxySetup($self);
152    }
153    &checkURL($self);
154}
155
156sub getWgetOptions
157{
158    my ($self) = @_;
159    my $strOptions = "";
160   
161    if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
162    {
163
164    $strOptions .= " -e httpproxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
165
166    if ($self->{'user_name'} && $self->{'user_password'})
167    {
168        $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
169    }
170    }
171
172    if ($self->{'proxy_on'}) {
173    $strOptions .= " --proxy ";
174    }
175
176    return $strOptions;
177}
178
179# Checking for proxy setup: proxy server, proxy port, proxy username and password.
180sub checkProxySetup
181{
182    my ($self) = @_;
183    ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
184    # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
185    # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
186
187}
188
189sub useWget
190{
191    #local $| = 1; # autoflush stdout buffer
192    #print STDOUT "*** Start of subroutine useWget in $0\n";
193
194    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
195
196    my ($strReadIn,$strLine,$command);
197    $strReadIn = "" unless defined $strReadIn;
198
199    my $current_dir = cwd();
200    my $changed_dir = 0;
201    if (defined $working_dir && -e $working_dir) {
202    chdir "$working_dir";
203    $changed_dir = 1;
204    }
205
206    # When we are running this script through GLI, the SIGTERM signal handler
207    # won't get called on Windows when wget is to be prematurely terminated.
208    # Instead, when wget has to be terminated in the middle of execution, GLI will
209    # connect to a serverSocket here to communicate when it's time to stop wget.
210    if(defined $self->{'gli'} && $self->{'gli'} && !defined $port) {
211
212    $port = <STDIN>; # gets a port on localhost that's not yet in use
213    chomp($port);
214   
215    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
216                           LocalPort => $port,
217                           Listen    => 1,
218                           Reuse     => 1);
219   
220    die "can't setup server" unless $serverSocket;
221    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
222
223    $read_set = new IO::Select();         # create handle set for reading
224    $read_set->add($serverSocket);        # add the main socket to the set
225    }
226
227    my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
228    $command = "\"$wget_file_path\" $cmdWget 2>&1";
229    # print STDERR "Command is: $command\n";
230    $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
231
232    my $loop = 1;
233    while($loop)
234    {
235    if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
236        if($blnShow) {
237        print STDERR "$strLine\n";
238        }
239        $strReadIn .= $strLine;
240    }
241    else { # wget finished, terminate naturally
242        #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
243        close($chld_in);
244        close($chld_out);
245        waitpid $childpid, 0;
246        $loop = 0;
247       
248        $childpid = undef;
249        if(defined $port) {
250        $read_set->remove($serverSocket);
251        close($serverSocket);
252        }
253    }
254
255    # if we run this script from the command-line (as opposed to from GLI),
256    # then we're not working with sockets and can therefore can skip the next bits
257    next unless(defined $port);
258
259    # http://www.perlfect.com/articles/select.shtml
260    # "multiplex between several filehandles within a single thread of control,
261    # thus creating the effect of parallelism in the handling of I/O."
262    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
263
264    # take all readable handles in turn
265    foreach my $rh (@rh_set) {
266        if($rh == $serverSocket) {
267        my $client = $rh->accept();
268        #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
269        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
270       
271        # Read from the client (getting rid of trailing newline)
272        # Has the client sent the <<STOP>> signal?
273        my $signal = <$client>;
274        chomp($signal);
275        if($signal eq "<<STOP>>") {
276            print $client "Perl received STOP signal (on port $port): stopping wget\n";
277           
278            $loop = 0;
279            close($chld_in);
280            close($chld_out);
281            kill("TERM", $childpid);
282           
283            $childpid = undef;
284
285            # Stop monitoring the read_handle
286            # close the serverSocket (the Java end will close the client socket that Java opened)
287            $read_set->remove($rh); #$read_set->remove($serverSocket);
288            close($rh);         #close($serverSocket);
289            #print $client "Perl is about to exit\n";
290            last;
291        }
292        }
293    }
294    }
295
296    if ($changed_dir) {
297    chdir $current_dir;
298    }
299   
300    return $strReadIn;
301}
302
303
304sub useWgetMonitored
305{
306    #local $| = 1; # autoflush stdout buffer
307    #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
308
309    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
310
311
312    my $current_dir = cwd();
313    my $changed_dir = 0;
314    if (defined $working_dir && -e $working_dir) {
315    chdir "$working_dir";
316    $changed_dir = 1;
317    }
318
319    # When we are running this script through GLI, the SIGTERM signal handler
320    # won't get called on Windows when wget is to be prematurely terminated.
321    # Instead, when wget has to be terminated in the middle of execution, GLI will
322    # connect to a serverSocket here to communicate when it's time to stop wget.
323    if(defined $self->{'gli'} && $self->{'gli'} && !defined $port) {
324
325    $port = <STDIN>; # gets a port on localhost that's not yet in use
326    chomp($port);
327   
328    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
329                           LocalPort => $port,
330                           Listen    => 1,
331                           Reuse     => 1);
332   
333    die "can't setup server" unless $serverSocket;
334    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
335
336    $read_set = new IO::Select();         # create handle set for reading
337    $read_set->add($serverSocket);        # add the main socket to the set
338    }
339
340    my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
341    #my $command = "\"$wget_file_path\" $cmdWget 2>&1 |"; #open(*WIN,$command) || die "wget request failed: $!\n";
342    my $command = "\"$wget_file_path\" $cmdWget 2>&1";
343    # print STDERR "Command is: $command\n";
344    $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
345
346    my $full_text = "";
347    my $error_text = "";
348    my @follow_list = ();
349    my $line;
350
351    my $loop = 1;
352    while($loop)
353    {
354    if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
355        if((defined $blnShow) && $blnShow)
356        {
357        print STDERR "$line";
358        }
359       
360        if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
361        my $follow_url = $1;
362        push(@follow_list,$follow_url);
363        }
364       
365        if ($line =~ m/ERROR\s+\d+/) {
366        $error_text .= $line;
367        }
368       
369        $full_text .= $line;
370    }
371    else { # wget finished, terminate naturally
372        #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
373        close($chld_in);
374        close($chld_out);
375        # Program terminates only when the following line is included
376        # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
377        # it prevents the child from turning into a "zombie process".
378        # While the wget process terminates without it, this perl script does not:
379        # the DOS prompt is not returned without it.
380        waitpid $childpid, 0;
381        $loop = 0;
382       
383        $childpid = undef;
384        if(defined $port) {
385        $read_set->remove($serverSocket);
386        close($serverSocket);
387        }
388    }
389
390    # if we run this script from the command-line (as opposed to from GLI),
391    # then we're not working with sockets and can therefore can skip the next bits
392    next unless(defined $port);
393
394    # http://www.perlfect.com/articles/select.shtml
395    # "multiplex between several filehandles within a single thread of control,
396    # thus creating the effect of parallelism in the handling of I/O."
397    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
398
399    # take all readable handles in turn
400    foreach my $rh (@rh_set) {
401        if($rh == $serverSocket) {
402        my $client = $rh->accept();
403        #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
404        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
405       
406        # Read from the client (getting rid of trailing newline)
407        # Has the client sent the <<STOP>> signal?
408        my $signal = <$client>;
409        chomp($signal);
410        if($signal eq "<<STOP>>") {
411            print $client "Perl received STOP signal (on port $port): stopping wget\n";
412           
413            $loop = 0;
414            close($chld_in);
415            close($chld_out);
416            kill("TERM", $childpid);
417           
418            $childpid = undef;
419
420            # Stop monitoring the read_handle
421            # close the serverSocket (the Java end will close the client socket that Java opened)
422            $read_set->remove($rh); #$read_set->remove($serverSocket);
423            close($rh);         #close($serverSocket);
424            #print $client "Perl is about to exit\n";
425            last;
426        }
427        }
428    }
429    }
430
431    my $command_status = $?;
432    if ($command_status != 0) {
433    $error_text .= "Exit error: $command_status";
434    }
435
436    if ($changed_dir) {
437    chdir $current_dir;
438    }
439   
440    my $final_follow = pop(@follow_list); # might be undefined, but that's OK
441   
442    return ($full_text,$error_text,$final_follow);
443}
444
445
446# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
447sub checkURL
448{
449    my ($self) = @_;
450    if ($self->{'url'} eq "")
451    {
452    &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
453    }
454}
455
456sub error
457{
458    my ($strFunctionName,$strError) = @_;
459    {
460    print "Error occoured in WgetDownload.pm\n".
461        "In Function:".$strFunctionName."\n".
462        "Error Message:".$strError."\n";
463    exit(-1);
464    }
465}
466
4671;
Note: See TracBrowser for help on using the browser.