root/gsdl/trunk/perllib/downloaders/WgetDownload.pm @ 17795

Revision 17795, 18.3 KB (checked in by ak19, 12 years ago)

Bugfix: On Linux, perl launches a subshell (the child process of the perl script) and THAT launches wget - so need to find the pid of wget in order to allow the user of GLI terminate it. This is because we absolutely have to use 2>&1 (otherwise wget blocks until it has finished). Also have tested that the cmdline version of downloadfrom.pl still terminates wget properly when fed Ctrl-C (SIGINT).

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use IPC::Open2;
38use IO::Select;
39use IO::Socket;
40
41
42sub BEGIN {
43    @WgetDownload::ISA = ('BaseDownload');
44}
45
46my $arguments =
47     [ { 'name' => "proxy_on",
48    'desc' => "{WgetDownload.proxy_on}",
49    'type' => "flag",
50    'reqd' => "no",
51    'hiddengli' => "yes"},
52      { 'name' => "proxy_host", 
53    'desc' => "{WgetDownload.proxy_host}",         
54    'type' => "string",
55    'reqd' => "no",
56    'hiddengli' => "yes"},
57      { 'name' => "proxy_port",
58    'desc' => "{WgetDownload.proxy_port}",         
59    'type' => "string",
60    'reqd' => "no",
61    'hiddengli' => "yes"},
62      { 'name' => "user_name", 
63    'desc' => "{WgetDownload.user_name}",         
64    'type' => "string",
65    'reqd' => "no",
66    'hiddengli' => "yes"},
67      { 'name' => "user_password",
68    'desc' => "{WgetDownload.user_password}",         
69    'type' => "string",
70    'reqd' => "no",
71    'hiddengli' => "yes"}];
72
73my $options = { 'name'     => "WgetDownload",
74        'desc'     => "{WgetDownload.desc}",
75        'abstract' => "yes",
76        'inherits' => "yes",
77        'args'     => $arguments };
78
79
80# Declaring file global variables related to the wget child process so that
81# the termination signal handler for SIGTERM can close the streams and tidy
82# up before ending the child process.
83my $childpid;
84my ($chld_out, $chld_in);
85my ($serverSocket, $read_set);
86
87# The port this script's server socket will be listening on, to handle
88# incoming signals from GLI to terminate wget. This is also file global,
89# since OAIDownload.pm will make several calls on wget using the same
90# instance of this script and we want to reuse whatever port GLI gave us.
91my $port;
92
93# When this script is called from the command line, this handler will be called
94# if this process is killed or abruptly ends due to receiving one of the
95# terminating signals that this handler is registered to deal with.
96sub abrupt_end_handler {
97    my $termination_signal = shift (@_);
98
99    if(defined $childpid) {
100    close($chld_out);
101    close($chld_in);
102   
103    print STDOUT "Received termination signal: $termination_signal\n";
104
105    # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
106    # See http://perldoc.perl.org/perlipc.html#Signals
107    # Warning on using kill at http://perldoc.perl.org/perlfork.html
108    kill("TERM", $childpid);
109
110    # If the SIGTERM sent on Linux calls this handler, we want to make
111    # sure any socket connection is closed.
112    # Otherwise sockets are only used when this script is run from GLI
113    # in which case the handlers don't really get called.
114    if(defined $serverSocket) {
115        $read_set->remove($serverSocket) if defined $read_set;
116        close($serverSocket);
117    }
118    }
119
120    exit(0);
121}
122
123# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
124# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
125# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
126# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
127# http://affy.blogspot.com/p5be/ch13.htm
128# http://perldoc.perl.org/perlipc.html#Signals
129$SIG{'INT'} = \&abrupt_end_handler;
130$SIG{'TERM'} = \&abrupt_end_handler;
131
132sub new {
133    my ($class) = shift (@_);
134    my ($getlist,$inputargs,$hashArgOptLists) = @_;
135    push(@$getlist, $class);
136
137    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
138    push(@{$hashArgOptLists->{"OptList"}},$options);
139
140    my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
141
142    return bless $self, $class;
143}
144
145sub checkWgetSetup
146{
147    my ($self,$blnGliCall) = @_;
148    #TODO: proxy detection??
149   
150    if((!$blnGliCall) && $self->{'proxy_on'})
151    {
152    &checkProxySetup($self);
153    }
154    &checkURL($self);
155}
156
157sub getWgetOptions
158{
159    my ($self) = @_;
160    my $strOptions = "";
161   
162    if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
163    {
164
165    $strOptions .= " -e httpproxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
166
167    if ($self->{'user_name'} && $self->{'user_password'})
168    {
169        $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
170    }
171    }
172
173    if ($self->{'proxy_on'}) {
174    $strOptions .= " --proxy ";
175    }
176
177    return $strOptions;
178}
179
180# Checking for proxy setup: proxy server, proxy port, proxy username and password.
181sub checkProxySetup
182{
183    my ($self) = @_;
184    ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
185    # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
186    # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
187
188}
189
190# Returns true if the wget status needs to be monitored through sockets
191# (if a socket is used to communicate with the Java program on when to
192# terminate wget). True if we are running gli, or if the particular type
193# of WgetDownload is *not* OAIDownload (in that case, the original way of
194# terminating the perl script from Java terminated wget as well).
195sub dealingWithSockets() {
196    my ($self) = @_;
197    return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
198                       # use ref($self) to find the classname of an object
199}
200
201sub get_childpid {
202    my ($self, $pid) = @_;
203    my $os = $^O;
204    # for windows, we do nothing special
205    if ($os =~ m/mswin/i) {
206    return $pid;
207    }
208   
209    # else $os is macos or linux
210    # This means the wget child process may have been spawned from a subshell
211    # (the real child) that was launched by this perl script. However, we want
212    # the pid of the wget process, since that is what we want to terminate.
213
214    sleep(2); # give it some time to start up, else we'd have finished searching beforehand
215
216    # Look through any processes spawned immediately/soon after the subshell.
217    # Look for any pid greater than the subshell's, but within a certain limit (10)
218    my $child = $pid;
219    for(my $i = 1; $i <= 10; $i++) {
220    $child = $child+$i;
221    if(kill(0, $child)) { # this process exists
222        return $child;
223    }
224    }
225   
226    return $pid; # could not find any process within limit, so return the original process id?
227}
228
229sub useWget
230{
231    #local $| = 1; # autoflush stdout buffer
232    #print STDOUT "*** Start of subroutine useWget in $0\n";
233
234    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
235
236    my ($strReadIn,$strLine,$command);
237    $strReadIn = "" unless defined $strReadIn;
238
239    my $current_dir = cwd();
240    my $changed_dir = 0;
241    if (defined $working_dir && -e $working_dir) {
242    chdir "$working_dir";
243    $changed_dir = 1;
244    }
245
246    # When we are running this script through GLI, the SIGTERM signal handler
247    # won't get called on Windows when wget is to be prematurely terminated.
248    # Instead, when wget has to be terminated in the middle of execution, GLI will
249    # connect to a serverSocket here to communicate when it's time to stop wget.
250    if($self->dealingWithSockets()) {
251
252    $port = <STDIN>; # gets a port on localhost that's not yet in use
253    chomp($port);
254   
255    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
256                           LocalPort => $port,
257                           Listen    => 1,
258                           Reuse     => 1);
259   
260    die "can't setup server" unless $serverSocket;
261    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
262
263    $read_set = new IO::Select();         # create handle set for reading
264    $read_set->add($serverSocket);        # add the main socket to the set
265    }
266
267    my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
268    $command = "\"$wget_file_path\" $cmdWget 2>&1";
269    # print STDERR "Command is: $command\n";
270    $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
271   
272    # for linux the wget process starts off as achild of a subshell launched as the child of
273    # this perl script. We want the pid of the wget process, not that of the subshell.
274    $childpid = $self->get_childpid($childpid);
275
276    my $loop = 1;
277    while($loop)
278    {
279    if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
280        if($blnShow) {
281        print STDERR "$strLine\n";
282        }
283        $strReadIn .= $strLine;
284    }
285    else { # wget finished, terminate naturally
286        #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
287        close($chld_in);
288        close($chld_out);
289        waitpid $childpid, 0;
290        $loop = 0;
291       
292        $childpid = undef;
293        if(defined $port) {
294        $read_set->remove($serverSocket);
295        close($serverSocket);
296        }
297    }
298
299    # if we run this script from the command-line (as opposed to from GLI),
300    # then we're not working with sockets and can therefore can skip the next bits
301    next unless(defined $port);
302
303    # http://www.perlfect.com/articles/select.shtml
304    # "multiplex between several filehandles within a single thread of control,
305    # thus creating the effect of parallelism in the handling of I/O."
306    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
307
308    # take all readable handles in turn
309    foreach my $rh (@rh_set) {
310        if($rh == $serverSocket) {
311        my $client = $rh->accept();
312        #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
313        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
314       
315        # Read from the client (getting rid of the trailing newline)
316        # Has the client sent the <<STOP>> signal?
317        my $signal = <$client>;
318        chomp($signal);
319        if($signal eq "<<STOP>>") {
320            print $client "Perl received STOP signal (on port $port): stopping wget\n";
321            $loop = 0;                          # out of outer while loop
322            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
323           
324            # Sometimes the wget process takes some time to start up. If the STOP signal
325            # was sent, don't try to terminate the process until we know it is running.
326            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
327            # for it to start up, checking for whether it is running in order to kill it.
328            for(my $seconds = 1; $seconds <= 5; $seconds++) {
329            if(kill(0, $childpid)) {
330                # If kill(0, $childpid) returns true, then the process is running
331                # and we need to kill it.
332                close($chld_in);
333                close($chld_out);
334                kill("TERM", $childpid);
335               
336                $childpid = undef;
337               
338                # Stop monitoring the read_handle and close the serverSocket
339                # (the Java end will close the client socket that Java opened)
340                $read_set->remove($rh);     #$read_set->remove($serverSocket);
341                close($rh);             #close($serverSocket);
342                print $client "Perl terminated wget and is about to exit\n";
343                last;                       # out of inner for loop
344            }
345            else { # the process may just be starting up, wait
346                sleep(1);
347            }
348            }
349            last;                               # out of foreach loop
350        }
351        }
352    }
353    }
354
355    if ($changed_dir) {
356    chdir $current_dir;
357    }
358   
359    return $strReadIn;
360}
361
362
363sub useWgetMonitored
364{
365    #local $| = 1; # autoflush stdout buffer
366    #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
367
368    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
369
370
371    my $current_dir = cwd();
372    my $changed_dir = 0;
373    if (defined $working_dir && -e $working_dir) {
374    chdir "$working_dir";
375    $changed_dir = 1;
376    }
377
378    # When we are running this script through GLI, the SIGTERM signal handler
379    # won't get called on Windows when wget is to be prematurely terminated.
380    # Instead, when wget has to be terminated in the middle of execution, GLI will
381    # connect to a serverSocket here to communicate when it's time to stop wget.
382    if($self->dealingWithSockets()) {
383
384    $port = <STDIN>; # gets a port on localhost that's not yet in use
385    chomp($port);
386   
387    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
388                           LocalPort => $port,
389                           Listen    => 1,
390                           Reuse     => 1);
391   
392    die "can't setup server" unless $serverSocket;
393    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
394
395    $read_set = new IO::Select();         # create handle set for reading
396    $read_set->add($serverSocket);        # add the main socket to the set
397    }
398
399    my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
400    #my $command = "\"$wget_file_path\" $cmdWget 2>&1 |"; #open(*WIN,$command) || die "wget request failed: $!\n";
401    my $command = "\"$wget_file_path\" $cmdWget 2>&1";
402    # print STDERR "Command is: $command\n";
403    $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
404
405    # for linux the wget process starts off as achild of a subshell launched as the child of
406    # this perl script. We want the pid of the wget process, not that of the subshell.
407    $childpid = $self->get_childpid($childpid);
408
409    my $full_text = "";
410    my $error_text = "";
411    my @follow_list = ();
412    my $line;
413
414    my $loop = 1;
415    while($loop)
416    {
417    if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
418        if((defined $blnShow) && $blnShow)
419        {
420        print STDERR "$line";
421        }
422       
423        if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
424        my $follow_url = $1;
425        push(@follow_list,$follow_url);
426        }
427       
428        if ($line =~ m/ERROR\s+\d+/) {
429        $error_text .= $line;
430        }
431       
432        $full_text .= $line;
433    }
434    else { # wget finished, terminate naturally
435        #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
436        close($chld_in);
437        close($chld_out);
438        # Program terminates only when the following line is included
439        # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
440        # it prevents the child from turning into a "zombie process".
441        # While the wget process terminates without it, this perl script does not:
442        # the DOS prompt is not returned without it.
443        waitpid $childpid, 0;
444        $loop = 0;
445       
446        $childpid = undef;
447        if(defined $port) {
448        $read_set->remove($serverSocket);
449        close($serverSocket);
450        }
451    }
452
453    # if we run this script from the command-line (as opposed to from GLI),
454    # then we're not working with sockets and can therefore can skip the next bits
455    next unless(defined $port);
456
457    # http://www.perlfect.com/articles/select.shtml
458    # "multiplex between several filehandles within a single thread of control,
459    # thus creating the effect of parallelism in the handling of I/O."
460    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
461
462    # take all readable handles in turn
463    foreach my $rh (@rh_set) {
464        if($rh == $serverSocket) {
465        my $client = $rh->accept();
466        #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
467        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
468       
469        # Read from the client (getting rid of trailing newline)
470        # Has the client sent the <<STOP>> signal?
471        my $signal = <$client>;
472        chomp($signal);
473        if($signal eq "<<STOP>>") {
474            print $client "Perl received STOP signal (on port $port): stopping wget\n";
475            $loop = 0;                          # out of outer while loop
476            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
477           
478            # Sometimes the wget process takes some time to start up. If the STOP signal
479            # was sent, don't try to terminate the process until we know it is running.
480            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
481            # for it to start up, checking for whether it is running in order to kill it.
482            for(my $seconds = 1; $seconds <= 5; $seconds++) {
483            if(kill(0, $childpid)) {
484                # If kill(0, $childpid) returns true, then the process is running
485                # and we need to kill it.
486                close($chld_in);
487                close($chld_out);
488                kill("TERM", $childpid);
489               
490                $childpid = undef;
491               
492                # Stop monitoring the read_handle and close the serverSocket
493                # (the Java end will close the client socket that Java opened)
494                $read_set->remove($rh);     #$read_set->remove($serverSocket);
495                close($rh);             #close($serverSocket);
496                print $client "Perl terminated wget and is about to exit\n";
497                last;                       # out of inner for loop
498            }
499            else { # the process may just be starting up, wait
500                sleep(1);
501            }
502            }
503            last;                               # out of foreach loop
504        }
505        }
506    }
507    }
508
509    my $command_status = $?;
510    if ($command_status != 0) {
511    $error_text .= "Exit error: $command_status";
512    }
513
514    if ($changed_dir) {
515    chdir $current_dir;
516    }
517   
518    my $final_follow = pop(@follow_list); # might be undefined, but that's OK
519   
520    return ($full_text,$error_text,$final_follow);
521}
522
523
524# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
525sub checkURL
526{
527    my ($self) = @_;
528    if ($self->{'url'} eq "")
529    {
530    &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
531    }
532}
533
534sub error
535{
536    my ($strFunctionName,$strError) = @_;
537    {
538    print "Error occoured in WgetDownload.pm\n".
539        "In Function:".$strFunctionName."\n".
540        "Error Message:".$strError."\n";
541    exit(-1);
542    }
543}
544
5451;
Note: See TracBrowser for help on using the browser.