root/gsdl/trunk/perllib/downloaders/WgetDownload.pm @ 17800

Revision 17800, 18.3 KB (checked in by ak19, 12 years ago)

BugFix?: Fixed the position of call to last to break out of for loop inside useWget and useWgetMonitored subroutines, so that it now works on windows. This needed to be done because the calls to last had been added for making things still work on Linux, but the location of the calls was wrong on Windows and caused OAIDownloads to fail all of a sudden.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use IPC::Open2;
38use IO::Select;
39use IO::Socket;
40
41
42sub BEGIN {
43    @WgetDownload::ISA = ('BaseDownload');
44}
45
46my $arguments =
47     [ { 'name' => "proxy_on",
48    'desc' => "{WgetDownload.proxy_on}",
49    'type' => "flag",
50    'reqd' => "no",
51    'hiddengli' => "yes"},
52      { 'name' => "proxy_host", 
53    'desc' => "{WgetDownload.proxy_host}",         
54    'type' => "string",
55    'reqd' => "no",
56    'hiddengli' => "yes"},
57      { 'name' => "proxy_port",
58    'desc' => "{WgetDownload.proxy_port}",         
59    'type' => "string",
60    'reqd' => "no",
61    'hiddengli' => "yes"},
62      { 'name' => "user_name", 
63    'desc' => "{WgetDownload.user_name}",         
64    'type' => "string",
65    'reqd' => "no",
66    'hiddengli' => "yes"},
67      { 'name' => "user_password",
68    'desc' => "{WgetDownload.user_password}",         
69    'type' => "string",
70    'reqd' => "no",
71    'hiddengli' => "yes"}];
72
73my $options = { 'name'     => "WgetDownload",
74        'desc'     => "{WgetDownload.desc}",
75        'abstract' => "yes",
76        'inherits' => "yes",
77        'args'     => $arguments };
78
79
80# Declaring file global variables related to the wget child process so that
81# the termination signal handler for SIGTERM can close the streams and tidy
82# up before ending the child process.
83my $childpid;
84my ($chld_out, $chld_in);
85my ($serverSocket, $read_set);
86
87# The port this script's server socket will be listening on, to handle
88# incoming signals from GLI to terminate wget. This is also file global,
89# since OAIDownload.pm will make several calls on wget using the same
90# instance of this script and we want to reuse whatever port GLI gave us.
91my $port;
92
93# When this script is called from the command line, this handler will be called
94# if this process is killed or abruptly ends due to receiving one of the
95# terminating signals that this handler is registered to deal with.
96sub abrupt_end_handler {
97    my $termination_signal = shift (@_);
98
99    if(defined $childpid) {
100    close($chld_out);
101    close($chld_in);
102   
103    print STDOUT "Received termination signal: $termination_signal\n";
104
105    # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
106    # See http://perldoc.perl.org/perlipc.html#Signals
107    # Warning on using kill at http://perldoc.perl.org/perlfork.html
108    kill("TERM", $childpid);
109
110    # If the SIGTERM sent on Linux calls this handler, we want to make
111    # sure any socket connection is closed.
112    # Otherwise sockets are only used when this script is run from GLI
113    # in which case the handlers don't really get called.
114    if(defined $serverSocket) {
115        $read_set->remove($serverSocket) if defined $read_set;
116        close($serverSocket);
117    }
118    }
119
120    exit(0);
121}
122
123# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
124# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
125# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
126# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
127# http://affy.blogspot.com/p5be/ch13.htm
128# http://perldoc.perl.org/perlipc.html#Signals
129$SIG{'INT'} = \&abrupt_end_handler;
130$SIG{'TERM'} = \&abrupt_end_handler;
131
132sub new {
133    my ($class) = shift (@_);
134    my ($getlist,$inputargs,$hashArgOptLists) = @_;
135    push(@$getlist, $class);
136
137    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
138    push(@{$hashArgOptLists->{"OptList"}},$options);
139
140    my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
141
142    return bless $self, $class;
143}
144
145sub checkWgetSetup
146{
147    my ($self,$blnGliCall) = @_;
148    #TODO: proxy detection??
149   
150    if((!$blnGliCall) && $self->{'proxy_on'})
151    {
152    &checkProxySetup($self);
153    }
154    &checkURL($self);
155}
156
157sub getWgetOptions
158{
159    my ($self) = @_;
160    my $strOptions = "";
161   
162    if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
163    {
164
165    $strOptions .= " -e httpproxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
166
167    if ($self->{'user_name'} && $self->{'user_password'})
168    {
169        $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
170    }
171    }
172
173    if ($self->{'proxy_on'}) {
174    $strOptions .= " --proxy ";
175    }
176
177    return $strOptions;
178}
179
180# Checking for proxy setup: proxy server, proxy port, proxy username and password.
181sub checkProxySetup
182{
183    my ($self) = @_;
184    ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
185    # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
186    # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
187
188}
189
190# Returns true if the wget status needs to be monitored through sockets
191# (if a socket is used to communicate with the Java program on when to
192# terminate wget). True if we are running gli, or if the particular type
193# of WgetDownload is *not* OAIDownload (in that case, the original way of
194# terminating the perl script from Java terminated wget as well).
195sub dealingWithSockets() {
196    my ($self) = @_;
197    return (defined $self->{'gli'} && $self->{'gli'} && !defined $port && ref($self) ne "OAIDownload");
198                       # use ref($self) to find the classname of an object
199}
200
201sub get_childpid {
202    my ($self, $pid) = @_;
203    my $os = $^O;
204    # for windows, we do nothing special
205    if ($os =~ m/mswin/i) {
206    return $pid;
207    }
208   
209    # else $os is macos or linux
210    # This means the wget child process may have been spawned from a subshell
211    # (the real child) that was launched by this perl script. However, we want
212    # the pid of the wget process, since that is what we want to terminate.
213
214    sleep(2); # give it some time to start up, else we'd have finished searching beforehand
215
216    # Look through any processes spawned immediately/soon after the subshell.
217    # Look for any pid greater than the subshell's, but within a certain limit (10)
218    my $child = $pid;
219    for(my $i = 1; $i <= 10; $i++) {
220    $child = $child+$i;
221    if(kill(0, $child)) { # this process exists
222        return $child;
223    }
224    }
225   
226    return $pid; # could not find any process within limit, so return the original process id?
227}
228
229sub useWget
230{
231    #local $| = 1; # autoflush stdout buffer
232    #print STDOUT "*** Start of subroutine useWget in $0\n";
233
234    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
235
236    my ($strReadIn,$strLine,$command);
237    $strReadIn = "" unless defined $strReadIn;
238
239    my $current_dir = cwd();
240    my $changed_dir = 0;
241    if (defined $working_dir && -e $working_dir) {
242    chdir "$working_dir";
243    $changed_dir = 1;
244    }
245
246    # When we are running this script through GLI, the SIGTERM signal handler
247    # won't get called on Windows when wget is to be prematurely terminated.
248    # Instead, when wget has to be terminated in the middle of execution, GLI will
249    # connect to a serverSocket here to communicate when it's time to stop wget.
250    if($self->dealingWithSockets()) {
251
252    $port = <STDIN>; # gets a port on localhost that's not yet in use
253    chomp($port);
254   
255    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
256                           LocalPort => $port,
257                           Listen    => 1,
258                           Reuse     => 1);
259   
260    die "can't setup server" unless $serverSocket;
261    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
262
263    $read_set = new IO::Select();         # create handle set for reading
264    $read_set->add($serverSocket);        # add the main socket to the set
265    }
266
267    my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
268    $command = "\"$wget_file_path\" $cmdWget 2>&1";
269    # print STDERR "Command is: $command\n";
270    $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
271   
272    # for linux the wget process starts off as achild of a subshell launched as the child of
273    # this perl script. We want the pid of the wget process, not that of the subshell.
274    $childpid = $self->get_childpid($childpid);
275
276    my $loop = 1;
277    while($loop)
278    {
279    if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
280        if($blnShow) {
281        print STDERR "$strLine\n";
282        }
283        $strReadIn .= $strLine;
284    }
285    else { # wget finished, terminate naturally
286        #print STDOUT "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
287        close($chld_in);
288        close($chld_out);
289        waitpid $childpid, 0;
290        $loop = 0;
291       
292        $childpid = undef;
293        if(defined $port) {
294        $read_set->remove($serverSocket);
295        close($serverSocket);
296        }
297    }
298
299    # if we run this script from the command-line (as opposed to from GLI),
300    # then we're not working with sockets and can therefore can skip the next bits
301    next unless(defined $port);
302
303    # http://www.perlfect.com/articles/select.shtml
304    # "multiplex between several filehandles within a single thread of control,
305    # thus creating the effect of parallelism in the handling of I/O."
306    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
307
308    # take all readable handles in turn
309    foreach my $rh (@rh_set) {
310        if($rh == $serverSocket) {
311        my $client = $rh->accept();
312        #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
313        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
314       
315        # Read from the client (getting rid of the trailing newline)
316        # Has the client sent the <<STOP>> signal?
317        my $signal = <$client>;
318        chomp($signal);
319        if($signal eq "<<STOP>>") {
320            print $client "Perl received STOP signal (on port $port): stopping wget\n";
321            $loop = 0;                          # out of outer while loop
322            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
323           
324            # Sometimes the wget process takes some time to start up. If the STOP signal
325            # was sent, don't try to terminate the process until we know it is running.
326            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
327            # for it to start up, checking for whether it is running in order to kill it.
328            for(my $seconds = 1; $seconds <= 5; $seconds++) {
329            if(kill(0, $childpid)) {
330                # If kill(0, $childpid) returns true, then the process is running
331                # and we need to kill it.
332                close($chld_in);
333                close($chld_out);
334                kill("TERM", $childpid);
335               
336                $childpid = undef;
337               
338                # Stop monitoring the read_handle and close the serverSocket
339                # (the Java end will close the client socket that Java opened)
340                $read_set->remove($rh);     #$read_set->remove($serverSocket);
341                close($rh);             #close($serverSocket);
342                print $client "Perl terminated wget and is about to exit\n";
343            }
344            else { # the process may just be starting up, wait
345                sleep(1);
346            }
347            last;                           # out of inner for loop
348            }
349            last;                               # out of foreach loop
350        }
351        }
352    }
353    }
354
355    if ($changed_dir) {
356    chdir $current_dir;
357    }
358   
359    return $strReadIn;
360}
361
362
363sub useWgetMonitored
364{
365    #local $| = 1; # autoflush stdout buffer
366    #print STDOUT "*** Start of subroutine useWgetMonitored in $0\n";
367
368    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
369
370
371    my $current_dir = cwd();
372    my $changed_dir = 0;
373    if (defined $working_dir && -e $working_dir) {
374    chdir "$working_dir";
375    $changed_dir = 1;
376    }
377
378    # When we are running this script through GLI, the SIGTERM signal handler
379    # won't get called on Windows when wget is to be prematurely terminated.
380    # Instead, when wget has to be terminated in the middle of execution, GLI will
381    # connect to a serverSocket here to communicate when it's time to stop wget.
382    if($self->dealingWithSockets()) {
383
384    $port = <STDIN>; # gets a port on localhost that's not yet in use
385    chomp($port);
386   
387    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
388                           LocalPort => $port,
389                           Listen    => 1,
390                           Reuse     => 1);
391   
392    die "can't setup server" unless $serverSocket;
393    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
394
395    $read_set = new IO::Select();         # create handle set for reading
396    $read_set->add($serverSocket);        # add the main socket to the set
397    }
398
399    my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
400    #my $command = "\"$wget_file_path\" $cmdWget 2>&1 |"; #open(*WIN,$command) || die "wget request failed: $!\n";
401    my $command = "\"$wget_file_path\" $cmdWget 2>&1";
402    # print STDERR "Command is: $command\n";
403    $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
404
405    # for linux the wget process starts off as achild of a subshell launched as the child of
406    # this perl script. We want the pid of the wget process, not that of the subshell.
407    $childpid = $self->get_childpid($childpid);
408
409    my $full_text = "";
410    my $error_text = "";
411    my @follow_list = ();
412    my $line;
413
414    my $loop = 1;
415    while($loop)
416    {
417    if (defined($line=<$chld_out>)) { # we're reading in from child process' stdout
418        if((defined $blnShow) && $blnShow)
419        {
420        print STDERR "$line";
421        }
422       
423        if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
424        my $follow_url = $1;
425        push(@follow_list,$follow_url);
426        }
427       
428        if ($line =~ m/ERROR\s+\d+/) {
429        $error_text .= $line;
430        }
431       
432        $full_text .= $line;
433    }
434    else { # wget finished, terminate naturally
435        #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
436        close($chld_in);
437        close($chld_out);
438        # Program terminates only when the following line is included
439        # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
440        # it prevents the child from turning into a "zombie process".
441        # While the wget process terminates without it, this perl script does not:
442        # the DOS prompt is not returned without it.
443        waitpid $childpid, 0;
444        $loop = 0;
445       
446        $childpid = undef;
447        if(defined $port) {
448        $read_set->remove($serverSocket);
449        close($serverSocket);
450        }
451    }
452
453    # if we run this script from the command-line (as opposed to from GLI),
454    # then we're not working with sockets and can therefore can skip the next bits
455    next unless(defined $port);
456
457    # http://www.perlfect.com/articles/select.shtml
458    # "multiplex between several filehandles within a single thread of control,
459    # thus creating the effect of parallelism in the handling of I/O."
460    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
461
462    # take all readable handles in turn
463    foreach my $rh (@rh_set) {
464        if($rh == $serverSocket) {
465        my $client = $rh->accept();
466        #$client->autoflush(1); # autoflush output buffer - don't put this back in: splits output irregularly over multilines
467        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
468       
469        # Read from the client (getting rid of trailing newline)
470        # Has the client sent the <<STOP>> signal?
471        my $signal = <$client>;
472        chomp($signal);
473        if($signal eq "<<STOP>>") {
474            print $client "Perl received STOP signal (on port $port): stopping wget\n";
475            $loop = 0;                          # out of outer while loop
476            $self->{'forced_quit'} = 1;         # subclasses need to know we're quitting
477           
478            # Sometimes the wget process takes some time to start up. If the STOP signal
479            # was sent, don't try to terminate the process until we know it is running.
480            # Otherwise wget may start running after we tried to kill it. Wait 5 seconds
481            # for it to start up, checking for whether it is running in order to kill it.
482            for(my $seconds = 1; $seconds <= 5; $seconds++) {
483            if(kill(0, $childpid)) {
484                # If kill(0, $childpid) returns true, then the process is running
485                # and we need to kill it.
486                close($chld_in);
487                close($chld_out);
488                kill("TERM", $childpid);
489               
490                $childpid = undef;
491               
492                # Stop monitoring the read_handle and close the serverSocket
493                # (the Java end will close the client socket that Java opened)
494                $read_set->remove($rh);     #$read_set->remove($serverSocket);
495                close($rh);             #close($serverSocket);
496                print $client "Perl terminated wget and is about to exit\n";
497            }
498            else { # the process may just be starting up, wait
499                sleep(1);
500            }
501            last;                           # out of inner for loop
502            }
503            last;                               # out of foreach loop
504        }
505        }
506    }
507    }
508
509    my $command_status = $?;
510    if ($command_status != 0) {
511    $error_text .= "Exit error: $command_status";
512    }
513
514    if ($changed_dir) {
515    chdir $current_dir;
516    }
517   
518    my $final_follow = pop(@follow_list); # might be undefined, but that's OK
519   
520    return ($full_text,$error_text,$final_follow);
521}
522
523
524# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
525sub checkURL
526{
527    my ($self) = @_;
528    if ($self->{'url'} eq "")
529    {
530    &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
531    }
532}
533
534sub error
535{
536    my ($strFunctionName,$strError) = @_;
537    {
538    print "Error occoured in WgetDownload.pm\n".
539        "In Function:".$strFunctionName."\n".
540        "Error Message:".$strError."\n";
541    exit(-1);
542    }
543}
544
5451;
Note: See TracBrowser for help on using the browser.