root/gsdl/trunk/perllib/downloaders/WgetDownload.pm @ 17531

Revision 17531, 11.8 KB (checked in by ak19, 11 years ago)

Now works with OAIDownload.pm for downloading over OAI. The variable port (the port at which this script's serversocket is to listen) is now made global to the file because OAIDownload.pm will call the subroutine usewget several times (launching a wget several times) per instance of this WgetDownload?.pm script. This means that the port number needs to be stored for reuse as it will not be sent over and over again by GLI.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WgetDownload.pm -- Download base module that handles calling Wget
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WgetDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use BaseDownload;
35use strict;
36use Cwd;
37use IPC::Open2;
38use IO::Select;
39use IO::Socket;
40
41
42sub BEGIN {
43    @WgetDownload::ISA = ('BaseDownload');
44}
45
46my $arguments =
47     [ { 'name' => "proxy_on",
48    'desc' => "{WgetDownload.proxy_on}",
49    'type' => "flag",
50    'reqd' => "no",
51    'hiddengli' => "yes"},
52      { 'name' => "proxy_host", 
53    'desc' => "{WgetDownload.proxy_host}",         
54    'type' => "string",
55    'reqd' => "no",
56    'hiddengli' => "yes"},
57      { 'name' => "proxy_port",
58    'desc' => "{WgetDownload.proxy_port}",         
59    'type' => "string",
60    'reqd' => "no",
61    'hiddengli' => "yes"},
62      { 'name' => "user_name", 
63    'desc' => "{WgetDownload.user_name}",         
64    'type' => "string",
65    'reqd' => "no",
66    'hiddengli' => "yes"},
67      { 'name' => "user_password",
68    'desc' => "{WgetDownload.user_password}",         
69    'type' => "string",
70    'reqd' => "no",
71    'hiddengli' => "yes"}];
72
73my $options = { 'name'     => "WgetDownload",
74        'desc'     => "{WgetDownload.desc}",
75        'abstract' => "yes",
76        'inherits' => "yes",
77        'args'     => $arguments };
78
79
80# Declaring file global variables related to the wget child process so that
81# the termination signal handler for SIGTERM can close the streams and tidy
82# up before ending the child process.
83my $childpid;
84my ($chld_out, $chld_in);
85my ($serverSocket, $read_set);
86
87# The port this script's server socket will be listening on, to handle
88# incoming signals from GLI to terminate wget. This is also file global,
89# since OAIDownload.pm will make several calls on wget using the same
90# instance of this script and we want to reuse whatever port GLI gave us.
91my $port;
92
93# When this script is called from the command line, this handler will be called
94# if this process is killed or abruptly ends due to receiving one of the
95# terminating signals that this handler is registered to deal with.
96sub abrupt_end_handler {
97    my $termination_signal = shift (@_);
98    if(defined $childpid) {
99    close($chld_out);
100    close($chld_in);
101   
102    #print STDOUT "Received termination signal: $termination_signal\n";
103
104    # Send TERM signal to child process to terminate it. Sending the INT signal doesn't work
105    # See http://perldoc.perl.org/perlipc.html#Signals
106    # Warning on using kill at http://perldoc.perl.org/perlfork.html
107    kill("TERM", $childpid);
108
109    # If the SIGTERM sent on Linux calls this handler, we want to make
110    # sure any socket connection is closed.
111    # Otherwise sockets are only used when this script is run from GLI
112    # in which case the handlers don't really get called.
113    if(defined $serverSocket) {
114        $read_set->remove($serverSocket) if defined $read_set;
115        close($serverSocket);
116    }
117    }
118
119    exit(0);
120}
121
122# Registering a handler for when termination signals SIGINT and SIGTERM are received to stop
123# the wget child process. SIGTERM--generated by Java's Process.destroy()--is the default kill
124# signal (kill -15) on Linux, while SIGINT is generated upon Ctrl-C (also on Windows).
125# Note that SIGKILL can't be handled as the handler won't get called for it. More information:
126# http://affy.blogspot.com/p5be/ch13.htm
127# http://perldoc.perl.org/perlipc.html#Signals
128$SIG{'INT'} = \&abrupt_end_handler;
129$SIG{'TERM'} = \&abrupt_end_handler;
130
131sub new {
132    my ($class) = shift (@_);
133    my ($getlist,$inputargs,$hashArgOptLists) = @_;
134    push(@$getlist, $class);
135
136    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
137    push(@{$hashArgOptLists->{"OptList"}},$options);
138
139    my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
140
141    return bless $self, $class;
142}
143
144sub checkWgetSetup
145{
146    my ($self,$blnGliCall) = @_;
147    #TODO: proxy detection??
148   
149    if((!$blnGliCall) && $self->{'proxy_on'})
150    {
151    &checkProxySetup($self);
152    }
153    &checkURL($self);
154}
155
156sub getWgetOptions
157{
158    my ($self) = @_;
159    my $strOptions = "";
160   
161    if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
162    {
163
164    $strOptions .= " -e httpproxy=$self->{'proxy_host'}:$self->{'proxy_port'} ";
165
166    if ($self->{'user_name'} && $self->{'user_password'})
167    {
168        $strOptions .= "--proxy-user=$self->{'user_name'}"." --proxy-passwd=$self->{'user_password'}";
169    }
170    }
171
172    if ($self->{'proxy_on'}) {
173    $strOptions .= " --proxy ";
174    }
175
176    return $strOptions;
177}
178
179# Checking for proxy setup: proxy server, proxy port, proxy username and password.
180sub checkProxySetup
181{
182    my ($self) = @_;
183    ($self->{'proxy_on'}) || &error("checkProxySetup","The proxy is not on? How could that be happening?");
184    # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'}
185    # Test if the connection is successful. If the connection wasn't successful then ask user to supply username and password.
186
187}
188
189sub useWget
190{
191    #local $| = 1; # autoflush stdout buffer
192    #print STDOUT "*** Start of subroutine useWget in $0\n";
193
194    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
195
196    my ($strReadIn,$strLine,$command);
197    $strReadIn = "" unless defined $strReadIn;
198
199    my $current_dir = cwd();
200    my $changed_dir = 0;
201    if (defined $working_dir && -e $working_dir) {
202    chdir "$working_dir";
203    $changed_dir = 1;
204    }
205
206    # When we are running this script through GLI, the SIGTERM signal handler
207    # won't get called on Windows when wget is to be prematurely terminated.
208    # Instead, when wget has to be terminated in the middle of execution, GLI will
209    # connect to a serverSocket here to communicate when it's time to stop wget.
210    if(defined $self->{'gli'} && $self->{'gli'} && !defined $port) {
211
212    $port = <STDIN>; # gets a port on localhost that's not yet in use
213    chomp($port);
214   
215    $serverSocket = IO::Socket::INET->new( Proto     => 'tcp',
216                           LocalPort => $port,
217                           Listen    => 1,
218                           Reuse     => 1);
219   
220    die "can't setup server" unless $serverSocket;
221    #print STDOUT "[Serversocket $0 accepting clients at port $port]\n";
222
223    $read_set = new IO::Select();         # create handle set for reading
224    $read_set->add($serverSocket);        # add the main socket to the set
225    }
226
227    my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
228    $command = "\"$wget_file_path\" $cmdWget 2>&1";
229    # print STDERR "Command is: $command\n";
230    $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
231
232    my $loop = 1;
233    while($loop)
234    {
235    if (defined(my $strLine=<$chld_out>)) { # we're reading in from child process' stdout
236        if($blnShow) {
237        print STDERR "$strLine\n";
238        }
239        $strReadIn .= $strLine;
240    }
241    else { # wget finished, terminate naturally
242        #print STDERR "\nPerl: open2 command, input stream closed. Wget terminated naturally.\n";
243        close($chld_in);
244        close($chld_out);
245        waitpid $childpid, 0;
246        $loop = 0;
247       
248        $childpid = undef;
249        if(defined $port) {
250        $read_set->remove($serverSocket);
251        close($serverSocket);
252        }
253    }
254
255    # if we run this script from the command-line (as opposed to from GLI),
256    # then we're not working with sockets and can therefore can skip the next bits
257    next unless(defined $port);
258
259    # http://www.perlfect.com/articles/select.shtml
260    # "multiplex between several filehandles within a single thread of control,
261    # thus creating the effect of parallelism in the handling of I/O."
262    my @rh_set = $read_set->can_read(0.002); # every 2 ms check if there's a client socket connecting
263
264    # take all readable handles in turn
265    foreach my $rh (@rh_set) {
266        if($rh == $serverSocket) {
267        my $client = $rh->accept();
268        #$client->autoflush(1); # autoflush output buffer - don't put this back in: output split irregularly over lines
269        print $client "Talked to ServerSocket (port $port). Connection accepted\n";
270       
271        # Read from the client (getting rid of trailing newline)
272        # Has the client sent the <<STOP>> signal?
273        my $signal = <$client>;
274        chomp($signal);
275        if($signal eq "<<STOP>>") {
276            print $client "Perl received STOP signal (on port $port): stopping wget\n";
277           
278            $loop = 0;
279            close($chld_in);
280            close($chld_out);
281            kill("TERM", $childpid);
282           
283            $childpid = undef;
284
285            # Stop monitoring the read_handle
286            # close the serverSocket (the Java end will close the client socket that Java opened)
287            $read_set->remove($rh); #$read_set->remove($serverSocket);
288            close($rh);         #close($serverSocket);
289            #print $client "Perl is about to exit\n";
290            last;
291        }
292        }
293    }
294    }
295
296    if ($changed_dir) {
297    chdir $current_dir;
298    }
299   
300    return $strReadIn;
301}
302
303
304sub useWgetMonitored
305{
306    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
307
308
309    my $current_dir = cwd();
310    my $changed_dir = 0;
311    if (defined $working_dir && -e $working_dir) {
312    chdir "$working_dir";
313    $changed_dir = 1;
314    }
315    my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
316    #my $command = "\"$wget_file_path\" $cmdWget 2>&1 |"; #open(*WIN,$command) || die "wget request failed: $!\n";
317###    print STDERR "**** wget cmd = $command\n";
318    #open(*WIN,$command) || die "wget request failed: $!\n";
319
320    my $command = "\"$wget_file_path\" $cmdWget";
321    $childpid = open2($chld_out, $chld_in, $command) || die "wget request failed: $!\n";
322
323    my $full_text = "";
324    my $error_text = "";
325    my @follow_list = ();
326    my $line;
327
328    while (defined($line=<$chld_out>)) # we're reading in from child process' stdout
329    {
330    if((defined $blnShow) && $blnShow)
331    {
332        print STDERR "$line";
333    }
334
335    if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
336        my $follow_url = $1;
337        push(@follow_list,$follow_url);
338    }
339
340    if ($line =~ m/ERROR\s+\d+/) {
341        $error_text .= $line;
342    }
343
344    $full_text .= $line;
345    }
346
347    close($chld_in);
348    close($chld_out);
349
350    # Program terminates only when the following line is included
351    # http://perldoc.perl.org/IPC/Open2.html explains why this is necessary
352    # it prevents the child from turning into a "zombie process".
353    # While the wget process terminates without it, this perl script does not:
354    # the DOS prompt is not returned without it.
355    waitpid $childpid, 0;
356
357    my $command_status = $?;
358    if ($command_status != 0) {
359    $error_text .= "Exit error: $command_status";
360    }
361
362    if ($changed_dir) {
363    chdir $current_dir;
364    }
365   
366    my $final_follow = pop(@follow_list); # might be undefined, but that's OK
367   
368    return ($full_text,$error_text,$final_follow);
369}
370
371
372# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
373sub checkURL
374{
375    my ($self) = @_;
376    if ($self->{'url'} eq "")
377    {
378    &error("checkURL","no URL is specified!! Please specifies the URL for downloading.");
379    }
380}
381
382sub error
383{
384    my ($strFunctionName,$strError) = @_;
385    {
386    print "Error occoured in WgetDownload.pm\n".
387        "In Function:".$strFunctionName."\n".
388        "Error Message:".$strError."\n";
389    exit(-1);
390    }
391}
392
3931;
Note: See TracBrowser for help on using the browser.