root/main/trunk/greenstone2/perllib/downloaders/WebDownload.pm @ 31920

Revision 31920, 7.8 KB (checked in by ak19, 2 years ago)

Untested on Windows as yet. 1. Major overhaul to WgetDownload?'s useWget() and useWgetMonitored() subroutines. Their use of open3 was wrong and would cause blocking if proxy set wrong or if https_proxy not set/set wrong and the url entered was http but resolves to https. The problem was more fundamental than the symptoms indicated the open3() calls were used wrong and resulted in blocking. The blocking could be indefinite. To generally avoid blocking, needed to use IO::select() to loop to check any child streams that are ready. To avoid possibly indefinite blocking, needed to use IO::select() with a timeout on the can_read() method. The need for all these and their use is indicated in the links added to the committed version of this module. 2. After the use of select() worked in principle, there was still the large problem that terminating unnaturally did not stop a second wget that had been launched. This unexpectedly had to do with doublequotes around wget's path that attempted to preserve any spaces in the path, but which behaved differently with open3(): any double quotes launched a subshell to run the command passed to open3(). And the wget cmd launched by the subshell cmd wasn't actually a child process, so it could not be terminated via the parentpid used as a processgrouppid when doing the kill TERM -processgroupid. The solution lay with the unexpected cause of the problem, which was the double quotes. Now the command passed to open3() is an array of parameters and no double quotes. The array is meant to preserve spaces in any filepaths. 3. Removed the 2 tries parameter passed to wget, since we now loop a certain number of times trying to read from the child process' streams each time this times out. If it times out n times, then we give up and assume that the URL could not be read.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WebDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use WgetDownload;
35
36sub BEGIN {
37    @WebDownload::ISA = ('WgetDownload');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42no strict 'subs'; # to pass STDERR/STDOUT to functions
43
44use gsprintf 'gsprintf';
45
46my $arguments =
47    [ { 'name' => "url",
48    'disp' => "{WebDownload.url_disp}",
49    'desc' => "{WebDownload.url}",
50    'type' => "string",
51    'reqd' => "yes"},
52      { 'name' => "depth",
53    'disp' => "{WebDownload.depth_disp}",
54    'desc' => "{WebDownload.depth}",
55    'type' => "int",
56    'deft' => "0",
57    "range" => "0,",
58    'reqd' => "no"},
59      { 'name' => "below",
60    'disp' => "{WebDownload.below_disp}",
61    'desc' => "{WebDownload.below}",
62    'type' => "flag",
63    'reqd' => "no"},
64      { 'name' => "within",
65    'disp' => "{WebDownload.within_disp}",
66    'desc' => "{WebDownload.within}",
67    'type' => "flag",
68    'reqd' => "no"},
69      { 'name' => "html_only",
70    'disp' => "{WebDownload.html_only_disp}",
71    'desc' => "{WebDownload.html_only}",
72    'type' => "flag",
73    'reqd' => "no"}
74      ];
75
76my $options = { 'name'     => "WebDownload",
77        'desc'     => "{WebDownload.desc}",
78        'abstract' => "no",
79        'inherits' => "yes",
80        'args'     => $arguments };
81
82
83my $self;
84
85sub new
86{
87    my ($class) = shift (@_);
88    my ($getlist,$inputargs,$hashArgOptLists) = @_;
89    push(@$getlist, $class);
90
91    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92    push(@{$hashArgOptLists->{"OptList"}},$options);
93
94    my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
95   
96    return bless $self, $class;
97}
98
99sub download
100{
101    my ($self) = shift (@_);
102    my ($hashGeneralOptions) = @_;
103
104 
105    # Download options
106    my $strOptions = $self->generateOptionsString();
107    my $strWgetOptions = $self->getWgetOptions();
108     
109    # Setup the command for using wget
110    my $cache_dir = "";
111    if($hashGeneralOptions->{'cache_dir'}) { # don't provide the prefix-dir flag to wget unless the cache_dir is specified
112    if ($ENV{'GSDLOS'} eq "windows") {   
113        $cache_dir = "-P \"".$hashGeneralOptions->{'cache_dir'}."\" ";
114    }
115    else {
116        $cache_dir = "-P ".$hashGeneralOptions->{'cache_dir'};
117    }
118    }
119    #my $cmdWget = "-N -k -x -t 2 -P \"".$hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions ".$self->{'url'};
120    #my $cmdWget = "-N -k -x --tries=2 $strWgetOptions $strOptions $cache_dir " .$self->{'url'};
121    my $cmdWget = "-N -k -x $strWgetOptions $strOptions $cache_dir " .$self->{'url'};
122
123    #print STDOUT "\n@@@@ RUNNING WGET CMD: $cmdWget\n\n";
124   
125    # Download the web pages
126    # print "Start download from $self->{'url'}...\n";
127    print STDERR "<<Undefined Maximum>>\n";
128   
129    if ($ENV{'GSDLOS'} eq "windows") {
130    my $strResponse = $self->useWget($cmdWget,1);
131    } else {
132    my $strResponse = $self->useWget($cmdWget,1,$hashGeneralOptions->{"cache_dir"} );
133   
134    }
135   
136    # if ($strResponse ne ""){print "$strResponse\n";}
137     print STDERR "Finish download from $self->{'url'}\n";
138
139    print STDERR "<<Finished>>\n";
140 
141    return 1;
142}
143
144sub generateOptionsString
145{
146    my ($self) = @_;
147    my $strOptions;
148
149    (defined $self) || &error("generateOptionsString","No \$self is defined!!\n");
150    (defined $self->{'depth'})|| &error("generateOptionsString","No depth is defined!!\n");
151   
152   
153    if($self->{'depth'} == 0)
154    {
155    $strOptions .= " ";
156    }
157    elsif($self->{'depth'} > 0)
158    {
159    $strOptions .= "-r -l ".$self->{'depth'}." ";
160    }
161    else
162    {
163    $self->error("setupOptions","Incorrect Depth is defined!!\n");
164    }
165
166    if($self->{'below'})
167    {
168    $strOptions .="-np ";
169    }
170
171     if($self->{'html_only'})
172    {
173    $strOptions .="-A .html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
174    }
175    else{
176
177    $strOptions .="-p ";
178    }
179
180    if (!$self->{'within'}){
181    $strOptions .="-H ";
182    }
183
184    return $strOptions;
185 
186}
187
188sub url_information
189{
190    my ($self) = shift (@_);
191
192    my $strOptions = $self->getWgetOptions();
193
194    #my $strBaseCMD = $strOptions." --tries=2 -q -O - \"$self->{'url'}\"";
195    my $strBaseCMD = $strOptions." -q -O - $self->{'url'}";
196
197    #&util::print_env(STDERR, "https_proxy", "http_proxy", "ftp_proxy");
198    #&util::print_env(STDERR); 
199   
200    my $strIdentifyText = $self->useWget($strBaseCMD);
201   
202    if (!defined $strIdentifyText or $strIdentifyText eq ""  ){
203   
204    print STDERR "Server information is unavailable.\n";
205   
206    if ($self->{'proxy_on'}) { # if proxying set, the settings may be wrong
207        &gsprintf::gsprintf(STDERR, "{WebDownload.proxied_connect_failed_info}\n");
208       
209        if($self->{'http_proxy_host'} && defined $self->{'http_proxy_port'}) {
210        &gsprintf::gsprintf(STDERR, "{WebDownload.http_proxy_settings}\n", $self->{'http_proxy_host'}, $self->{'http_proxy_port'});
211        }
212        if($self->{'https_proxy_host'} && defined $self->{'https_proxy_port'}) {
213        &gsprintf::gsprintf(STDERR, "{WebDownload.https_proxy_settings}\n", $self->{'https_proxy_host'}, $self->{'https_proxy_port'});
214        }
215        if($self->{'ftp_proxy_host'} && defined $self->{'ftp_proxy_port'}) {
216        &gsprintf::gsprintf(STDERR, "{WebDownload.ftp_proxy_settings}\n", $self->{'ftp_proxy_host'}, $self->{'ftp_proxy_port'});
217        }
218    } else { # else no proxy set, the user may need proxy settings
219        &gsprintf::gsprintf(STDERR, "{WebDownload.proxyless_connect_failed_info}\n");
220    }
221   
222    # with or without proxying set, getting server info may have failed if the URL was https
223    # but the site had no valid certificate and no_check_certificate wasn't turned on
224    # suggest to the user to try turning it on
225    &gsprintf::gsprintf(STDERR, "{WebDownload.connect_failed_info}\n");
226   
227    print STDERR "<<Finished>>\n";
228    return; 
229    }
230
231    while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/si)
232    {
233    $strIdentifyText = $1.$3;
234    print STDERR "Page Title: $2\n";
235    }
236 
237    while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/si)
238    {
239    $strIdentifyText = $1.$3;
240    my $strTempString = $2;
241    print STDERR "\n";
242
243    while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/si)
244    {
245        # Store the infromation in to variable, since next time when we do
246        # regular expression, we will lost all the $1, $2, $X....
247        $strTempString = $3;
248        my $strMetaName = $1;
249        my $strMetaContain = $2;
250       
251        # Take out the extra space in the beginning of the string.
252        $strMetaName =~ s/^([" "])+//m;
253        $strMetaContain =~ s/^([" "])+//m;
254             
255        print STDERR "$strMetaName: $strMetaContain\n\n";
256           
257    }
258
259    }
260
261    print STDERR "<<Finished>>\n";
262
263}
264
265
266sub error
267{
268    my ($strFunctionName,$strError) = @_;
269    {
270    print "Error occoured in WebDownload.pm\n".
271        "In Function:".$strFunctionName."\n".
272        "Error Message:".$strError."\n";
273    exit(-1);
274    }
275}
276
2771;
278
Note: See TracBrowser for help on using the browser.