root/main/trunk/greenstone2/perllib/downloaders/WebDownload.pm @ 31877

Revision 31877, 7.3 KB (checked in by ak19, 2 years ago)

Getting Windows wget to use proxy settings in environment. They didn't need to be in CAPS, as Windows has the side effect of setting env vars in both the original case of the letters of the env variable name as well as in all caps, and unsetting either version unsets both. On Windows however, I noticed that Perl was not on the PATH after open3() in WgetDownload?.pm::useWget() failed with an unclear error message. So now the PATH is also propagated from Java to the perl code for downloading using wget.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WebDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use WgetDownload;
35
36sub BEGIN {
37    @WebDownload::ISA = ('WgetDownload');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42no strict 'subs'; # to pass STDERR/STDOUT to functions
43
44use gsprintf 'gsprintf';
45
46my $arguments =
47    [ { 'name' => "url",
48    'disp' => "{WebDownload.url_disp}",
49    'desc' => "{WebDownload.url}",
50    'type' => "string",
51    'reqd' => "yes"},
52      { 'name' => "depth",
53    'disp' => "{WebDownload.depth_disp}",
54    'desc' => "{WebDownload.depth}",
55    'type' => "int",
56    'deft' => "0",
57    "range" => "0,",
58    'reqd' => "no"},
59      { 'name' => "below",
60    'disp' => "{WebDownload.below_disp}",
61    'desc' => "{WebDownload.below}",
62    'type' => "flag",
63    'reqd' => "no"},
64      { 'name' => "within",
65    'disp' => "{WebDownload.within_disp}",
66    'desc' => "{WebDownload.within}",
67    'type' => "flag",
68    'reqd' => "no"},
69      { 'name' => "html_only",
70    'disp' => "{WebDownload.html_only_disp}",
71    'desc' => "{WebDownload.html_only}",
72    'type' => "flag",
73    'reqd' => "no"}
74      ];
75
76my $options = { 'name'     => "WebDownload",
77        'desc'     => "{WebDownload.desc}",
78        'abstract' => "no",
79        'inherits' => "yes",
80        'args'     => $arguments };
81
82
83my $self;
84
85sub new
86{
87    my ($class) = shift (@_);
88    my ($getlist,$inputargs,$hashArgOptLists) = @_;
89    push(@$getlist, $class);
90
91    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92    push(@{$hashArgOptLists->{"OptList"}},$options);
93
94    my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
95   
96    return bless $self, $class;
97}
98
99sub download
100{
101    my ($self) = shift (@_);
102    my ($hashGeneralOptions) = @_;
103
104 
105    # Download options
106    my $strOptions = $self->generateOptionsString();
107    my $strWgetOptions = $self->getWgetOptions();
108     
109    # Setup the command for using wget
110    my $cache_dir = "";
111    if($hashGeneralOptions->{'cache_dir'}) { # don't provide the prefix-dir flag to wget unless the cache_dir is specified
112    if ($ENV{'GSDLOS'} eq "windows") {   
113        $cache_dir = "-P \"".$hashGeneralOptions->{'cache_dir'}."\" ";
114    }
115    else {
116        $cache_dir = "-P ".$hashGeneralOptions->{'cache_dir'};
117    }
118    }
119    #my $cmdWget = "-N -k -x -t 2 -P \"".$hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions ".$self->{'url'};
120    my $cmdWget = "-N -k -x --tries=2 $strWgetOptions $strOptions $cache_dir " .$self->{'url'};
121
122    #print STDOUT "\n@@@@ RUNNING WGET CMD: $cmdWget\n\n";
123   
124    # Download the web pages
125    # print "Start download from $self->{'url'}...\n";
126    print STDERR "<<Undefined Maximum>>\n";
127   
128    if ($ENV{'GSDLOS'} eq "windows") {
129    my $strResponse = $self->useWget($cmdWget,1);
130    } else {
131    my $strResponse = $self->useWget($cmdWget,1,$hashGeneralOptions->{"cache_dir"} );
132   
133    }
134   
135    # if ($strResponse ne ""){print "$strResponse\n";}
136     print STDERR "Finish download from $self->{'url'}\n";
137
138    print STDERR "<<Finished>>\n";
139 
140    return 1;
141}
142
143sub generateOptionsString
144{
145    my ($self) = @_;
146    my $strOptions;
147
148    (defined $self) || &error("generateOptionsString","No \$self is defined!!\n");
149    (defined $self->{'depth'})|| &error("generateOptionsString","No depth is defined!!\n");
150   
151   
152    if($self->{'depth'} == 0)
153    {
154    $strOptions .= " ";
155    }
156    elsif($self->{'depth'} > 0)
157    {
158    $strOptions .= "-r -l ".$self->{'depth'}." ";
159    }
160    else
161    {
162    $self->error("setupOptions","Incorrect Depth is defined!!\n");
163    }
164
165    if($self->{'below'})
166    {
167    $strOptions .="-np ";
168    }
169
170     if($self->{'html_only'})
171    {
172    $strOptions .="-A .html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
173    }
174    else{
175
176    $strOptions .="-p ";
177    }
178
179    if (!$self->{'within'}){
180    $strOptions .="-H ";
181    }
182
183    return $strOptions;
184 
185}
186
187sub url_information
188{
189    my ($self) = shift (@_);
190
191    my $strOptions = $self->getWgetOptions();
192
193    my $strBaseCMD = $strOptions." --tries=2 -q -O - \"$self->{'url'}\"";
194
195    #&util::print_env(STDERR, "https_proxy", "http_proxy", "HTTPS_PROXY", "HTTP_PROXY", "ftp_proxy", "FTP_PROXY");
196    #&util::print_env(STDERR); 
197   
198    my $strIdentifyText = $self->useWget($strBaseCMD);
199   
200    if (!defined $strIdentifyText or $strIdentifyText eq ""  ){
201   
202    print STDERR "Server information is unavailable.\n";
203
204    #&util::print_env(STDERR, "https_proxy", "http_proxy");
205   
206    if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'}) { # if proxying set, the settings may be wrong
207        &gsprintf::gsprintf_multiline(STDERR, "{WebDownload.proxied_connect_failed_info}\n", $self->{'proxy_host'}, $self->{'proxy_port'});
208    } else { # else no proxy set, the user may need proxy settings
209        &gsprintf::gsprintf_multiline(STDERR, "{WebDownload.proxyless_connect_failed_info}\n");
210    }
211   
212    # with or without proxying set, getting server info may have failed if the URL was https
213    # but the site had no valid certificate and no_check_certificate wasn't turned on
214    # suggest to the user to try turning it on
215    &gsprintf::gsprintf_multiline(STDERR, "{WebDownload.connect_failed_info}\n");
216   
217    print STDERR "<<Finished>>\n";
218    return; 
219    }
220
221    while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/si)
222    {
223    $strIdentifyText = $1.$3;
224    print STDERR "Page Title: $2\n";
225    }
226 
227    while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/si)
228    {
229    $strIdentifyText = $1.$3;
230    my $strTempString = $2;
231    print STDERR "\n";
232
233    while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/si)
234    {
235        # Store the infromation in to variable, since next time when we do
236        # regular expression, we will lost all the $1, $2, $X....
237        $strTempString = $3;
238        my $strMetaName = $1;
239        my $strMetaContain = $2;
240       
241        # Take out the extra space in the beginning of the string.
242        $strMetaName =~ s/^([" "])+//m;
243        $strMetaContain =~ s/^([" "])+//m;
244             
245        print STDERR "$strMetaName: $strMetaContain\n\n";
246           
247    }
248
249    }
250
251    print STDERR "<<Finished>>\n";
252
253}
254
255
256sub error
257{
258    my ($strFunctionName,$strError) = @_;
259    {
260    print "Error occoured in WebDownload.pm\n".
261        "In Function:".$strFunctionName."\n".
262        "Error Message:".$strError."\n";
263    exit(-1);
264    }
265}
266
2671;
268
Note: See TracBrowser for help on using the browser.