root/main/trunk/greenstone2/perllib/downloaders/WebDownload.pm @ 31898

Revision 31898, 7.7 KB (checked in by ak19, 2 years ago)

With Kathy's commit 31896 doing away with the need for the recently introduced gsprintf_multiline() by moving the regex replacements of backslash-n (and backslash-t) with newline and tab into gsprint::lookup_string, can now shift WebDownload? calls to gsprintf_multine to use regular gsprintf. Putting back the original gsprintf method as gsprintf_multiline is no longer needed.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WebDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use WgetDownload;
35
36sub BEGIN {
37    @WebDownload::ISA = ('WgetDownload');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42no strict 'subs'; # to pass STDERR/STDOUT to functions
43
44use gsprintf 'gsprintf';
45
46my $arguments =
47    [ { 'name' => "url",
48    'disp' => "{WebDownload.url_disp}",
49    'desc' => "{WebDownload.url}",
50    'type' => "string",
51    'reqd' => "yes"},
52      { 'name' => "depth",
53    'disp' => "{WebDownload.depth_disp}",
54    'desc' => "{WebDownload.depth}",
55    'type' => "int",
56    'deft' => "0",
57    "range" => "0,",
58    'reqd' => "no"},
59      { 'name' => "below",
60    'disp' => "{WebDownload.below_disp}",
61    'desc' => "{WebDownload.below}",
62    'type' => "flag",
63    'reqd' => "no"},
64      { 'name' => "within",
65    'disp' => "{WebDownload.within_disp}",
66    'desc' => "{WebDownload.within}",
67    'type' => "flag",
68    'reqd' => "no"},
69      { 'name' => "html_only",
70    'disp' => "{WebDownload.html_only_disp}",
71    'desc' => "{WebDownload.html_only}",
72    'type' => "flag",
73    'reqd' => "no"}
74      ];
75
76my $options = { 'name'     => "WebDownload",
77        'desc'     => "{WebDownload.desc}",
78        'abstract' => "no",
79        'inherits' => "yes",
80        'args'     => $arguments };
81
82
83my $self;
84
85sub new
86{
87    my ($class) = shift (@_);
88    my ($getlist,$inputargs,$hashArgOptLists) = @_;
89    push(@$getlist, $class);
90
91    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92    push(@{$hashArgOptLists->{"OptList"}},$options);
93
94    my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
95   
96    return bless $self, $class;
97}
98
99sub download
100{
101    my ($self) = shift (@_);
102    my ($hashGeneralOptions) = @_;
103
104 
105    # Download options
106    my $strOptions = $self->generateOptionsString();
107    my $strWgetOptions = $self->getWgetOptions();
108     
109    # Setup the command for using wget
110    my $cache_dir = "";
111    if($hashGeneralOptions->{'cache_dir'}) { # don't provide the prefix-dir flag to wget unless the cache_dir is specified
112    if ($ENV{'GSDLOS'} eq "windows") {   
113        $cache_dir = "-P \"".$hashGeneralOptions->{'cache_dir'}."\" ";
114    }
115    else {
116        $cache_dir = "-P ".$hashGeneralOptions->{'cache_dir'};
117    }
118    }
119    #my $cmdWget = "-N -k -x -t 2 -P \"".$hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions ".$self->{'url'};
120    my $cmdWget = "-N -k -x --tries=2 $strWgetOptions $strOptions $cache_dir " .$self->{'url'};
121
122    #print STDOUT "\n@@@@ RUNNING WGET CMD: $cmdWget\n\n";
123   
124    # Download the web pages
125    # print "Start download from $self->{'url'}...\n";
126    print STDERR "<<Undefined Maximum>>\n";
127   
128    if ($ENV{'GSDLOS'} eq "windows") {
129    my $strResponse = $self->useWget($cmdWget,1);
130    } else {
131    my $strResponse = $self->useWget($cmdWget,1,$hashGeneralOptions->{"cache_dir"} );
132   
133    }
134   
135    # if ($strResponse ne ""){print "$strResponse\n";}
136     print STDERR "Finish download from $self->{'url'}\n";
137
138    print STDERR "<<Finished>>\n";
139 
140    return 1;
141}
142
143sub generateOptionsString
144{
145    my ($self) = @_;
146    my $strOptions;
147
148    (defined $self) || &error("generateOptionsString","No \$self is defined!!\n");
149    (defined $self->{'depth'})|| &error("generateOptionsString","No depth is defined!!\n");
150   
151   
152    if($self->{'depth'} == 0)
153    {
154    $strOptions .= " ";
155    }
156    elsif($self->{'depth'} > 0)
157    {
158    $strOptions .= "-r -l ".$self->{'depth'}." ";
159    }
160    else
161    {
162    $self->error("setupOptions","Incorrect Depth is defined!!\n");
163    }
164
165    if($self->{'below'})
166    {
167    $strOptions .="-np ";
168    }
169
170     if($self->{'html_only'})
171    {
172    $strOptions .="-A .html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
173    }
174    else{
175
176    $strOptions .="-p ";
177    }
178
179    if (!$self->{'within'}){
180    $strOptions .="-H ";
181    }
182
183    return $strOptions;
184 
185}
186
187sub url_information
188{
189    my ($self) = shift (@_);
190
191    my $strOptions = $self->getWgetOptions();
192
193    my $strBaseCMD = $strOptions." --tries=2 -q -O - \"$self->{'url'}\"";
194
195    #&util::print_env(STDERR, "https_proxy", "http_proxy", "ftp_proxy");
196    #&util::print_env(STDERR); 
197   
198    my $strIdentifyText = $self->useWget($strBaseCMD);
199   
200    if (!defined $strIdentifyText or $strIdentifyText eq ""  ){
201   
202    print STDERR "Server information is unavailable.\n";
203   
204    if ($self->{'proxy_on'}) { # if proxying set, the settings may be wrong
205        &gsprintf::gsprintf(STDERR, "{WebDownload.proxied_connect_failed_info}\n");
206       
207        if($self->{'http_proxy_host'} && defined $self->{'http_proxy_port'}) {
208        &gsprintf::gsprintf(STDERR, "{WebDownload.http_proxy_settings}\n", $self->{'http_proxy_host'}, $self->{'http_proxy_port'});
209        }
210        if($self->{'https_proxy_host'} && defined $self->{'https_proxy_port'}) {
211        &gsprintf::gsprintf(STDERR, "{WebDownload.https_proxy_settings}\n", $self->{'https_proxy_host'}, $self->{'https_proxy_port'});
212        }
213        if($self->{'ftp_proxy_host'} && defined $self->{'ftp_proxy_port'}) {
214        &gsprintf::gsprintf(STDERR, "{WebDownload.ftp_proxy_settings}\n", $self->{'ftp_proxy_host'}, $self->{'ftp_proxy_port'});
215        }
216    } else { # else no proxy set, the user may need proxy settings
217        &gsprintf::gsprintf(STDERR, "{WebDownload.proxyless_connect_failed_info}\n");
218    }
219   
220    # with or without proxying set, getting server info may have failed if the URL was https
221    # but the site had no valid certificate and no_check_certificate wasn't turned on
222    # suggest to the user to try turning it on
223    &gsprintf::gsprintf(STDERR, "{WebDownload.connect_failed_info}\n");
224   
225    print STDERR "<<Finished>>\n";
226    return; 
227    }
228
229    while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/si)
230    {
231    $strIdentifyText = $1.$3;
232    print STDERR "Page Title: $2\n";
233    }
234 
235    while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/si)
236    {
237    $strIdentifyText = $1.$3;
238    my $strTempString = $2;
239    print STDERR "\n";
240
241    while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/si)
242    {
243        # Store the infromation in to variable, since next time when we do
244        # regular expression, we will lost all the $1, $2, $X....
245        $strTempString = $3;
246        my $strMetaName = $1;
247        my $strMetaContain = $2;
248       
249        # Take out the extra space in the beginning of the string.
250        $strMetaName =~ s/^([" "])+//m;
251        $strMetaContain =~ s/^([" "])+//m;
252             
253        print STDERR "$strMetaName: $strMetaContain\n\n";
254           
255    }
256
257    }
258
259    print STDERR "<<Finished>>\n";
260
261}
262
263
264sub error
265{
266    my ($strFunctionName,$strError) = @_;
267    {
268    print "Error occoured in WebDownload.pm\n".
269        "In Function:".$strFunctionName."\n".
270        "Error Message:".$strError."\n";
271    exit(-1);
272    }
273}
274
2751;
276
Note: See TracBrowser for help on using the browser.