source: main/trunk/greenstone2/perllib/downloaders/WebDownload.pm

Last change on this file was 31957, checked in by ak19, 7 years ago

Tidying up. 1. Shifting informative error message to strings.properties. 2. Commenting out unnecessary message that was printed out for debugging when things went well. 3. To use gsprintf passing in STDERR/STDOUT as a parameter when use strict is on, need to make some exceptions to strict.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.8 KB
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WebDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use WgetDownload;
35
36sub BEGIN {
37 @WebDownload::ISA = ('WgetDownload');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42no strict 'subs'; # to pass STDERR/STDOUT to functions
43
44use gsprintf 'gsprintf';
45
46my $arguments =
47 [ { 'name' => "url",
48 'disp' => "{WebDownload.url_disp}",
49 'desc' => "{WebDownload.url}",
50 'type' => "string",
51 'reqd' => "yes"},
52 { 'name' => "depth",
53 'disp' => "{WebDownload.depth_disp}",
54 'desc' => "{WebDownload.depth}",
55 'type' => "int",
56 'deft' => "0",
57 "range" => "0,",
58 'reqd' => "no"},
59 { 'name' => "below",
60 'disp' => "{WebDownload.below_disp}",
61 'desc' => "{WebDownload.below}",
62 'type' => "flag",
63 'reqd' => "no"},
64 { 'name' => "within",
65 'disp' => "{WebDownload.within_disp}",
66 'desc' => "{WebDownload.within}",
67 'type' => "flag",
68 'reqd' => "no"},
69 { 'name' => "html_only",
70 'disp' => "{WebDownload.html_only_disp}",
71 'desc' => "{WebDownload.html_only}",
72 'type' => "flag",
73 'reqd' => "no"}
74 ];
75
76my $options = { 'name' => "WebDownload",
77 'desc' => "{WebDownload.desc}",
78 'abstract' => "no",
79 'inherits' => "yes",
80 'args' => $arguments };
81
82
83my $self;
84
85sub new
86{
87 my ($class) = shift (@_);
88 my ($getlist,$inputargs,$hashArgOptLists) = @_;
89 push(@$getlist, $class);
90
91 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92 push(@{$hashArgOptLists->{"OptList"}},$options);
93
94 my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
95
96 return bless $self, $class;
97}
98
99sub download
100{
101 my ($self) = shift (@_);
102 my ($hashGeneralOptions) = @_;
103
104
105 # Download options
106 my $strOptions = $self->generateOptionsString();
107 my $strWgetOptions = $self->getWgetOptions();
108
109 # Setup the command for using wget
110 my $cache_dir = "";
111 if($hashGeneralOptions->{'cache_dir'}) { # don't provide the prefix-dir flag to wget unless the cache_dir is specified
112 if ($ENV{'GSDLOS'} eq "windows") {
113 $cache_dir = "-P \"".$hashGeneralOptions->{'cache_dir'}."\" ";
114 }
115 else {
116 $cache_dir = "-P ".$hashGeneralOptions->{'cache_dir'};
117 }
118 }
119 #my $cmdWget = "-N -k -x -t 2 -P \"".$hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions ".$self->{'url'};
120 #my $cmdWget = "-N -k -x --tries=2 $strWgetOptions $strOptions $cache_dir " .$self->{'url'};
121 my $cmdWget = "-N -k -x $strWgetOptions $strOptions $cache_dir " .$self->{'url'};
122
123 #print STDOUT "\n@@@@ RUNNING WGET CMD: $cmdWget\n\n";
124
125 # Download the web pages
126 # print "Start download from $self->{'url'}...\n";
127 print STDERR "<<Undefined Maximum>>\n";
128
129 if ($ENV{'GSDLOS'} eq "windows") {
130 my $strResponse = $self->useWget($cmdWget,1);
131 } else {
132 my $strResponse = $self->useWget($cmdWget,1,$hashGeneralOptions->{"cache_dir"} );
133
134 }
135
136 # if ($strResponse ne ""){print "$strResponse\n";}
137 print STDERR "Finished downloading from $self->{'url'}\n";
138
139 print STDERR "<<Finished>>\n";
140
141 return 1;
142}
143
144sub generateOptionsString
145{
146 my ($self) = @_;
147 my $strOptions;
148
149 (defined $self) || &error("generateOptionsString","No \$self is defined!!\n");
150 (defined $self->{'depth'})|| &error("generateOptionsString","No depth is defined!!\n");
151
152
153 if($self->{'depth'} == 0)
154 {
155 $strOptions .= " ";
156 }
157 elsif($self->{'depth'} > 0)
158 {
159 $strOptions .= "-r -l ".$self->{'depth'}." ";
160 }
161 else
162 {
163 $self->error("setupOptions","Incorrect Depth is defined!!\n");
164 }
165
166 if($self->{'below'})
167 {
168 $strOptions .="-np ";
169 }
170
171 if($self->{'html_only'})
172 {
173 $strOptions .="-A .html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
174 }
175 else{
176
177 $strOptions .="-p ";
178 }
179
180 if (!$self->{'within'}){
181 $strOptions .="-H ";
182 }
183
184 return $strOptions;
185
186}
187
188sub url_information
189{
190 my ($self) = shift (@_);
191
192 my $strOptions = $self->getWgetOptions();
193
194 #my $strBaseCMD = $strOptions." --tries=2 -q -O - \"$self->{'url'}\"";
195 my $strBaseCMD = $strOptions." -q -O - $self->{'url'}";
196
197 #&util::print_env(STDERR, "https_proxy", "http_proxy", "ftp_proxy");
198 #&util::print_env(STDERR);
199
200 my $strIdentifyText = $self->useWget($strBaseCMD);
201
202 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
203
204 print STDERR "Server information is unavailable.\n";
205
206 if ($self->{'proxy_on'}) { # if proxying set, the settings may be wrong
207 &gsprintf::gsprintf(STDERR, "{WebDownload.proxied_connect_failed_info}\n");
208
209 if($self->{'http_proxy_host'} && defined $self->{'http_proxy_port'}) {
210 &gsprintf::gsprintf(STDERR, "{WebDownload.http_proxy_settings}\n", $self->{'http_proxy_host'}, $self->{'http_proxy_port'});
211 }
212 if($self->{'https_proxy_host'} && defined $self->{'https_proxy_port'}) {
213 &gsprintf::gsprintf(STDERR, "{WebDownload.https_proxy_settings}\n", $self->{'https_proxy_host'}, $self->{'https_proxy_port'});
214 }
215 if($self->{'ftp_proxy_host'} && defined $self->{'ftp_proxy_port'}) {
216 &gsprintf::gsprintf(STDERR, "{WebDownload.ftp_proxy_settings}\n", $self->{'ftp_proxy_host'}, $self->{'ftp_proxy_port'});
217 }
218 } else { # else no proxy set, the user may need proxy settings
219 &gsprintf::gsprintf(STDERR, "{WebDownload.proxyless_connect_failed_info}\n");
220 }
221
222 # with or without proxying set, getting server info may have failed if the URL was https
223 # but the site had no valid certificate and no_check_certificate wasn't turned on
224 # suggest to the user to try turning it on
225 &gsprintf::gsprintf(STDERR, "{WebDownload.connect_failed_info}\n");
226
227 print STDERR "<<Finished>>\n";
228 return;
229 }
230
231 while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/si)
232 {
233 $strIdentifyText = $1.$3;
234 print STDERR "Page Title: $2\n";
235 }
236
237 while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/si)
238 {
239 $strIdentifyText = $1.$3;
240 my $strTempString = $2;
241 print STDERR "\n";
242
243 while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/si)
244 {
245 # Store the infromation in to variable, since next time when we do
246 # regular expression, we will lost all the $1, $2, $X....
247 $strTempString = $3;
248 my $strMetaName = $1;
249 my $strMetaContain = $2;
250
251 # Take out the extra space in the beginning of the string.
252 $strMetaName =~ s/^([" "])+//m;
253 $strMetaContain =~ s/^([" "])+//m;
254
255 print STDERR "$strMetaName: $strMetaContain\n\n";
256
257 }
258
259 }
260
261 print STDERR "<<Finished>>\n";
262
263}
264
265
266sub error
267{
268 my ($strFunctionName,$strError) = @_;
269 {
270 print "Error occoured in WebDownload.pm\n".
271 "In Function:".$strFunctionName."\n".
272 "Error Message:".$strError."\n";
273 exit(-1);
274 }
275}
276
2771;
278
Note: See TracBrowser for help on using the repository browser.