source: gsdl/trunk/perllib/downloaders/WebDownload.pm@ 14248

Last change on this file since 14248 was 13902, checked in by shaoqun, 17 years ago

fixed the bug that caused the download web files being stored in the wrong place on linux

  • Property svn:keywords set to Author Date Id Revision
File size: 6.5 KB
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WebDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use WgetDownload;
35
36sub BEGIN {
37 @WebDownload::ISA = ('WgetDownload');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43my $arguments =
44 [ { 'name' => "url",
45 'disp' => "{WebDownload.url_disp}",
46 'desc' => "{WebDownload.url}",
47 'type' => "string",
48 'reqd' => "yes"},
49 { 'name' => "depth",
50 'disp' => "{WebDownload.depth_disp}",
51 'desc' => "{WebDownload.depth}",
52 'type' => "int",
53 'deft' => "0",
54 "range" => "0,",
55 'reqd' => "no"},
56 { 'name' => "below",
57 'disp' => "{WebDownload.below_disp}",
58 'desc' => "{WebDownload.below}",
59 'type' => "flag",
60 'reqd' => "no"},
61 { 'name' => "within",
62 'disp' => "{WebDownload.within_disp}",
63 'desc' => "{WebDownload.within}",
64 'type' => "flag",
65 'reqd' => "no"},
66 { 'name' => "html_only",
67 'disp' => "{WebDownload.html_only_disp}",
68 'desc' => "{WebDownload.html_only}",
69 'type' => "flag",
70 'reqd' => "no"}
71 ];
72
73my $options = { 'name' => "WebDownload",
74 'desc' => "{WebDownload.desc}",
75 'abstract' => "no",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79
80my $self;
81
82sub new
83{
84 my ($class) = shift (@_);
85 my ($getlist,$inputargs,$hashArgOptLists) = @_;
86 push(@$getlist, $class);
87
88 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
89 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
90
91 my $self = (defined $hashArgOptLists)? new WgetDownload($getlist,$inputargs,$hashArgOptLists): new WgetDownload($getlist,$inputargs);
92
93 return bless $self, $class;
94}
95
96sub download
97{
98 my ($self) = shift (@_);
99 my ($hashGeneralOptions) = @_;
100
101
102 # Download options
103 my $strOptions = $self->generateOptionsString();
104 my $strWgetOptions = $self->getWgetOptions();
105
106 # Setup the command for using wget
107 my $cache_dir = "";
108 if ($ENV{'GSDLOS'} eq "windows") {
109 $cache_dir = "-P \"".$hashGeneralOptions->{"cache_dir"}."\" ";
110 }
111 else{
112 $cache_dir = "-P ".$hashGeneralOptions->{"cache_dir"};
113 }
114
115 #my $cmdWget = "-N -k -x -t 2 -P \"".$hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions ".$self->{'url'};
116 my $cmdWget = "-N -k -x -t 2 $strWgetOptions $strOptions $cache_dir " .$self->{'url'};
117
118 # Download the web pages
119 # print "Strat download from $self->{'url'}...\n";
120 print STDERR "<<Undefined Maximum>>\n";
121
122 if ($ENV{'GSDLOS'} eq "windows") {
123 my $strResponse = $self->useWget($cmdWget,1);
124 } else {
125 my $strResponse = $self->useWget($cmdWget,1,$hashGeneralOptions->{"cache_dir"} );
126
127 }
128
129 # if ($strResponse ne ""){print "$strResponse\n";}
130 print STDERR "Finish download from $self->{'url'}\n";
131
132 print STDERR "<<Finished>>\n";
133
134 return 1;
135}
136
137sub generateOptionsString
138{
139 my ($self) = @_;
140 my $strOptions;
141
142 (defined $self) || &error("generateOptionsString","No \$self is defined!!\n");
143 (defined $self->{'depth'})|| &error("generateOptionsString","No depth is defined!!\n");
144
145
146 if($self->{'depth'} == 0)
147 {
148 $strOptions .= " ";
149 }
150 elsif($self->{'depth'} > 0)
151 {
152 $strOptions .= "-r -l ".$self->{'depth'}." ";
153 }
154 else
155 {
156 $self->error("setupOptions","Incorrect Depth is defined!!\n");
157 }
158
159 if($self->{'below'})
160 {
161 $strOptions .="-np ";
162 }
163
164 if($self->{'html_only'})
165 {
166 $strOptions .="-A .html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
167 }
168 else{
169
170 $strOptions .="-p ";
171 }
172
173 if (!$self->{'within'}){
174 $strOptions .="-H ";
175 }
176
177 return $strOptions;
178
179}
180
181sub url_information
182{
183 my ($self) = shift (@_);
184
185 my $strOptions = $self->getWgetOptions();
186
187 my $strBaseCMD = $strOptions." -q -O - \"$self->{'url'}\"";
188
189
190 my $strIdentifyText = $self->useWget($strBaseCMD);
191
192 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
193 print STDERR "Server information is unavailable.\n";
194 print STDERR "<<Finished>>\n";
195 return;
196 }
197
198 while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/s)
199 {
200 $strIdentifyText = $1.$3;
201 print STDERR "Page Title: $2\n";
202 }
203
204 while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/s)
205 {
206 $strIdentifyText = $1.$3;
207 my $strTempString = $2;
208 print STDERR "\n";
209
210 while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/s)
211 {
212 # Store the infromation in to variable, since next time when we do
213 # regular expression, we will lost all the $1, $2, $X....
214 $strTempString = $3;
215 my $strMetaName = $1;
216 my $strMetaContain = $2;
217
218 # Take out the extra space in the beginning of the string.
219 $strMetaName =~ s/^([" "])+//m;
220 $strMetaContain =~ s/^([" "])+//m;
221
222 print STDERR "$strMetaName: $strMetaContain\n\n";
223
224 }
225
226 }
227
228 print STDERR "<<Finished>>\n";
229
230}
231
232
233sub error
234{
235 my ($strFunctionName,$strError) = @_;
236 {
237 print "Error occoured in WebDownload.pm\n".
238 "In Function:".$strFunctionName."\n".
239 "Error Message:".$strError."\n";
240 exit(-1);
241 }
242}
243
2441;
245
Note: See TracBrowser for help on using the repository browser.