source: main/trunk/greenstone2/perllib/downloaders/WebDownload.pm@ 31851

Last change on this file since 31851 was 31851, checked in by ak19, 7 years ago
  1. Fixes to get proxying to work on Windows. 2. Fixes to timeout if a page doesn't exist and it takes forever to read. Both for downloading from a URL and getting server info (perl code), and also in Java code, when doing a getRedirectURL(). Generally, a URL is correct and when wget is launched, a cancel operation in the Java GUI successfully causes and interrupt which then terminates wget. However, if the URL doesn't exist, either when getting serer info or when downloading, the wget launched by the perl seems to block or something, and the interrupt is not noticed until the wget is manually terminated through the task manager. Then the interrupt is finally noticed. If pages would indicate they don't exist, then it wouldn't have been a problem. This issue is now circumvented through setting a read-timeout, to stop retrieving pages that don't exist but that take forever to access anyway as they don't indicate that they don't exist. A connect timeout is for if you get proxy details wrong or something like that and it takes forever to connect.
  • Property svn:keywords set to Author Date Id Revision
File size: 6.3 KB
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WebDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use WgetDownload;
35
36sub BEGIN {
37 @WebDownload::ISA = ('WgetDownload');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43my $arguments =
44 [ { 'name' => "url",
45 'disp' => "{WebDownload.url_disp}",
46 'desc' => "{WebDownload.url}",
47 'type' => "string",
48 'reqd' => "yes"},
49 { 'name' => "depth",
50 'disp' => "{WebDownload.depth_disp}",
51 'desc' => "{WebDownload.depth}",
52 'type' => "int",
53 'deft' => "0",
54 "range" => "0,",
55 'reqd' => "no"},
56 { 'name' => "below",
57 'disp' => "{WebDownload.below_disp}",
58 'desc' => "{WebDownload.below}",
59 'type' => "flag",
60 'reqd' => "no"},
61 { 'name' => "within",
62 'disp' => "{WebDownload.within_disp}",
63 'desc' => "{WebDownload.within}",
64 'type' => "flag",
65 'reqd' => "no"},
66 { 'name' => "html_only",
67 'disp' => "{WebDownload.html_only_disp}",
68 'desc' => "{WebDownload.html_only}",
69 'type' => "flag",
70 'reqd' => "no"}
71 ];
72
73my $options = { 'name' => "WebDownload",
74 'desc' => "{WebDownload.desc}",
75 'abstract' => "no",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79
80my $self;
81
82sub new
83{
84 my ($class) = shift (@_);
85 my ($getlist,$inputargs,$hashArgOptLists) = @_;
86 push(@$getlist, $class);
87
88 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
89 push(@{$hashArgOptLists->{"OptList"}},$options);
90
91 my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
92
93 return bless $self, $class;
94}
95
96sub download
97{
98 my ($self) = shift (@_);
99 my ($hashGeneralOptions) = @_;
100
101
102 # Download options
103 my $strOptions = $self->generateOptionsString();
104 my $strWgetOptions = $self->getWgetOptions();
105
106 # Setup the command for using wget
107 my $cache_dir = "";
108 if($hashGeneralOptions->{'cache_dir'}) { # don't provide the prefix-dir flag to wget unless the cache_dir is specified
109 if ($ENV{'GSDLOS'} eq "windows") {
110 $cache_dir = "-P \"".$hashGeneralOptions->{'cache_dir'}."\" ";
111 }
112 else {
113 $cache_dir = "-P ".$hashGeneralOptions->{'cache_dir'};
114 }
115 }
116 #my $cmdWget = "-N -k -x -t 2 -P \"".$hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions ".$self->{'url'};
117 my $cmdWget = "-N -k -x -t 2 --read-timeout=2 --connect-timeout=2 $strWgetOptions $strOptions $cache_dir " .$self->{'url'};
118
119 #print STDOUT "\n@@@@ RUNNING WGET CMD: $cmdWget\n\n";
120
121 # Download the web pages
122 # print "Start download from $self->{'url'}...\n";
123 print STDERR "<<Undefined Maximum>>\n";
124
125 if ($ENV{'GSDLOS'} eq "windows") {
126 my $strResponse = $self->useWget($cmdWget,1);
127 } else {
128 my $strResponse = $self->useWget($cmdWget,1,$hashGeneralOptions->{"cache_dir"} );
129
130 }
131
132 # if ($strResponse ne ""){print "$strResponse\n";}
133 print STDERR "Finish download from $self->{'url'}\n";
134
135 print STDERR "<<Finished>>\n";
136
137 return 1;
138}
139
140sub generateOptionsString
141{
142 my ($self) = @_;
143 my $strOptions;
144
145 (defined $self) || &error("generateOptionsString","No \$self is defined!!\n");
146 (defined $self->{'depth'})|| &error("generateOptionsString","No depth is defined!!\n");
147
148
149 if($self->{'depth'} == 0)
150 {
151 $strOptions .= " ";
152 }
153 elsif($self->{'depth'} > 0)
154 {
155 $strOptions .= "-r -l ".$self->{'depth'}." ";
156 }
157 else
158 {
159 $self->error("setupOptions","Incorrect Depth is defined!!\n");
160 }
161
162 if($self->{'below'})
163 {
164 $strOptions .="-np ";
165 }
166
167 if($self->{'html_only'})
168 {
169 $strOptions .="-A .html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
170 }
171 else{
172
173 $strOptions .="-p ";
174 }
175
176 if (!$self->{'within'}){
177 $strOptions .="-H ";
178 }
179
180 return $strOptions;
181
182}
183
184sub url_information
185{
186 my ($self) = shift (@_);
187
188 my $strOptions = $self->getWgetOptions();
189
190 my $strBaseCMD = $strOptions." --timeout=4 --tries=1 -q -O - \"$self->{'url'}\"";
191
192
193 my $strIdentifyText = $self->useWget($strBaseCMD);
194
195 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
196 print STDERR "Server information is unavailable.\n";
197 print STDERR "<<Finished>>\n";
198 return;
199 }
200
201 while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/si)
202 {
203 $strIdentifyText = $1.$3;
204 print STDERR "Page Title: $2\n";
205 }
206
207 while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/si)
208 {
209 $strIdentifyText = $1.$3;
210 my $strTempString = $2;
211 print STDERR "\n";
212
213 while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/si)
214 {
215 # Store the infromation in to variable, since next time when we do
216 # regular expression, we will lost all the $1, $2, $X....
217 $strTempString = $3;
218 my $strMetaName = $1;
219 my $strMetaContain = $2;
220
221 # Take out the extra space in the beginning of the string.
222 $strMetaName =~ s/^([" "])+//m;
223 $strMetaContain =~ s/^([" "])+//m;
224
225 print STDERR "$strMetaName: $strMetaContain\n\n";
226
227 }
228
229 }
230
231 print STDERR "<<Finished>>\n";
232
233}
234
235
236sub error
237{
238 my ($strFunctionName,$strError) = @_;
239 {
240 print "Error occoured in WebDownload.pm\n".
241 "In Function:".$strFunctionName."\n".
242 "Error Message:".$strError."\n";
243 exit(-1);
244 }
245}
246
2471;
248
Note: See TracBrowser for help on using the repository browser.