source: main/trunk/greenstone2/perllib/downloaders/WebDownload.pm@ 31864

Last change on this file since 31864 was 31864, checked in by ak19, 6 years ago
  1. Modified GLI and perl to set proxy_on, proxy_host and proxy_port for unix too, whereas in the past, only http(s)_proxy ENV vars were set by GLI for unix. If the env vars are not set, then perl assumes it's the wget setup for windows and passes the proxy vars that were set by GLI as flags to wget. If the env vars were set, then perl should run wget without setting the proxy vars as flags to wget, and wget will use the proxying info in the environment. So now, the decision as to whether proxy vars are set or not will result in context specific suggestion messages to the user when the Server Info button was pressed and if the URL could not be accessed for whatever reason. 2. Modified GLI code to not use proxy if proxying is toggled off, but use proxy when it's toggled back on. (Previously, if proxying was set at any point during a GLI session, then GLI remembered that even when proxying was turned off thereafter.)
  • Property svn:keywords set to Author Date Id Revision
File size: 7.1 KB
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WebDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use WgetDownload;
35
36sub BEGIN {
37 @WebDownload::ISA = ('WgetDownload');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42no strict 'subs'; # to pass STDERR/STDOUT to functions
43
44use gsprintf 'gsprintf';
45
46my $arguments =
47 [ { 'name' => "url",
48 'disp' => "{WebDownload.url_disp}",
49 'desc' => "{WebDownload.url}",
50 'type' => "string",
51 'reqd' => "yes"},
52 { 'name' => "depth",
53 'disp' => "{WebDownload.depth_disp}",
54 'desc' => "{WebDownload.depth}",
55 'type' => "int",
56 'deft' => "0",
57 "range" => "0,",
58 'reqd' => "no"},
59 { 'name' => "below",
60 'disp' => "{WebDownload.below_disp}",
61 'desc' => "{WebDownload.below}",
62 'type' => "flag",
63 'reqd' => "no"},
64 { 'name' => "within",
65 'disp' => "{WebDownload.within_disp}",
66 'desc' => "{WebDownload.within}",
67 'type' => "flag",
68 'reqd' => "no"},
69 { 'name' => "html_only",
70 'disp' => "{WebDownload.html_only_disp}",
71 'desc' => "{WebDownload.html_only}",
72 'type' => "flag",
73 'reqd' => "no"}
74 ];
75
76my $options = { 'name' => "WebDownload",
77 'desc' => "{WebDownload.desc}",
78 'abstract' => "no",
79 'inherits' => "yes",
80 'args' => $arguments };
81
82
83my $self;
84
85sub new
86{
87 my ($class) = shift (@_);
88 my ($getlist,$inputargs,$hashArgOptLists) = @_;
89 push(@$getlist, $class);
90
91 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92 push(@{$hashArgOptLists->{"OptList"}},$options);
93
94 my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
95
96 return bless $self, $class;
97}
98
99sub download
100{
101 my ($self) = shift (@_);
102 my ($hashGeneralOptions) = @_;
103
104
105 # Download options
106 my $strOptions = $self->generateOptionsString();
107 my $strWgetOptions = $self->getWgetOptions();
108
109 # Setup the command for using wget
110 my $cache_dir = "";
111 if($hashGeneralOptions->{'cache_dir'}) { # don't provide the prefix-dir flag to wget unless the cache_dir is specified
112 if ($ENV{'GSDLOS'} eq "windows") {
113 $cache_dir = "-P \"".$hashGeneralOptions->{'cache_dir'}."\" ";
114 }
115 else {
116 $cache_dir = "-P ".$hashGeneralOptions->{'cache_dir'};
117 }
118 }
119 #my $cmdWget = "-N -k -x -t 2 -P \"".$hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions ".$self->{'url'};
120 my $cmdWget = "-N -k -x --tries=2 $strWgetOptions $strOptions $cache_dir " .$self->{'url'};
121
122 #print STDOUT "\n@@@@ RUNNING WGET CMD: $cmdWget\n\n";
123
124 # Download the web pages
125 # print "Start download from $self->{'url'}...\n";
126 print STDERR "<<Undefined Maximum>>\n";
127
128 if ($ENV{'GSDLOS'} eq "windows") {
129 my $strResponse = $self->useWget($cmdWget,1);
130 } else {
131 my $strResponse = $self->useWget($cmdWget,1,$hashGeneralOptions->{"cache_dir"} );
132
133 }
134
135 # if ($strResponse ne ""){print "$strResponse\n";}
136 print STDERR "Finish download from $self->{'url'}\n";
137
138 print STDERR "<<Finished>>\n";
139
140 return 1;
141}
142
143sub generateOptionsString
144{
145 my ($self) = @_;
146 my $strOptions;
147
148 (defined $self) || &error("generateOptionsString","No \$self is defined!!\n");
149 (defined $self->{'depth'})|| &error("generateOptionsString","No depth is defined!!\n");
150
151
152 if($self->{'depth'} == 0)
153 {
154 $strOptions .= " ";
155 }
156 elsif($self->{'depth'} > 0)
157 {
158 $strOptions .= "-r -l ".$self->{'depth'}." ";
159 }
160 else
161 {
162 $self->error("setupOptions","Incorrect Depth is defined!!\n");
163 }
164
165 if($self->{'below'})
166 {
167 $strOptions .="-np ";
168 }
169
170 if($self->{'html_only'})
171 {
172 $strOptions .="-A .html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
173 }
174 else{
175
176 $strOptions .="-p ";
177 }
178
179 if (!$self->{'within'}){
180 $strOptions .="-H ";
181 }
182
183 return $strOptions;
184
185}
186
187sub url_information
188{
189 my ($self) = shift (@_);
190
191 my $strOptions = $self->getWgetOptions();
192
193 my $strBaseCMD = $strOptions." --tries=2 -q -O - \"$self->{'url'}\"";
194
195 my $strIdentifyText = $self->useWget($strBaseCMD);
196
197 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
198
199 print STDERR "Server information is unavailable.\n";
200
201 #&util::print_env(STDERR, "https_proxy", "http_proxy");
202
203 if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'}) { # if proxying set, the settings may be wrong
204 &gsprintf::gsprintf_multiline(STDERR, "{WebDownload.proxied_connect_failed_info}\n", $self->{'proxy_host'}, $self->{'proxy_port'});
205 } else { # else no proxy set, the user may need proxy settings
206 &gsprintf::gsprintf_multiline(STDERR, "{WebDownload.proxyless_connect_failed_info}\n");
207 }
208
209 # with or without proxying set, getting server info may have failed if the URL was https
210 # but the site had no valid certificate and no_check_certificate wasn't turned on
211 # suggest to the user to try turning it on
212 &gsprintf::gsprintf_multiline(STDERR, "{WebDownload.connect_failed_info}\n");
213
214 print STDERR "<<Finished>>\n";
215 return;
216 }
217
218 while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/si)
219 {
220 $strIdentifyText = $1.$3;
221 print STDERR "Page Title: $2\n";
222 }
223
224 while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/si)
225 {
226 $strIdentifyText = $1.$3;
227 my $strTempString = $2;
228 print STDERR "\n";
229
230 while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/si)
231 {
232 # Store the infromation in to variable, since next time when we do
233 # regular expression, we will lost all the $1, $2, $X....
234 $strTempString = $3;
235 my $strMetaName = $1;
236 my $strMetaContain = $2;
237
238 # Take out the extra space in the beginning of the string.
239 $strMetaName =~ s/^([" "])+//m;
240 $strMetaContain =~ s/^([" "])+//m;
241
242 print STDERR "$strMetaName: $strMetaContain\n\n";
243
244 }
245
246 }
247
248 print STDERR "<<Finished>>\n";
249
250}
251
252
253sub error
254{
255 my ($strFunctionName,$strError) = @_;
256 {
257 print "Error occoured in WebDownload.pm\n".
258 "In Function:".$strFunctionName."\n".
259 "Error Message:".$strError."\n";
260 exit(-1);
261 }
262}
263
2641;
265
Note: See TracBrowser for help on using the repository browser.