root/main/trunk/greenstone2/perllib/downloaders/WebDownload.pm @ 31860

Revision 31860, 7.0 KB (checked in by ak19, 2 years ago)

The changes necessary for getting the new no_check_certificate checkbox to appear and work in GLI and get propagated to the perl code that launches wget. This checkbox controls whether wget is launched with the no-check-certificate flag to retrieve Https URLs despite lack of (valid) certificates.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WebDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use WgetDownload;
35
36sub BEGIN {
37    @WebDownload::ISA = ('WgetDownload');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42no strict 'subs';
43use gsprintf 'gsprintf';
44
45my $arguments =
46    [ { 'name' => "url",
47    'disp' => "{WebDownload.url_disp}",
48    'desc' => "{WebDownload.url}",
49    'type' => "string",
50    'reqd' => "yes"},
51      { 'name' => "depth",
52    'disp' => "{WebDownload.depth_disp}",
53    'desc' => "{WebDownload.depth}",
54    'type' => "int",
55    'deft' => "0",
56    "range" => "0,",
57    'reqd' => "no"},
58      { 'name' => "below",
59    'disp' => "{WebDownload.below_disp}",
60    'desc' => "{WebDownload.below}",
61    'type' => "flag",
62    'reqd' => "no"},
63      { 'name' => "within",
64    'disp' => "{WebDownload.within_disp}",
65    'desc' => "{WebDownload.within}",
66    'type' => "flag",
67    'reqd' => "no"},
68      { 'name' => "html_only",
69    'disp' => "{WebDownload.html_only_disp}",
70    'desc' => "{WebDownload.html_only}",
71    'type' => "flag",
72    'reqd' => "no"}
73      ];
74
75my $options = { 'name'     => "WebDownload",
76        'desc'     => "{WebDownload.desc}",
77        'abstract' => "no",
78        'inherits' => "yes",
79        'args'     => $arguments };
80
81
82my $self;
83
84sub new
85{
86    my ($class) = shift (@_);
87    my ($getlist,$inputargs,$hashArgOptLists) = @_;
88    push(@$getlist, $class);
89
90    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
91    push(@{$hashArgOptLists->{"OptList"}},$options);
92
93    my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
94   
95    return bless $self, $class;
96}
97
98sub download
99{
100    my ($self) = shift (@_);
101    my ($hashGeneralOptions) = @_;
102
103 
104    # Download options
105    my $strOptions = $self->generateOptionsString();
106    my $strWgetOptions = $self->getWgetOptions();
107     
108    # Setup the command for using wget
109    my $cache_dir = "";
110    if($hashGeneralOptions->{'cache_dir'}) { # don't provide the prefix-dir flag to wget unless the cache_dir is specified
111    if ($ENV{'GSDLOS'} eq "windows") {   
112        $cache_dir = "-P \"".$hashGeneralOptions->{'cache_dir'}."\" ";
113    }
114    else {
115        $cache_dir = "-P ".$hashGeneralOptions->{'cache_dir'};
116    }
117    }
118    #my $cmdWget = "-N -k -x -t 2 -P \"".$hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions ".$self->{'url'};
119    my $cmdWget = "-N -k -x --tries=2 $strWgetOptions $strOptions $cache_dir " .$self->{'url'};
120
121    #print STDOUT "\n@@@@ RUNNING WGET CMD: $cmdWget\n\n";
122   
123    # Download the web pages
124    # print "Start download from $self->{'url'}...\n";
125    print STDERR "<<Undefined Maximum>>\n";
126   
127    if ($ENV{'GSDLOS'} eq "windows") {
128    my $strResponse = $self->useWget($cmdWget,1);
129    } else {
130    my $strResponse = $self->useWget($cmdWget,1,$hashGeneralOptions->{"cache_dir"} );
131   
132    }
133   
134    # if ($strResponse ne ""){print "$strResponse\n";}
135     print STDERR "Finish download from $self->{'url'}\n";
136
137    print STDERR "<<Finished>>\n";
138 
139    return 1;
140}
141
142sub generateOptionsString
143{
144    my ($self) = @_;
145    my $strOptions;
146
147    (defined $self) || &error("generateOptionsString","No \$self is defined!!\n");
148    (defined $self->{'depth'})|| &error("generateOptionsString","No depth is defined!!\n");
149   
150   
151    if($self->{'depth'} == 0)
152    {
153    $strOptions .= " ";
154    }
155    elsif($self->{'depth'} > 0)
156    {
157    $strOptions .= "-r -l ".$self->{'depth'}." ";
158    }
159    else
160    {
161    $self->error("setupOptions","Incorrect Depth is defined!!\n");
162    }
163
164    if($self->{'below'})
165    {
166    $strOptions .="-np ";
167    }
168
169     if($self->{'html_only'})
170    {
171    $strOptions .="-A .html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
172    }
173    else{
174
175    $strOptions .="-p ";
176    }
177
178    if (!$self->{'within'}){
179    $strOptions .="-H ";
180    }
181
182    return $strOptions;
183 
184}
185
186sub url_information
187{
188    my ($self) = shift (@_);
189
190    my $strOptions = $self->getWgetOptions();
191
192    my $strBaseCMD = $strOptions." --tries=2 -q -O - \"$self->{'url'}\"";
193
194 
195    my $strIdentifyText = $self->useWget($strBaseCMD);
196   
197    if (!defined $strIdentifyText or $strIdentifyText eq ""  ){
198   
199    print STDERR "Server information is unavailable.\n";
200
201    if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'}) { # if proxying set, the settings may be wrong
202        &gsprintf::gsprintf_multiline(STDERR, "{WebDownload.proxied_connect_failed_info}\n", $self->{'proxy_host'}, $self->{'proxy_port'});
203    } else { # else no proxy set, the user may need proxy settings
204        &gsprintf::gsprintf_multiline(STDERR, "{WebDownload.proxyless_connect_failed_info}\n");
205    }
206   
207    # with or without proxying set, getting server info may have failed if the URL was Https
208    # but the site had no valid certificate and no_check_certificate wasn't turned on
209    # suggest to the user to try turning it on
210    &gsprintf::gsprintf_multiline(STDERR, "{WebDownload.connect_failed_info}\n");
211   
212    print STDERR "<<Finished>>\n";
213    return; 
214    }
215
216    while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/si)
217    {
218    $strIdentifyText = $1.$3;
219    print STDERR "Page Title: $2\n";
220    }
221 
222    while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/si)
223    {
224    $strIdentifyText = $1.$3;
225    my $strTempString = $2;
226    print STDERR "\n";
227
228    while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/si)
229    {
230        # Store the infromation in to variable, since next time when we do
231        # regular expression, we will lost all the $1, $2, $X....
232        $strTempString = $3;
233        my $strMetaName = $1;
234        my $strMetaContain = $2;
235       
236        # Take out the extra space in the beginning of the string.
237        $strMetaName =~ s/^([" "])+//m;
238        $strMetaContain =~ s/^([" "])+//m;
239             
240        print STDERR "$strMetaName: $strMetaContain\n\n";
241           
242    }
243
244    }
245
246    print STDERR "<<Finished>>\n";
247
248}
249
250
251sub error
252{
253    my ($strFunctionName,$strError) = @_;
254    {
255    print "Error occoured in WebDownload.pm\n".
256        "In Function:".$strFunctionName."\n".
257        "Error Message:".$strError."\n";
258    exit(-1);
259    }
260}
261
2621;
263
Note: See TracBrowser for help on using the browser.