root/main/trunk/greenstone2/perllib/downloaders/WebDownload.pm @ 31878

Revision 31878, 7.3 KB (checked in by ak19, 2 years ago)

1. Previous commit message was incorrect: it wasn't that perl wasn't found that resulted in the open3() failure error message, but that something on the PATH wasn't available to it, possible wget itself. 2. Updating unused DownloadJob?.old_callDownload() to have the recently committed changes in callDownload(). 3. Emacs tabbing for recently committed files.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WebDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use WgetDownload;
35
36sub BEGIN {
37    @WebDownload::ISA = ('WgetDownload');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42no strict 'subs'; # to pass STDERR/STDOUT to functions
43
44use gsprintf 'gsprintf';
45
46my $arguments =
47    [ { 'name' => "url",
48    'disp' => "{WebDownload.url_disp}",
49    'desc' => "{WebDownload.url}",
50    'type' => "string",
51    'reqd' => "yes"},
52      { 'name' => "depth",
53    'disp' => "{WebDownload.depth_disp}",
54    'desc' => "{WebDownload.depth}",
55    'type' => "int",
56    'deft' => "0",
57    "range" => "0,",
58    'reqd' => "no"},
59      { 'name' => "below",
60    'disp' => "{WebDownload.below_disp}",
61    'desc' => "{WebDownload.below}",
62    'type' => "flag",
63    'reqd' => "no"},
64      { 'name' => "within",
65    'disp' => "{WebDownload.within_disp}",
66    'desc' => "{WebDownload.within}",
67    'type' => "flag",
68    'reqd' => "no"},
69      { 'name' => "html_only",
70    'disp' => "{WebDownload.html_only_disp}",
71    'desc' => "{WebDownload.html_only}",
72    'type' => "flag",
73    'reqd' => "no"}
74      ];
75
76my $options = { 'name'     => "WebDownload",
77        'desc'     => "{WebDownload.desc}",
78        'abstract' => "no",
79        'inherits' => "yes",
80        'args'     => $arguments };
81
82
83my $self;
84
85sub new
86{
87    my ($class) = shift (@_);
88    my ($getlist,$inputargs,$hashArgOptLists) = @_;
89    push(@$getlist, $class);
90
91    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92    push(@{$hashArgOptLists->{"OptList"}},$options);
93
94    my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
95   
96    return bless $self, $class;
97}
98
99sub download
100{
101    my ($self) = shift (@_);
102    my ($hashGeneralOptions) = @_;
103
104 
105    # Download options
106    my $strOptions = $self->generateOptionsString();
107    my $strWgetOptions = $self->getWgetOptions();
108     
109    # Setup the command for using wget
110    my $cache_dir = "";
111    if($hashGeneralOptions->{'cache_dir'}) { # don't provide the prefix-dir flag to wget unless the cache_dir is specified
112    if ($ENV{'GSDLOS'} eq "windows") {   
113        $cache_dir = "-P \"".$hashGeneralOptions->{'cache_dir'}."\" ";
114    }
115    else {
116        $cache_dir = "-P ".$hashGeneralOptions->{'cache_dir'};
117    }
118    }
119    #my $cmdWget = "-N -k -x -t 2 -P \"".$hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions ".$self->{'url'};
120    my $cmdWget = "-N -k -x --tries=2 $strWgetOptions $strOptions $cache_dir " .$self->{'url'};
121
122    #print STDOUT "\n@@@@ RUNNING WGET CMD: $cmdWget\n\n";
123   
124    # Download the web pages
125    # print "Start download from $self->{'url'}...\n";
126    print STDERR "<<Undefined Maximum>>\n";
127   
128    if ($ENV{'GSDLOS'} eq "windows") {
129    my $strResponse = $self->useWget($cmdWget,1);
130    } else {
131    my $strResponse = $self->useWget($cmdWget,1,$hashGeneralOptions->{"cache_dir"} );
132   
133    }
134   
135    # if ($strResponse ne ""){print "$strResponse\n";}
136     print STDERR "Finish download from $self->{'url'}\n";
137
138    print STDERR "<<Finished>>\n";
139 
140    return 1;
141}
142
143sub generateOptionsString
144{
145    my ($self) = @_;
146    my $strOptions;
147
148    (defined $self) || &error("generateOptionsString","No \$self is defined!!\n");
149    (defined $self->{'depth'})|| &error("generateOptionsString","No depth is defined!!\n");
150   
151   
152    if($self->{'depth'} == 0)
153    {
154    $strOptions .= " ";
155    }
156    elsif($self->{'depth'} > 0)
157    {
158    $strOptions .= "-r -l ".$self->{'depth'}." ";
159    }
160    else
161    {
162    $self->error("setupOptions","Incorrect Depth is defined!!\n");
163    }
164
165    if($self->{'below'})
166    {
167    $strOptions .="-np ";
168    }
169
170     if($self->{'html_only'})
171    {
172    $strOptions .="-A .html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
173    }
174    else{
175
176    $strOptions .="-p ";
177    }
178
179    if (!$self->{'within'}){
180    $strOptions .="-H ";
181    }
182
183    return $strOptions;
184 
185}
186
187sub url_information
188{
189    my ($self) = shift (@_);
190
191    my $strOptions = $self->getWgetOptions();
192
193    my $strBaseCMD = $strOptions." --tries=2 -q -O - \"$self->{'url'}\"";
194
195    #&util::print_env(STDERR, "https_proxy", "http_proxy", "HTTPS_PROXY", "HTTP_PROXY", "ftp_proxy", "FTP_PROXY");
196    #&util::print_env(STDERR); 
197   
198    my $strIdentifyText = $self->useWget($strBaseCMD);
199   
200    if (!defined $strIdentifyText or $strIdentifyText eq ""  ){
201   
202    print STDERR "Server information is unavailable.\n";
203
204    #&util::print_env(STDERR, "https_proxy", "http_proxy");
205   
206    if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'}) { # if proxying set, the settings may be wrong
207        &gsprintf::gsprintf_multiline(STDERR, "{WebDownload.proxied_connect_failed_info}\n", $self->{'proxy_host'}, $self->{'proxy_port'});
208    } else { # else no proxy set, the user may need proxy settings
209        &gsprintf::gsprintf_multiline(STDERR, "{WebDownload.proxyless_connect_failed_info}\n");
210    }
211   
212    # with or without proxying set, getting server info may have failed if the URL was https
213    # but the site had no valid certificate and no_check_certificate wasn't turned on
214    # suggest to the user to try turning it on
215    &gsprintf::gsprintf_multiline(STDERR, "{WebDownload.connect_failed_info}\n");
216   
217    print STDERR "<<Finished>>\n";
218    return; 
219    }
220
221    while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/si)
222    {
223    $strIdentifyText = $1.$3;
224    print STDERR "Page Title: $2\n";
225    }
226 
227    while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/si)
228    {
229    $strIdentifyText = $1.$3;
230    my $strTempString = $2;
231    print STDERR "\n";
232
233    while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/si)
234    {
235        # Store the infromation in to variable, since next time when we do
236        # regular expression, we will lost all the $1, $2, $X....
237        $strTempString = $3;
238        my $strMetaName = $1;
239        my $strMetaContain = $2;
240       
241        # Take out the extra space in the beginning of the string.
242        $strMetaName =~ s/^([" "])+//m;
243        $strMetaContain =~ s/^([" "])+//m;
244             
245        print STDERR "$strMetaName: $strMetaContain\n\n";
246           
247    }
248
249    }
250
251    print STDERR "<<Finished>>\n";
252
253}
254
255
256sub error
257{
258    my ($strFunctionName,$strError) = @_;
259    {
260    print "Error occoured in WebDownload.pm\n".
261        "In Function:".$strFunctionName."\n".
262        "Error Message:".$strError."\n";
263    exit(-1);
264    }
265}
266
2671;
268
Note: See TracBrowser for help on using the browser.