root/main/trunk/greenstone2/perllib/downloaders/WebDownload.pm @ 31856

Revision 31856, 6.8 KB (checked in by ak19, 3 years ago)

If the proxy settings are wrong or set when not needed, pressing the Server Information button would take forever (freeze GLI GUI), and the wget that java launched through perl would also take forever, blocking. The wget will have to be terminated from Task Manager. To overcome issues of network settings misconfigurations, which Dr Bainbridge said are hard to detect, setting the number of tries on pressing the Server Info button to 2. The number of tries for pressing the Download button were already 2, so this just makes the two wget commands issued more similar (but the wget launched by the Download button now uses the --tries=2 rather than the shorthand -t 2 too, so that the code reads better). Setting the number of wget retries launched by the Server Info Dialog also ensures wget is eventually terminated, as happens when both tries fail. Some more informative messages are now displayed if the server is unavaiable, depending on whether proxying is on or not.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WebDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use WgetDownload;
35
36sub BEGIN {
37    @WebDownload::ISA = ('WgetDownload');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43my $arguments =
44    [ { 'name' => "url",
45    'disp' => "{WebDownload.url_disp}",
46    'desc' => "{WebDownload.url}",
47    'type' => "string",
48    'reqd' => "yes"},
49      { 'name' => "depth",
50    'disp' => "{WebDownload.depth_disp}",
51    'desc' => "{WebDownload.depth}",
52    'type' => "int",
53    'deft' => "0",
54    "range" => "0,",
55    'reqd' => "no"},
56      { 'name' => "below",
57    'disp' => "{WebDownload.below_disp}",
58    'desc' => "{WebDownload.below}",
59    'type' => "flag",
60    'reqd' => "no"},
61      { 'name' => "within",
62    'disp' => "{WebDownload.within_disp}",
63    'desc' => "{WebDownload.within}",
64    'type' => "flag",
65    'reqd' => "no"},
66      { 'name' => "html_only",
67    'disp' => "{WebDownload.html_only_disp}",
68    'desc' => "{WebDownload.html_only}",
69    'type' => "flag",
70    'reqd' => "no"}
71      ];
72
73my $options = { 'name'     => "WebDownload",
74        'desc'     => "{WebDownload.desc}",
75        'abstract' => "no",
76        'inherits' => "yes",
77        'args'     => $arguments };
78
79
80my $self;
81
82sub new
83{
84    my ($class) = shift (@_);
85    my ($getlist,$inputargs,$hashArgOptLists) = @_;
86    push(@$getlist, $class);
87
88    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
89    push(@{$hashArgOptLists->{"OptList"}},$options);
90
91    my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
92   
93    return bless $self, $class;
94}
95
96sub download
97{
98    my ($self) = shift (@_);
99    my ($hashGeneralOptions) = @_;
100
101 
102    # Download options
103    my $strOptions = $self->generateOptionsString();
104    my $strWgetOptions = $self->getWgetOptions();
105     
106    # Setup the command for using wget
107    my $cache_dir = "";
108    if($hashGeneralOptions->{'cache_dir'}) { # don't provide the prefix-dir flag to wget unless the cache_dir is specified
109    if ($ENV{'GSDLOS'} eq "windows") {   
110        $cache_dir = "-P \"".$hashGeneralOptions->{'cache_dir'}."\" ";
111    }
112    else {
113        $cache_dir = "-P ".$hashGeneralOptions->{'cache_dir'};
114    }
115    }
116    #my $cmdWget = "-N -k -x -t 2 -P \"".$hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions ".$self->{'url'};
117    my $cmdWget = "-N -k -x --tries=2 $strWgetOptions $strOptions $cache_dir " .$self->{'url'};
118
119    #print STDOUT "\n@@@@ RUNNING WGET CMD: $cmdWget\n\n";
120   
121    # Download the web pages
122    # print "Start download from $self->{'url'}...\n";
123    print STDERR "<<Undefined Maximum>>\n";
124   
125    if ($ENV{'GSDLOS'} eq "windows") {
126    my $strResponse = $self->useWget($cmdWget,1);
127    } else {
128    my $strResponse = $self->useWget($cmdWget,1,$hashGeneralOptions->{"cache_dir"} );
129   
130    }
131   
132    # if ($strResponse ne ""){print "$strResponse\n";}
133     print STDERR "Finish download from $self->{'url'}\n";
134
135    print STDERR "<<Finished>>\n";
136 
137    return 1;
138}
139
140sub generateOptionsString
141{
142    my ($self) = @_;
143    my $strOptions;
144
145    (defined $self) || &error("generateOptionsString","No \$self is defined!!\n");
146    (defined $self->{'depth'})|| &error("generateOptionsString","No depth is defined!!\n");
147   
148   
149    if($self->{'depth'} == 0)
150    {
151    $strOptions .= " ";
152    }
153    elsif($self->{'depth'} > 0)
154    {
155    $strOptions .= "-r -l ".$self->{'depth'}." ";
156    }
157    else
158    {
159    $self->error("setupOptions","Incorrect Depth is defined!!\n");
160    }
161
162    if($self->{'below'})
163    {
164    $strOptions .="-np ";
165    }
166
167     if($self->{'html_only'})
168    {
169    $strOptions .="-A .html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
170    }
171    else{
172
173    $strOptions .="-p ";
174    }
175
176    if (!$self->{'within'}){
177    $strOptions .="-H ";
178    }
179
180    return $strOptions;
181 
182}
183
184sub url_information
185{
186    my ($self) = shift (@_);
187
188    my $strOptions = $self->getWgetOptions();
189
190    my $strBaseCMD = $strOptions." --tries=2 -q -O - \"$self->{'url'}\"";
191
192 
193    my $strIdentifyText = $self->useWget($strBaseCMD);
194   
195    if (!defined $strIdentifyText or $strIdentifyText eq ""  ){
196        print STDERR "Server information is unavailable.\n";           
197        if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'}) { # if proxying set, the settings may be wrong
198            print STDERR "Current proxy settings are:\n";
199            print STDERR "- host=$self->{'proxy_host'}\n";
200            print STDERR "- port=$self->{'proxy_port'}\n";
201        } else { # else no proxy set, the user may need proxy settings
202            prin STDERR "The external server might not be responding, or you might need to switch on proxy settings.\n";
203        }
204        print STDERR "<<Finished>>\n";
205         return; 
206    }
207
208    while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/si)
209    {
210    $strIdentifyText = $1.$3;
211    print STDERR "Page Title: $2\n";
212    }
213 
214    while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/si)
215    {
216    $strIdentifyText = $1.$3;
217    my $strTempString = $2;
218    print STDERR "\n";
219
220    while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/si)
221    {
222        # Store the infromation in to variable, since next time when we do
223        # regular expression, we will lost all the $1, $2, $X....
224        $strTempString = $3;
225        my $strMetaName = $1;
226        my $strMetaContain = $2;
227       
228        # Take out the extra space in the beginning of the string.
229        $strMetaName =~ s/^([" "])+//m;
230        $strMetaContain =~ s/^([" "])+//m;
231             
232        print STDERR "$strMetaName: $strMetaContain\n\n";
233           
234    }
235
236    }
237
238    print STDERR "<<Finished>>\n";
239
240}
241
242
243sub error
244{
245    my ($strFunctionName,$strError) = @_;
246    {
247    print "Error occoured in WebDownload.pm\n".
248        "In Function:".$strFunctionName."\n".
249        "Error Message:".$strError."\n";
250    exit(-1);
251    }
252}
253
2541;
255
Note: See TracBrowser for help on using the browser.