root/main/trunk/greenstone2/perllib/downloaders/WebDownload.pm @ 31851

Revision 31851, 6.3 KB (checked in by ak19, 2 years ago)

1. Fixes to get proxying to work on Windows. 2. Fixes to timeout if a page doesn't exist and it takes forever to read. Both for downloading from a URL and getting server info (perl code), and also in Java code, when doing a getRedirectURL(). Generally, a URL is correct and when wget is launched, a cancel operation in the Java GUI successfully causes and interrupt which then terminates wget. However, if the URL doesn't exist, either when getting serer info or when downloading, the wget launched by the perl seems to block or something, and the interrupt is not noticed until the wget is manually terminated through the task manager. Then the interrupt is finally noticed. If pages would indicate they don't exist, then it wouldn't have been a problem. This issue is now circumvented through setting a read-timeout, to stop retrieving pages that don't exist but that take forever to access anyway as they don't indicate that they don't exist. A connect timeout is for if you get proxy details wrong or something like that and it takes forever to connect.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package WebDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use WgetDownload;
35
36sub BEGIN {
37    @WebDownload::ISA = ('WgetDownload');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43my $arguments =
44    [ { 'name' => "url",
45    'disp' => "{WebDownload.url_disp}",
46    'desc' => "{WebDownload.url}",
47    'type' => "string",
48    'reqd' => "yes"},
49      { 'name' => "depth",
50    'disp' => "{WebDownload.depth_disp}",
51    'desc' => "{WebDownload.depth}",
52    'type' => "int",
53    'deft' => "0",
54    "range" => "0,",
55    'reqd' => "no"},
56      { 'name' => "below",
57    'disp' => "{WebDownload.below_disp}",
58    'desc' => "{WebDownload.below}",
59    'type' => "flag",
60    'reqd' => "no"},
61      { 'name' => "within",
62    'disp' => "{WebDownload.within_disp}",
63    'desc' => "{WebDownload.within}",
64    'type' => "flag",
65    'reqd' => "no"},
66      { 'name' => "html_only",
67    'disp' => "{WebDownload.html_only_disp}",
68    'desc' => "{WebDownload.html_only}",
69    'type' => "flag",
70    'reqd' => "no"}
71      ];
72
73my $options = { 'name'     => "WebDownload",
74        'desc'     => "{WebDownload.desc}",
75        'abstract' => "no",
76        'inherits' => "yes",
77        'args'     => $arguments };
78
79
80my $self;
81
82sub new
83{
84    my ($class) = shift (@_);
85    my ($getlist,$inputargs,$hashArgOptLists) = @_;
86    push(@$getlist, $class);
87
88    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
89    push(@{$hashArgOptLists->{"OptList"}},$options);
90
91    my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
92   
93    return bless $self, $class;
94}
95
96sub download
97{
98    my ($self) = shift (@_);
99    my ($hashGeneralOptions) = @_;
100
101 
102    # Download options
103    my $strOptions = $self->generateOptionsString();
104    my $strWgetOptions = $self->getWgetOptions();
105     
106    # Setup the command for using wget
107    my $cache_dir = "";
108    if($hashGeneralOptions->{'cache_dir'}) { # don't provide the prefix-dir flag to wget unless the cache_dir is specified
109    if ($ENV{'GSDLOS'} eq "windows") {   
110        $cache_dir = "-P \"".$hashGeneralOptions->{'cache_dir'}."\" ";
111    }
112    else {
113        $cache_dir = "-P ".$hashGeneralOptions->{'cache_dir'};
114    }
115    }
116    #my $cmdWget = "-N -k -x -t 2 -P \"".$hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions ".$self->{'url'};
117    my $cmdWget = "-N -k -x -t 2 --read-timeout=2 --connect-timeout=2 $strWgetOptions $strOptions $cache_dir " .$self->{'url'};   
118
119    #print STDOUT "\n@@@@ RUNNING WGET CMD: $cmdWget\n\n";
120   
121    # Download the web pages
122    # print "Start download from $self->{'url'}...\n";
123    print STDERR "<<Undefined Maximum>>\n";
124   
125    if ($ENV{'GSDLOS'} eq "windows") {
126    my $strResponse = $self->useWget($cmdWget,1);
127    } else {
128    my $strResponse = $self->useWget($cmdWget,1,$hashGeneralOptions->{"cache_dir"} );
129   
130    }
131   
132    # if ($strResponse ne ""){print "$strResponse\n";}
133     print STDERR "Finish download from $self->{'url'}\n";
134
135    print STDERR "<<Finished>>\n";
136 
137    return 1;
138}
139
140sub generateOptionsString
141{
142    my ($self) = @_;
143    my $strOptions;
144
145    (defined $self) || &error("generateOptionsString","No \$self is defined!!\n");
146    (defined $self->{'depth'})|| &error("generateOptionsString","No depth is defined!!\n");
147   
148   
149    if($self->{'depth'} == 0)
150    {
151    $strOptions .= " ";
152    }
153    elsif($self->{'depth'} > 0)
154    {
155    $strOptions .= "-r -l ".$self->{'depth'}." ";
156    }
157    else
158    {
159    $self->error("setupOptions","Incorrect Depth is defined!!\n");
160    }
161
162    if($self->{'below'})
163    {
164    $strOptions .="-np ";
165    }
166
167     if($self->{'html_only'})
168    {
169    $strOptions .="-A .html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
170    }
171    else{
172
173    $strOptions .="-p ";
174    }
175
176    if (!$self->{'within'}){
177    $strOptions .="-H ";
178    }
179
180    return $strOptions;
181 
182}
183
184sub url_information
185{
186    my ($self) = shift (@_);
187
188    my $strOptions = $self->getWgetOptions();
189
190    my $strBaseCMD = $strOptions." --timeout=4 --tries=1 -q -O - \"$self->{'url'}\"";
191
192 
193    my $strIdentifyText = $self->useWget($strBaseCMD);
194   
195    if (!defined $strIdentifyText or $strIdentifyText eq ""  ){
196    print STDERR "Server information is unavailable.\n";
197    print STDERR "<<Finished>>\n";
198         return; 
199    }
200
201    while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/si)
202    {
203    $strIdentifyText = $1.$3;
204    print STDERR "Page Title: $2\n";
205    }
206 
207    while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/si)
208    {
209    $strIdentifyText = $1.$3;
210    my $strTempString = $2;
211    print STDERR "\n";
212
213    while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/si)
214    {
215        # Store the infromation in to variable, since next time when we do
216        # regular expression, we will lost all the $1, $2, $X....
217        $strTempString = $3;
218        my $strMetaName = $1;
219        my $strMetaContain = $2;
220       
221        # Take out the extra space in the beginning of the string.
222        $strMetaName =~ s/^([" "])+//m;
223        $strMetaContain =~ s/^([" "])+//m;
224             
225        print STDERR "$strMetaName: $strMetaContain\n\n";
226           
227    }
228
229    }
230
231    print STDERR "<<Finished>>\n";
232
233}
234
235
236sub error
237{
238    my ($strFunctionName,$strError) = @_;
239    {
240    print "Error occoured in WebDownload.pm\n".
241        "In Function:".$strFunctionName."\n".
242        "Error Message:".$strError."\n";
243    exit(-1);
244    }
245}
246
2471;
248
Note: See TracBrowser for help on using the browser.