root/main/trunk/greenstone2/perllib/downloaders/MediaWikiDownload.pm @ 32779

Revision 32779, 9.4 KB (checked in by ak19, 20 months ago)

1. Bugfix to wget downloading: call to quotewords() necessarily has 2nd parameter keep set to to false/0, in order to remove quotes from cmd args, but which also has the undesirable side-effect of removing single backslashes (while double backslashes get turned to single backslashes). This caused a bug in the download tutorial failing as the cache dir on windows, which contained backslashes going in, ended up containing no backslashes at all when wget was run. The correct place to fix it is before quotewords() gets called in WgetDownload?'s 2 UseWget?() methods. Before quotewords() is called, singlebackslashes are now protected as double backslashes, so that quotewords with 2nd param keep=0 now gets things back to normal by turning double backslashes to single again. 2. Minor change to MediaWikiDownload? to reuse variable.

  • Property svn:executable set to *
Line 
1###########################################################################
2#
3# MediaWikiDownload.pm -- downloader for wiki pages
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package MediaWikiDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use WgetDownload;
35
36sub BEGIN {
37    @MediaWikiDownload::ISA = ('WgetDownload');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43my $arguments =
44    [ { 'name' => "url",
45    'disp' => "{WebDownload.url_disp}",
46    'desc' => "{WebDownload.url}",
47    'type' => "string",
48    'reqd' => "yes"},
49      { 'name' => "depth",
50    'disp' => "{WebDownload.depth_disp}",
51    'desc' => "{WebDownload.depth}",
52    'type' => "int",
53    'deft' => "0",
54    "range" => "0,",
55    'reqd' => "no"},
56      { 'name' => "below",
57    'disp' => "{WebDownload.below_disp}",
58    'desc' => "{WebDownload.below}",
59    'type' => "flag",
60    'reqd' => "no"},
61      { 'name' => "within",
62    'disp' => "{WebDownload.within_disp}",
63    'desc' => "{WebDownload.within}",
64    'type' => "flag",
65    'reqd' => "no"},     
66      { 'name' => "reject_files",
67    'disp' => "{MediaWikiDownload.reject_filetype_disp}",
68    'desc' => "{MediaWikiDownload.reject_filetype}",
69    'type' => "string",
70    'reqd' => "no",
71    'deft' => "*action=*,*diff=*,*oldid=*,*printable*,*Recentchangeslinked*,*Userlogin*,*Whatlinkshere*,*redirect*,*Special:*,Talk:*,Image:*,*.ppt,*.pdf,*.zip,*.doc"},
72      { 'name' => "exclude_directories",
73    'disp' => "{MediaWikiDownload.exclude_directories_disp}",
74    'desc' => "{MediaWikiDownload.exclude_directories}",
75    'type' => "string",
76    'reqd' => "no",
77    'deft' => "/wiki/index.php/Special:Recentchangeslinked,/wiki/index.php/Special:Whatlinkshere,/wiki/index.php/Talk:Creating_CD"},
78      ];
79
80my $options = { 'name'     => "MediaWikiDownload",
81        'desc'     => "{MediaWikiDownload.desc}",
82        'abstract' => "no",
83        'inherits' => "yes",
84        'args'     => $arguments };
85
86my $wget_options = "";
87
88my $self;
89
90sub new
91{
92    my ($class) = shift (@_);
93    my ($getlist,$inputargs,$hashArgOptLists) = @_;
94    push(@$getlist, $class);
95
96    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
97    push(@{$hashArgOptLists->{"OptList"}},$options);
98
99     my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
100   
101    return bless $self, $class;
102}
103
104
105sub download
106{
107    my ($self) = shift (@_);
108    my ($hashGeneralOptions) = @_;
109 
110    # Download options
111    my $strOptions = $self->generateOptionsString();
112    my $strWgetOptions = $self->getWgetOptions();
113     
114    # Setup the command for using wget
115    $wget_options = "-N -k -x -t 2 -P \"". $hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions ";
116    my $cmdWget   = $wget_options . $self->{'url'};
117   
118    # Download the web pages
119    # print "Strat download from $self->{'url'}...\n";
120    print STDERR "<<Undefined Maximum>>\n";
121
122    my $strResponse = $self->useWget($cmdWget,1);
123 
124    # if ($strResponse ne ""){print "$strResponse\n";}
125       
126    print STDERR "Finish download from $self->{'url'}\n";
127   
128    # check css files for HTML pages are downloaded as well
129    $self->{'url'} =~ /http:\/\/([^\/]*)\//;       
130    my $base_url = &FileUtils::filenameConcatenate($hashGeneralOptions->{"cache_dir"}, $1);   
131    &check_file($base_url, $self);
132       
133    print STDERR "<<Finished>>\n";
134 
135    return 1;
136}
137
138sub check_file
139{   
140    my $dir = shift;
141    my ($self) = shift;
142   
143    local *DIR;
144     
145    opendir DIR, $dir or return;
146   
147    my @contents = grep { $_ ne '.' and $_ ne '..' } readdir DIR;
148   
149    closedir DIR;
150   
151    foreach (@contents) {
152      # broad-first search
153      &check_file("$dir/$_", $self);
154     
155      # read in every HTML page and check the css associated with import statement already exists.
156      &check_imported_css($self, "$dir/$_");
157    }
158}
159
160sub check_imported_css
161{
162    my ($self) = shift(@_);   
163    my $downloaded_file = shift(@_);
164           
165    $downloaded_file =~ /(.+)(\/|\\)cache(\/|\\)([^\/\\]+)(\/|\\)/;
166   
167    # external website url, to make up the stylesheet urls
168    my $base_website_url = $4;
169   
170    # the cache download file directory
171    my $base_dir = "$1$2cache$3$4" if defined $base_website_url;
172   
173    if($downloaded_file=~/(.+)\.(html|htm)/) {
174      my $content = "";
175      if(open(INPUT, "<$downloaded_file")){
176    while(my $line = <INPUT>){
177          $content .= $line;
178    }
179        close(INPUT);
180      }
181     
182      my @css_files;
183      my @css_files_paths;
184      my $css_file_count = 0;
185      while($content =~ /<style type="text\/css"(.+)?import "(.+)?"/ig){
186    $css_files[$css_file_count] = $base_website_url . $2 if defined $2;
187        $css_files_paths[$css_file_count] = $base_dir . $2 if defined $2;
188        $css_file_count++;
189      }
190     
191      for($css_file_count=0; $css_file_count<scalar(@css_files); $css_file_count++) {       
192        my $css_file = "http://" . $css_files[$css_file_count];
193        my $css_file_path = $css_files_paths[$css_file_count];
194       
195        # trim the ? mark append to the end of a stylesheet
196        $css_file =~ s/\?([^\/\.\s]+)$//isg;         
197        $css_file_path =~ s/\?([^\/\.\s]+)$//isg;         
198       
199        # do nothing if the css file existed
200        next if(-e $css_file_path);
201       
202        # otherwise use Wget to download the css files
203        my $cmdWget = $wget_options . $css_file;       
204        my $strResponse = $self->useWget($cmdWget,1);
205       
206        print STDERR "Downloaded associated StyleSheet : $css_file\n";
207      }
208    }
209}
210
211
212sub generateOptionsString
213{
214    my ($self) = @_;
215    my $strOptions;
216
217    (defined $self) || &error("generateOptionsString","No \$self is defined!!\n");
218    (defined $self->{'depth'})|| &error("generateOptionsString","No depth is defined!!\n");   
219   
220    # -r for recursive downloading
221    # -E to append a 'html' suffix to a file of type 'application/xhtml+xml' or 'text/html'
222    $strOptions .="-r -E ";
223
224    # -X exclude file directories
225    if($self->{'exclude_directories'}==1) {
226    $strOptions .="-X " . $self->{'exclude_directories'};
227    } else {
228    $strOptions .="-X /wiki/index.php/Special:Recentchangeslinked,/wiki/index.php/Special:Whatlinkshere,/wiki/index.php/Talk:Creating_CD ";
229    }   
230
231    # -R reject file list, reject files with these text in their names
232    if($self->{'reject_files'}==1) {
233    $strOptions .="-R " . $self->{'reject_files'};
234    } else {
235    $strOptions .="-R *action=*,*diff=*,*oldid=*,*printable*,*Recentchangeslinked*,*Userlogin*,*Whatlinkshere*,*redirect*,*Special:*,Talk:*,Image:* ";
236    }   
237   
238    if($self->{'depth'} == 0){
239    $strOptions .= " ";
240    } elsif($self->{'depth'} > 0) {
241    $strOptions .= "-l ".$self->{'depth'}." "; # already got -r
242    } else {
243    $self->error("setupOptions","Incorrect Depth is defined!!\n");
244    }
245
246    if($self->{'below'})  {
247    $strOptions .="-np ";
248    }
249
250    # if($self->{'html_only'}) {
251    # $strOptions .="-A .html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
252    # } else{
253    # $strOptions .="-p ";
254    #}
255
256    if (!$self->{'within'}){
257    $strOptions .="-H ";
258    }   
259   
260    return $strOptions; 
261}
262
263sub url_information
264{
265    my ($self) = shift (@_);
266
267    my $strOptions = $self->getWgetOptions();
268
269    my $strBaseCMD = $strOptions." -q -O - \"$self->{'url'}\"";
270
271 
272    my $strIdentifyText = $self->useWget($strBaseCMD);
273   
274    if (!defined $strIdentifyText or $strIdentifyText eq ""  ){
275        print STDERR "Server information is unavailable.\n";
276        print STDERR "<<Finished>>\n";
277        return; 
278    }
279
280    while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/s)  {
281        $strIdentifyText = $1.$3;
282        print STDERR "Page Title: $2\n";
283    }
284 
285    while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/s)  {
286    $strIdentifyText = $1.$3;
287    my $strTempString = $2;
288    print STDERR "\n";
289
290    while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/s){
291        # Store the infromation in to variable, since next time when we do
292        # regular expression, we will lost all the $1, $2, $X....
293        $strTempString = $3;
294        my $strMetaName = $1;
295        my $strMetaContain = $2;
296       
297        # Take out the extra space in the beginning of the string.
298        $strMetaName =~ s/^([" "])+//m;
299        $strMetaContain =~ s/^([" "])+//m;
300             
301        print STDERR "$strMetaName: $strMetaContain\n\n";           
302    }
303    }
304
305    print STDERR "<<Finished>>\n";
306
307}
308
309
310sub error
311{
312    my ($strFunctionName,$strError) = @_;
313    {
314    print "Error occoured in MediaWikiDownload.pm\n".
315        "In Function:".$strFunctionName."\n".
316        "Error Message:".$strError."\n";
317    exit(-1);
318    }
319}
320
3211;
322
Note: See TracBrowser for help on using the browser.