########################################################################### # # MediaWikiDownload.pm -- downloader for wiki pages # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package MediaWikiDownload; eval {require bytes}; # suppress the annoying "subroutine redefined" warning that various # plugins cause under perl 5.6 $SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)}; use WgetDownload; sub BEGIN { @MediaWikiDownload::ISA = ('WgetDownload'); } use strict; # every perl program should have this! no strict 'refs'; # make an exception so we can use variables as filehandles my $arguments = [ { 'name' => "url", 'disp' => "{WebDownload.url_disp}", 'desc' => "{WebDownload.url}", 'type' => "string", 'reqd' => "yes"}, { 'name' => "depth", 'disp' => "{WebDownload.depth_disp}", 'desc' => "{WebDownload.depth}", 'type' => "int", 'deft' => "0", "range" => "0,", 'reqd' => "no"}, { 'name' => "below", 'disp' => "{WebDownload.below_disp}", 'desc' => "{WebDownload.below}", 'type' => "flag", 'reqd' => "no"}, { 'name' => "within", 'disp' => "{WebDownload.within_disp}", 'desc' => "{WebDownload.within}", 'type' => "flag", 'reqd' => "no"}, { 'name' => "reject_files", 'disp' => "{MediaWikiDownload.reject_filetype_disp}", 'desc' => "{MediaWikiDownload.reject_filetype}", 'type' => "string", 'reqd' => "no", 'deft' => "*action=*,*diff=*,*oldid=*,*printable*,*Recentchangeslinked*,*Userlogin*,*Whatlinkshere*,*redirect*,*Special:*,Talk:*,Image:*,*.ppt,*.pdf,*.zip,*.doc"}, { 'name' => "exclude_directories", 'disp' => "{MediaWikiDownload.exclude_directories_disp}", 'desc' => "{MediaWikiDownload.exclude_directories}", 'type' => "string", 'reqd' => "no", 'deft' => "/wiki/index.php/Special:Recentchangeslinked,/wiki/index.php/Special:Whatlinkshere,/wiki/index.php/Talk:Creating_CD"}, ]; my $options = { 'name' => "MediaWikiDownload", 'desc' => "{MediaWikiDownload.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; my $wget_options = ""; my $self; sub new { my ($class) = shift (@_); my ($getlist,$inputargs,$hashArgOptLists) = @_; push(@$getlist, $class); if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; my $self = (defined $hashArgOptLists)? new WgetDownload($getlist,$inputargs,$hashArgOptLists): new WgetDownload($getlist,$inputargs); return bless $self, $class; } sub download { my ($self) = shift (@_); my ($hashGeneralOptions) = @_; # Download options my $strOptions = $self->generateOptionsString(); my $strWgetOptions = $self->getWgetOptions(); # Setup the command for using wget $wget_options = "-N -k -x -t 2 -P \"". $hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions "; my $cmdWget = "-N -k -x -t 2 -P \"". $hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions " . $self->{'url'}; # Download the web pages # print "Strat download from $self->{'url'}...\n"; print STDERR "<>\n"; my $strResponse = $self->useWget($cmdWget,1); # if ($strResponse ne ""){print "$strResponse\n";} print STDERR "Finish download from $self->{'url'}\n"; # check css files for HTML pages are downloaded as well $self->{'url'} =~ /http:\/\/([^\/]*)\//; my $base_url = &util::filename_cat($hashGeneralOptions->{"cache_dir"}, $1); &check_file($base_url, $self); print STDERR "<>\n"; return 1; } sub check_file { my $dir = shift; my ($self) = shift; local *DIR; opendir DIR, $dir or return; my @contents = grep { $_ ne '.' and $_ ne '..' } readdir DIR; closedir DIR; foreach (@contents) { # broad-first search &check_file("$dir/$_", $self); # read in every HTML page and check the css associated with import statement already exists. &check_imported_css($self, "$dir/$_"); } } sub check_imported_css { my ($self) = shift(@_); my $downloaded_file = shift(@_); $downloaded_file =~ /(.+)(\/|\\)cache(\/|\\)([^\/\\]+)(\/|\\)/; # external website url, to make up the stylesheet urls my $base_website_url = $4; # the cache download file directory my $base_dir = "$1$2cache$3$4" if defined $base_website_url; if($downloaded_file=~/(.+)\.(html|htm)/) { my $content = ""; if(open(INPUT, "<$downloaded_file")){ while(my $line = ){ $content .= $line; } close(INPUT); } my @css_files; my @css_files_paths; my $css_file_count = 0; while($content =~ /