########################################################################### # # MediaWikiPlugin.pm -- html plugin with extra facilities for wiki page # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # This plugin is to process an HTML file from a MediaWiki website which downloaded by # the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like # login, discussion, history, etc. Only the navigation and search section could be preserved. # Searchbox will be modified to search the Greenstone collection instead of the website. # It also can automatically add the table of contents on the website's Main_Page to the # collection's Home page. package MediaWikiPlugin; use HTMLPlugin; use unicode; use util; use FileUtils; use strict; # every perl program should have this! no strict 'refs'; # make an exception so we can use variables as filehandles sub BEGIN { @MediaWikiPlugin::ISA = ('HTMLPlugin'); } my $arguments = [ # show the table of contents on collection's home page { 'name' => "show_toc", 'desc' => "{MediaWikiPlugin.show_toc}", 'type' => "flag", 'reqd' => "no"}, # set to delete the table of contents section on each MediaWiki page { 'name' => "delete_toc", 'desc' => "{MediaWikiPlugin.delete_toc}", 'type' => "flag", 'reqd' => "no"}, # regexp to match the table of contents { 'name' => "toc_exp", 'desc' => "{MediaWikiPlugin.toc_exp}", 'type' => "regexp", 'reqd' => "no", 'deft' => "]*)id=(\\\"|')toc(\\\"|')(.|\\n)*?\\n" }, # set to delete the navigation section { 'name' => "delete_nav", 'desc' => "{MediaWikiPlugin.delete_nav}", 'type' => "flag", 'reqd' => "no", 'deft' => ""}, # regexp to match the navigation section { 'name' => "nav_div_exp", 'desc' => "{MediaWikiPlugin.nav_div_exp}", 'type' => "regexp", 'reqd' => "no", 'deft' => "]*)id=(\\\"|')p-navigation(\\\"|')(.|\\n)*?<\/div>" }, # set to delete the searchbox section { 'name' => "delete_searchbox", 'desc' => "{MediaWikiPlugin.delete_searchbox}", 'type' => "flag", 'reqd' => "no", 'deft' => ""}, # regexp to match the searchbox section { 'name' => "searchbox_div_exp", 'desc' => "{MediaWikiPlugin.searchbox_div_exp}", 'type' => "regexp", 'reqd' => "no", 'deft' => "]*)id=(\\\"|')p-search(\\\"|')(.|\\n)*?<\/div>"}, # regexp to match title suffix # can't use the title_sub option in HTMLPlugin instead # because title_sub always matches from the begining { 'name' => "remove_title_suffix_exp", 'desc' => "{MediaWikiPlugin.remove_title_suffix_exp}", 'type' => "regexp", 'reqd' => "no", 'deft' => ""} ]; my $options = { 'name' => "MediaWikiPlugin", 'desc' => "{MediaWikiPlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists); return bless $self, $class; } sub process { my $self = shift (@_); my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; my $outhandle = $self->{'outhandle'}; my @head_and_body = split(/(.+)<\/title>/i; my $doctitle = $1 if defined $1; if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) { my @doc_properties = split(//i,$head); my $doc_heading = shift(@doc_properties); my $rest_doc_properties = join(" ", @doc_properties); my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties); my $extracted_metadata = shift (@extracted_metadata); $self->extract_metadata($extracted_metadata, $metadata, $doc_obj); } # set the title here if we haven't found it yet if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) { if (defined $doctitle && $doctitle =~ /\S/) { # remove suffix in title if required my $remove_suffix_exp = $self->{'remove_title_suffix_exp'}; if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){ $doctitle =~ s/$remove_suffix_exp//i; } $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle); } else { $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file); } } # we are only interested in the column-contents div
# remove header section, it may contain header images or additional search boxes my $header_exp = "]*)id=(\"|')container(\"|')([^>]*)>(.|\\n)*]*)id=(\"|')column-content"; if($body_text =~ /$header_exp/){ $body_text =~ s/$header_exp///mg; # remove extra bits my $extra_bits = "Retrieved from(.+)\""; $body_text =~ s/$extra_bits//isg; $body_text =~ s/(]*>]*> <\/o:p><\/span><\/p>)//isg; $body_text =~ s/(]*> <\/o:p><\/p>)//isg; $body_text =~ s///g; $body_text =~ s/( )+/ /sg; # get rid of the [edit] buttons $body_text =~ s/\[]*)>edit<\/a>]//g; # get rid of the last time edit information at the bottom $body_text =~ s/]*?)>(\w+)<\/a> \d\d:\d\d,([\s|\w]*?)\(PST\)//g; # get rid of the (Redirected from ...) $body_text =~ s/\(Redirected from ]*)>(\w|\s)*?<\/a>\)//isg; # escape texts macros $body_text =~ s/_([^\s]*)_/_$1<\/span>_/isg; # may change the links, like Greenstone_Documentation_All.html, then change back $body_text =~ s/]*)_([^>]*)<\/span>_/){ # comment out the body element because we change the body to div $line =~ s/^(\s*)body(\s*)\{(\s*)$/$1\/*body$2*\/{$3/isg; if($line =~ m/^(.+)\{/i || $line =~ m/^(\s)*#/i){ if($line !~ m/wikispecificstyle/i){ $line = "#wikispecificstyle " . $line; } } $css_content .= $line; } close(INPUT); open(OUTPUT, ">$css_file_path"); print OUTPUT $css_content; close(OUTPUT); } # Copy the modified stylesheets to collection's style folder # for future customization my $style_dir = $base_dir; $style_dir =~ s/import$/style/; $css_file =~ m/(.*)\/(.*)$/; $style_dir = &FileUtils::filenameConcatenate($style_dir, $2); if(open(OUTPUT, ">$style_dir")){ print OUTPUT $css_content; close(OUTPUT); } } } # by default, only preserve navigation box and search box # others like toolbox, interaction, languages box, will be removed # extract the larger part -- footer section my $print_footer = "
(.|\n)+"; $body_text =~ /$print_footer/; my $footer = ""; $footer = $& if defined $&; $footer =~ s/<\/body>//isg; # trim the comments first $footer =~ s/