[14662] | 1 | ###########################################################################
|
---|
| 2 | #
|
---|
[15872] | 3 | # MediaWikiPlugin.pm -- html plugin with extra facilities for wiki page
|
---|
[14662] | 4 | #
|
---|
| 5 | # A component of the Greenstone digital library software
|
---|
| 6 | # from the New Zealand Digital Library Project at the
|
---|
| 7 | # University of Waikato, New Zealand.
|
---|
| 8 | #
|
---|
| 9 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
| 10 | #
|
---|
| 11 | # This program is free software; you can redistribute it and/or modify
|
---|
| 12 | # it under the terms of the GNU General Public License as published by
|
---|
| 13 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 14 | # (at your option) any later version.
|
---|
| 15 | #
|
---|
| 16 | # This program is distributed in the hope that it will be useful,
|
---|
| 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 19 | # GNU General Public License for more details.
|
---|
| 20 | #
|
---|
| 21 | # You should have received a copy of the GNU General Public License
|
---|
| 22 | # along with this program; if not, write to the Free Software
|
---|
| 23 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 24 | #
|
---|
| 25 | ###########################################################################
|
---|
| 26 | # This plugin is to process an HTML file from a MediaWiki website which downloaded by
|
---|
| 27 | # the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
|
---|
| 28 | # login, discussion, history, etc. Only the navigation and search section could be preserved.
|
---|
| 29 | # Searchbox will be modified to search the Greenstone collection instead of the website.
|
---|
| 30 | # It also can automatically add the table of contents on the website's Main_Page to the
|
---|
| 31 | # collection's Home page.
|
---|
| 32 |
|
---|
[15872] | 33 | package MediaWikiPlugin;
|
---|
[14662] | 34 |
|
---|
[15872] | 35 | use HTMLPlugin;
|
---|
[14662] | 36 | use unicode;
|
---|
[28560] | 37 | use util;
|
---|
| 38 | use FileUtils;
|
---|
[14662] | 39 |
|
---|
[15887] | 40 | use strict; # every perl program should have this!
|
---|
| 41 | no strict 'refs'; # make an exception so we can use variables as filehandles
|
---|
[14662] | 42 |
|
---|
| 43 |
|
---|
| 44 | sub BEGIN {
|
---|
[15872] | 45 | @MediaWikiPlugin::ISA = ('HTMLPlugin');
|
---|
[14662] | 46 | }
|
---|
| 47 |
|
---|
| 48 | my $arguments =
|
---|
| 49 | [
|
---|
| 50 | # show the table of contents on collection's home page
|
---|
| 51 | { 'name' => "show_toc",
|
---|
[15872] | 52 | 'desc' => "{MediaWikiPlugin.show_toc}",
|
---|
[14662] | 53 | 'type' => "flag",
|
---|
| 54 | 'reqd' => "no"},
|
---|
| 55 | # set to delete the table of contents section on each MediaWiki page
|
---|
| 56 | { 'name' => "delete_toc",
|
---|
[15872] | 57 | 'desc' => "{MediaWikiPlugin.delete_toc}",
|
---|
[14662] | 58 | 'type' => "flag",
|
---|
| 59 | 'reqd' => "no"},
|
---|
| 60 | # regexp to match the table of contents
|
---|
| 61 | { 'name' => "toc_exp",
|
---|
[15872] | 62 | 'desc' => "{MediaWikiPlugin.toc_exp}",
|
---|
[14662] | 63 | 'type' => "regexp",
|
---|
| 64 | 'reqd' => "no",
|
---|
| 65 | 'deft' => "<table([^>]*)id=(\\\"|')toc(\\\"|')(.|\\n)*?</table>\\n" },
|
---|
| 66 | # set to delete the navigation section
|
---|
| 67 | { 'name' => "delete_nav",
|
---|
[15872] | 68 | 'desc' => "{MediaWikiPlugin.delete_nav}",
|
---|
[14662] | 69 | 'type' => "flag",
|
---|
| 70 | 'reqd' => "no",
|
---|
| 71 | 'deft' => ""},
|
---|
| 72 | # regexp to match the navigation section
|
---|
| 73 | { 'name' => "nav_div_exp",
|
---|
[15872] | 74 | 'desc' => "{MediaWikiPlugin.nav_div_exp}",
|
---|
[14662] | 75 | 'type' => "regexp",
|
---|
| 76 | 'reqd' => "no",
|
---|
| 77 | 'deft' => "<div([^>]*)id=(\\\"|')p-navigation(\\\"|')(.|\\n)*?<\/div>" },
|
---|
| 78 | # set to delete the searchbox section
|
---|
| 79 | { 'name' => "delete_searchbox",
|
---|
[15872] | 80 | 'desc' => "{MediaWikiPlugin.delete_searchbox}",
|
---|
[14662] | 81 | 'type' => "flag",
|
---|
| 82 | 'reqd' => "no",
|
---|
| 83 | 'deft' => ""},
|
---|
| 84 | # regexp to match the searchbox section
|
---|
| 85 | { 'name' => "searchbox_div_exp",
|
---|
[15872] | 86 | 'desc' => "{MediaWikiPlugin.searchbox_div_exp}",
|
---|
[14662] | 87 | 'type' => "regexp",
|
---|
| 88 | 'reqd' => "no",
|
---|
| 89 | 'deft' => "<div([^>]*)id=(\\\"|')p-search(\\\"|')(.|\\n)*?<\/div>"},
|
---|
| 90 | # regexp to match title suffix
|
---|
[15872] | 91 | # can't use the title_sub option in HTMLPlugin instead
|
---|
[14662] | 92 | # because title_sub always matches from the begining
|
---|
| 93 | { 'name' => "remove_title_suffix_exp",
|
---|
[15872] | 94 | 'desc' => "{MediaWikiPlugin.remove_title_suffix_exp}",
|
---|
[14662] | 95 | 'type' => "regexp",
|
---|
| 96 | 'reqd' => "no",
|
---|
| 97 | 'deft' => ""}
|
---|
| 98 | ];
|
---|
| 99 |
|
---|
[15872] | 100 | my $options = { 'name' => "MediaWikiPlugin",
|
---|
| 101 | 'desc' => "{MediaWikiPlugin.desc}",
|
---|
[14662] | 102 | 'abstract' => "no",
|
---|
| 103 | 'inherits' => "yes",
|
---|
| 104 | 'args' => $arguments };
|
---|
| 105 |
|
---|
| 106 | sub new {
|
---|
| 107 | my ($class) = shift (@_);
|
---|
| 108 | my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
|
---|
| 109 | push(@$pluginlist, $class);
|
---|
| 110 |
|
---|
[15872] | 111 | push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
|
---|
| 112 | push(@{$hashArgOptLists->{"OptList"}},$options);
|
---|
[14662] | 113 |
|
---|
[15872] | 114 | my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
|
---|
[14662] | 115 | return bless $self, $class;
|
---|
| 116 | }
|
---|
| 117 |
|
---|
| 118 |
|
---|
| 119 |
|
---|
| 120 | sub process {
|
---|
| 121 | my $self = shift (@_);
|
---|
| 122 | my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
|
---|
| 123 | my $outhandle = $self->{'outhandle'};
|
---|
| 124 |
|
---|
| 125 | my @head_and_body = split(/<body/i,$$textref);
|
---|
| 126 | my $head = shift(@head_and_body);
|
---|
| 127 | my $body_text = join("<body", @head_and_body);
|
---|
| 128 |
|
---|
| 129 | $head =~ m/<title>(.+)<\/title>/i;
|
---|
| 130 | my $doctitle = $1 if defined $1;
|
---|
| 131 |
|
---|
| 132 | if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
|
---|
| 133 | my @doc_properties = split(/<xml>/i,$head);
|
---|
| 134 | my $doc_heading = shift(@doc_properties);
|
---|
| 135 | my $rest_doc_properties = join(" ", @doc_properties);
|
---|
| 136 |
|
---|
| 137 | my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
|
---|
| 138 | my $extracted_metadata = shift (@extracted_metadata);
|
---|
| 139 | $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
|
---|
| 140 | }
|
---|
| 141 |
|
---|
| 142 | # set the title here if we haven't found it yet
|
---|
| 143 | if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
|
---|
| 144 | if (defined $doctitle && $doctitle =~ /\S/) {
|
---|
| 145 | # remove suffix in title if required
|
---|
| 146 | my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
|
---|
| 147 | if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
|
---|
| 148 | $doctitle =~ s/$remove_suffix_exp//i;
|
---|
| 149 | }
|
---|
| 150 | $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
|
---|
| 151 | } else {
|
---|
| 152 | $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
|
---|
| 153 | }
|
---|
| 154 | }
|
---|
| 155 |
|
---|
| 156 | # we are only interested in the column-contents div <div id="column-content">
|
---|
| 157 | # remove header section, it may contain header images or additional search boxes
|
---|
| 158 | my $header_exp = "<div([^>]*)id=(\"|')container(\"|')([^>]*)>(.|\\n)*<div([^>]*)id=(\"|')column-content";
|
---|
| 159 | if($body_text =~ /$header_exp/){
|
---|
| 160 | $body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
|
---|
| 161 | } else {
|
---|
| 162 | $header_exp = "(.|\\n)*?<div([^>]*)?id=(\"|')column-content";
|
---|
| 163 | if($body_text =~ /$header_exp/){
|
---|
| 164 | $body_text =~ s/$header_exp/<div$2id='column-content/i;
|
---|
| 165 | }
|
---|
| 166 | }
|
---|
| 167 |
|
---|
| 168 | # remove timeline
|
---|
| 169 | $body_text =~ s/<div([^>]*)class=("|')smwtimeline("|')[\s\S]*?<\/div>//mg;
|
---|
| 170 |
|
---|
| 171 | # remove extra bits
|
---|
| 172 | my $extra_bits = "Retrieved from(.+)</a>\"";
|
---|
| 173 | $body_text =~ s/$extra_bits//isg;
|
---|
| 174 |
|
---|
| 175 | $body_text =~ s/(<p[^>]*><span[^>]*><o:p> <\/o:p><\/span><\/p>)//isg;
|
---|
| 176 | $body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
|
---|
| 177 | $body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
|
---|
| 178 | $body_text =~ s/( )+/ /sg;
|
---|
| 179 |
|
---|
| 180 | # get rid of the [edit] buttons
|
---|
| 181 | $body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
|
---|
| 182 | # get rid of the last time edit information at the bottom
|
---|
| 183 | $body_text =~ s/<a href="([^>]*)edit([^>]*)"([^>]*?)>(\w+)<\/a> \d\d:\d\d,([\s|\w]*?)\(PST\)//g;
|
---|
| 184 | # get rid of the (Redirected from ...)
|
---|
| 185 | $body_text =~ s/\(Redirected from <a ([^>]*)>(\w|\s)*?<\/a>\)//isg;
|
---|
| 186 |
|
---|
| 187 | # escape texts macros
|
---|
| 188 | $body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
|
---|
| 189 | # may change the links, like Greenstone_Documentation_All.html, then change back
|
---|
| 190 | $body_text =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg;
|
---|
| 191 |
|
---|
| 192 | # define file delimiter for different platforms
|
---|
| 193 | my $file_delimiter;
|
---|
| 194 | if ($ENV{'GSDLOS'} =~ /^windows$/i) {
|
---|
| 195 | $file_delimiter = "\\";
|
---|
| 196 | } else {
|
---|
| 197 | $file_delimiter = "/";
|
---|
| 198 | }
|
---|
| 199 |
|
---|
| 200 | # IMPORTANT: different delimiter for $base_dir and $file
|
---|
| 201 | # $base_dir use forward slash for both windows and linux
|
---|
| 202 | # print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
|
---|
| 203 | # linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
|
---|
| 204 | # $file use different delimiters : forward slash for linux; backward slash for windows
|
---|
[15872] | 205 | # print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlugin.html
|
---|
[14662] | 206 | # linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
|
---|
| 207 |
|
---|
| 208 | # get the base url for the MediaWiki website
|
---|
| 209 | my $safe_delimiter = &safe_escape_regexp($file_delimiter);
|
---|
| 210 | my @url_dirs=split($safe_delimiter, $file);
|
---|
| 211 | my $url_base = $url_dirs[0];
|
---|
| 212 |
|
---|
| 213 | # Re-check css files associated with MediaWiki pages
|
---|
| 214 | if(defined $base_dir && $base_dir ne ""){
|
---|
| 215 | my @css_files;
|
---|
| 216 | my $css_file_count = 0;
|
---|
| 217 |
|
---|
| 218 | # find all the stylesheets imported with @import statement
|
---|
| 219 | while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
|
---|
| 220 | $css_files[$css_file_count++] = $2 if defined $2;
|
---|
| 221 | }
|
---|
[28560] | 222 |
|
---|
| 223 | # Set the env for wget once, outside the for loop
|
---|
| 224 | # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
|
---|
| 225 | &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
|
---|
[14662] | 226 |
|
---|
| 227 | # download the stylesheets if we haven't downloaded them yet
|
---|
| 228 | # add prefix to each style elmement, comment out the body element
|
---|
[19123] | 229 | # and copy the files to collection's style folder
|
---|
[14662] | 230 | for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
|
---|
| 231 |
|
---|
| 232 | my $css_file = $css_files[$css_file_count];
|
---|
| 233 |
|
---|
| 234 | # remove prefix gli/cache directory
|
---|
| 235 | $css_file =~ s/^(.+)gli(\\|\/)cache(\\|\/)//i;
|
---|
| 236 |
|
---|
| 237 | # change the \ delimiter in $css_file to / for consistency
|
---|
| 238 | $css_file =~ s/\\/\//isg;
|
---|
| 239 | if($css_file !~ /$url_base/) {
|
---|
| 240 | $css_file = $url_base . $css_file;
|
---|
| 241 | }
|
---|
| 242 |
|
---|
| 243 | # trim the ? mark append to the end of a stylesheet
|
---|
| 244 | $css_file =~ s/\?(.+)$//isg;
|
---|
| 245 |
|
---|
[28560] | 246 | my $css_file_path = &FileUtils::filenameConcatenate($base_dir, $css_file);
|
---|
[14662] | 247 |
|
---|
| 248 | # do nothing if we have already downloaded the css files
|
---|
| 249 | if (! -e $css_file_path) {
|
---|
| 250 |
|
---|
| 251 | # check the stylesheet's directory in the import folder
|
---|
| 252 | # if the directory doesn't exist, create one
|
---|
| 253 | my @dirs = split(/\//i,$css_file);
|
---|
| 254 | my $path_check = "$base_dir/";
|
---|
| 255 | for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
|
---|
| 256 | $path_check .= $dirs[$i] . "/";
|
---|
| 257 | mkdir($path_check) if (! -d $path_check );
|
---|
| 258 | }
|
---|
[28560] | 259 |
|
---|
[14662] | 260 | # NOTE: wget needs configuration to directly access Internet
|
---|
| 261 | # These files should already downloaded if we used the MediaWikiDownload
|
---|
| 262 | # downloading
|
---|
| 263 | $css_file = "http://$css_file";
|
---|
| 264 | print "\ndownloading : " . $css_file . "\n\n";
|
---|
| 265 | system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
|
---|
| 266 | if ($? != 0) {
|
---|
| 267 | print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
|
---|
| 268 | print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
|
---|
| 269 | unlink("$css_file_path");
|
---|
| 270 | }
|
---|
| 271 | } # done with download
|
---|
| 272 |
|
---|
| 273 | # add a prefix "#wikispecificstyle" to each element
|
---|
| 274 | # because we want to preserve this website's formats and don't want to mess up with Greenstone formats
|
---|
| 275 | # so we will wrap the web page with a div with id = wikispecificstyle
|
---|
| 276 | my $css_content;
|
---|
| 277 | if(open(INPUT, "<$css_file_path")){
|
---|
| 278 | while(my $line = <INPUT>){
|
---|
| 279 | # comment out the body element because we change the body to div
|
---|
[32129] | 280 | $line =~ s/^(\s*)body(\s*)\{(\s*)$/$1\/*body$2*\/{$3/isg;
|
---|
[14662] | 281 |
|
---|
| 282 | if($line =~ m/^(.+)\{/i || $line =~ m/^(\s)*#/i){
|
---|
| 283 | if($line !~ m/wikispecificstyle/i){
|
---|
| 284 | $line = "#wikispecificstyle " . $line;
|
---|
| 285 | }
|
---|
| 286 | }
|
---|
| 287 |
|
---|
| 288 | $css_content .= $line;
|
---|
| 289 | }
|
---|
| 290 | close(INPUT);
|
---|
| 291 | open(OUTPUT, ">$css_file_path");
|
---|
| 292 | print OUTPUT $css_content;
|
---|
| 293 | close(OUTPUT);
|
---|
| 294 | }
|
---|
| 295 |
|
---|
[19123] | 296 | # Copy the modified stylesheets to collection's style folder
|
---|
[14662] | 297 | # for future customization
|
---|
[19123] | 298 | my $style_dir = $base_dir;
|
---|
| 299 | $style_dir =~ s/import$/style/;
|
---|
[14662] | 300 | $css_file =~ m/(.*)\/(.*)$/;
|
---|
[28560] | 301 | $style_dir = &FileUtils::filenameConcatenate($style_dir, $2);
|
---|
[14662] | 302 |
|
---|
[19123] | 303 | if(open(OUTPUT, ">$style_dir")){
|
---|
[14662] | 304 | print OUTPUT $css_content;
|
---|
| 305 | close(OUTPUT);
|
---|
| 306 | }
|
---|
| 307 | }
|
---|
| 308 | }
|
---|
| 309 |
|
---|
| 310 |
|
---|
| 311 | # by default, only preserve navigation box and search box
|
---|
| 312 | # others like toolbox, interaction, languages box, will be removed
|
---|
| 313 |
|
---|
| 314 | # extract the larger part -- footer section
|
---|
| 315 | my $print_footer = "<div class=\"printfooter\">(.|\n)+</body>";
|
---|
| 316 | $body_text =~ /$print_footer/;
|
---|
| 317 | my $footer = "";
|
---|
| 318 | $footer = $& if defined $&;
|
---|
| 319 | $footer =~ s/<\/body>//isg;
|
---|
| 320 |
|
---|
| 321 | # trim the comments first
|
---|
| 322 | $footer =~ s/<!--[\s\S]*?--[ \t\n\r]*>//isg;
|
---|
| 323 |
|
---|
| 324 | # contain sections that are to be preserved
|
---|
| 325 | my $preserve_sections = "";
|
---|
| 326 |
|
---|
| 327 | # process the navigation section
|
---|
| 328 | my $nav_match_exp = "<div([^>]*)id=(\"|')p-navigation(\"|')(.|\n)*?<\/div>";
|
---|
| 329 | if (defined $self->{'nav_div_exp'}) {
|
---|
| 330 | $nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
|
---|
| 331 | }
|
---|
| 332 |
|
---|
| 333 | if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
|
---|
| 334 | # do nothing
|
---|
| 335 | } else {
|
---|
| 336 | if ($footer =~ m/$nav_match_exp/ig) {
|
---|
| 337 | $preserve_sections = $& ;
|
---|
| 338 | } else {
|
---|
| 339 | print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
|
---|
| 340 | }
|
---|
| 341 | # if($preserve_sections =~/\S/){
|
---|
| 342 | # $preserve_sections .= "</div>";
|
---|
| 343 | # }
|
---|
| 344 | }
|
---|
| 345 |
|
---|
| 346 | # process the searchbox section
|
---|
| 347 | my $searchbox_exp = "<div([^>]*)id=(\"|')p-search(\"|')(.|\\n)*?<\/div>";
|
---|
| 348 | if(defined $self->{'searchbox_div_exp'}) {
|
---|
| 349 | $searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
|
---|
| 350 | }
|
---|
| 351 |
|
---|
| 352 | my $searchbox_section = "";
|
---|
| 353 | $footer =~ m/$searchbox_exp/ig;
|
---|
| 354 | $searchbox_section = $& if defined $&;
|
---|
| 355 |
|
---|
| 356 | # make the searchbox form work in Greenstone
|
---|
| 357 | if($searchbox_section =~ /\S/){
|
---|
| 358 | # replace action
|
---|
| 359 | $searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
|
---|
| 360 |
|
---|
| 361 | # remove buttons
|
---|
| 362 | $searchbox_section =~ s/name="search"/name="q"/isg;
|
---|
| 363 | $searchbox_section =~ s/name="go"//isg;
|
---|
| 364 | $searchbox_section =~ s/name="fulltext"//isg;
|
---|
| 365 |
|
---|
| 366 | # get collection name from $base_dir for c param
|
---|
| 367 | $base_dir =~ m/\/collect\/(.+)\//i;
|
---|
| 368 | my $collection_name = "";
|
---|
| 369 | $collection_name = $1 if defined $1;
|
---|
| 370 |
|
---|
| 371 | # add Greenstone search params
|
---|
| 372 | my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
|
---|
| 373 | ."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
|
---|
| 374 | # ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
|
---|
| 375 | # ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
|
---|
| 376 |
|
---|
| 377 | $searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
|
---|
| 378 |
|
---|
| 379 | # $searchbox_section .= "</div>";
|
---|
| 380 | } else {
|
---|
| 381 | print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
|
---|
| 382 | }
|
---|
| 383 |
|
---|
| 384 | # either delete or replace the searchbox
|
---|
| 385 | if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
|
---|
| 386 | # do nothing
|
---|
| 387 | } else {
|
---|
| 388 | $preserve_sections .= "\n$searchbox_section\n";
|
---|
| 389 | }
|
---|
| 390 |
|
---|
| 391 | if($preserve_sections ne ""){
|
---|
| 392 | $preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
|
---|
| 393 | }
|
---|
| 394 | $preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
|
---|
| 395 |
|
---|
| 396 | $body_text =~ s/$print_footer/$preserve_sections/isg;
|
---|
| 397 |
|
---|
| 398 |
|
---|
| 399 | # delete other forms in the page
|
---|
| 400 | my @forms;
|
---|
| 401 | my $form_count = 0;
|
---|
| 402 | while($body_text =~ m/<form([^>]*)name=("|')([^>"']*)?("|')/isg){
|
---|
| 403 | next if($3 eq "searchform");
|
---|
| 404 | $forms[$form_count++] = $&;
|
---|
| 405 | }
|
---|
| 406 | foreach my $form (@forms) {
|
---|
| 407 | $body_text =~ s/$form[\s\S]*?<\/form>//m;
|
---|
| 408 | }
|
---|
| 409 |
|
---|
| 410 | # process links.
|
---|
| 411 | # because current WGET 1.10 the -k and -E option doesn't work together
|
---|
| 412 | # need to 'manually' convert the links to relative links
|
---|
| 413 | # Dealing with 3 types of links:
|
---|
| 414 | # -- outgoing links
|
---|
| 415 | # -- if we have downloaded the target files, link to the internal version (relative link)
|
---|
| 416 | # -- otherwise, link to the external version (absolute links)
|
---|
| 417 | # -- in-page links (relative link)
|
---|
| 418 |
|
---|
| 419 | # NOTE: (important)
|
---|
| 420 | # must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
|
---|
| 421 | # otherwise, the internal links may have problems
|
---|
| 422 |
|
---|
| 423 | # remove the title attribute of <a> tag
|
---|
| 424 | $body_text =~ s/<a([^>]*)title="(.*?)"/<a$1/isg;
|
---|
| 425 |
|
---|
| 426 | # extract all the links
|
---|
| 427 | my @links;
|
---|
| 428 | my $link_count = 0;
|
---|
| 429 | while($body_text =~ m/(href|src)="([^>\s]*)$url_base\/([^>\s]*)"/ig){
|
---|
| 430 | $links[$link_count++] = "$1=\"$2$url_base/$3\"";
|
---|
| 431 | }
|
---|
| 432 |
|
---|
| 433 | foreach my $cur_link (@links) {
|
---|
| 434 | # escape greedy match + character
|
---|
| 435 | $cur_link =~ s/\+/\\+/isg;
|
---|
| 436 |
|
---|
| 437 | $cur_link =~ m/(.+)"([^>]*)$url_base\/([^>\s]*)"/;
|
---|
| 438 | my $external_file_path = "$1\"http://$url_base/$3\"";
|
---|
| 439 |
|
---|
| 440 | $body_text =~ s/$cur_link/$external_file_path/i;
|
---|
| 441 | }
|
---|
| 442 |
|
---|
| 443 | # tag links to new wiki pages as red
|
---|
| 444 | $body_text =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi;
|
---|
| 445 |
|
---|
| 446 | # tag links to pages external of the MediaWiki website as blue
|
---|
| 447 | $body_text =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi;
|
---|
| 448 |
|
---|
| 449 |
|
---|
| 450 | # process the table-of-contents section
|
---|
| 451 | # if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
|
---|
| 452 | # 1. read _content_ macro from about.dm
|
---|
| 453 | # 2. append the toc, change all links to the Greenstone internal format for relative links
|
---|
| 454 | # 3. write to the extra.dm
|
---|
| 455 | # TODO: we assume the _about:content_ hasn't been specified before
|
---|
| 456 | # so needs to add function to handle when the macro is already in the extra.dm
|
---|
| 457 | if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){
|
---|
| 458 |
|
---|
| 459 | # extract toc of the Main_Page
|
---|
| 460 | my $mainpage_toc = "";
|
---|
| 461 | my $toc_exp = "<table([^>]*)id=(\"|')toc(\"|')(.|\\n)*</table>\\n";
|
---|
| 462 | if($self->{'toc_exp'} =~ /\S/){
|
---|
| 463 | $toc_exp = $self->{'toc_exp'};
|
---|
| 464 | }
|
---|
| 465 | if($body_text =~ /$toc_exp/){
|
---|
| 466 | $mainpage_toc = $&;
|
---|
| 467 | }
|
---|
| 468 |
|
---|
| 469 | if($mainpage_toc =~ /\S/) {
|
---|
| 470 |
|
---|
| 471 | # change the in-page links to relative links, for example, change <a href="#section1"> to
|
---|
| 472 | # <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
|
---|
| 473 | my $file_url_format = $file;
|
---|
| 474 | $file_url_format =~ s/\\/\//isg;
|
---|
| 475 | $file_url_format = "http://" . $file_url_format;
|
---|
| 476 |
|
---|
| 477 | # encode as URL, otherwise doesn't work on Windows
|
---|
| 478 | $file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
|
---|
| 479 | $mainpage_toc =~ s/<a href="([^>"#]*)#([^>"]*)"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
|
---|
| 480 |
|
---|
| 481 |
|
---|
| 482 | # read the collection's extra.dm
|
---|
| 483 | my $macro_path = $base_dir;
|
---|
| 484 | $macro_path =~ s/import$/macros/;
|
---|
[28560] | 485 | my $extradm_file = &FileUtils::filenameConcatenate($macro_path, "extra.dm");
|
---|
[14662] | 486 |
|
---|
| 487 | my $extra_dm = "";
|
---|
| 488 | if(open(INPUT, "<$extradm_file")){
|
---|
| 489 | while(my $line = <INPUT>){
|
---|
| 490 | $extra_dm .= $line;
|
---|
| 491 | }
|
---|
| 492 | } else {
|
---|
| 493 | print $outhandle "can't open file $extradm_file\n";
|
---|
| 494 | }
|
---|
| 495 | close(INPUT);
|
---|
| 496 |
|
---|
| 497 | # check whether we have changed the macros
|
---|
| 498 | my @packages = split("package ", $extra_dm);
|
---|
| 499 | my $about_package = "";
|
---|
| 500 | foreach my $package (@packages) {
|
---|
| 501 | $about_package = "package " . $package if($package =~ /^about/);
|
---|
| 502 | }
|
---|
| 503 |
|
---|
| 504 | my $update_extra_dm = 0;
|
---|
| 505 |
|
---|
[32129] | 506 | if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*)\{/ && $about_package =~ m/$mainpage_toc/){
|
---|
[14662] | 507 | print $outhandle "_content_ macro already changed!!!!\n";
|
---|
| 508 | }
|
---|
| 509 | # if extra.dm doesn't have an "about package"
|
---|
| 510 | elsif ($about_package !~ /\S/) {
|
---|
| 511 | # read _content_ macro from $GSDLHOME/macros/about.dm file
|
---|
[15887] | 512 | my $global_about_package = $self->read_content_from_about_dm();
|
---|
[14662] | 513 |
|
---|
| 514 | # create the extra _content_ macro for this collection
|
---|
| 515 | # add the original content of the _content_ macro
|
---|
| 516 | $global_about_package =~ m/{(.|\n)*<\/div>\n\n/;
|
---|
| 517 |
|
---|
| 518 | # append the new about package to extra.dm
|
---|
| 519 | $extra_dm .= "\n\npackage about\n_content_$&\n\n";
|
---|
| 520 | $extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
|
---|
| 521 |
|
---|
| 522 | $update_extra_dm = 1;
|
---|
| 523 | }
|
---|
| 524 | # the about package exists, but either doesn't have the _content_ macro or
|
---|
| 525 | # the _content_ macro doesn't contain the toc
|
---|
| 526 | else {
|
---|
| 527 | # check if there is a content macro
|
---|
| 528 | my $content_macro_existed = 0;
|
---|
[32129] | 529 | $content_macro_existed = ($about_package =~ /(\s*|\n)_content_(\s*)\{/);
|
---|
[14662] | 530 |
|
---|
| 531 | # if there is one
|
---|
| 532 | # append a new section div for toc to the end of the document section
|
---|
| 533 | if($content_macro_existed ==1) {
|
---|
[32129] | 534 | $about_package =~ /(\s*|\n)_content_(\s*)\{(.|\n)*?}/;
|
---|
[14662] | 535 | my $content_macro = $&;
|
---|
| 536 | my $new_content_macro = $content_macro;
|
---|
| 537 | $new_content_macro =~ s/<div[^>]*class="document">(.|\n)*<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
|
---|
| 538 | $extra_dm =~ s/$content_macro/$new_content_macro/mg;
|
---|
| 539 | }
|
---|
| 540 | # otherwise, append _content_ macro to the about package
|
---|
| 541 | else {
|
---|
| 542 | my $new_about_package = $about_package;
|
---|
[15887] | 543 | my $content_macro = &read_content_from_about_dm();
|
---|
[14662] | 544 | $content_macro =~ m/{(.|\n)*<\/div>\n\n/;
|
---|
| 545 |
|
---|
| 546 | $new_about_package .= "\n\n_content_$&\n\n";
|
---|
| 547 | $new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
|
---|
| 548 | $extra_dm =~ s/$about_package/$new_about_package/mg;
|
---|
| 549 | }
|
---|
| 550 |
|
---|
| 551 | # either the case, we need to update the extra.dm
|
---|
| 552 | $update_extra_dm = 1;
|
---|
| 553 | }
|
---|
| 554 |
|
---|
| 555 | if($update_extra_dm==1){
|
---|
| 556 | # write to the extra.dm file of the collection
|
---|
| 557 | if (open(OUTPUT, ">$extradm_file")) {
|
---|
| 558 | print OUTPUT $extra_dm;
|
---|
| 559 | } else {
|
---|
| 560 | print "can't open $extradm_file\n";
|
---|
| 561 | }
|
---|
| 562 | close(OUTPUT);
|
---|
| 563 | }
|
---|
| 564 | } else {
|
---|
| 565 | print $outhandle "Main_Page doesn't have a table-of-contents section\n";
|
---|
| 566 | }
|
---|
| 567 | }
|
---|
| 568 |
|
---|
| 569 | # If delete_toc is set, remove toc and tof contents.
|
---|
| 570 | if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
|
---|
| 571 | if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
|
---|
| 572 | # print "\nit matches toc_exp !!\n" if $body_text =~ /$self->{'toc_exp'}/;
|
---|
| 573 | if ($body_text =~ /$self->{'toc_exp'}/) {
|
---|
| 574 | $body_text =~ s/$self->{'toc_exp'}//i;
|
---|
| 575 | }
|
---|
| 576 | }
|
---|
| 577 | }
|
---|
| 578 |
|
---|
| 579 | $$textref = "<body" . $body_text;
|
---|
| 580 |
|
---|
| 581 | # Wrap the whole page with <div id="wikispecificstyle"></div>
|
---|
| 582 | # keep the style of this website and don't mess up with the Greenstone styles
|
---|
| 583 | $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
|
---|
| 584 | $$textref =~ s/<\/body>/<\/div><\/body>/is;
|
---|
| 585 |
|
---|
| 586 | $self->SUPER::process(@_);
|
---|
| 587 |
|
---|
| 588 | return 1;
|
---|
| 589 | }
|
---|
| 590 |
|
---|
| 591 |
|
---|
| 592 | sub extract_metadata
|
---|
| 593 | {
|
---|
| 594 | my $self = shift (@_);
|
---|
| 595 | my ($textref, $metadata, $doc_obj) = @_;
|
---|
| 596 | my $outhandle = $self->{'outhandle'};
|
---|
| 597 |
|
---|
| 598 | return if (!defined $textref);
|
---|
| 599 |
|
---|
| 600 | # metadata fields to extract/save. 'key' is the (lowercase) name of the
|
---|
| 601 | # html meta, 'value' is the metadata name for greenstone to use
|
---|
| 602 | my %find_fields = ();
|
---|
| 603 | my ($tag,$value);
|
---|
| 604 |
|
---|
| 605 | my $orig_field = "";
|
---|
| 606 | foreach my $field (split /,/, $self->{'metadata_fields'}) {
|
---|
| 607 | # support tag<tagname>
|
---|
| 608 | if ($field =~ /^(.*?)<(.*?)>$/) {
|
---|
| 609 | # "$2" is the user's preferred gs metadata name
|
---|
| 610 | $find_fields{lc($1)}=$2; # lc = lowercase
|
---|
| 611 | $orig_field = $1;
|
---|
| 612 | } else { # no <tagname> for mapping
|
---|
| 613 | # "$field" is the user's preferred gs metadata name
|
---|
| 614 | $find_fields{lc($field)}=$field; # lc = lowercase
|
---|
| 615 | $orig_field = $field;
|
---|
| 616 | }
|
---|
| 617 |
|
---|
| 618 | if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
|
---|
| 619 | $tag = $orig_field;
|
---|
| 620 | $value = $1;
|
---|
| 621 | if (!defined $value || !defined $tag){
|
---|
[15872] | 622 | #print $outhandle "MediaWikiPlugin: can't find VALUE in \"$tag\"\n";
|
---|
[14662] | 623 | next;
|
---|
| 624 | } else {
|
---|
| 625 | # clean up and add
|
---|
| 626 | chomp($value); # remove trailing \n, if any
|
---|
| 627 | $tag = $find_fields{lc($tag)};
|
---|
| 628 | #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
|
---|
| 629 | # if ($self->{'verbosity'} > 2);
|
---|
| 630 | $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
|
---|
| 631 | }
|
---|
| 632 | }
|
---|
| 633 | }
|
---|
| 634 | }
|
---|
| 635 |
|
---|
| 636 | sub safe_escape_regexp
|
---|
| 637 | {
|
---|
| 638 | my $regexp = shift (@_);
|
---|
| 639 |
|
---|
| 640 | # if ($ENV{'GSDLOS'} =~ /^windows$/i) {
|
---|
| 641 | $regexp =~ s/\\/\\\\/isg;
|
---|
| 642 | #} else {
|
---|
| 643 | $regexp =~ s/\//\\\//isg;
|
---|
| 644 | #}
|
---|
| 645 | return $regexp;
|
---|
| 646 | }
|
---|
| 647 |
|
---|
| 648 | sub read_content_from_about_dm
|
---|
| 649 | {
|
---|
[15887] | 650 | my $self = shift(@_);
|
---|
| 651 |
|
---|
[28560] | 652 | my $about_macro_file = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "macros", "about.dm");
|
---|
[14662] | 653 | my $about_page_content = "";
|
---|
| 654 | if (open(INPUT, "<$about_macro_file")){
|
---|
| 655 | while (my $line=<INPUT>){
|
---|
| 656 | $about_page_content .= $line;
|
---|
| 657 | }
|
---|
| 658 | } else {
|
---|
[15887] | 659 | my $outhandle = $self->{'outhandle'};
|
---|
[14662] | 660 | print $outhandle "can't open file $about_macro_file\n";
|
---|
| 661 | }
|
---|
| 662 | close(INPUT);
|
---|
| 663 |
|
---|
| 664 | # extract the _content_ macro
|
---|
[31780] | 665 | $about_page_content =~ m/_content_ \{(.|\n)*<\/div>\n\n<\/div>\n}/i;
|
---|
[14662] | 666 | $about_page_content = $&;
|
---|
| 667 |
|
---|
| 668 | return $about_page_content;
|
---|
| 669 | }
|
---|
| 670 |
|
---|
| 671 | 1;
|
---|