Changeset 14251
- Timestamp:
- 2007-07-16T10:22:59+12:00 (17 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/plugins/MediaWikiPlug.pm
r14108 r14251 24 24 # 25 25 ########################################################################### 26 # This plugin is to process an HTML file where sections are divided by 27 # user-defined headings tags. As it is difficult to predict what user's definition 28 # this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...) 29 # as well as allows to get rid of user-defined Table of Content (TOC)... 30 # format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)... 26 # This plugin is to process an HTML file from a MediaWiki website which downloaded by 27 # the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like 28 # login, discussion, history, etc. Only the navigation and search section could be preserved. 29 # Searchbox will be modified to search the Greenstone collection instead of the website. 30 # It also can automatically add the table of contents on the website's Main_Page to the 31 # collection's Home page. 31 32 32 33 package MediaWikiPlug; 33 34 34 35 use HTMLPlug; 35 use ImagePlug; 36 use File::Copy; 36 # use ImagePlug; 37 # use File::Copy; 38 use unicode; 39 37 40 38 41 #use strict; # every perl program should have this! … … 40 43 41 44 sub BEGIN { 42 @MediaWikiPlug::ISA = ('HTMLPlug'); 45 @MediaWikiPlug::ISA = ('HTMLPlug'); 43 46 } 44 47 45 48 my $arguments = 46 49 [ 50 # show the table of contents on collection's home page 47 51 { 'name' => "show_toc", 48 52 'desc' => "{MediaWikiPlug.show_toc}", 49 53 'type' => "flag", 50 54 'reqd' => "no"}, 55 # set to delete the table of contents section on each MediaWiki page 56 { 'name' => "delete_toc", 57 'desc' => "{MediaWikiPlug.delete_toc}", 58 'type' => "flag", 59 'reqd' => "no"}, 60 # regexp to match the table of contents 51 61 { 'name' => "toc_exp", 52 62 'desc' => "{MediaWikiPlug.toc_exp}", 53 63 'type' => "regexp", 54 64 'reqd' => "no", 55 'deft' => "" }, 56 { 'name' => "delete_toc", 57 'desc' => "{MediaWikiPlug.delete_toc}", 58 'type' => "flag", 59 'reqd' => "no"}, 65 'deft' => "<table([^>]*)id=(\\\"|')toc(\\\"|')(.|\\n)*</table>\\n" }, 66 # set to delete the navigation section 60 67 { 'name' => "delete_nav", 61 68 'desc' => "{MediaWikiPlug.delete_nav}", 62 69 'type' => "flag", 63 70 'reqd' => "no", 64 'deft' => ""}, 65 { 'name' => "nav_exp", 66 'desc' => "{MediaWikiPlug.nav_exp}", 71 'deft' => ""}, 72 # regexp to match the navigation section 73 { 'name' => "nav_div_exp", 74 'desc' => "{MediaWikiPlug.nav_div_exp}", 67 75 'type' => "regexp", 68 76 'reqd' => "no", 69 'deft' => "" }, 70 { 'name' => "tag_sections", 71 'desc' => "{MediaWikiPlug.tag_sections}", 77 'deft' => "<div([^>]*)id=(\\\"|')p-navigation(\\\"|')(.|\\n)*?<\/div>" }, 78 # set to delete the searchbox section 79 { 'name' => "delete_searchbox", 80 'desc' => "{MediaWikiPlug.delete_searchbox}", 72 81 'type' => "flag", 73 'reqd' => "no"}, 74 { 'name' => "description_tags", 75 'desc' => "{HTMLPlug.description_tags}", 76 'type' => "flag", 77 'reqd' => "no"} 82 'reqd' => "no", 83 'deft' => ""}, 84 # regexp to match the searchbox section 85 { 'name' => "searchbox_div_exp", 86 'desc' => "{MediaWikiPlug.searchbox_div_exp}", 87 'type' => "regexp", 88 'reqd' => "no", 89 'deft' => "<div([^>]*)id=(\\\"|')p-search(\\\"|')(.|\\n)*?<\/div>"}, 90 # regexp to match title suffix 91 # can't use the title_sub option in HTMLPlug instead 92 # because title_sub always matches from the begining 93 { 'name' => "remove_title_suffix_exp", 94 'desc' => "{MediaWikiPlug.remove_title_suffix_exp}", 95 'type' => "regexp", 96 'reqd' => "no", 97 'deft' => ""} 78 98 ]; 79 80 99 81 100 my $options = { 'name' => "MediaWikiPlug", … … 85 104 'args' => $arguments }; 86 105 87 88 106 sub new { 89 107 my ($class) = shift (@_); … … 112 130 113 131 $head =~ m/<title>(.+)<\/title>/i; 114 my $doctitle = $1 if defined $1; 132 my $doctitle = $1 if defined $1; 115 133 116 134 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) { … … 126 144 # set the title here if we haven't found it yet 127 145 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) { 128 if (defined $doctitle && $doctitle =~ /\S/) { 129 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle); 146 if (defined $doctitle && $doctitle =~ /\S/) { 147 # remove suffix in title if required 148 my $remove_suffix_exp = $self->{'remove_title_suffix_exp'}; 149 if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){ 150 $doctitle =~ s/$remove_suffix_exp//i; 151 } 152 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle); 130 153 } else { 131 154 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file); 132 155 } 133 } 134 135 if(defined $base_dir && $base_dir ne ""){ 136 # find and download stylesheet 156 } 157 158 # we are only interested in the column-contents div <div id="column-content"> 159 # remove header section, it may contain header images or additional search boxes 160 my $header_exp = "<div([^>]*)id=(\"|')container(\"|')([^>]*)>(.|\\n)*<div([^>]*)id=(\"|')column-content"; 161 $body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg; 162 163 # remove timeline 164 $body_text =~ s/<div([^>]*)class=("|')smwtimeline("|')[\s\S]*?<\/div>//mg; 165 166 # remove extra bits 167 my $extra_bits = "Retrieved from(.+)</a>\""; 168 $body_text =~ s/$extra_bits//isg; 169 170 $body_text =~ s/(<p[^>]*><span[^>]*><o:p> <\/o:p><\/span><\/p>)//isg; 171 $body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg; 172 $body_text =~ s/<!\[if !vml\]>/<![if vml]>/g; 173 $body_text =~ s/( )+/ /sg; 174 175 # get rid of the [edit] buttons 176 $body_text =~ s/\[<a([^>]*)>edit<\/a>]//g; 177 # get rid of the last time edit information at the bottom 178 $body_text =~ s/<a href="([^>]*)edit([^>]*)"([^>]*?)>(\w+)<\/a> \d\d:\d\d,([\s|\w]*?)\(PST\)//g; 179 # get rid of the (Redirected from ...) 180 $body_text =~ s/\(Redirected from <a ([^>]*)>(\w|\s)*?<\/a>\)//isg; 181 182 # escape texts macros 183 $body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg; 184 # may change the links, like Greenstone_Documentation_All.html, then change back 185 $body_text =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg; 186 187 # define file delimiter for different platforms 188 my $file_delimiter; 189 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 190 $file_delimiter = "\\"; 191 } else { 192 $file_delimiter = "/"; 193 } 194 195 # IMPORTANT: different delimiter for $base_dir and $file 196 # $base_dir use forward slash for both windows and linux 197 # print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import 198 # linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import 199 # $file use different delimiters : forward slash for linux; backward slash for windows 200 # print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlug.html 201 # linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html 202 203 # get the base url for the MediaWiki website 204 my $safe_delimiter = &safe_escape_regexp($file_delimiter); 205 my @url_dirs=split($safe_delimiter, $file); 206 my $url_base = $url_dirs[0]; 207 208 # Re-check css files associated with MediaWiki pages 209 if(defined $base_dir && $base_dir ne ""){ 137 210 my @css_files; 138 211 my $css_file_count = 0; 139 # find all the style sheets imported with import statement 212 213 # find all the stylesheets imported with @import statement 140 214 while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){ 141 $css_files[$css_file_count++] = $2 if defined $2; 142 } 215 $css_files[$css_file_count++] = $2 if defined $2; 216 } 217 218 # download the stylesheets if we haven't downloaded them yet 219 # add prefix to each style elmement, comment out the body element 220 # and copy the files to collection's images folder 221 for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) { 222 223 my $css_file = $css_files[$css_file_count]; 224 225 # remove prefix gli/cache directory 226 $css_file =~ s/^(.+)gli(\\|\/)cache(\\|\/)//i; 227 228 # change the \ delimiter in $css_file to / for consistency 229 $css_file =~ s/\\/\//isg; 230 if($css_file !~ /$url_base/) { 231 $css_file = $url_base . $css_file; 232 } 233 234 # trim the ? mark append to the end of a stylesheet 235 $css_file =~ s/\?(.+)$//isg; 236 237 my $css_file_path = &util::filename_cat($base_dir, $css_file); 238 239 # do nothing if we have already downloaded the css files 240 if (! -e $css_file_path) { 241 242 # check the stylesheet's directory in the import folder 243 # if the directory doesn't exist, create one 244 my @dirs = split(/\//i,$css_file); 245 my $path_check = "$base_dir/"; 246 for (my $i = 0; $i < (scalar(@dirs)-1); $i++) { 247 $path_check .= $dirs[$i] . "/"; 248 mkdir($path_check) if (! -d $path_check ); 249 } 250 251 # NOTE: wget needs configuration to directly access Internet 252 # These files should already downloaded if we used the MediaWikiDownload 253 # downloading 254 $css_file = "http://$css_file"; 255 print "\ndownloading : " . $css_file . "\n\n"; 256 system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path"); 257 if ($? != 0) { 258 print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n"; 259 print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n"; 260 unlink("$css_file_path"); 261 } 262 } # done with download 263 264 # add a prefix "#wikispecificstyle" to each element 265 # because we want to preserve this website's formats and don't want to mess up with Greenstone formats 266 # so we will wrap the web page with a div with id = wikispecificstyle 267 my $css_content; 268 if(open(INPUT, "<$css_file_path")){ 269 while(my $line = <INPUT>){ 270 # comment out the body element because we change the body to div 271 $line =~ s/^(\s*)body(\s*){(\s*)$/$1\/*body$2*\/{$3/isg; 272 273 if($line =~ m/^(.+)\{/i || $line =~ m/^(\s)*#/i){ 274 $line = "#wikispecificstyle " . $line; 275 } 276 $css_content .= $line; 277 } 278 close(INPUT); 279 open(OUTPUT, ">$css_file_path"); 280 print OUTPUT $css_content; 281 close(OUTPUT); 282 } 283 284 # Copy the modified stylesheets to collection's images folder 285 # for future customization 286 my $images_dir = $base_dir; 287 $images_dir =~ s/import$/images/; 288 $css_file =~ m/(.*)\/(.*)$/; 289 $images_dir = &util::filename_cat($images_dir, $2); 290 291 if(open(OUTPUT, ">$images_dir")){ 292 print OUTPUT $css_content; 293 close(OUTPUT); 294 } 295 } 296 } 297 298 299 # by default, only preserve navigation box and search box 300 # others like toolbox, interaction, languages box, will be removed 301 302 # extract the larger part -- footer section 303 my $print_footer = "<div class=\"printfooter\">(.|\n)+</body>"; 304 $body_text =~ /$print_footer/; 305 my $footer = ""; 306 $footer = $& if defined $&; 307 $footer =~ s/<\/body>//isg; 308 309 # trim the comments first 310 $footer =~ s/<!--[\s\S]*?--[ \t\n\r]*>//isg; 311 312 # contain sections that are to be preserved 313 my $preserve_sections = ""; 314 315 # process the navigation section 316 my $nav_match_exp = "<div([^>]*)id=(\"|')p-navigation(\"|')(.|\n)*?<\/div>"; 317 if (defined $self->{'nav_div_exp'}) { 318 $nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ; 319 } 320 321 if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) { 322 # do nothing 323 } else { 324 if ($footer =~ m/$nav_match_exp/ig) { 325 $preserve_sections = $& ; 326 } else { 327 print $outhandle "Can't find the navigation section with : $nav_match_exp\n"; 328 } 329 # if($preserve_sections =~/\S/){ 330 # $preserve_sections .= "</div>"; 331 # } 332 } 333 334 # process the searchbox section 335 my $searchbox_exp = "<div([^>]*)id=(\"|')p-search(\"|')(.|\\n)*?<\/div>"; 336 if(defined $self->{'searchbox_div_exp'}) { 337 $searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/); 338 } 339 340 my $searchbox_section = ""; 341 $footer =~ m/$searchbox_exp/ig; 342 $searchbox_section = $& if defined $&; 343 344 # make the searchbox form work in Greenstone 345 if($searchbox_section =~ /\S/){ 346 # replace action 347 $searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg; 348 349 # remove buttons 350 $searchbox_section =~ s/name="search"/name="q"/isg; 351 $searchbox_section =~ s/name="go"//isg; 352 $searchbox_section =~ s/name="fulltext"//isg; 353 354 # get collection name from $base_dir for c param 355 $base_dir =~ m/\/collect\/(.+)\//i; 356 my $collection_name = ""; 357 $collection_name = $1 if defined $1; 358 359 # add Greenstone search params 360 my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n" 361 ."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n"; 362 # ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n" 363 # ."<input type=\"hidden\" name=\"r\" value=\"1\">\n"; 364 365 $searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg; 366 367 # $searchbox_section .= "</div>"; 368 } else { 369 print $outhandle "Can't find the searchbox section with : $searchbox_section\n"; 370 } 371 372 # either delete or replace the searchbox 373 if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") { 374 # do nothing 375 } else { 376 $preserve_sections .= "\n$searchbox_section\n"; 377 } 378 379 380 if($preserve_sections ne ""){ 381 $preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n"; 382 } 383 $preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>"; 384 385 $body_text =~ s/$print_footer/$preserve_sections/isg; 386 387 388 # delete other forms in the page 389 my @forms; 390 my $form_count = 0; 391 while($body_text =~ m/<form([^>]*)name=("|')([^>]*)("|')/isg){ 392 next if($3 eq "q"); 393 $forms[$form_count++] = $&; 394 } 395 foreach my $form (@forms) { 396 $body_text =~ s/$form[\s\S]*?<\/form>//m; 397 } 398 399 400 # process links. 401 # because current WGET 1.10 the -k and -E option doesn't work together 402 # need to 'manually' convert the links to relative links 403 # Dealing with 3 types of links: 404 # -- outgoing links 405 # -- if we have downloaded the target files, link to the internal version (relative link) 406 # -- otherwise, link to the external version (absolute links) 407 # -- in-page links (relative link) 408 409 # NOTE: (important) 410 # must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website 411 # otherwise, the internal links may have problems 412 413 # remove the title attribute of <a> tag 414 $body_text =~ s/<a([^>]*)title="(.*?)"/<a$1/isg; 415 416 # extract all the links 417 my @links; 418 my $link_count = 0; 419 while($body_text =~ m/(href|src)="([^>\s]*)$url_base\/([^>\s]*)"/ig){ 420 $links[$link_count++] = "$1=\"$2$url_base/$3\""; 421 } 422 423 foreach my $cur_link (@links) { 424 # escape greedy match + character 425 $cur_link =~ s/\+/\\+/isg; 426 427 $cur_link =~ m/(.+)"([^>]*)$url_base\/([^>\s]*)"/; 428 my $external_file_path = "$1\"http://$url_base/$3\""; 429 430 $body_text =~ s/$cur_link/$external_file_path/i; 431 } 432 433 # tag links to new wiki pages as red 434 $body_text =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi; 435 436 # tag links to pages external of the MediaWiki website as blue 437 $body_text =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi; 438 439 440 # process the table-of-contents section 441 # if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file 442 # 1. read _content_ macro from about.dm 443 # 2. append the toc, change all links to the Greenstone internal format for relative links 444 # 3. write to the extra.dm 445 # TODO: we assume the _about:content_ hasn't been specified before 446 # so needs to add function to handle when the macro is already in the extra.dm 447 if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){ 448 449 # extract toc of the Main_Page 450 my $mainpage_toc = ""; 451 my $toc_exp = "<table([^>]*)id=(\"|')toc(\"|')(.|\\n)*</table>\\n"; 452 if($self->{'toc_exp'} =~ /\S/){ 453 $toc_exp = $self->{'toc_exp'}; 454 } 455 if($body_text =~ /$toc_exp/){ 456 $mainpage_toc = $&; 457 } 458 459 if($mainpage_toc =~ /\S/) { 460 461 # change the in-page links to relative links, for example, change <a href="#section1"> to 462 # <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1"> 463 my $file_url_format = $file; 464 $file_url_format =~ s/\\/\//isg; 465 $file_url_format = "http://" . $file_url_format; 466 467 # encode as URL, otherwise doesn't work on Windows 468 $file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg; 469 $mainpage_toc =~ s/<a href="([^>"#]*)#([^>"]*)"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg; 470 471 472 # read the collection's extra.dm 473 my $macro_path = $base_dir; 474 $macro_path =~ s/import$/macros/; 475 my $extradm_file = &util::filename_cat($macro_path, "extra.dm"); 476 477 my $extra_dm = ""; 478 if(open(INPUT, "<$extradm_file")){ 479 while(my $line = <INPUT>){ 480 $extra_dm .= $line; 481 } 482 } else { 483 print $outhandle "can't open file $extradm_file\n"; 484 } 485 close(INPUT); 486 487 # check whether we have changed the macros 488 my @packages = split("package ", $extra_dm); 489 my $about_package = ""; 490 foreach my $package (@packages) { 491 $about_package = "package " . $package if($package =~ /^about/); 492 } 493 494 my $update_extra_dm = 0; 495 496 if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){ 497 print $outhandle "_content_ macro already changed!!!!\n"; 498 } 499 # if extra.dm doesn't have an "about package" 500 elsif ($about_package !~ /\S/) { 501 # read _content_ macro from $GSDLHOME/macros/about.dm file 502 my $global_about_package = &read_content_from_about_dm(); 503 504 # create the extra _content_ macro for this collection 505 # add the original content of the _content_ macro 506 $global_about_package =~ m/{(.|\n)*<\/div>\n\n/; 507 508 # append the new about package to extra.dm 509 $extra_dm .= "\n\npackage about\n_content_$&\n\n"; 510 $extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}"; 511 512 $update_extra_dm = 1; 513 } 514 # the about package exists, but either doesn't have the _content_ macro or 515 # the _content_ macro doesn't contain the toc 516 else { 517 # check if there is a content macro 518 my $content_macro_existed = 0; 519 $content_macro_existed = ($about_package =~ /(\s*|\n)_content_(\s*){/); 520 521 # if there is one 522 # append a new section div for toc to the end of the document section 523 if($content_macro_existed ==1) { 524 $about_package =~ /(\s*|\n)_content_(\s*){(.|\n)*?}/; 525 my $content_macro = $&; 526 my $new_content_macro = $content_macro; 527 $new_content_macro =~ s/<div[^>]*class="document">(.|\n)*<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/; 528 $extra_dm =~ s/$content_macro/$new_content_macro/mg; 529 } 530 # otherwise, append _content_ macro to the about package 531 else { 532 my $new_about_package = $about_package; 533 $content_macro = &read_content_from_about_dm(); 534 $content_macro =~ m/{(.|\n)*<\/div>\n\n/; 535 536 $new_about_package .= "\n\n_content_$&\n\n"; 537 $new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}"; 538 $extra_dm =~ s/$about_package/$new_about_package/mg; 539 } 540 541 # either the case, we need to update the extra.dm 542 $update_extra_dm = 1; 543 } 544 545 if($update_extra_dm==1){ 546 # write to the extra.dm file of the collection 547 if (open(OUTPUT, ">$extradm_file")) { 548 print OUTPUT $extra_dm; 549 } else { 550 print "can't open $extradm_file\n"; 551 } 552 close(OUTPUT); 553 } 554 } else { 555 print $outhandle "Main_Page doesn't have a table-of-contents section\n"; 556 } 557 } 143 558 144 # check whether the stylesheet exists 145 # if not, download it and copy to the collection's images folder 146 for($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++){ 147 my $css_file = $css_files[$css_file_count]; 148 $css_file =~ s/^(.+)gli\/cache\///i; 149 150 my $css_file_path = "$base_dir/$css_file"; 151 152 if (-e $css_file_path){ # the file already exists 153 next; 154 } 155 156 # check the css directory and create one if it's not there 157 my @dirs = split(/\//i,$css_file); 158 my $path_check = "$base_dir/"; 159 for(my $i = 0; $i < (scalar(@dirs)-1); $i++){ 160 $path_check .= $dirs[$i] . "/"; 161 if(! -d $path_check ){ 162 mkdir($path_check); 163 } 164 } 165 166 # download 167 $css_file = "http://$css_file"; 168 system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path"); 169 if ($? != 0) {unlink("$css_file_path");} 170 171 # change every style element to #wikispecificstyle ... 172 if(open(INPUT, "<$css_file_path")){ 173 my $css_content; 174 while(my $line = <INPUT>){ 175 if($line =~ m/^(.+)\{/i){ 176 $line = "#wikispecificstyle " . $line; 177 } 178 $css_content .= $line; 179 } 180 close(INPUT); 181 open(OUTPUT, ">$css_file_path"); 182 print OUTPUT $css_content; 183 close(OUTPUT); 184 } 185 186 # copy to images folder 187 # do not copy, because collection can only have one specific stylesheet 188 # better to add and modify the style sheets manually 189 # @dirs = split(/\//i,$base_dir); 190 # my $collection_base_dir; 191 # for(my $i = 0; $i < (scalar(@dirs)-1); $i++){ 192 # $collection_base_dir .= $dirs[$i] . "/"; 193 # } 194 # my $images_folder = $collection_base_dir . "images/"; 195 # copy($css_file_path, $images_folder) || die "File cannot be copied."; 559 # If delete_toc is set, remove toc and tof contents. 560 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){ 561 if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){ 562 # print "\nit matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/; 563 if ($body_text =~ /$self->{'toc_exp'}/) { 564 $body_text =~ s/$self->{'toc_exp'}//i; 565 } 196 566 } 197 } 198 199 # add sections around h2 tag 200 # wrap each section with <div id=\"wikispecificstyle\"></div> to get the wiki styles 201 # add search box with each section 202 if ($self->{'tag_sections'}) { 203 my @sections = ($body_text =~ /<h2>(.+)<\/h2>/gi); 204 for(my $i=1; $i < scalar(@sections); $i++){ 205 my $section_title = $sections[$i]; 206 $section_title =~ s/<([^>]*)>//g; 207 $section_title =~ s/(^\s|\s$)//g; 208 my $section_metadata = "<Section>\n<Description>\n<Metadata name=\"Title\">$section_title</Metadata>\n</Description>\n"; 209 if($i !=1){ 210 $section_metadata = "</Section>\n" . $section_metadata; 211 } 212 $section_metadata = "\n<!--\n" . $section_metadata . "-->\n"; 213 214 $section_metadata .= "<div id=\"wikispecificstyle\">\n<div id=\"content\">\n"; 215 $section_metadata = "</div></div>\n" . $section_metadata if $i !=1; 216 217 $body_text =~ s/<h2>$sections[$i]<\/h2>/$section_metadata<h2>$sections[$i]<\/h2>/i; 218 219 if($i==scalar(@sections)-1) { 220 # $body_text =~ s/<div class=\"printfooter\">/<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i; 221 $body_text =~ s/<div class=\"printfooter\">/<\/div>\n<\/div>\n<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i; 222 } 223 } 224 } 225 226 # If delete_nav is enabled, it means to get rid of navigation contents. 227 # if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){ 228 # if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/){ 229 # print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/; 230 # $body_text =~ s/$self->{'nav_exp'}//isg; 231 # } 232 #} 233 my $searchbox = ""; 234 if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){ 235 my $nav_match_express; 236 if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/) { 237 $nav_match_express = $self->{'nav_exp'} ; 238 } else { # default setting for mediawiki 239 $nav_match_express = "<div class=\"printfooter\">(.|\n)*secs. -->"; 240 } 241 242 print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/; 243 244 # $body_text =~ m/<div class=\"printfooter\">(.|\n)*secs. -->/isg; 245 $body_text =~ m/$nav_match_express/isg; 246 my $navigate = $& if defined $&; 247 248 # find the search box and add it to the document page 249 if(defined $navigate && $navigate =~ /\S/){ 250 $navigate =~ m/<div id="p-search" class="portlet">(.|\n)*<\/form>/; 251 $searchbox = $& . "\n<\/div>\n<\/div>"; 252 $searchbox =~ s/action="([^>]*)"/action="\/gsdl\/cgi-bin\/library"/isg; 253 $searchbox =~ s/name="search"/name="q"/isg; 254 $searchbox =~ s/name="go"//isg; 255 $searchbox =~ s/name="fulltext"//isg; 256 my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n" 257 ."<input type=\"hidden\" name=\"c\" value=\"wikitest\"/>\n" 258 ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>" 259 ."<input type=\"hidden\" name=\"t\" value=\"1\">"; 260 $searchbox =~ s/<\/form>/$hidden_params<\/form>/isg; 261 $searchbox = "\n</div>\n</div><div id=\"wikispecificstyle\"><div id=\"column-one\">$searchbox</div></div>"; 262 } 263 264 # $body_text =~ s/<div class=\"printfooter\">(.|\n)*secs. -->/$searchbox/isg; 265 $body_text =~ s/$nav_match_express/$searchbox/isg; 266 } 267 268 if ($self->{'tag_sections'}) { 269 $body_text =~ s/<!--\n<\/Section>/$searchbox\n<!--\n<\/Section>/ig; 270 } 271 272 # Tidy up extra new lines 273 $body_text =~ s/(<p[^>]*><span[^>]*><o:p> <\/o:p><\/span><\/p>)//isg; 274 $body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg; 275 276 $section_text .= "<!--\n<Section>\n-->\n"; 277 my $body = "<body".$body_text; 278 279 $$textref = $body; 280 281 # get the base dir for convert absolute links to relative links 282 $$textref =~ m"href=\"(.*?)/cache/(.*?)/"i; 283 my $basedir = $2; 284 285 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g; 286 $$textref =~ s/( )+/ /sg; 287 288 # get rid of the [edit] button 289 $$textref =~ s/\[<a([^>]*)>edit<\/a>]//g; 290 291 # get rid of the last time edit information at the bottom 292 $$textref =~ s/<a href="(.+)edit(.*?)"(.*?)>(\w+)<\/a> \d\d:\d\d,(.*?)(PST)//g; 293 294 # get rid of the (Redirected from ...) 295 $$textref =~ s/(Redirected from <a ([^>]*)>(\w|\s)*<\/a>)//isg; 296 297 # escape macros 298 $$textref =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg; 299 # may change the links, like Greenstone_Documentation_All.html, then change back 300 $$textref =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg; 301 302 # convert all the urls to relative url, because current wget 1.10 -k and -E option doesn't work together 303 # get rid of the title attribute of a tag 304 $$textref =~ s/<a([^>]*)title="(.*?)"/<a$1/isg; 305 # find the relative path of current directory 306 if($basedir ne ""){ 307 my @dirs=split("\/", $file); 308 my $dirnum = scalar(@dirs); 309 my $replace = ""; 310 for(my $i=0; $i<$dirnum-2; $i++){ 311 $replace .= "../"; 312 } 313 # test if the linked relative file exists, if not, link to the internet version 314 $$textref =~ s/(href|src)="([^>]*)$basedir\/([^>]*)"/$1="$replace$3"/gi; 315 # my @total_links = ($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi); 316 # print $outhandle "\nnumber of total links: " . scalar(@total_links)."\n"; 317 # for(my $cur_link_no = 0; $cur_link_no < scalar(@total_links); $cur_link_no++){ 318 319 #while($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi){ 320 #$total_links[$cur_link_no] =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/i; 321 # my $prefix = $1; 322 # my $link = $&; 323 # my $rel_file_name = $3; 324 # my $rel_link = "$replace$rel_file_name"; 325 # print $outhandle "catched link==> $link\nrelative link==> $rel_link\n"; 326 # if(-e $rel_link){ 327 # $rel_link = "$prefix=\"$rel_link\""; 328 # $$textref =~ s/$link/$rel_link/i; 329 # }else{ 330 # my $ext_link = "$prefix=\"http:\/\/$basedir\/$rel_file_name\""; 331 # print $outhandle "external link==> $ext_link\n"; 332 # $$textref =~ s/$link/$ext_link/i; #s/$link/$prefix="http:\/\/$rel_file_name"/i; 333 # } 334 #} 335 336 337 # tag the link to new wiki pages as red 338 $$textref =~ s/(href|src)="$replace([^>]*)&action=edit([^>]*)"/$1="http:\/\/$basedir\/$2&action=edit$3"/gi; 339 $$textref =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi; 340 341 # tag the link to external pages as blue 342 $$textref =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi; 343 344 #print $outhandle $$textref; 345 } 346 347 # if 'show_toc' is set, put the table of content on the Wiki Main_Page to the about page of the collection 348 # 1. read _content_ macro from about.dm 349 # 2. append the toc, change all links to the Greenstone internal format for relative links 350 # 3. write to the extra.dm 351 # TODO: currently we suppose the _about:content_ hasn't been specified before 352 # so needs to add function to handle when the macro is already in the extra.dm 353 if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){ 354 my $macro_path = $base_dir; 355 $macro_path =~ s/import$/macros/; 356 my $extra_dm; 357 my $extradm_file = "$macro_path/extra.dm"; 358 if(open(INPUT, "<$extradm_file")){ 359 while(my $line = <INPUT>){ 360 $extra_dm .= $line; 361 } 362 close(INPUT); 363 364 if($extra_dm =~ m/package about/ && $extra_dm =~ m/_content_(\s)*{/){ 365 print $outhandle "already changed!!!!\n"; 366 } else { 367 # read _content_ macro from about.dm file 368 my $about_macro = $ENV{'GSDLHOME'} . "/macros/about.dm"; 369 my $about_page_content = ""; 370 if(open(INPUT, "<$about_macro")){ 371 while(my $line=<INPUT>){ 372 $about_page_content .= $line; 373 } 374 }else{ 375 print $outhandle "can't open file $about_macro\n"; 376 } 377 close(INPUT); 378 379 # extract the _content_ macro 380 $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i; 381 $about_page_content = $&; 382 383 # extract toc of the Main_Page 384 my $mainpage_content = ""; 385 if($self->{'toc_exp'} =~ /\S/){ 386 $$textref =~ /$self->{'toc_exp'}/; 387 $mainpage_content = $&; 388 } else { 389 # $mainpage_content =~ s/<!-- start content -->(.|\n)*<!-- end content -->/$1/igs; 390 } 391 # print $outhandle "---------\n$$textref\n--------\n\n"; 392 # print $outhandle "==========\n$mainpage_content\n==========\n\n"; 393 394 # add toc to the _content_ macro 395 $about_page_content =~ m/{(.|\n)*<\/div>\n\n/; 396 $extra_dm .= "package about\n_content_$&\n\n<div class=\"section\">\n$mainpage_content\n</div>\n</div>\n}"; 397 398 # change all links to the internal Greenstone relative link format 399 $extra_dm =~ s/<a href="([^>]*)"/<a href="_httpquery_&a=extlink&rl=1&href=http:\/\/$basedir$1"/isg; 400 $extra_dm =~ s/(\.\.\/)+/\//isg; 401 # print $outhandle "to add---------\n$extra_dm\n--------\n"; 402 403 # write to the extra.dm file of the collection 404 open(OUTPUT, ">$extradm_file"); 405 print OUTPUT $extra_dm; 406 close(OUTPUT); 407 } 408 } else { 409 print $outhandle "can't open file $extradm_file\n"; 410 } 411 } 412 413 # If delete_toc is enabled, it means to get rid of toc and tof contents. 414 # get rid of TOC and TOF sections and their title 415 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){ 416 if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){ 417 # $body_text =~ s/<p class=(($self->{'toc_exp'})[^>]*)>(.+?)<\/p>//isg; 418 # print "it matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/; 419 # $body_text =~ s/$self->{'toc_exp'}//i; 420 print "it matches toc_exp!!\n" if $$textref =~ /$self->{'toc_exp'}/; 421 $$textref =~ s/$self->{'toc_exp'}//i; 422 } 423 } 424 425 # To add a layer on top of the wiki page 426 # so as to keep the wiki style inside the wiki page 427 # and keep the Greenstone style at the same time 428 $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is; 429 $$textref =~ s/<\/body>/<\/div><\/body>/is; 430 431 # tag with sections 432 $$textref =~ s/<body([^>]*)>/$&\n<!--\n<Section>\n<Description>\n<Metadata name=\"Title\">$doctitle<\/Metadata>\n<\/Description>\n-->\n/is; 433 $$textref =~ s/<\/body>/\n<!--\n<\/Section>\n-->\n/is; 434 435 #print $outhandle "\n\n$$textref\n\n"; 436 437 # use description tags 438 if ($self->{'description_tags'}) { 439 my $cursection = $doc_obj->get_top_section(); 440 # remove the html header - note that doing this here means any 441 # sections defined within the header will be lost (so all <Section> 442 # tags must appear within the body of the HTML) 443 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is); 444 445 $$textref =~ s/^.*?<body[^>]*>//is; 446 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg; 447 448 my $opencom = '(?:<!--|<!(?:—|—|--))'; 449 my $closecom = '(?:-->|(?:—|—|--)>)'; 450 451 my $lt = '(?:<|<)'; 452 my $gt = '(?:>|>)'; 453 my $quot = '(?:"|"|”|“)'; 454 455 # my $dont_strip = ''; 456 # if ($self->{'no_strip_metadata_html'}) { 457 # ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g; 458 # } 459 460 my $found_something = 0; 461 my $top = 1; 462 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) { 463 my $text = $1; 464 my $comment = $2; 465 if (defined $text) { 466 # text before a comment - note that getting to here 467 # doesn't necessarily mean there are Section tags in 468 # the document 469 # print $outhandle "section text:\n$text\n"; 470 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection); 471 } 472 while ($comment =~ s/$lt(.*?)$gt//s) { 473 my $tag = $1; 474 if ($tag eq "Section") { 475 $found_something = 1; 476 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top; 477 $top = 0; 478 } elsif ($tag eq "/Section") { 479 $found_something = 1; 480 $cursection = $doc_obj->get_parent_section ($cursection); 481 } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) { 482 my $metaname = $1; 483 my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0; 484 $comment =~ s/^(.*?)$lt\/Metadata$gt//s; 485 my $metavalue = $1; 486 $metavalue =~ s/^\s+//; 487 $metavalue =~ s/\s+$//; 488 # assume that no metadata value intentionally includes 489 # carriage returns or HTML tags (if they're there they 490 # were probably introduced when converting to HTML from 491 # some other format). 492 # actually some people want to have html tags in their 493 # metadata. 494 $metavalue =~ s/[\cJ\cM]/ /sg; 495 # $metavalue =~ s/<[^>]+>//sg unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ /^($dont_strip)$/); 496 $metavalue =~ s/\s+/ /sg; 497 # print $outhandle "metaname = $metaname\nmetavalue = $metavalue\n"; 498 if ($accumulate) { 499 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue); 500 } else { 501 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue); 502 } 503 } elsif ($tag eq "Description" || $tag eq "/Description") { 504 # do nothing with containing Description tags 505 } else { 506 # simple HTML tag (probably created by the conversion 507 # to HTML from some other format) - we'll ignore it and 508 # hope for the best ;-) 509 } 510 } 511 }# end while 512 513 if ($cursection ne "") { 514 print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n"; 515 } 516 517 $$textref =~ s/^.*?<body[^>]*>//is; 518 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg; 519 if ($$textref =~ /\S/) { 520 if (!$found_something) { 521 if ($self->{'verbosity'} > 2) { 522 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n"; 523 print $outhandle " will be processed as a single section document\n"; 524 } 525 526 # go ahead and process single-section document 527 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection); 528 529 } else { 530 print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n"; 531 print $outhandle " of the final closing </Section> tag. This text will\n"; 532 print $outhandle " be ignored."; 533 534 my ($text); 535 if (length($$textref) > 30) { 536 $text = substr($$textref, 0, 30) . "..."; 537 } else { 538 $text = $$textref; 539 } 540 $text =~ s/\n/ /isg; 541 print $outhandle " ($text)\n"; 542 } 543 } elsif (!$found_something) { 544 if ($self->{'verbosity'} > 2) { 545 # may get to here if document contained no valid Section 546 # tags but did contain some comments. The text will have 547 # been processed already but we should print the warning 548 # as above and extract metadata 549 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n"; 550 print $outhandle " is blank or empty. Metadata will be assigned if present.\n"; 551 } 552 } 553 } # if $self->{'description_tags'} 554 else { 555 # remove header and footer 556 # if (!$self->{'keep_head'}) { 557 # $$textref =~ s/^.*?<body[^>]*>//is; 558 # $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg; 559 # } 560 561 # single section document 562 # $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection); 563 564 # Important: to get the relative links to work, 565 # 1: use the below statement instead of the above one 566 # 2. cannot have process_section method. 567 # why????? 568 $self->SUPER::process(@_); 569 } 570 return 1; 567 } 568 569 $$textref = "<body" . $body_text; 570 571 # Wrap the whole page with <div id="wikispecificstyle"></div> 572 # keep the style of this website and don't mess up with the Greenstone styles 573 $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is; 574 $$textref =~ s/<\/body>/<\/div><\/body>/is; 571 575 572 #$self->SUPER::process(@_); 576 $self->SUPER::process(@_); 577 578 return 1; 573 579 } 574 575 576 577 # note that process_section may be called multiple times for a single578 # section (relying on the fact that add_utf8_text appends the text to any579 # that may exist already).580 # sub process_section {581 # my $self = shift (@_);582 # my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;583 584 # trap links585 # if (!$self->{'nolinks'}) {586 # usemap="./#index" not handled correctly => change to "#index"587 # $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/588 #$self->replace_usemap_links($1, $2, $3)/isge;589 590 #$$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/591 #$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;592 #}593 594 # trap images595 596 # allow spaces if inside quotes - jrm21597 #$$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)([\"\'][^\"\']+[\"\']|[^\s>]+)([^>]*>)/598 #$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;599 600 # add text to document object601 # turn \ into \\ so that the rest of greenstone doesn't think there602 # is an escape code following. (Macro parsing loses them...)603 # $$textref =~ s/\\/\\\\/go;604 605 # $doc_obj->add_utf8_text($cursection, $$textref);606 #}607 580 608 581 … … 651 624 } 652 625 626 sub safe_escape_regexp 627 { 628 my $regexp = shift (@_); 629 630 # if ($ENV{'GSDLOS'} =~ /^windows$/i) { 631 $regexp =~ s/\\/\\\\/isg; 632 #} else { 633 $regexp =~ s/\//\\\//isg; 634 #} 635 return $regexp; 636 } 637 638 sub read_content_from_about_dm 639 { 640 my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm"); 641 my $about_page_content = ""; 642 if (open(INPUT, "<$about_macro_file")){ 643 while (my $line=<INPUT>){ 644 $about_page_content .= $line; 645 } 646 } else { 647 print $outhandle "can't open file $about_macro_file\n"; 648 } 649 close(INPUT); 650 651 # extract the _content_ macro 652 $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i; 653 $about_page_content = $&; 654 655 return $about_page_content; 656 } 657 653 658 1;
Note:
See TracChangeset
for help on using the changeset viewer.