[14108] | 1 | ###########################################################################
|
---|
| 2 | #
|
---|
| 3 | # MediaWikiPlug.pm -- html plugin with extra facilities for wiki page
|
---|
| 4 | #
|
---|
| 5 | # A component of the Greenstone digital library software
|
---|
| 6 | # from the New Zealand Digital Library Project at the
|
---|
| 7 | # University of Waikato, New Zealand.
|
---|
| 8 | #
|
---|
| 9 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
| 10 | #
|
---|
| 11 | # This program is free software; you can redistribute it and/or modify
|
---|
| 12 | # it under the terms of the GNU General Public License as published by
|
---|
| 13 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 14 | # (at your option) any later version.
|
---|
| 15 | #
|
---|
| 16 | # This program is distributed in the hope that it will be useful,
|
---|
| 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 19 | # GNU General Public License for more details.
|
---|
| 20 | #
|
---|
| 21 | # You should have received a copy of the GNU General Public License
|
---|
| 22 | # along with this program; if not, write to the Free Software
|
---|
| 23 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 24 | #
|
---|
| 25 | ###########################################################################
|
---|
| 26 | # This plugin is to process an HTML file where sections are divided by
|
---|
| 27 | # user-defined headings tags. As it is difficult to predict what user's definition
|
---|
| 28 | # this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
|
---|
| 29 | # as well as allows to get rid of user-defined Table of Content (TOC)...
|
---|
| 30 | # format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
|
---|
| 31 |
|
---|
| 32 | package MediaWikiPlug;
|
---|
| 33 |
|
---|
| 34 | use HTMLPlug;
|
---|
| 35 | use ImagePlug;
|
---|
| 36 | use File::Copy;
|
---|
| 37 |
|
---|
| 38 | #use strict; # every perl program should have this!
|
---|
| 39 | #no strict 'refs'; # make an exception so we can use variables as filehandles
|
---|
| 40 |
|
---|
| 41 | sub BEGIN {
|
---|
| 42 | @MediaWikiPlug::ISA = ('HTMLPlug');
|
---|
| 43 | }
|
---|
| 44 |
|
---|
| 45 | my $arguments =
|
---|
| 46 | [
|
---|
| 47 | { 'name' => "show_toc",
|
---|
| 48 | 'desc' => "{MediaWikiPlug.show_toc}",
|
---|
| 49 | 'type' => "flag",
|
---|
| 50 | 'reqd' => "no"},
|
---|
| 51 | { 'name' => "toc_exp",
|
---|
| 52 | 'desc' => "{MediaWikiPlug.toc_exp}",
|
---|
| 53 | 'type' => "regexp",
|
---|
| 54 | 'reqd' => "no",
|
---|
| 55 | 'deft' => "" },
|
---|
| 56 | { 'name' => "delete_toc",
|
---|
| 57 | 'desc' => "{MediaWikiPlug.delete_toc}",
|
---|
| 58 | 'type' => "flag",
|
---|
| 59 | 'reqd' => "no"},
|
---|
| 60 | { 'name' => "delete_nav",
|
---|
| 61 | 'desc' => "{MediaWikiPlug.delete_nav}",
|
---|
| 62 | 'type' => "flag",
|
---|
| 63 | 'reqd' => "no",
|
---|
| 64 | 'deft' => ""},
|
---|
| 65 | { 'name' => "nav_exp",
|
---|
| 66 | 'desc' => "{MediaWikiPlug.nav_exp}",
|
---|
| 67 | 'type' => "regexp",
|
---|
| 68 | 'reqd' => "no",
|
---|
| 69 | 'deft' => "" },
|
---|
| 70 | { 'name' => "tag_sections",
|
---|
| 71 | 'desc' => "{MediaWikiPlug.tag_sections}",
|
---|
| 72 | 'type' => "flag",
|
---|
| 73 | 'reqd' => "no"},
|
---|
| 74 | { 'name' => "description_tags",
|
---|
| 75 | 'desc' => "{HTMLPlug.description_tags}",
|
---|
| 76 | 'type' => "flag",
|
---|
| 77 | 'reqd' => "no"}
|
---|
| 78 | ];
|
---|
| 79 |
|
---|
| 80 |
|
---|
| 81 | my $options = { 'name' => "MediaWikiPlug",
|
---|
| 82 | 'desc' => "{MediaWikiPlug.desc}",
|
---|
| 83 | 'abstract' => "no",
|
---|
| 84 | 'inherits' => "yes",
|
---|
| 85 | 'args' => $arguments };
|
---|
| 86 |
|
---|
| 87 |
|
---|
| 88 | sub new {
|
---|
| 89 | my ($class) = shift (@_);
|
---|
| 90 | my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
|
---|
| 91 | push(@$pluginlist, $class);
|
---|
| 92 |
|
---|
| 93 | if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
|
---|
| 94 | if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
|
---|
| 95 |
|
---|
| 96 | my $self = new HTMLPlug($pluginlist, $inputargs, $hashArgOptLists);
|
---|
| 97 | return bless $self, $class;
|
---|
| 98 | }
|
---|
| 99 |
|
---|
| 100 |
|
---|
| 101 |
|
---|
| 102 | sub process {
|
---|
| 103 | my $self = shift (@_);
|
---|
| 104 | my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
|
---|
| 105 | my $outhandle = $self->{'outhandle'};
|
---|
| 106 |
|
---|
| 107 | print $outhandle "MediaWikiPlug: processing $file\n" if $self->{'verbosity'} > 1;
|
---|
| 108 |
|
---|
| 109 | my @head_and_body = split(/<body/i,$$textref);
|
---|
| 110 | my $head = shift(@head_and_body);
|
---|
| 111 | my $body_text = join("<body", @head_and_body);
|
---|
| 112 |
|
---|
| 113 | $head =~ m/<title>(.+)<\/title>/i;
|
---|
| 114 | my $doctitle = $1 if defined $1;
|
---|
| 115 |
|
---|
| 116 | if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
|
---|
| 117 | my @doc_properties = split(/<xml>/i,$head);
|
---|
| 118 | my $doc_heading = shift(@doc_properties);
|
---|
| 119 | my $rest_doc_properties = join(" ", @doc_properties);
|
---|
| 120 |
|
---|
| 121 | my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
|
---|
| 122 | my $extracted_metadata = shift (@extracted_metadata);
|
---|
| 123 | $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
|
---|
| 124 | }
|
---|
| 125 |
|
---|
| 126 | # set the title here if we haven't found it yet
|
---|
| 127 | if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
|
---|
| 128 | if (defined $doctitle && $doctitle =~ /\S/) {
|
---|
| 129 | $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
|
---|
| 130 | } else {
|
---|
| 131 | $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
|
---|
| 132 | }
|
---|
| 133 | }
|
---|
| 134 |
|
---|
| 135 | if(defined $base_dir && $base_dir ne ""){
|
---|
| 136 | # find and download stylesheet
|
---|
| 137 | my @css_files;
|
---|
| 138 | my $css_file_count = 0;
|
---|
| 139 | # find all the style sheets imported with import statement
|
---|
| 140 | while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
|
---|
| 141 | $css_files[$css_file_count++] = $2 if defined $2;
|
---|
| 142 | }
|
---|
| 143 |
|
---|
| 144 | # check whether the stylesheet exists
|
---|
| 145 | # if not, download it and copy to the collection's images folder
|
---|
| 146 | for($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++){
|
---|
| 147 | my $css_file = $css_files[$css_file_count];
|
---|
| 148 | $css_file =~ s/^(.+)gli\/cache\///i;
|
---|
| 149 |
|
---|
| 150 | my $css_file_path = "$base_dir/$css_file";
|
---|
| 151 |
|
---|
| 152 | if (-e $css_file_path){ # the file already exists
|
---|
| 153 | next;
|
---|
| 154 | }
|
---|
| 155 |
|
---|
| 156 | # check the css directory and create one if it's not there
|
---|
| 157 | my @dirs = split(/\//i,$css_file);
|
---|
| 158 | my $path_check = "$base_dir/";
|
---|
| 159 | for(my $i = 0; $i < (scalar(@dirs)-1); $i++){
|
---|
| 160 | $path_check .= $dirs[$i] . "/";
|
---|
| 161 | if(! -d $path_check ){
|
---|
| 162 | mkdir($path_check);
|
---|
| 163 | }
|
---|
| 164 | }
|
---|
| 165 |
|
---|
| 166 | # download
|
---|
| 167 | $css_file = "http://$css_file";
|
---|
| 168 | system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
|
---|
| 169 | if ($? != 0) {unlink("$css_file_path");}
|
---|
| 170 |
|
---|
| 171 | # change every style element to #wikispecificstyle ...
|
---|
| 172 | if(open(INPUT, "<$css_file_path")){
|
---|
| 173 | my $css_content;
|
---|
| 174 | while(my $line = <INPUT>){
|
---|
| 175 | if($line =~ m/^(.+)\{/i){
|
---|
| 176 | $line = "#wikispecificstyle " . $line;
|
---|
| 177 | }
|
---|
| 178 | $css_content .= $line;
|
---|
| 179 | }
|
---|
| 180 | close(INPUT);
|
---|
| 181 | open(OUTPUT, ">$css_file_path");
|
---|
| 182 | print OUTPUT $css_content;
|
---|
| 183 | close(OUTPUT);
|
---|
| 184 | }
|
---|
| 185 |
|
---|
| 186 | # copy to images folder
|
---|
| 187 | # do not copy, because collection can only have one specific stylesheet
|
---|
| 188 | # better to add and modify the style sheets manually
|
---|
| 189 | # @dirs = split(/\//i,$base_dir);
|
---|
| 190 | # my $collection_base_dir;
|
---|
| 191 | # for(my $i = 0; $i < (scalar(@dirs)-1); $i++){
|
---|
| 192 | # $collection_base_dir .= $dirs[$i] . "/";
|
---|
| 193 | # }
|
---|
| 194 | # my $images_folder = $collection_base_dir . "images/";
|
---|
| 195 | # copy($css_file_path, $images_folder) || die "File cannot be copied.";
|
---|
| 196 | }
|
---|
| 197 | }
|
---|
| 198 |
|
---|
| 199 | # add sections around h2 tag
|
---|
| 200 | # wrap each section with <div id=\"wikispecificstyle\"></div> to get the wiki styles
|
---|
| 201 | # add search box with each section
|
---|
| 202 | if ($self->{'tag_sections'}) {
|
---|
| 203 | my @sections = ($body_text =~ /<h2>(.+)<\/h2>/gi);
|
---|
| 204 | for(my $i=1; $i < scalar(@sections); $i++){
|
---|
| 205 | my $section_title = $sections[$i];
|
---|
| 206 | $section_title =~ s/<([^>]*)>//g;
|
---|
| 207 | $section_title =~ s/(^\s|\s$)//g;
|
---|
| 208 | my $section_metadata = "<Section>\n<Description>\n<Metadata name=\"Title\">$section_title</Metadata>\n</Description>\n";
|
---|
| 209 | if($i !=1){
|
---|
| 210 | $section_metadata = "</Section>\n" . $section_metadata;
|
---|
| 211 | }
|
---|
| 212 | $section_metadata = "\n<!--\n" . $section_metadata . "-->\n";
|
---|
| 213 |
|
---|
| 214 | $section_metadata .= "<div id=\"wikispecificstyle\">\n<div id=\"content\">\n";
|
---|
| 215 | $section_metadata = "</div></div>\n" . $section_metadata if $i !=1;
|
---|
| 216 |
|
---|
| 217 | $body_text =~ s/<h2>$sections[$i]<\/h2>/$section_metadata<h2>$sections[$i]<\/h2>/i;
|
---|
| 218 |
|
---|
| 219 | if($i==scalar(@sections)-1) {
|
---|
| 220 | # $body_text =~ s/<div class=\"printfooter\">/<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
|
---|
| 221 | $body_text =~ s/<div class=\"printfooter\">/<\/div>\n<\/div>\n<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
|
---|
| 222 | }
|
---|
| 223 | }
|
---|
| 224 | }
|
---|
| 225 |
|
---|
| 226 | # If delete_nav is enabled, it means to get rid of navigation contents.
|
---|
| 227 | # if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
|
---|
| 228 | # if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/){
|
---|
| 229 | # print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
|
---|
| 230 | # $body_text =~ s/$self->{'nav_exp'}//isg;
|
---|
| 231 | # }
|
---|
| 232 | #}
|
---|
| 233 | my $searchbox = "";
|
---|
| 234 | if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
|
---|
| 235 | my $nav_match_express;
|
---|
| 236 | if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/) {
|
---|
| 237 | $nav_match_express = $self->{'nav_exp'} ;
|
---|
| 238 | } else { # default setting for mediawiki
|
---|
| 239 | $nav_match_express = "<div class=\"printfooter\">(.|\n)*secs. -->";
|
---|
| 240 | }
|
---|
| 241 |
|
---|
| 242 | print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
|
---|
| 243 |
|
---|
| 244 | # $body_text =~ m/<div class=\"printfooter\">(.|\n)*secs. -->/isg;
|
---|
| 245 | $body_text =~ m/$nav_match_express/isg;
|
---|
| 246 | my $navigate = $& if defined $&;
|
---|
| 247 |
|
---|
| 248 | # find the search box and add it to the document page
|
---|
| 249 | if(defined $navigate && $navigate =~ /\S/){
|
---|
| 250 | $navigate =~ m/<div id="p-search" class="portlet">(.|\n)*<\/form>/;
|
---|
| 251 | $searchbox = $& . "\n<\/div>\n<\/div>";
|
---|
| 252 | $searchbox =~ s/action="([^>]*)"/action="\/gsdl\/cgi-bin\/library"/isg;
|
---|
| 253 | $searchbox =~ s/name="search"/name="q"/isg;
|
---|
| 254 | $searchbox =~ s/name="go"//isg;
|
---|
| 255 | $searchbox =~ s/name="fulltext"//isg;
|
---|
| 256 | my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
|
---|
| 257 | ."<input type=\"hidden\" name=\"c\" value=\"wikitest\"/>\n"
|
---|
| 258 | ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>"
|
---|
| 259 | ."<input type=\"hidden\" name=\"t\" value=\"1\">";
|
---|
| 260 | $searchbox =~ s/<\/form>/$hidden_params<\/form>/isg;
|
---|
| 261 | $searchbox = "\n</div>\n</div><div id=\"wikispecificstyle\"><div id=\"column-one\">$searchbox</div></div>";
|
---|
| 262 | }
|
---|
| 263 |
|
---|
| 264 | # $body_text =~ s/<div class=\"printfooter\">(.|\n)*secs. -->/$searchbox/isg;
|
---|
| 265 | $body_text =~ s/$nav_match_express/$searchbox/isg;
|
---|
| 266 | }
|
---|
| 267 |
|
---|
| 268 | if ($self->{'tag_sections'}) {
|
---|
| 269 | $body_text =~ s/<!--\n<\/Section>/$searchbox\n<!--\n<\/Section>/ig;
|
---|
| 270 | }
|
---|
| 271 |
|
---|
| 272 | # Tidy up extra new lines
|
---|
| 273 | $body_text =~ s/(<p[^>]*><span[^>]*><o:p> <\/o:p><\/span><\/p>)//isg;
|
---|
| 274 | $body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
|
---|
| 275 |
|
---|
| 276 | $section_text .= "<!--\n<Section>\n-->\n";
|
---|
| 277 | my $body = "<body".$body_text;
|
---|
| 278 |
|
---|
| 279 | $$textref = $body;
|
---|
| 280 |
|
---|
| 281 | # get the base dir for convert absolute links to relative links
|
---|
| 282 | $$textref =~ m"href=\"(.*?)/cache/(.*?)/"i;
|
---|
| 283 | my $basedir = $2;
|
---|
| 284 |
|
---|
| 285 | $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
|
---|
| 286 | $$textref =~ s/( )+/ /sg;
|
---|
| 287 |
|
---|
| 288 | # get rid of the [edit] button
|
---|
| 289 | $$textref =~ s/\[<a([^>]*)>edit<\/a>]//g;
|
---|
| 290 |
|
---|
| 291 | # get rid of the last time edit information at the bottom
|
---|
| 292 | $$textref =~ s/<a href="(.+)edit(.*?)"(.*?)>(\w+)<\/a> \d\d:\d\d,(.*?)(PST)//g;
|
---|
| 293 |
|
---|
| 294 | # get rid of the (Redirected from ...)
|
---|
| 295 | $$textref =~ s/(Redirected from <a ([^>]*)>(\w|\s)*<\/a>)//isg;
|
---|
| 296 |
|
---|
| 297 | # escape macros
|
---|
| 298 | $$textref =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
|
---|
| 299 | # may change the links, like Greenstone_Documentation_All.html, then change back
|
---|
| 300 | $$textref =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg;
|
---|
| 301 |
|
---|
| 302 | # convert all the urls to relative url, because current wget 1.10 -k and -E option doesn't work together
|
---|
| 303 | # get rid of the title attribute of a tag
|
---|
| 304 | $$textref =~ s/<a([^>]*)title="(.*?)"/<a$1/isg;
|
---|
| 305 | # find the relative path of current directory
|
---|
| 306 | if($basedir ne ""){
|
---|
| 307 | my @dirs=split("\/", $file);
|
---|
| 308 | my $dirnum = scalar(@dirs);
|
---|
| 309 | my $replace = "";
|
---|
| 310 | for(my $i=0; $i<$dirnum-2; $i++){
|
---|
| 311 | $replace .= "../";
|
---|
| 312 | }
|
---|
| 313 | # test if the linked relative file exists, if not, link to the internet version
|
---|
| 314 | $$textref =~ s/(href|src)="([^>]*)$basedir\/([^>]*)"/$1="$replace$3"/gi;
|
---|
| 315 | # my @total_links = ($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi);
|
---|
| 316 | # print $outhandle "\nnumber of total links: " . scalar(@total_links)."\n";
|
---|
| 317 | # for(my $cur_link_no = 0; $cur_link_no < scalar(@total_links); $cur_link_no++){
|
---|
| 318 |
|
---|
| 319 | #while($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi){
|
---|
| 320 | #$total_links[$cur_link_no] =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/i;
|
---|
| 321 | # my $prefix = $1;
|
---|
| 322 | # my $link = $&;
|
---|
| 323 | # my $rel_file_name = $3;
|
---|
| 324 | # my $rel_link = "$replace$rel_file_name";
|
---|
| 325 | # print $outhandle "catched link==> $link\nrelative link==> $rel_link\n";
|
---|
| 326 | # if(-e $rel_link){
|
---|
| 327 | # $rel_link = "$prefix=\"$rel_link\"";
|
---|
| 328 | # $$textref =~ s/$link/$rel_link/i;
|
---|
| 329 | # }else{
|
---|
| 330 | # my $ext_link = "$prefix=\"http:\/\/$basedir\/$rel_file_name\"";
|
---|
| 331 | # print $outhandle "external link==> $ext_link\n";
|
---|
| 332 | # $$textref =~ s/$link/$ext_link/i; #s/$link/$prefix="http:\/\/$rel_file_name"/i;
|
---|
| 333 | # }
|
---|
| 334 | #}
|
---|
| 335 |
|
---|
| 336 |
|
---|
| 337 | # tag the link to new wiki pages as red
|
---|
| 338 | $$textref =~ s/(href|src)="$replace([^>]*)&action=edit([^>]*)"/$1="http:\/\/$basedir\/$2&action=edit$3"/gi;
|
---|
| 339 | $$textref =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi;
|
---|
| 340 |
|
---|
| 341 | # tag the link to external pages as blue
|
---|
| 342 | $$textref =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi;
|
---|
| 343 |
|
---|
| 344 | #print $outhandle $$textref;
|
---|
| 345 | }
|
---|
| 346 |
|
---|
| 347 | # if 'show_toc' is set, put the table of content on the Wiki Main_Page to the about page of the collection
|
---|
| 348 | # 1. read _content_ macro from about.dm
|
---|
| 349 | # 2. append the toc, change all links to the Greenstone internal format for relative links
|
---|
| 350 | # 3. write to the extra.dm
|
---|
| 351 | # TODO: currently we suppose the _about:content_ hasn't been specified before
|
---|
| 352 | # so needs to add function to handle when the macro is already in the extra.dm
|
---|
| 353 | if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){
|
---|
| 354 | my $macro_path = $base_dir;
|
---|
| 355 | $macro_path =~ s/import$/macros/;
|
---|
| 356 | my $extra_dm;
|
---|
| 357 | my $extradm_file = "$macro_path/extra.dm";
|
---|
| 358 | if(open(INPUT, "<$extradm_file")){
|
---|
| 359 | while(my $line = <INPUT>){
|
---|
| 360 | $extra_dm .= $line;
|
---|
| 361 | }
|
---|
| 362 | close(INPUT);
|
---|
| 363 |
|
---|
| 364 | if($extra_dm =~ m/package about/ && $extra_dm =~ m/_content_(\s)*{/){
|
---|
| 365 | print $outhandle "already changed!!!!\n";
|
---|
| 366 | } else {
|
---|
| 367 | # read _content_ macro from about.dm file
|
---|
| 368 | my $about_macro = $ENV{'GSDLHOME'} . "/macros/about.dm";
|
---|
| 369 | my $about_page_content = "";
|
---|
| 370 | if(open(INPUT, "<$about_macro")){
|
---|
| 371 | while(my $line=<INPUT>){
|
---|
| 372 | $about_page_content .= $line;
|
---|
| 373 | }
|
---|
| 374 | }else{
|
---|
| 375 | print $outhandle "can't open file $about_macro\n";
|
---|
| 376 | }
|
---|
| 377 | close(INPUT);
|
---|
| 378 |
|
---|
| 379 | # extract the _content_ macro
|
---|
| 380 | $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i;
|
---|
| 381 | $about_page_content = $&;
|
---|
| 382 |
|
---|
| 383 | # extract toc of the Main_Page
|
---|
| 384 | my $mainpage_content = "";
|
---|
| 385 | if($self->{'toc_exp'} =~ /\S/){
|
---|
| 386 | $$textref =~ /$self->{'toc_exp'}/;
|
---|
| 387 | $mainpage_content = $&;
|
---|
| 388 | } else {
|
---|
| 389 | # $mainpage_content =~ s/<!-- start content -->(.|\n)*<!-- end content -->/$1/igs;
|
---|
| 390 | }
|
---|
| 391 | # print $outhandle "---------\n$$textref\n--------\n\n";
|
---|
| 392 | # print $outhandle "==========\n$mainpage_content\n==========\n\n";
|
---|
| 393 |
|
---|
| 394 | # add toc to the _content_ macro
|
---|
| 395 | $about_page_content =~ m/{(.|\n)*<\/div>\n\n/;
|
---|
| 396 | $extra_dm .= "package about\n_content_$&\n\n<div class=\"section\">\n$mainpage_content\n</div>\n</div>\n}";
|
---|
| 397 |
|
---|
| 398 | # change all links to the internal Greenstone relative link format
|
---|
| 399 | $extra_dm =~ s/<a href="([^>]*)"/<a href="_httpquery_&a=extlink&rl=1&href=http:\/\/$basedir$1"/isg;
|
---|
| 400 | $extra_dm =~ s/(\.\.\/)+/\//isg;
|
---|
| 401 | # print $outhandle "to add---------\n$extra_dm\n--------\n";
|
---|
| 402 |
|
---|
| 403 | # write to the extra.dm file of the collection
|
---|
| 404 | open(OUTPUT, ">$extradm_file");
|
---|
| 405 | print OUTPUT $extra_dm;
|
---|
| 406 | close(OUTPUT);
|
---|
| 407 | }
|
---|
| 408 | } else {
|
---|
| 409 | print $outhandle "can't open file $extradm_file\n";
|
---|
| 410 | }
|
---|
| 411 | }
|
---|
| 412 |
|
---|
| 413 | # If delete_toc is enabled, it means to get rid of toc and tof contents.
|
---|
| 414 | # get rid of TOC and TOF sections and their title
|
---|
| 415 | if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
|
---|
| 416 | if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
|
---|
| 417 | # $body_text =~ s/<p class=(($self->{'toc_exp'})[^>]*)>(.+?)<\/p>//isg;
|
---|
| 418 | # print "it matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/;
|
---|
| 419 | # $body_text =~ s/$self->{'toc_exp'}//i;
|
---|
| 420 | print "it matches toc_exp!!\n" if $$textref =~ /$self->{'toc_exp'}/;
|
---|
| 421 | $$textref =~ s/$self->{'toc_exp'}//i;
|
---|
| 422 | }
|
---|
| 423 | }
|
---|
| 424 |
|
---|
| 425 | # To add a layer on top of the wiki page
|
---|
| 426 | # so as to keep the wiki style inside the wiki page
|
---|
| 427 | # and keep the Greenstone style at the same time
|
---|
| 428 | $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
|
---|
| 429 | $$textref =~ s/<\/body>/<\/div><\/body>/is;
|
---|
| 430 |
|
---|
| 431 | # tag with sections
|
---|
| 432 | $$textref =~ s/<body([^>]*)>/$&\n<!--\n<Section>\n<Description>\n<Metadata name=\"Title\">$doctitle<\/Metadata>\n<\/Description>\n-->\n/is;
|
---|
| 433 | $$textref =~ s/<\/body>/\n<!--\n<\/Section>\n-->\n/is;
|
---|
| 434 |
|
---|
| 435 | #print $outhandle "\n\n$$textref\n\n";
|
---|
| 436 |
|
---|
| 437 | # use description tags
|
---|
| 438 | if ($self->{'description_tags'}) {
|
---|
| 439 | my $cursection = $doc_obj->get_top_section();
|
---|
| 440 | # remove the html header - note that doing this here means any
|
---|
| 441 | # sections defined within the header will be lost (so all <Section>
|
---|
| 442 | # tags must appear within the body of the HTML)
|
---|
| 443 | my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
|
---|
| 444 |
|
---|
| 445 | $$textref =~ s/^.*?<body[^>]*>//is;
|
---|
| 446 | $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
|
---|
| 447 |
|
---|
| 448 | my $opencom = '(?:<!--|<!(?:—|—|--))';
|
---|
| 449 | my $closecom = '(?:-->|(?:—|—|--)>)';
|
---|
| 450 |
|
---|
| 451 | my $lt = '(?:<|<)';
|
---|
| 452 | my $gt = '(?:>|>)';
|
---|
| 453 | my $quot = '(?:"|"|”|“)';
|
---|
| 454 |
|
---|
| 455 | # my $dont_strip = '';
|
---|
| 456 | # if ($self->{'no_strip_metadata_html'}) {
|
---|
| 457 | # ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
|
---|
| 458 | # }
|
---|
| 459 |
|
---|
| 460 | my $found_something = 0;
|
---|
| 461 | my $top = 1;
|
---|
| 462 | while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
|
---|
| 463 | my $text = $1;
|
---|
| 464 | my $comment = $2;
|
---|
| 465 | if (defined $text) {
|
---|
| 466 | # text before a comment - note that getting to here
|
---|
| 467 | # doesn't necessarily mean there are Section tags in
|
---|
| 468 | # the document
|
---|
| 469 | # print $outhandle "section text:\n$text\n";
|
---|
| 470 | $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
|
---|
| 471 | }
|
---|
| 472 | while ($comment =~ s/$lt(.*?)$gt//s) {
|
---|
| 473 | my $tag = $1;
|
---|
| 474 | if ($tag eq "Section") {
|
---|
| 475 | $found_something = 1;
|
---|
| 476 | $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
|
---|
| 477 | $top = 0;
|
---|
| 478 | } elsif ($tag eq "/Section") {
|
---|
| 479 | $found_something = 1;
|
---|
| 480 | $cursection = $doc_obj->get_parent_section ($cursection);
|
---|
| 481 | } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
|
---|
| 482 | my $metaname = $1;
|
---|
| 483 | my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
|
---|
| 484 | $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
|
---|
| 485 | my $metavalue = $1;
|
---|
| 486 | $metavalue =~ s/^\s+//;
|
---|
| 487 | $metavalue =~ s/\s+$//;
|
---|
| 488 | # assume that no metadata value intentionally includes
|
---|
| 489 | # carriage returns or HTML tags (if they're there they
|
---|
| 490 | # were probably introduced when converting to HTML from
|
---|
| 491 | # some other format).
|
---|
| 492 | # actually some people want to have html tags in their
|
---|
| 493 | # metadata.
|
---|
| 494 | $metavalue =~ s/[\cJ\cM]/ /sg;
|
---|
| 495 | # $metavalue =~ s/<[^>]+>//sg unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ /^($dont_strip)$/);
|
---|
| 496 | $metavalue =~ s/\s+/ /sg;
|
---|
| 497 | # print $outhandle "metaname = $metaname\nmetavalue = $metavalue\n";
|
---|
| 498 | if ($accumulate) {
|
---|
| 499 | $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
|
---|
| 500 | } else {
|
---|
| 501 | $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
|
---|
| 502 | }
|
---|
| 503 | } elsif ($tag eq "Description" || $tag eq "/Description") {
|
---|
| 504 | # do nothing with containing Description tags
|
---|
| 505 | } else {
|
---|
| 506 | # simple HTML tag (probably created by the conversion
|
---|
| 507 | # to HTML from some other format) - we'll ignore it and
|
---|
| 508 | # hope for the best ;-)
|
---|
| 509 | }
|
---|
| 510 | }
|
---|
| 511 | }# end while
|
---|
| 512 |
|
---|
| 513 | if ($cursection ne "") {
|
---|
| 514 | print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
|
---|
| 515 | }
|
---|
| 516 |
|
---|
| 517 | $$textref =~ s/^.*?<body[^>]*>//is;
|
---|
| 518 | $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
|
---|
| 519 | if ($$textref =~ /\S/) {
|
---|
| 520 | if (!$found_something) {
|
---|
| 521 | if ($self->{'verbosity'} > 2) {
|
---|
| 522 | print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
|
---|
| 523 | print $outhandle " will be processed as a single section document\n";
|
---|
| 524 | }
|
---|
| 525 |
|
---|
| 526 | # go ahead and process single-section document
|
---|
| 527 | $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
|
---|
| 528 |
|
---|
| 529 | } else {
|
---|
| 530 | print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
|
---|
| 531 | print $outhandle " of the final closing </Section> tag. This text will\n";
|
---|
| 532 | print $outhandle " be ignored.";
|
---|
| 533 |
|
---|
| 534 | my ($text);
|
---|
| 535 | if (length($$textref) > 30) {
|
---|
| 536 | $text = substr($$textref, 0, 30) . "...";
|
---|
| 537 | } else {
|
---|
| 538 | $text = $$textref;
|
---|
| 539 | }
|
---|
| 540 | $text =~ s/\n/ /isg;
|
---|
| 541 | print $outhandle " ($text)\n";
|
---|
| 542 | }
|
---|
| 543 | } elsif (!$found_something) {
|
---|
| 544 | if ($self->{'verbosity'} > 2) {
|
---|
| 545 | # may get to here if document contained no valid Section
|
---|
| 546 | # tags but did contain some comments. The text will have
|
---|
| 547 | # been processed already but we should print the warning
|
---|
| 548 | # as above and extract metadata
|
---|
| 549 | print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
|
---|
| 550 | print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
|
---|
| 551 | }
|
---|
| 552 | }
|
---|
| 553 | } # if $self->{'description_tags'}
|
---|
| 554 | else {
|
---|
| 555 | # remove header and footer
|
---|
| 556 | # if (!$self->{'keep_head'}) {
|
---|
| 557 | # $$textref =~ s/^.*?<body[^>]*>//is;
|
---|
| 558 | # $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
|
---|
| 559 | # }
|
---|
| 560 |
|
---|
| 561 | # single section document
|
---|
| 562 | # $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
|
---|
| 563 |
|
---|
| 564 | # Important: to get the relative links to work,
|
---|
| 565 | # 1: use the below statement instead of the above one
|
---|
| 566 | # 2. cannot have process_section method.
|
---|
| 567 | # why?????
|
---|
| 568 | $self->SUPER::process(@_);
|
---|
| 569 | }
|
---|
| 570 | return 1;
|
---|
| 571 |
|
---|
| 572 | #$self->SUPER::process(@_);
|
---|
| 573 | }
|
---|
| 574 |
|
---|
| 575 |
|
---|
| 576 |
|
---|
| 577 | # note that process_section may be called multiple times for a single
|
---|
| 578 | # section (relying on the fact that add_utf8_text appends the text to any
|
---|
| 579 | # that may exist already).
|
---|
| 580 | # sub process_section {
|
---|
| 581 | # my $self = shift (@_);
|
---|
| 582 | # my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
|
---|
| 583 |
|
---|
| 584 | # trap links
|
---|
| 585 | # if (!$self->{'nolinks'}) {
|
---|
| 586 | # usemap="./#index" not handled correctly => change to "#index"
|
---|
| 587 | # $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
|
---|
| 588 | #$self->replace_usemap_links($1, $2, $3)/isge;
|
---|
| 589 |
|
---|
| 590 | #$$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
|
---|
| 591 | #$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
|
---|
| 592 | #}
|
---|
| 593 |
|
---|
| 594 | # trap images
|
---|
| 595 |
|
---|
| 596 | # allow spaces if inside quotes - jrm21
|
---|
| 597 | #$$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)([\"\'][^\"\']+[\"\']|[^\s>]+)([^>]*>)/
|
---|
| 598 | #$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
|
---|
| 599 |
|
---|
| 600 | # add text to document object
|
---|
| 601 | # turn \ into \\ so that the rest of greenstone doesn't think there
|
---|
| 602 | # is an escape code following. (Macro parsing loses them...)
|
---|
| 603 | # $$textref =~ s/\\/\\\\/go;
|
---|
| 604 |
|
---|
| 605 | # $doc_obj->add_utf8_text($cursection, $$textref);
|
---|
| 606 | #}
|
---|
| 607 |
|
---|
| 608 |
|
---|
| 609 | sub extract_metadata
|
---|
| 610 | {
|
---|
| 611 | my $self = shift (@_);
|
---|
| 612 | my ($textref, $metadata, $doc_obj) = @_;
|
---|
| 613 | my $outhandle = $self->{'outhandle'};
|
---|
| 614 |
|
---|
| 615 | return if (!defined $textref);
|
---|
| 616 |
|
---|
| 617 | # metadata fields to extract/save. 'key' is the (lowercase) name of the
|
---|
| 618 | # html meta, 'value' is the metadata name for greenstone to use
|
---|
| 619 | my %find_fields = ();
|
---|
| 620 | my ($tag,$value);
|
---|
| 621 |
|
---|
| 622 | my $orig_field = "";
|
---|
| 623 | foreach my $field (split /,/, $self->{'metadata_fields'}) {
|
---|
| 624 | # support tag<tagname>
|
---|
| 625 | if ($field =~ /^(.*?)<(.*?)>$/) {
|
---|
| 626 | # "$2" is the user's preferred gs metadata name
|
---|
| 627 | $find_fields{lc($1)}=$2; # lc = lowercase
|
---|
| 628 | $orig_field = $1;
|
---|
| 629 | } else { # no <tagname> for mapping
|
---|
| 630 | # "$field" is the user's preferred gs metadata name
|
---|
| 631 | $find_fields{lc($field)}=$field; # lc = lowercase
|
---|
| 632 | $orig_field = $field;
|
---|
| 633 | }
|
---|
| 634 |
|
---|
| 635 | if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
|
---|
| 636 | $tag = $orig_field;
|
---|
| 637 | $value = $1;
|
---|
| 638 | if (!defined $value || !defined $tag){
|
---|
| 639 | #print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
|
---|
| 640 | next;
|
---|
| 641 | } else {
|
---|
| 642 | # clean up and add
|
---|
| 643 | chomp($value); # remove trailing \n, if any
|
---|
| 644 | $tag = $find_fields{lc($tag)};
|
---|
| 645 | #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
|
---|
| 646 | # if ($self->{'verbosity'} > 2);
|
---|
| 647 | $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
|
---|
| 648 | }
|
---|
| 649 | }
|
---|
| 650 | }
|
---|
| 651 | }
|
---|
| 652 |
|
---|
| 653 | 1;
|
---|