source: trunk/gsdl/perllib/plugins/MediaWikiPlug.pm@ 14108

Last change on this file since 14108 was 14108, checked in by lh92, 17 years ago

Plugin for processing MediaWiki pages

  • Property svn:keywords set to Author Date Id Revision
File size: 24.3 KB
Line 
1###########################################################################
2#
3# MediaWikiPlug.pm -- html plugin with extra facilities for wiki page
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26# This plugin is to process an HTML file where sections are divided by
27# user-defined headings tags. As it is difficult to predict what user's definition
28# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
29# as well as allows to get rid of user-defined Table of Content (TOC)...
30# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
31
32package MediaWikiPlug;
33
34use HTMLPlug;
35use ImagePlug;
36use File::Copy;
37
38#use strict; # every perl program should have this!
39#no strict 'refs'; # make an exception so we can use variables as filehandles
40
41sub BEGIN {
42 @MediaWikiPlug::ISA = ('HTMLPlug');
43}
44
45my $arguments =
46 [
47 { 'name' => "show_toc",
48 'desc' => "{MediaWikiPlug.show_toc}",
49 'type' => "flag",
50 'reqd' => "no"},
51 { 'name' => "toc_exp",
52 'desc' => "{MediaWikiPlug.toc_exp}",
53 'type' => "regexp",
54 'reqd' => "no",
55 'deft' => "" },
56 { 'name' => "delete_toc",
57 'desc' => "{MediaWikiPlug.delete_toc}",
58 'type' => "flag",
59 'reqd' => "no"},
60 { 'name' => "delete_nav",
61 'desc' => "{MediaWikiPlug.delete_nav}",
62 'type' => "flag",
63 'reqd' => "no",
64 'deft' => ""},
65 { 'name' => "nav_exp",
66 'desc' => "{MediaWikiPlug.nav_exp}",
67 'type' => "regexp",
68 'reqd' => "no",
69 'deft' => "" },
70 { 'name' => "tag_sections",
71 'desc' => "{MediaWikiPlug.tag_sections}",
72 'type' => "flag",
73 'reqd' => "no"},
74 { 'name' => "description_tags",
75 'desc' => "{HTMLPlug.description_tags}",
76 'type' => "flag",
77 'reqd' => "no"}
78 ];
79
80
81my $options = { 'name' => "MediaWikiPlug",
82 'desc' => "{MediaWikiPlug.desc}",
83 'abstract' => "no",
84 'inherits' => "yes",
85 'args' => $arguments };
86
87
88sub new {
89 my ($class) = shift (@_);
90 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
91 push(@$pluginlist, $class);
92
93 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
94 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
95
96 my $self = new HTMLPlug($pluginlist, $inputargs, $hashArgOptLists);
97 return bless $self, $class;
98}
99
100
101
102sub process {
103 my $self = shift (@_);
104 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
105 my $outhandle = $self->{'outhandle'};
106
107 print $outhandle "MediaWikiPlug: processing $file\n" if $self->{'verbosity'} > 1;
108
109 my @head_and_body = split(/<body/i,$$textref);
110 my $head = shift(@head_and_body);
111 my $body_text = join("<body", @head_and_body);
112
113 $head =~ m/<title>(.+)<\/title>/i;
114 my $doctitle = $1 if defined $1;
115
116 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
117 my @doc_properties = split(/<xml>/i,$head);
118 my $doc_heading = shift(@doc_properties);
119 my $rest_doc_properties = join(" ", @doc_properties);
120
121 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
122 my $extracted_metadata = shift (@extracted_metadata);
123 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
124 }
125
126 # set the title here if we haven't found it yet
127 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
128 if (defined $doctitle && $doctitle =~ /\S/) {
129 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
130 } else {
131 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
132 }
133 }
134
135 if(defined $base_dir && $base_dir ne ""){
136 # find and download stylesheet
137 my @css_files;
138 my $css_file_count = 0;
139 # find all the style sheets imported with import statement
140 while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
141 $css_files[$css_file_count++] = $2 if defined $2;
142 }
143
144 # check whether the stylesheet exists
145 # if not, download it and copy to the collection's images folder
146 for($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++){
147 my $css_file = $css_files[$css_file_count];
148 $css_file =~ s/^(.+)gli\/cache\///i;
149
150 my $css_file_path = "$base_dir/$css_file";
151
152 if (-e $css_file_path){ # the file already exists
153 next;
154 }
155
156 # check the css directory and create one if it's not there
157 my @dirs = split(/\//i,$css_file);
158 my $path_check = "$base_dir/";
159 for(my $i = 0; $i < (scalar(@dirs)-1); $i++){
160 $path_check .= $dirs[$i] . "/";
161 if(! -d $path_check ){
162 mkdir($path_check);
163 }
164 }
165
166 # download
167 $css_file = "http://$css_file";
168 system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
169 if ($? != 0) {unlink("$css_file_path");}
170
171 # change every style element to #wikispecificstyle ...
172 if(open(INPUT, "<$css_file_path")){
173 my $css_content;
174 while(my $line = <INPUT>){
175 if($line =~ m/^(.+)\{/i){
176 $line = "#wikispecificstyle " . $line;
177 }
178 $css_content .= $line;
179 }
180 close(INPUT);
181 open(OUTPUT, ">$css_file_path");
182 print OUTPUT $css_content;
183 close(OUTPUT);
184 }
185
186 # copy to images folder
187 # do not copy, because collection can only have one specific stylesheet
188 # better to add and modify the style sheets manually
189 # @dirs = split(/\//i,$base_dir);
190 # my $collection_base_dir;
191 # for(my $i = 0; $i < (scalar(@dirs)-1); $i++){
192 # $collection_base_dir .= $dirs[$i] . "/";
193 # }
194 # my $images_folder = $collection_base_dir . "images/";
195 # copy($css_file_path, $images_folder) || die "File cannot be copied.";
196 }
197 }
198
199 # add sections around h2 tag
200 # wrap each section with <div id=\"wikispecificstyle\"></div> to get the wiki styles
201 # add search box with each section
202 if ($self->{'tag_sections'}) {
203 my @sections = ($body_text =~ /<h2>(.+)<\/h2>/gi);
204 for(my $i=1; $i < scalar(@sections); $i++){
205 my $section_title = $sections[$i];
206 $section_title =~ s/<([^>]*)>//g;
207 $section_title =~ s/(^\s|\s$)//g;
208 my $section_metadata = "<Section>\n<Description>\n<Metadata name=\"Title\">$section_title</Metadata>\n</Description>\n";
209 if($i !=1){
210 $section_metadata = "</Section>\n" . $section_metadata;
211 }
212 $section_metadata = "\n<!--\n" . $section_metadata . "-->\n";
213
214 $section_metadata .= "<div id=\"wikispecificstyle\">\n<div id=\"content\">\n";
215 $section_metadata = "</div></div>\n" . $section_metadata if $i !=1;
216
217 $body_text =~ s/<h2>$sections[$i]<\/h2>/$section_metadata<h2>$sections[$i]<\/h2>/i;
218
219 if($i==scalar(@sections)-1) {
220 # $body_text =~ s/<div class=\"printfooter\">/<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
221 $body_text =~ s/<div class=\"printfooter\">/<\/div>\n<\/div>\n<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
222 }
223 }
224 }
225
226 # If delete_nav is enabled, it means to get rid of navigation contents.
227 # if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
228 # if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/){
229 # print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
230 # $body_text =~ s/$self->{'nav_exp'}//isg;
231 # }
232 #}
233 my $searchbox = "";
234 if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
235 my $nav_match_express;
236 if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/) {
237 $nav_match_express = $self->{'nav_exp'} ;
238 } else { # default setting for mediawiki
239 $nav_match_express = "<div class=\"printfooter\">(.|\n)*secs. -->";
240 }
241
242 print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
243
244 # $body_text =~ m/<div class=\"printfooter\">(.|\n)*secs. -->/isg;
245 $body_text =~ m/$nav_match_express/isg;
246 my $navigate = $& if defined $&;
247
248 # find the search box and add it to the document page
249 if(defined $navigate && $navigate =~ /\S/){
250 $navigate =~ m/<div id="p-search" class="portlet">(.|\n)*<\/form>/;
251 $searchbox = $& . "\n<\/div>\n<\/div>";
252 $searchbox =~ s/action="([^>]*)"/action="\/gsdl\/cgi-bin\/library"/isg;
253 $searchbox =~ s/name="search"/name="q"/isg;
254 $searchbox =~ s/name="go"//isg;
255 $searchbox =~ s/name="fulltext"//isg;
256 my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
257 ."<input type=\"hidden\" name=\"c\" value=\"wikitest\"/>\n"
258 ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>"
259 ."<input type=\"hidden\" name=\"t\" value=\"1\">";
260 $searchbox =~ s/<\/form>/$hidden_params<\/form>/isg;
261 $searchbox = "\n</div>\n</div><div id=\"wikispecificstyle\"><div id=\"column-one\">$searchbox</div></div>";
262 }
263
264 # $body_text =~ s/<div class=\"printfooter\">(.|\n)*secs. -->/$searchbox/isg;
265 $body_text =~ s/$nav_match_express/$searchbox/isg;
266 }
267
268 if ($self->{'tag_sections'}) {
269 $body_text =~ s/<!--\n<\/Section>/$searchbox\n<!--\n<\/Section>/ig;
270 }
271
272 # Tidy up extra new lines
273 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
274 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
275
276 $section_text .= "<!--\n<Section>\n-->\n";
277 my $body = "<body".$body_text;
278
279 $$textref = $body;
280
281 # get the base dir for convert absolute links to relative links
282 $$textref =~ m"href=\"(.*?)/cache/(.*?)/"i;
283 my $basedir = $2;
284
285 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
286 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
287
288 # get rid of the [edit] button
289 $$textref =~ s/\[<a([^>]*)>edit<\/a>]//g;
290
291 # get rid of the last time edit information at the bottom
292 $$textref =~ s/<a href="(.+)edit(.*?)"(.*?)>(\w+)<\/a> \d\d:\d\d,(.*?)(PST)//g;
293
294 # get rid of the (Redirected from ...)
295 $$textref =~ s/(Redirected from <a ([^>]*)>(\w|\s)*<\/a>)//isg;
296
297 # escape macros
298 $$textref =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
299 # may change the links, like Greenstone_Documentation_All.html, then change back
300 $$textref =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg;
301
302 # convert all the urls to relative url, because current wget 1.10 -k and -E option doesn't work together
303 # get rid of the title attribute of a tag
304 $$textref =~ s/<a([^>]*)title="(.*?)"/<a$1/isg;
305 # find the relative path of current directory
306 if($basedir ne ""){
307 my @dirs=split("\/", $file);
308 my $dirnum = scalar(@dirs);
309 my $replace = "";
310 for(my $i=0; $i<$dirnum-2; $i++){
311 $replace .= "../";
312 }
313 # test if the linked relative file exists, if not, link to the internet version
314 $$textref =~ s/(href|src)="([^>]*)$basedir\/([^>]*)"/$1="$replace$3"/gi;
315 # my @total_links = ($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi);
316 # print $outhandle "\nnumber of total links: " . scalar(@total_links)."\n";
317 # for(my $cur_link_no = 0; $cur_link_no < scalar(@total_links); $cur_link_no++){
318
319 #while($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi){
320 #$total_links[$cur_link_no] =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/i;
321 # my $prefix = $1;
322 # my $link = $&;
323 # my $rel_file_name = $3;
324 # my $rel_link = "$replace$rel_file_name";
325 # print $outhandle "catched link==> $link\nrelative link==> $rel_link\n";
326 # if(-e $rel_link){
327 # $rel_link = "$prefix=\"$rel_link\"";
328 # $$textref =~ s/$link/$rel_link/i;
329 # }else{
330 # my $ext_link = "$prefix=\"http:\/\/$basedir\/$rel_file_name\"";
331 # print $outhandle "external link==> $ext_link\n";
332 # $$textref =~ s/$link/$ext_link/i; #s/$link/$prefix="http:\/\/$rel_file_name"/i;
333 # }
334 #}
335
336
337 # tag the link to new wiki pages as red
338 $$textref =~ s/(href|src)="$replace([^>]*)&amp;action=edit([^>]*)"/$1="http:\/\/$basedir\/$2&amp;action=edit$3"/gi;
339 $$textref =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi;
340
341 # tag the link to external pages as blue
342 $$textref =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi;
343
344 #print $outhandle $$textref;
345 }
346
347 # if 'show_toc' is set, put the table of content on the Wiki Main_Page to the about page of the collection
348 # 1. read _content_ macro from about.dm
349 # 2. append the toc, change all links to the Greenstone internal format for relative links
350 # 3. write to the extra.dm
351 # TODO: currently we suppose the _about:content_ hasn't been specified before
352 # so needs to add function to handle when the macro is already in the extra.dm
353 if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){
354 my $macro_path = $base_dir;
355 $macro_path =~ s/import$/macros/;
356 my $extra_dm;
357 my $extradm_file = "$macro_path/extra.dm";
358 if(open(INPUT, "<$extradm_file")){
359 while(my $line = <INPUT>){
360 $extra_dm .= $line;
361 }
362 close(INPUT);
363
364 if($extra_dm =~ m/package about/ && $extra_dm =~ m/_content_(\s)*{/){
365 print $outhandle "already changed!!!!\n";
366 } else {
367 # read _content_ macro from about.dm file
368 my $about_macro = $ENV{'GSDLHOME'} . "/macros/about.dm";
369 my $about_page_content = "";
370 if(open(INPUT, "<$about_macro")){
371 while(my $line=<INPUT>){
372 $about_page_content .= $line;
373 }
374 }else{
375 print $outhandle "can't open file $about_macro\n";
376 }
377 close(INPUT);
378
379 # extract the _content_ macro
380 $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i;
381 $about_page_content = $&;
382
383 # extract toc of the Main_Page
384 my $mainpage_content = "";
385 if($self->{'toc_exp'} =~ /\S/){
386 $$textref =~ /$self->{'toc_exp'}/;
387 $mainpage_content = $&;
388 } else {
389 # $mainpage_content =~ s/<!-- start content -->(.|\n)*<!-- end content -->/$1/igs;
390 }
391 # print $outhandle "---------\n$$textref\n--------\n\n";
392 # print $outhandle "==========\n$mainpage_content\n==========\n\n";
393
394 # add toc to the _content_ macro
395 $about_page_content =~ m/{(.|\n)*<\/div>\n\n/;
396 $extra_dm .= "package about\n_content_$&\n\n<div class=\"section\">\n$mainpage_content\n</div>\n</div>\n}";
397
398 # change all links to the internal Greenstone relative link format
399 $extra_dm =~ s/<a href="([^>]*)"/<a href="_httpquery_&a=extlink&rl=1&href=http:\/\/$basedir$1"/isg;
400 $extra_dm =~ s/(\.\.\/)+/\//isg;
401 # print $outhandle "to add---------\n$extra_dm\n--------\n";
402
403 # write to the extra.dm file of the collection
404 open(OUTPUT, ">$extradm_file");
405 print OUTPUT $extra_dm;
406 close(OUTPUT);
407 }
408 } else {
409 print $outhandle "can't open file $extradm_file\n";
410 }
411 }
412
413 # If delete_toc is enabled, it means to get rid of toc and tof contents.
414 # get rid of TOC and TOF sections and their title
415 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
416 if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
417 # $body_text =~ s/<p class=(($self->{'toc_exp'})[^>]*)>(.+?)<\/p>//isg;
418 # print "it matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/;
419 # $body_text =~ s/$self->{'toc_exp'}//i;
420 print "it matches toc_exp!!\n" if $$textref =~ /$self->{'toc_exp'}/;
421 $$textref =~ s/$self->{'toc_exp'}//i;
422 }
423 }
424
425 # To add a layer on top of the wiki page
426 # so as to keep the wiki style inside the wiki page
427 # and keep the Greenstone style at the same time
428 $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
429 $$textref =~ s/<\/body>/<\/div><\/body>/is;
430
431 # tag with sections
432 $$textref =~ s/<body([^>]*)>/$&\n<!--\n<Section>\n<Description>\n<Metadata name=\"Title\">$doctitle<\/Metadata>\n<\/Description>\n-->\n/is;
433 $$textref =~ s/<\/body>/\n<!--\n<\/Section>\n-->\n/is;
434
435 #print $outhandle "\n\n$$textref\n\n";
436
437 # use description tags
438 if ($self->{'description_tags'}) {
439 my $cursection = $doc_obj->get_top_section();
440 # remove the html header - note that doing this here means any
441 # sections defined within the header will be lost (so all <Section>
442 # tags must appear within the body of the HTML)
443 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
444
445 $$textref =~ s/^.*?<body[^>]*>//is;
446 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
447
448 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
449 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
450
451 my $lt = '(?:<|&lt;)';
452 my $gt = '(?:>|&gt;)';
453 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
454
455 # my $dont_strip = '';
456 # if ($self->{'no_strip_metadata_html'}) {
457 # ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
458 # }
459
460 my $found_something = 0;
461 my $top = 1;
462 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
463 my $text = $1;
464 my $comment = $2;
465 if (defined $text) {
466 # text before a comment - note that getting to here
467 # doesn't necessarily mean there are Section tags in
468 # the document
469 # print $outhandle "section text:\n$text\n";
470 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
471 }
472 while ($comment =~ s/$lt(.*?)$gt//s) {
473 my $tag = $1;
474 if ($tag eq "Section") {
475 $found_something = 1;
476 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
477 $top = 0;
478 } elsif ($tag eq "/Section") {
479 $found_something = 1;
480 $cursection = $doc_obj->get_parent_section ($cursection);
481 } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
482 my $metaname = $1;
483 my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
484 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
485 my $metavalue = $1;
486 $metavalue =~ s/^\s+//;
487 $metavalue =~ s/\s+$//;
488 # assume that no metadata value intentionally includes
489 # carriage returns or HTML tags (if they're there they
490 # were probably introduced when converting to HTML from
491 # some other format).
492 # actually some people want to have html tags in their
493 # metadata.
494 $metavalue =~ s/[\cJ\cM]/ /sg;
495 # $metavalue =~ s/<[^>]+>//sg unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ /^($dont_strip)$/);
496 $metavalue =~ s/\s+/ /sg;
497 # print $outhandle "metaname = $metaname\nmetavalue = $metavalue\n";
498 if ($accumulate) {
499 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
500 } else {
501 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
502 }
503 } elsif ($tag eq "Description" || $tag eq "/Description") {
504 # do nothing with containing Description tags
505 } else {
506 # simple HTML tag (probably created by the conversion
507 # to HTML from some other format) - we'll ignore it and
508 # hope for the best ;-)
509 }
510 }
511 }# end while
512
513 if ($cursection ne "") {
514 print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
515 }
516
517 $$textref =~ s/^.*?<body[^>]*>//is;
518 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
519 if ($$textref =~ /\S/) {
520 if (!$found_something) {
521 if ($self->{'verbosity'} > 2) {
522 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
523 print $outhandle " will be processed as a single section document\n";
524 }
525
526 # go ahead and process single-section document
527 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
528
529 } else {
530 print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
531 print $outhandle " of the final closing </Section> tag. This text will\n";
532 print $outhandle " be ignored.";
533
534 my ($text);
535 if (length($$textref) > 30) {
536 $text = substr($$textref, 0, 30) . "...";
537 } else {
538 $text = $$textref;
539 }
540 $text =~ s/\n/ /isg;
541 print $outhandle " ($text)\n";
542 }
543 } elsif (!$found_something) {
544 if ($self->{'verbosity'} > 2) {
545 # may get to here if document contained no valid Section
546 # tags but did contain some comments. The text will have
547 # been processed already but we should print the warning
548 # as above and extract metadata
549 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
550 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
551 }
552 }
553 } # if $self->{'description_tags'}
554 else {
555 # remove header and footer
556 # if (!$self->{'keep_head'}) {
557 # $$textref =~ s/^.*?<body[^>]*>//is;
558 # $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
559 # }
560
561 # single section document
562 # $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
563
564 # Important: to get the relative links to work,
565 # 1: use the below statement instead of the above one
566 # 2. cannot have process_section method.
567 # why?????
568 $self->SUPER::process(@_);
569 }
570 return 1;
571
572 #$self->SUPER::process(@_);
573}
574
575
576
577# note that process_section may be called multiple times for a single
578# section (relying on the fact that add_utf8_text appends the text to any
579# that may exist already).
580# sub process_section {
581# my $self = shift (@_);
582# my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
583
584 # trap links
585 # if (!$self->{'nolinks'}) {
586 # usemap="./#index" not handled correctly => change to "#index"
587 # $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
588 #$self->replace_usemap_links($1, $2, $3)/isge;
589
590 #$$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
591 #$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
592 #}
593
594 # trap images
595
596 # allow spaces if inside quotes - jrm21
597 #$$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)([\"\'][^\"\']+[\"\']|[^\s>]+)([^>]*>)/
598 #$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
599
600 # add text to document object
601 # turn \ into \\ so that the rest of greenstone doesn't think there
602 # is an escape code following. (Macro parsing loses them...)
603# $$textref =~ s/\\/\\\\/go;
604
605# $doc_obj->add_utf8_text($cursection, $$textref);
606#}
607
608
609sub extract_metadata
610{
611 my $self = shift (@_);
612 my ($textref, $metadata, $doc_obj) = @_;
613 my $outhandle = $self->{'outhandle'};
614
615 return if (!defined $textref);
616
617 # metadata fields to extract/save. 'key' is the (lowercase) name of the
618 # html meta, 'value' is the metadata name for greenstone to use
619 my %find_fields = ();
620 my ($tag,$value);
621
622 my $orig_field = "";
623 foreach my $field (split /,/, $self->{'metadata_fields'}) {
624 # support tag<tagname>
625 if ($field =~ /^(.*?)<(.*?)>$/) {
626 # "$2" is the user's preferred gs metadata name
627 $find_fields{lc($1)}=$2; # lc = lowercase
628 $orig_field = $1;
629 } else { # no <tagname> for mapping
630 # "$field" is the user's preferred gs metadata name
631 $find_fields{lc($field)}=$field; # lc = lowercase
632 $orig_field = $field;
633 }
634
635 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
636 $tag = $orig_field;
637 $value = $1;
638 if (!defined $value || !defined $tag){
639 #print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
640 next;
641 } else {
642 # clean up and add
643 chomp($value); # remove trailing \n, if any
644 $tag = $find_fields{lc($tag)};
645 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
646 # if ($self->{'verbosity'} > 2);
647 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
648 }
649 }
650 }
651}
652
6531;
Note: See TracBrowser for help on using the repository browser.