source: main/trunk/greenstone2/perllib/plugins/StructuredHTMLPlugin.pm

Last change on this file was 33309, checked in by ak19, 5 years ago

More workarounds for HTML conversion results from Word's windows_scripting. If there were newlines between headings in the original word doc that accidentally had heading formatting, then the windows_scripting conversion creates a skeleton heading containing no actual text but space. The result being that when the doc.xml is generated, there's an empty subsection reflecting that empty heading. So adding further cleanup into StructuredHTMLPlugin to look for and remove these empty headings resulting from common Word user errors.

  • Property svn:keywords set to Author Date Id Revision
File size: 14.7 KB
RevLine 
[10271]1###########################################################################
2#
[15872]3# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
[10404]4# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5# HTML document
[10271]6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
[10404]28# This plugin is to process an HTML file where sections are divided by
29# user-defined headings tags. As it is difficult to predict what user's definition
30# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31# as well as allows to get rid of user-defined Table of Content (TOC)...
32# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
33
[15872]34package StructuredHTMLPlugin;
[10271]35
[15872]36use HTMLPlugin;
37use ImageConverter; # want the identify method
[24600]38use util;
[10271]39
[15872]40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
[10404]42
[10271]43sub BEGIN {
[15872]44 @StructuredHTMLPlugin::ISA = ('HTMLPlugin');
[10271]45}
46
[11849]47my $arguments =
48 [
49 { 'name' => "level1_header",
[15872]50 'desc' => "{StructuredHTMLPlugin.level1_header}",
[11849]51 'type' => "regexp",
52 'reqd' => "no",
53 'deft' => "" },
54 { 'name' => "level2_header",
[15872]55 'desc' => "{StructuredHTMLPlugin.level2_header}",
[11849]56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => "" },
59 { 'name' => "level3_header",
[15872]60 'desc' => "{StructuredHTMLPlugin.level3_header}",
[11849]61 'type' => "regexp",
62 'reqd' => "no",
63 'deft' => "" },
64 { 'name' => "title_header",
[15872]65 'desc' => "{StructuredHTMLPlugin.title_header}",
[11849]66 'type' => "regexp",
67 'reqd' => "no",
68 'deft' => "" },
[11884]69 { 'name' => "delete_toc",
[15872]70 'desc' => "{StructuredHTMLPlugin.delete_toc}",
[11884]71 'type' => "flag",
72 'reqd' => "no"},
[11849]73 { 'name' => "toc_header",
[15872]74 'desc' => "{StructuredHTMLPlugin.toc_header}",
[11849]75 'type' => "regexp",
76 'reqd' => "no",
[11884]77 'deft' => "" }
[11849]78 ];
[10271]79
[15872]80my $options = { 'name' => "StructuredHTMLPlugin",
81 'desc' => "{StructuredHTMLPlugin.desc}",
[10271]82 'abstract' => "no",
83 'inherits' => "yes",
84 'args' => $arguments };
85
86sub new {
87 my ($class) = shift (@_);
88 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89 push(@$pluginlist, $class);
90
[15872]91 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92 push(@{$hashArgOptLists->{"OptList"}},$options);
[10271]93
[15872]94 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
[10271]95
96 return bless $self, $class;
97}
98
99
100sub process {
101 my $self = shift (@_);
[10404]102 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[10271]103 my $outhandle = $self->{'outhandle'};
104
105 my @head_and_body = split(/<body/i,$$textref);
106 my $head = shift(@head_and_body);
[33301]107 my $body_text = join("<body", @head_and_body); # won't actually work to prefix "<body" to just the body remaining in @head_and_body array, since only 1 element (the body) remains in @head_and_body
[10600]108 $head =~ m/<title>(.+)<\/title>/i;
109 my $doctitle = $1 if defined $1;
[10723]110 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
[10426]111 my @doc_properties = split(/<xml>/i,$head);
112 my $doc_heading = shift(@doc_properties);
113 my $rest_doc_properties = join(" ", @doc_properties);
[11380]114
[10426]115 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
116 my $extracted_metadata = shift (@extracted_metadata);
117 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
118 }
119
[11893]120 # set the title here if we haven't found it yet
121 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
122 if (defined $doctitle && $doctitle =~ /\S/) {
123 $doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
124 } else {
125 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
126 }
127 }
128
129 # If delete_toc is enabled, it means to get rid of toc and tof contents.
[10271]130 # get rid of TOC and TOF sections and their title
[11380]131 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
[10496]132 if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
133 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
134 }
[10443]135 }
[10600]136
[10496]137 if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
[10271]138 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
[10600]139 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]140 }
[10443]141
142 if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
143 $self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
[10595]144 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]145 }
[10443]146
[10496]147 if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
[10443]148 $self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
[10595]149 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
[10271]150 }
151
[10496]152 if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
[10443]153 $self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
[10271]154 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
155 }
[11893]156
[10271]157 # Tidy up extra new lines
158 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
159 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
[33301]160
161 # what was the following line for. effectively unused. do we need it??
[15872]162 #$section_text .= "<!--\n<Section>\n-->\n";
[10600]163 #my $top_section_tag = "<!--\n<Section>\n-->\n";
164 #$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
165 #$body_text =~ s/(<div.*)/$top_section_tag$1/i;
[10271]166 my $body = "<body".$body_text;
[33301]167
[33309]168 # remove empty headings that Word's windows_scripting may insert for multiple new lines around headings
169 # have heading markup, e.g. <h2><o:p>&nbsp;</o:p></h2>
170 $body =~ s@<h[1-6]>(<o:p>)?(&nbsp;)+(</o:p>)?</h[1-6]>@@gis;
171
[10271]172 my $section_text = $head;
[10426]173
[10271]174 # split HTML text on <h1>, <h2> etc tags
175 my @h_split = split(/<h/i,$body);
[10426]176
[10271]177 my $hnum = 0;
178
179 my $sectionh1 = 0;
180 $section_text .= shift(@h_split);
[33301]181
182 # When windows_scripting is on, WordPlugin invokes Word to convert the doc(x) file to HTML which is then
183 # processed by this StructuredHTMLPlugin. However, Word will embed the entire HTML body content inside a <div>
184 # This <div> becomes problematic, since in sectioned documents, the first section would end up starting with a div
185 # but not contain a matching closing div, while the final section will end with an unmatched closing div.
186 # So, as a hack, we remove any opening <div> appearing immediately after the <body> before the first <h>eading.
187 # And we'll set a flag to remember to remove any corresponding closing </div> before the closing </body>.
188 # So, now we look for any unclosed <div> elements in the preamble (pre-Headings) html that is in $section_text
189 my $remove_global_div = 0;
190 if($section_text =~ m/^(.*?)\s*<div[^>]*>\s*$/is) {
191 $section_text = $1;
192 $remove_global_div = 1;
193 print $outhandle "********** Found and removed a global opening <div> at start of html body, will monitor for closing div too.\n"
194 if $self->{'verbosity'} > 2;
195 }
196
[10271]197 my $hc;
198 foreach $hc ( @h_split )
199 {
[33299]200 if ($hc =~ m/^([1-3])\s*.*?>(.*)$/s)
[10271]201 {
[33299]202 my $new_hnum = $1;
203 my $hc_after = $2;
204 my $filtered_hc = undef;
205
206 if ($hc_after =~ m/^(.*?)<\/h$new_hnum>(.*)$/is)
[10271]207 {
[33299]208 my $h_text = $1;
209 $filtered_hc = $2; # This represents the remainder of $hc, after the (e.g.) <h2>xxxx</h2> element
210
211 $hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
212 # boil HTML down to some interesting text
213 $h_text =~ s/^[1-3]>//;
214 $h_text =~ s/<\/?.*?>//sg;
215 $h_text =~ s/\s+/ /sg;
216 $h_text =~ s/^\s$//s;
217 $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
218
219 if ($h_text =~ m/\w+/)
220 {
221 if ($new_hnum > $hnum)
222 {
223 # increase section nesting
224 $hnum++;
225 while ($hnum < $new_hnum)
226 {
227 my $spacing = " " x $hnum;
228 $section_text .= "<!--\n";
229 $section_text .= $spacing."<Section>\n";
230 $section_text .= "-->\n";
231 $hnum++;
232 }
233 }
234 else # ($new_hnum <= $hnum)
235 {
236 # descrease section nesting
237 while ($hnum >= $new_hnum)
238 {
239 my $spacing = " " x $hnum;
240 $section_text .= "<!--\n";
241 $section_text .= $spacing."</Section>\n";
242 $section_text .= "-->\n";
243 $hnum--;
244 }
245 $hnum++;
246 }
247
248 my $spacing = " " x $hnum;
249 $section_text .= "<!--\n";
250 $section_text .= $spacing."<Section>\n";
251 $section_text .= $spacing." <Description>\n";
252 $section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
253 $section_text .= $spacing." </Description>\n";
254 $section_text .= "-->\n";
255
256 #print $outhandle $spacing."$h_text\n"
257 # if $self->{'verbosity'} > 2;
258
259 $sectionh1++ if ($hnum==1);
260 }
[10271]261 }
[33299]262 else {
263### print STDERR "***** hc = <h$hc\n\n";
[10271]264 }
[33299]265
266 # This can probably be replaced by the first statement in the if-statement, because
267 # $filtered_hc should always be defined when it is assigned above (even if it resolves
268 # to be an empty string)
269 if (defined $filtered_hc) {
270 $section_text .= $filtered_hc;
271 }
272 else {
273 $section_text .= "<h$hc";
274 }
[10271]275 }
[33299]276 else
277 {
278 $section_text .= "<h$hc";
279 }
[10271]280 }
[33301]281
282 if($remove_global_div) { # then need to also handle a closing </div> tag for the global div too, and if one is present, remove it
283 $section_text =~ s@\s*</div[^>]*>(\s*</body>\s*</html>\s*)$@$1@is;
284 print $outhandle "********** Removing any matching closing global divider element\n"
285 if $self->{'verbosity'} > 2;
286 }
287
288
289 while ($hnum >= 1)
[10271]290 {
291 my $spacing = " " x $hnum;
292 $section_text .= "<!--\n";
293 $section_text .= $spacing."</Section>\n";
294 $section_text .= "-->\n";
295 $hnum--;
296 }
297
298 $section_text .= "<!--\n</Section>\n-->\n";
[33301]299
[10271]300 $$textref = $section_text;
[10426]301
[11893]302# if ($sectionh1>0)
303# {
304# print $outhandle " Located section headings ..."
305# if $self->{'verbosity'} > 1;
306# }
[10426]307
[10271]308 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
[10426]309
[10271]310 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
311
[10426]312 ## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
313
[10271]314 $self->SUPER::process(@_);
[10426]315
[10271]316}
317
318
319sub resize_if_necessary
320{
321 my ($self,$front,$back,$base_dir,$href) = @_;
[10426]322
[10271]323 # dig out width and height of image, if there
324 my $img_attributes = "$front back";
325 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
326 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
[10426]327
[10271]328 # derive local filename for image based on its URL
329 my $img_filename = $href;
330 $img_filename =~ s/^[^:]*:\/\///;
331 $img_filename = &util::filename_cat($base_dir, $img_filename);
[10426]332
[10271]333 # Replace %20's in URL with a space if required. Note that the filename
334 # may include the %20 in some situations
335 if ($img_filename =~ /\%20/) {
336 if (!-e $img_filename) {
337 $img_filename =~ s/\%20/ /g;
338 }
339 }
340 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
341 # get image info on width and height
[10426]342
[10271]343 my $outhandle = $self->{'outhandle'};
344 my $verbosity = $self->{'verbosity'};
345
346 my ($image_type, $actual_width, $actual_height, $image_size)
[15872]347 = &ImageConverter::identify($img_filename, $outhandle, $verbosity);
[10426]348
[10356]349 #print STDERR "**** $actual_width x $actual_height";
350 #print STDERR " (requested: $img_width x $img_height)\n";
[10271]351
352 if (($img_width < $actual_width) || ($img_height < $actual_height)) {
[11893]353 #print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
[10426]354
[10271]355 # derive new image name based on current image
356 my ($tailname, $dirname, $suffix)
[15872]357 = &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
[10426]358
[10271]359 my $resized_filename
360 = &util::filename_cat($dirname, $tailname."_resized".$suffix);
[10426]361
[10356]362 #print STDERR "**** suffix = $suffix\n";
[10426]363
[10271]364 # Generate smaller image with convert
[15872]365 my $newsize = "$img_width"."x$img_height";
[10271]366 my $command = "convert -interlace plane -verbose "
[15872]367 ."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
[24600]368 $command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl $command";
[11893]369 #print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
370 #my $result = '';
371 #print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
[10271]372 }
373 }
374 return $href;
375}
376
377
[10426]378sub extract_metadata
379{
380 my $self = shift (@_);
381 my ($textref, $metadata, $doc_obj) = @_;
382 my $outhandle = $self->{'outhandle'};
383
[11380]384 return if (!defined $textref);
385
[21801]386 my $separator = $self->{'metadata_field_separator'};
387 if ($separator eq "") {
388 undef $separator;
389 }
[10426]390 # metadata fields to extract/save. 'key' is the (lowercase) name of the
391 # html meta, 'value' is the metadata name for greenstone to use
392 my %find_fields = ();
393 my ($tag,$value);
[10271]394
[10426]395 my $orig_field = "";
[19993]396 foreach my $field (split /\s*,\s*/, $self->{'metadata_fields'}) {
[10426]397 # support tag<tagname>
[19993]398 if ($field =~ /^(.*?)\s*<(.*?)>$/) {
[10426]399 # "$2" is the user's preferred gs metadata name
400 $find_fields{lc($1)}=$2; # lc = lowercase
401 $orig_field = $1;
402 } else { # no <tagname> for mapping
403 # "$field" is the user's preferred gs metadata name
404 $find_fields{lc($field)}=$field; # lc = lowercase
405 $orig_field = $field;
406 }
[11380]407
[10426]408 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
409 $tag = $orig_field;
410 $value = $1;
411 if (!defined $value || !defined $tag){
[15872]412 #print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
[10426]413 next;
414 } else {
415 # clean up and add
416 chomp($value); # remove trailing \n, if any
417 $tag = $find_fields{lc($tag)};
[11893]418 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
419 # if ($self->{'verbosity'} > 2);
[21801]420 if (defined $separator) {
421 my @values = split($separator, $value);
422 foreach my $v (@values) {
423 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $v) if $v =~ /\S/;
424 }
425 }
426 else {
427 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
428 }
[10426]429 }
430 }
431 }
432}
433
[10271]4341;
Note: See TracBrowser for help on using the repository browser.