source: main/trunk/greenstone2/perllib/plugins/StructuredHTMLPlugin.pm@ 33309

Last change on this file since 33309 was 33309, checked in by ak19, 5 years ago

More workarounds for HTML conversion results from Word's windows_scripting. If there were newlines between headings in the original word doc that accidentally had heading formatting, then the windows_scripting conversion creates a skeleton heading containing no actual text but space. The result being that when the doc.xml is generated, there's an empty subsection reflecting that empty heading. So adding further cleanup into StructuredHTMLPlugin to look for and remove these empty headings resulting from common Word user errors.

  • Property svn:keywords set to Author Date Id Revision
File size: 14.7 KB
Line 
1###########################################################################
2#
3# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
4# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5# HTML document
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28# This plugin is to process an HTML file where sections are divided by
29# user-defined headings tags. As it is difficult to predict what user's definition
30# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31# as well as allows to get rid of user-defined Table of Content (TOC)...
32# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
33
34package StructuredHTMLPlugin;
35
36use HTMLPlugin;
37use ImageConverter; # want the identify method
38use util;
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43sub BEGIN {
44 @StructuredHTMLPlugin::ISA = ('HTMLPlugin');
45}
46
47my $arguments =
48 [
49 { 'name' => "level1_header",
50 'desc' => "{StructuredHTMLPlugin.level1_header}",
51 'type' => "regexp",
52 'reqd' => "no",
53 'deft' => "" },
54 { 'name' => "level2_header",
55 'desc' => "{StructuredHTMLPlugin.level2_header}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => "" },
59 { 'name' => "level3_header",
60 'desc' => "{StructuredHTMLPlugin.level3_header}",
61 'type' => "regexp",
62 'reqd' => "no",
63 'deft' => "" },
64 { 'name' => "title_header",
65 'desc' => "{StructuredHTMLPlugin.title_header}",
66 'type' => "regexp",
67 'reqd' => "no",
68 'deft' => "" },
69 { 'name' => "delete_toc",
70 'desc' => "{StructuredHTMLPlugin.delete_toc}",
71 'type' => "flag",
72 'reqd' => "no"},
73 { 'name' => "toc_header",
74 'desc' => "{StructuredHTMLPlugin.toc_header}",
75 'type' => "regexp",
76 'reqd' => "no",
77 'deft' => "" }
78 ];
79
80my $options = { 'name' => "StructuredHTMLPlugin",
81 'desc' => "{StructuredHTMLPlugin.desc}",
82 'abstract' => "no",
83 'inherits' => "yes",
84 'args' => $arguments };
85
86sub new {
87 my ($class) = shift (@_);
88 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89 push(@$pluginlist, $class);
90
91 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92 push(@{$hashArgOptLists->{"OptList"}},$options);
93
94 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
95
96 return bless $self, $class;
97}
98
99
100sub process {
101 my $self = shift (@_);
102 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
103 my $outhandle = $self->{'outhandle'};
104
105 my @head_and_body = split(/<body/i,$$textref);
106 my $head = shift(@head_and_body);
107 my $body_text = join("<body", @head_and_body); # won't actually work to prefix "<body" to just the body remaining in @head_and_body array, since only 1 element (the body) remains in @head_and_body
108 $head =~ m/<title>(.+)<\/title>/i;
109 my $doctitle = $1 if defined $1;
110 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
111 my @doc_properties = split(/<xml>/i,$head);
112 my $doc_heading = shift(@doc_properties);
113 my $rest_doc_properties = join(" ", @doc_properties);
114
115 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
116 my $extracted_metadata = shift (@extracted_metadata);
117 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
118 }
119
120 # set the title here if we haven't found it yet
121 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
122 if (defined $doctitle && $doctitle =~ /\S/) {
123 $doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
124 } else {
125 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
126 }
127 }
128
129 # If delete_toc is enabled, it means to get rid of toc and tof contents.
130 # get rid of TOC and TOF sections and their title
131 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
132 if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
133 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
134 }
135 }
136
137 if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
138 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
139 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
140 }
141
142 if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
143 $self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
144 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
145 }
146
147 if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
148 $self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
149 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
150 }
151
152 if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
153 $self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
154 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
155 }
156
157 # Tidy up extra new lines
158 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
159 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
160
161 # what was the following line for. effectively unused. do we need it??
162 #$section_text .= "<!--\n<Section>\n-->\n";
163 #my $top_section_tag = "<!--\n<Section>\n-->\n";
164 #$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
165 #$body_text =~ s/(<div.*)/$top_section_tag$1/i;
166 my $body = "<body".$body_text;
167
168 # remove empty headings that Word's windows_scripting may insert for multiple new lines around headings
169 # have heading markup, e.g. <h2><o:p>&nbsp;</o:p></h2>
170 $body =~ s@<h[1-6]>(<o:p>)?(&nbsp;)+(</o:p>)?</h[1-6]>@@gis;
171
172 my $section_text = $head;
173
174 # split HTML text on <h1>, <h2> etc tags
175 my @h_split = split(/<h/i,$body);
176
177 my $hnum = 0;
178
179 my $sectionh1 = 0;
180 $section_text .= shift(@h_split);
181
182 # When windows_scripting is on, WordPlugin invokes Word to convert the doc(x) file to HTML which is then
183 # processed by this StructuredHTMLPlugin. However, Word will embed the entire HTML body content inside a <div>
184 # This <div> becomes problematic, since in sectioned documents, the first section would end up starting with a div
185 # but not contain a matching closing div, while the final section will end with an unmatched closing div.
186 # So, as a hack, we remove any opening <div> appearing immediately after the <body> before the first <h>eading.
187 # And we'll set a flag to remember to remove any corresponding closing </div> before the closing </body>.
188 # So, now we look for any unclosed <div> elements in the preamble (pre-Headings) html that is in $section_text
189 my $remove_global_div = 0;
190 if($section_text =~ m/^(.*?)\s*<div[^>]*>\s*$/is) {
191 $section_text = $1;
192 $remove_global_div = 1;
193 print $outhandle "********** Found and removed a global opening <div> at start of html body, will monitor for closing div too.\n"
194 if $self->{'verbosity'} > 2;
195 }
196
197 my $hc;
198 foreach $hc ( @h_split )
199 {
200 if ($hc =~ m/^([1-3])\s*.*?>(.*)$/s)
201 {
202 my $new_hnum = $1;
203 my $hc_after = $2;
204 my $filtered_hc = undef;
205
206 if ($hc_after =~ m/^(.*?)<\/h$new_hnum>(.*)$/is)
207 {
208 my $h_text = $1;
209 $filtered_hc = $2; # This represents the remainder of $hc, after the (e.g.) <h2>xxxx</h2> element
210
211 $hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
212 # boil HTML down to some interesting text
213 $h_text =~ s/^[1-3]>//;
214 $h_text =~ s/<\/?.*?>//sg;
215 $h_text =~ s/\s+/ /sg;
216 $h_text =~ s/^\s$//s;
217 $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
218
219 if ($h_text =~ m/\w+/)
220 {
221 if ($new_hnum > $hnum)
222 {
223 # increase section nesting
224 $hnum++;
225 while ($hnum < $new_hnum)
226 {
227 my $spacing = " " x $hnum;
228 $section_text .= "<!--\n";
229 $section_text .= $spacing."<Section>\n";
230 $section_text .= "-->\n";
231 $hnum++;
232 }
233 }
234 else # ($new_hnum <= $hnum)
235 {
236 # descrease section nesting
237 while ($hnum >= $new_hnum)
238 {
239 my $spacing = " " x $hnum;
240 $section_text .= "<!--\n";
241 $section_text .= $spacing."</Section>\n";
242 $section_text .= "-->\n";
243 $hnum--;
244 }
245 $hnum++;
246 }
247
248 my $spacing = " " x $hnum;
249 $section_text .= "<!--\n";
250 $section_text .= $spacing."<Section>\n";
251 $section_text .= $spacing." <Description>\n";
252 $section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
253 $section_text .= $spacing." </Description>\n";
254 $section_text .= "-->\n";
255
256 #print $outhandle $spacing."$h_text\n"
257 # if $self->{'verbosity'} > 2;
258
259 $sectionh1++ if ($hnum==1);
260 }
261 }
262 else {
263### print STDERR "***** hc = <h$hc\n\n";
264 }
265
266 # This can probably be replaced by the first statement in the if-statement, because
267 # $filtered_hc should always be defined when it is assigned above (even if it resolves
268 # to be an empty string)
269 if (defined $filtered_hc) {
270 $section_text .= $filtered_hc;
271 }
272 else {
273 $section_text .= "<h$hc";
274 }
275 }
276 else
277 {
278 $section_text .= "<h$hc";
279 }
280 }
281
282 if($remove_global_div) { # then need to also handle a closing </div> tag for the global div too, and if one is present, remove it
283 $section_text =~ s@\s*</div[^>]*>(\s*</body>\s*</html>\s*)$@$1@is;
284 print $outhandle "********** Removing any matching closing global divider element\n"
285 if $self->{'verbosity'} > 2;
286 }
287
288
289 while ($hnum >= 1)
290 {
291 my $spacing = " " x $hnum;
292 $section_text .= "<!--\n";
293 $section_text .= $spacing."</Section>\n";
294 $section_text .= "-->\n";
295 $hnum--;
296 }
297
298 $section_text .= "<!--\n</Section>\n-->\n";
299
300 $$textref = $section_text;
301
302# if ($sectionh1>0)
303# {
304# print $outhandle " Located section headings ..."
305# if $self->{'verbosity'} > 1;
306# }
307
308 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
309
310 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
311
312 ## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
313
314 $self->SUPER::process(@_);
315
316}
317
318
319sub resize_if_necessary
320{
321 my ($self,$front,$back,$base_dir,$href) = @_;
322
323 # dig out width and height of image, if there
324 my $img_attributes = "$front back";
325 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
326 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
327
328 # derive local filename for image based on its URL
329 my $img_filename = $href;
330 $img_filename =~ s/^[^:]*:\/\///;
331 $img_filename = &util::filename_cat($base_dir, $img_filename);
332
333 # Replace %20's in URL with a space if required. Note that the filename
334 # may include the %20 in some situations
335 if ($img_filename =~ /\%20/) {
336 if (!-e $img_filename) {
337 $img_filename =~ s/\%20/ /g;
338 }
339 }
340 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
341 # get image info on width and height
342
343 my $outhandle = $self->{'outhandle'};
344 my $verbosity = $self->{'verbosity'};
345
346 my ($image_type, $actual_width, $actual_height, $image_size)
347 = &ImageConverter::identify($img_filename, $outhandle, $verbosity);
348
349 #print STDERR "**** $actual_width x $actual_height";
350 #print STDERR " (requested: $img_width x $img_height)\n";
351
352 if (($img_width < $actual_width) || ($img_height < $actual_height)) {
353 #print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
354
355 # derive new image name based on current image
356 my ($tailname, $dirname, $suffix)
357 = &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
358
359 my $resized_filename
360 = &util::filename_cat($dirname, $tailname."_resized".$suffix);
361
362 #print STDERR "**** suffix = $suffix\n";
363
364 # Generate smaller image with convert
365 my $newsize = "$img_width"."x$img_height";
366 my $command = "convert -interlace plane -verbose "
367 ."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
368 $command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl $command";
369 #print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
370 #my $result = '';
371 #print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
372 }
373 }
374 return $href;
375}
376
377
378sub extract_metadata
379{
380 my $self = shift (@_);
381 my ($textref, $metadata, $doc_obj) = @_;
382 my $outhandle = $self->{'outhandle'};
383
384 return if (!defined $textref);
385
386 my $separator = $self->{'metadata_field_separator'};
387 if ($separator eq "") {
388 undef $separator;
389 }
390 # metadata fields to extract/save. 'key' is the (lowercase) name of the
391 # html meta, 'value' is the metadata name for greenstone to use
392 my %find_fields = ();
393 my ($tag,$value);
394
395 my $orig_field = "";
396 foreach my $field (split /\s*,\s*/, $self->{'metadata_fields'}) {
397 # support tag<tagname>
398 if ($field =~ /^(.*?)\s*<(.*?)>$/) {
399 # "$2" is the user's preferred gs metadata name
400 $find_fields{lc($1)}=$2; # lc = lowercase
401 $orig_field = $1;
402 } else { # no <tagname> for mapping
403 # "$field" is the user's preferred gs metadata name
404 $find_fields{lc($field)}=$field; # lc = lowercase
405 $orig_field = $field;
406 }
407
408 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
409 $tag = $orig_field;
410 $value = $1;
411 if (!defined $value || !defined $tag){
412 #print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
413 next;
414 } else {
415 # clean up and add
416 chomp($value); # remove trailing \n, if any
417 $tag = $find_fields{lc($tag)};
418 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
419 # if ($self->{'verbosity'} > 2);
420 if (defined $separator) {
421 my @values = split($separator, $value);
422 foreach my $v (@values) {
423 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $v) if $v =~ /\S/;
424 }
425 }
426 else {
427 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
428 }
429 }
430 }
431 }
432}
433
4341;
Note: See TracBrowser for help on using the repository browser.