source: main/trunk/greenstone2/perllib/plugins/StructuredHTMLPlugin.pm@ 33299

Last change on this file since 33299 was 33299, checked in by ak19, 5 years ago
  1. Committing Dr Bainbridge's fix to remove duplicated heading titles in sectioned HTML produced from sectioned Word documents by StructuredHTMLPlugin. Committing this prior to debugging other issues. 2. Minor change to unicode.pm, improved description for function
  • Property svn:keywords set to Author Date Id Revision
File size: 13.0 KB
Line 
1###########################################################################
2#
3# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
4# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5# HTML document
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28# This plugin is to process an HTML file where sections are divided by
29# user-defined headings tags. As it is difficult to predict what user's definition
30# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31# as well as allows to get rid of user-defined Table of Content (TOC)...
32# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
33
34package StructuredHTMLPlugin;
35
36use HTMLPlugin;
37use ImageConverter; # want the identify method
38use util;
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43sub BEGIN {
44 @StructuredHTMLPlugin::ISA = ('HTMLPlugin');
45}
46
47my $arguments =
48 [
49 { 'name' => "level1_header",
50 'desc' => "{StructuredHTMLPlugin.level1_header}",
51 'type' => "regexp",
52 'reqd' => "no",
53 'deft' => "" },
54 { 'name' => "level2_header",
55 'desc' => "{StructuredHTMLPlugin.level2_header}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => "" },
59 { 'name' => "level3_header",
60 'desc' => "{StructuredHTMLPlugin.level3_header}",
61 'type' => "regexp",
62 'reqd' => "no",
63 'deft' => "" },
64 { 'name' => "title_header",
65 'desc' => "{StructuredHTMLPlugin.title_header}",
66 'type' => "regexp",
67 'reqd' => "no",
68 'deft' => "" },
69 { 'name' => "delete_toc",
70 'desc' => "{StructuredHTMLPlugin.delete_toc}",
71 'type' => "flag",
72 'reqd' => "no"},
73 { 'name' => "toc_header",
74 'desc' => "{StructuredHTMLPlugin.toc_header}",
75 'type' => "regexp",
76 'reqd' => "no",
77 'deft' => "" }
78 ];
79
80my $options = { 'name' => "StructuredHTMLPlugin",
81 'desc' => "{StructuredHTMLPlugin.desc}",
82 'abstract' => "no",
83 'inherits' => "yes",
84 'args' => $arguments };
85
86sub new {
87 my ($class) = shift (@_);
88 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89 push(@$pluginlist, $class);
90
91 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92 push(@{$hashArgOptLists->{"OptList"}},$options);
93
94 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
95
96 return bless $self, $class;
97}
98
99
100sub process {
101 my $self = shift (@_);
102 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
103 my $outhandle = $self->{'outhandle'};
104
105 my @head_and_body = split(/<body/i,$$textref);
106 my $head = shift(@head_and_body);
107 my $body_text = join("<body", @head_and_body);
108 $head =~ m/<title>(.+)<\/title>/i;
109 my $doctitle = $1 if defined $1;
110 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
111 my @doc_properties = split(/<xml>/i,$head);
112 my $doc_heading = shift(@doc_properties);
113 my $rest_doc_properties = join(" ", @doc_properties);
114
115 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
116 my $extracted_metadata = shift (@extracted_metadata);
117 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
118 }
119
120 # set the title here if we haven't found it yet
121 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
122 if (defined $doctitle && $doctitle =~ /\S/) {
123 $doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
124 } else {
125 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
126 }
127 }
128
129 # If delete_toc is enabled, it means to get rid of toc and tof contents.
130 # get rid of TOC and TOF sections and their title
131 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
132 if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
133 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
134 }
135 }
136
137 if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
138 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
139 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
140 }
141
142 if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
143 $self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
144 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
145 }
146
147 if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
148 $self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
149 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
150 }
151
152 if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
153 $self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
154 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
155 }
156
157 # Tidy up extra new lines
158 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
159 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
160
161 # what was the following line for. effectively unused. do we need it??
162 #$section_text .= "<!--\n<Section>\n-->\n";
163 #my $top_section_tag = "<!--\n<Section>\n-->\n";
164 #$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
165 #$body_text =~ s/(<div.*)/$top_section_tag$1/i;
166 my $body = "<body".$body_text;
167
168 my $section_text = $head;
169
170 # split HTML text on <h1>, <h2> etc tags
171 my @h_split = split(/<h/i,$body);
172
173 my $hnum = 0;
174
175 my $sectionh1 = 0;
176 $section_text .= shift(@h_split);
177
178 my $hc;
179 foreach $hc ( @h_split )
180 {
181 if ($hc =~ m/^([1-3])\s*.*?>(.*)$/s)
182 {
183 my $new_hnum = $1;
184 my $hc_after = $2;
185 my $filtered_hc = undef;
186
187 if ($hc_after =~ m/^(.*?)<\/h$new_hnum>(.*)$/is)
188 {
189 my $h_text = $1;
190 $filtered_hc = $2; # This represents the remainder of $hc, after the (e.g.) <h2>xxxx</h2> element
191
192 $hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
193 # boil HTML down to some interesting text
194 $h_text =~ s/^[1-3]>//;
195 $h_text =~ s/<\/?.*?>//sg;
196 $h_text =~ s/\s+/ /sg;
197 $h_text =~ s/^\s$//s;
198 $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
199
200 if ($h_text =~ m/\w+/)
201 {
202 if ($new_hnum > $hnum)
203 {
204 # increase section nesting
205 $hnum++;
206 while ($hnum < $new_hnum)
207 {
208 my $spacing = " " x $hnum;
209 $section_text .= "<!--\n";
210 $section_text .= $spacing."<Section>\n";
211 $section_text .= "-->\n";
212 $hnum++;
213 }
214 }
215 else # ($new_hnum <= $hnum)
216 {
217 # descrease section nesting
218 while ($hnum >= $new_hnum)
219 {
220 my $spacing = " " x $hnum;
221 $section_text .= "<!--\n";
222 $section_text .= $spacing."</Section>\n";
223 $section_text .= "-->\n";
224 $hnum--;
225 }
226 $hnum++;
227 }
228
229 my $spacing = " " x $hnum;
230 $section_text .= "<!--\n";
231 $section_text .= $spacing."<Section>\n";
232 $section_text .= $spacing." <Description>\n";
233 $section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
234 $section_text .= $spacing." </Description>\n";
235 $section_text .= "-->\n";
236
237 #print $outhandle $spacing."$h_text\n"
238 # if $self->{'verbosity'} > 2;
239
240 $sectionh1++ if ($hnum==1);
241 }
242 }
243 else {
244### print STDERR "***** hc = <h$hc\n\n";
245 }
246
247 # This can probably be replaced by the first statement in the if-statement, because
248 # $filtered_hc should always be defined when it is assigned above (even if it resolves
249 # to be an empty string)
250 if (defined $filtered_hc) {
251 $section_text .= $filtered_hc;
252 }
253 else {
254 $section_text .= "<h$hc";
255 }
256 }
257 else
258 {
259 $section_text .= "<h$hc";
260 }
261 }
262
263 while ($hnum >= 1)
264 {
265 my $spacing = " " x $hnum;
266 $section_text .= "<!--\n";
267 $section_text .= $spacing."</Section>\n";
268 $section_text .= "-->\n";
269 $hnum--;
270 }
271
272 $section_text .= "<!--\n</Section>\n-->\n";
273
274 $$textref = $section_text;
275
276# if ($sectionh1>0)
277# {
278# print $outhandle " Located section headings ..."
279# if $self->{'verbosity'} > 1;
280# }
281
282 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
283
284 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
285
286 ## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
287
288 $self->SUPER::process(@_);
289
290}
291
292
293sub resize_if_necessary
294{
295 my ($self,$front,$back,$base_dir,$href) = @_;
296
297 # dig out width and height of image, if there
298 my $img_attributes = "$front back";
299 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
300 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
301
302 # derive local filename for image based on its URL
303 my $img_filename = $href;
304 $img_filename =~ s/^[^:]*:\/\///;
305 $img_filename = &util::filename_cat($base_dir, $img_filename);
306
307 # Replace %20's in URL with a space if required. Note that the filename
308 # may include the %20 in some situations
309 if ($img_filename =~ /\%20/) {
310 if (!-e $img_filename) {
311 $img_filename =~ s/\%20/ /g;
312 }
313 }
314 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
315 # get image info on width and height
316
317 my $outhandle = $self->{'outhandle'};
318 my $verbosity = $self->{'verbosity'};
319
320 my ($image_type, $actual_width, $actual_height, $image_size)
321 = &ImageConverter::identify($img_filename, $outhandle, $verbosity);
322
323 #print STDERR "**** $actual_width x $actual_height";
324 #print STDERR " (requested: $img_width x $img_height)\n";
325
326 if (($img_width < $actual_width) || ($img_height < $actual_height)) {
327 #print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
328
329 # derive new image name based on current image
330 my ($tailname, $dirname, $suffix)
331 = &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
332
333 my $resized_filename
334 = &util::filename_cat($dirname, $tailname."_resized".$suffix);
335
336 #print STDERR "**** suffix = $suffix\n";
337
338 # Generate smaller image with convert
339 my $newsize = "$img_width"."x$img_height";
340 my $command = "convert -interlace plane -verbose "
341 ."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
342 $command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl $command";
343 #print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
344 #my $result = '';
345 #print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
346 }
347 }
348 return $href;
349}
350
351
352sub extract_metadata
353{
354 my $self = shift (@_);
355 my ($textref, $metadata, $doc_obj) = @_;
356 my $outhandle = $self->{'outhandle'};
357
358 return if (!defined $textref);
359
360 my $separator = $self->{'metadata_field_separator'};
361 if ($separator eq "") {
362 undef $separator;
363 }
364 # metadata fields to extract/save. 'key' is the (lowercase) name of the
365 # html meta, 'value' is the metadata name for greenstone to use
366 my %find_fields = ();
367 my ($tag,$value);
368
369 my $orig_field = "";
370 foreach my $field (split /\s*,\s*/, $self->{'metadata_fields'}) {
371 # support tag<tagname>
372 if ($field =~ /^(.*?)\s*<(.*?)>$/) {
373 # "$2" is the user's preferred gs metadata name
374 $find_fields{lc($1)}=$2; # lc = lowercase
375 $orig_field = $1;
376 } else { # no <tagname> for mapping
377 # "$field" is the user's preferred gs metadata name
378 $find_fields{lc($field)}=$field; # lc = lowercase
379 $orig_field = $field;
380 }
381
382 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
383 $tag = $orig_field;
384 $value = $1;
385 if (!defined $value || !defined $tag){
386 #print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
387 next;
388 } else {
389 # clean up and add
390 chomp($value); # remove trailing \n, if any
391 $tag = $find_fields{lc($tag)};
392 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
393 # if ($self->{'verbosity'} > 2);
394 if (defined $separator) {
395 my @values = split($separator, $value);
396 foreach my $v (@values) {
397 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $v) if $v =~ /\S/;
398 }
399 }
400 else {
401 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
402 }
403 }
404 }
405 }
406}
407
4081;
Note: See TracBrowser for help on using the repository browser.