source: main/trunk/greenstone2/perllib/plugins/StructuredHTMLPlugin.pm@ 32341

Last change on this file since 32341 was 32332, checked in by kjdon, 6 years ago

removed replace_images function. this inherits from HTMLPlugin, and replace images is called from that. don't know why it had its own version of this in the first place as it didn't look customised at all.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.5 KB
Line 
1###########################################################################
2#
3# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
4# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5# HTML document
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28# This plugin is to process an HTML file where sections are divided by
29# user-defined headings tags. As it is difficult to predict what user's definition
30# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31# as well as allows to get rid of user-defined Table of Content (TOC)...
32# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
33
34package StructuredHTMLPlugin;
35
36use HTMLPlugin;
37use ImageConverter; # want the identify method
38use util;
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43sub BEGIN {
44 @StructuredHTMLPlugin::ISA = ('HTMLPlugin');
45}
46
47my $arguments =
48 [
49 { 'name' => "level1_header",
50 'desc' => "{StructuredHTMLPlugin.level1_header}",
51 'type' => "regexp",
52 'reqd' => "no",
53 'deft' => "" },
54 { 'name' => "level2_header",
55 'desc' => "{StructuredHTMLPlugin.level2_header}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => "" },
59 { 'name' => "level3_header",
60 'desc' => "{StructuredHTMLPlugin.level3_header}",
61 'type' => "regexp",
62 'reqd' => "no",
63 'deft' => "" },
64 { 'name' => "title_header",
65 'desc' => "{StructuredHTMLPlugin.title_header}",
66 'type' => "regexp",
67 'reqd' => "no",
68 'deft' => "" },
69 { 'name' => "delete_toc",
70 'desc' => "{StructuredHTMLPlugin.delete_toc}",
71 'type' => "flag",
72 'reqd' => "no"},
73 { 'name' => "toc_header",
74 'desc' => "{StructuredHTMLPlugin.toc_header}",
75 'type' => "regexp",
76 'reqd' => "no",
77 'deft' => "" }
78 ];
79
80my $options = { 'name' => "StructuredHTMLPlugin",
81 'desc' => "{StructuredHTMLPlugin.desc}",
82 'abstract' => "no",
83 'inherits' => "yes",
84 'args' => $arguments };
85
86sub new {
87 my ($class) = shift (@_);
88 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89 push(@$pluginlist, $class);
90
91 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92 push(@{$hashArgOptLists->{"OptList"}},$options);
93
94 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
95
96 return bless $self, $class;
97}
98
99
100sub process {
101 my $self = shift (@_);
102 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
103 my $outhandle = $self->{'outhandle'};
104
105 my @head_and_body = split(/<body/i,$$textref);
106 my $head = shift(@head_and_body);
107 my $body_text = join("<body", @head_and_body);
108 $head =~ m/<title>(.+)<\/title>/i;
109 my $doctitle = $1 if defined $1;
110 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
111 my @doc_properties = split(/<xml>/i,$head);
112 my $doc_heading = shift(@doc_properties);
113 my $rest_doc_properties = join(" ", @doc_properties);
114
115 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
116 my $extracted_metadata = shift (@extracted_metadata);
117 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
118 }
119
120 # set the title here if we haven't found it yet
121 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
122 if (defined $doctitle && $doctitle =~ /\S/) {
123 $doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
124 } else {
125 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
126 }
127 }
128
129 # If delete_toc is enabled, it means to get rid of toc and tof contents.
130 # get rid of TOC and TOF sections and their title
131 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
132 if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
133 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
134 }
135 }
136
137 if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
138 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
139 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
140 }
141
142 if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
143 $self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
144 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
145 }
146
147 if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
148 $self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
149 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
150 }
151
152 if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
153 $self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
154 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
155 }
156
157 # Tidy up extra new lines
158 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
159 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
160
161 # what was the following line for. effectively unused. do we need it??
162 #$section_text .= "<!--\n<Section>\n-->\n";
163 #my $top_section_tag = "<!--\n<Section>\n-->\n";
164 #$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
165 #$body_text =~ s/(<div.*)/$top_section_tag$1/i;
166 my $body = "<body".$body_text;
167
168 my $section_text = $head;
169
170 # split HTML text on <h1>, <h2> etc tags
171 my @h_split = split(/<h/i,$body);
172
173 my $hnum = 0;
174
175 my $sectionh1 = 0;
176 $section_text .= shift(@h_split);
177
178 my $hc;
179 foreach $hc ( @h_split )
180 {
181 if ($hc =~ m/^([1-3])\s*.*?>(.*)$/s)
182 {
183 my $new_hnum = $1;
184 my $hc_after = $2;
185
186 if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
187 {
188 my $h_text = $1;
189 $hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
190 # boil HTML down to some interesting text
191 $h_text =~ s/^[1-3]>//;
192 $h_text =~ s/<\/?.*?>//sg;
193 $h_text =~ s/\s+/ /sg;
194 $h_text =~ s/^\s$//s;
195 $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
196
197 if ($h_text =~ m/\w+/)
198 {
199 if ($new_hnum > $hnum)
200 {
201 # increase section nesting
202 $hnum++;
203 while ($hnum < $new_hnum)
204 {
205 my $spacing = " " x $hnum;
206 $section_text .= "<!--\n";
207 $section_text .= $spacing."<Section>\n";
208 $section_text .= "-->\n";
209 $hnum++;
210 }
211 }
212 else # ($new_hnum <= $hnum)
213 {
214 # descrease section nesting
215 while ($hnum >= $new_hnum)
216 {
217 my $spacing = " " x $hnum;
218 $section_text .= "<!--\n";
219 $section_text .= $spacing."</Section>\n";
220 $section_text .= "-->\n";
221 $hnum--;
222 }
223 $hnum++;
224 }
225
226 my $spacing = " " x $hnum;
227 $section_text .= "<!--\n";
228 $section_text .= $spacing."<Section>\n";
229 $section_text .= $spacing." <Description>\n";
230 $section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
231 $section_text .= $spacing." </Description>\n";
232 $section_text .= "-->\n";
233
234 #print $outhandle $spacing."$h_text\n"
235 # if $self->{'verbosity'} > 2;
236
237 $sectionh1++ if ($hnum==1);
238 }
239 }
240 else {
241### print STDERR "***** hc = <h$hc\n\n";
242 }
243 $section_text .= "<h$hc";
244 }
245 else
246 {
247 $section_text .= "<h$hc";
248 }
249 }
250
251 while ($hnum >= 1)
252 {
253 my $spacing = " " x $hnum;
254 $section_text .= "<!--\n";
255 $section_text .= $spacing."</Section>\n";
256 $section_text .= "-->\n";
257 $hnum--;
258 }
259
260 $section_text .= "<!--\n</Section>\n-->\n";
261
262 $$textref = $section_text;
263
264# if ($sectionh1>0)
265# {
266# print $outhandle " Located section headings ..."
267# if $self->{'verbosity'} > 1;
268# }
269
270 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
271
272 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
273
274 ## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
275
276 $self->SUPER::process(@_);
277
278}
279
280
281sub resize_if_necessary
282{
283 my ($self,$front,$back,$base_dir,$href) = @_;
284
285 # dig out width and height of image, if there
286 my $img_attributes = "$front back";
287 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
288 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
289
290 # derive local filename for image based on its URL
291 my $img_filename = $href;
292 $img_filename =~ s/^[^:]*:\/\///;
293 $img_filename = &util::filename_cat($base_dir, $img_filename);
294
295 # Replace %20's in URL with a space if required. Note that the filename
296 # may include the %20 in some situations
297 if ($img_filename =~ /\%20/) {
298 if (!-e $img_filename) {
299 $img_filename =~ s/\%20/ /g;
300 }
301 }
302 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
303 # get image info on width and height
304
305 my $outhandle = $self->{'outhandle'};
306 my $verbosity = $self->{'verbosity'};
307
308 my ($image_type, $actual_width, $actual_height, $image_size)
309 = &ImageConverter::identify($img_filename, $outhandle, $verbosity);
310
311 #print STDERR "**** $actual_width x $actual_height";
312 #print STDERR " (requested: $img_width x $img_height)\n";
313
314 if (($img_width < $actual_width) || ($img_height < $actual_height)) {
315 #print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
316
317 # derive new image name based on current image
318 my ($tailname, $dirname, $suffix)
319 = &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
320
321 my $resized_filename
322 = &util::filename_cat($dirname, $tailname."_resized".$suffix);
323
324 #print STDERR "**** suffix = $suffix\n";
325
326 # Generate smaller image with convert
327 my $newsize = "$img_width"."x$img_height";
328 my $command = "convert -interlace plane -verbose "
329 ."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
330 $command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl $command";
331 #print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
332 #my $result = '';
333 #print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
334 }
335 }
336 return $href;
337}
338
339
340sub extract_metadata
341{
342 my $self = shift (@_);
343 my ($textref, $metadata, $doc_obj) = @_;
344 my $outhandle = $self->{'outhandle'};
345
346 return if (!defined $textref);
347
348 my $separator = $self->{'metadata_field_separator'};
349 if ($separator eq "") {
350 undef $separator;
351 }
352 # metadata fields to extract/save. 'key' is the (lowercase) name of the
353 # html meta, 'value' is the metadata name for greenstone to use
354 my %find_fields = ();
355 my ($tag,$value);
356
357 my $orig_field = "";
358 foreach my $field (split /\s*,\s*/, $self->{'metadata_fields'}) {
359 # support tag<tagname>
360 if ($field =~ /^(.*?)\s*<(.*?)>$/) {
361 # "$2" is the user's preferred gs metadata name
362 $find_fields{lc($1)}=$2; # lc = lowercase
363 $orig_field = $1;
364 } else { # no <tagname> for mapping
365 # "$field" is the user's preferred gs metadata name
366 $find_fields{lc($field)}=$field; # lc = lowercase
367 $orig_field = $field;
368 }
369
370 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
371 $tag = $orig_field;
372 $value = $1;
373 if (!defined $value || !defined $tag){
374 #print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
375 next;
376 } else {
377 # clean up and add
378 chomp($value); # remove trailing \n, if any
379 $tag = $find_fields{lc($tag)};
380 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
381 # if ($self->{'verbosity'} > 2);
382 if (defined $separator) {
383 my @values = split($separator, $value);
384 foreach my $v (@values) {
385 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $v) if $v =~ /\S/;
386 }
387 }
388 else {
389 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
390 }
391 }
392 }
393 }
394}
395
3961;
Note: See TracBrowser for help on using the repository browser.