source: main/trunk/greenstone2/perllib/plugins/StructuredHTMLPlugin.pm@ 32589

Last change on this file since 32589 was 32332, checked in by kjdon, 6 years ago

removed replace_images function. this inherits from HTMLPlugin, and replace images is called from that. don't know why it had its own version of this in the first place as it didn't look customised at all.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.5 KB
RevLine 
[10271]1###########################################################################
2#
[15872]3# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
[10404]4# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5# HTML document
[10271]6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
[10404]28# This plugin is to process an HTML file where sections are divided by
29# user-defined headings tags. As it is difficult to predict what user's definition
30# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31# as well as allows to get rid of user-defined Table of Content (TOC)...
32# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
33
[15872]34package StructuredHTMLPlugin;
[10271]35
[15872]36use HTMLPlugin;
37use ImageConverter; # want the identify method
[24600]38use util;
[10271]39
[15872]40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
[10404]42
[10271]43sub BEGIN {
[15872]44 @StructuredHTMLPlugin::ISA = ('HTMLPlugin');
[10271]45}
46
[11849]47my $arguments =
48 [
49 { 'name' => "level1_header",
[15872]50 'desc' => "{StructuredHTMLPlugin.level1_header}",
[11849]51 'type' => "regexp",
52 'reqd' => "no",
53 'deft' => "" },
54 { 'name' => "level2_header",
[15872]55 'desc' => "{StructuredHTMLPlugin.level2_header}",
[11849]56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => "" },
59 { 'name' => "level3_header",
[15872]60 'desc' => "{StructuredHTMLPlugin.level3_header}",
[11849]61 'type' => "regexp",
62 'reqd' => "no",
63 'deft' => "" },
64 { 'name' => "title_header",
[15872]65 'desc' => "{StructuredHTMLPlugin.title_header}",
[11849]66 'type' => "regexp",
67 'reqd' => "no",
68 'deft' => "" },
[11884]69 { 'name' => "delete_toc",
[15872]70 'desc' => "{StructuredHTMLPlugin.delete_toc}",
[11884]71 'type' => "flag",
72 'reqd' => "no"},
[11849]73 { 'name' => "toc_header",
[15872]74 'desc' => "{StructuredHTMLPlugin.toc_header}",
[11849]75 'type' => "regexp",
76 'reqd' => "no",
[11884]77 'deft' => "" }
[11849]78 ];
[10271]79
[15872]80my $options = { 'name' => "StructuredHTMLPlugin",
81 'desc' => "{StructuredHTMLPlugin.desc}",
[10271]82 'abstract' => "no",
83 'inherits' => "yes",
84 'args' => $arguments };
85
86sub new {
87 my ($class) = shift (@_);
88 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89 push(@$pluginlist, $class);
90
[15872]91 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92 push(@{$hashArgOptLists->{"OptList"}},$options);
[10271]93
[15872]94 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
[10271]95
96 return bless $self, $class;
97}
98
99
100sub process {
101 my $self = shift (@_);
[10404]102 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[10271]103 my $outhandle = $self->{'outhandle'};
104
105 my @head_and_body = split(/<body/i,$$textref);
106 my $head = shift(@head_and_body);
107 my $body_text = join("<body", @head_and_body);
[10600]108 $head =~ m/<title>(.+)<\/title>/i;
109 my $doctitle = $1 if defined $1;
[10723]110 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
[10426]111 my @doc_properties = split(/<xml>/i,$head);
112 my $doc_heading = shift(@doc_properties);
113 my $rest_doc_properties = join(" ", @doc_properties);
[11380]114
[10426]115 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
116 my $extracted_metadata = shift (@extracted_metadata);
117 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
118 }
119
[11893]120 # set the title here if we haven't found it yet
121 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
122 if (defined $doctitle && $doctitle =~ /\S/) {
123 $doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
124 } else {
125 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
126 }
127 }
128
129 # If delete_toc is enabled, it means to get rid of toc and tof contents.
[10271]130 # get rid of TOC and TOF sections and their title
[11380]131 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
[10496]132 if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
133 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
134 }
[10443]135 }
[10600]136
[10496]137 if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
[10271]138 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
[10600]139 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]140 }
[10443]141
142 if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
143 $self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
[10595]144 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]145 }
[10443]146
[10496]147 if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
[10443]148 $self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
[10595]149 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
[10271]150 }
151
[10496]152 if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
[10443]153 $self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
[10271]154 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
155 }
[11893]156
[10271]157 # Tidy up extra new lines
158 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
159 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
[10426]160
[15872]161 # what was the following line for. effectively unused. do we need it??
162 #$section_text .= "<!--\n<Section>\n-->\n";
[10600]163 #my $top_section_tag = "<!--\n<Section>\n-->\n";
164 #$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
165 #$body_text =~ s/(<div.*)/$top_section_tag$1/i;
[10271]166 my $body = "<body".$body_text;
167
168 my $section_text = $head;
[10426]169
[10271]170 # split HTML text on <h1>, <h2> etc tags
171 my @h_split = split(/<h/i,$body);
[10426]172
[10271]173 my $hnum = 0;
174
175 my $sectionh1 = 0;
176 $section_text .= shift(@h_split);
[10426]177
[10271]178 my $hc;
179 foreach $hc ( @h_split )
180 {
181 if ($hc =~ m/^([1-3])\s*.*?>(.*)$/s)
182 {
183 my $new_hnum = $1;
184 my $hc_after = $2;
185
186 if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
187 {
188 my $h_text = $1;
189 $hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
190 # boil HTML down to some interesting text
191 $h_text =~ s/^[1-3]>//;
192 $h_text =~ s/<\/?.*?>//sg;
193 $h_text =~ s/\s+/ /sg;
194 $h_text =~ s/^\s$//s;
195 $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
[10426]196
[10271]197 if ($h_text =~ m/\w+/)
198 {
199 if ($new_hnum > $hnum)
200 {
201 # increase section nesting
202 $hnum++;
203 while ($hnum < $new_hnum)
204 {
205 my $spacing = " " x $hnum;
206 $section_text .= "<!--\n";
207 $section_text .= $spacing."<Section>\n";
208 $section_text .= "-->\n";
209 $hnum++;
210 }
211 }
212 else # ($new_hnum <= $hnum)
213 {
214 # descrease section nesting
215 while ($hnum >= $new_hnum)
216 {
217 my $spacing = " " x $hnum;
218 $section_text .= "<!--\n";
219 $section_text .= $spacing."</Section>\n";
220 $section_text .= "-->\n";
221 $hnum--;
222 }
223 $hnum++;
224 }
225
[10600]226 my $spacing = " " x $hnum;
[10271]227 $section_text .= "<!--\n";
228 $section_text .= $spacing."<Section>\n";
229 $section_text .= $spacing." <Description>\n";
230 $section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
231 $section_text .= $spacing." </Description>\n";
232 $section_text .= "-->\n";
233
[11893]234 #print $outhandle $spacing."$h_text\n"
235 # if $self->{'verbosity'} > 2;
[10426]236
[10271]237 $sectionh1++ if ($hnum==1);
238 }
239 }
240 else {
241### print STDERR "***** hc = <h$hc\n\n";
242 }
243 $section_text .= "<h$hc";
244 }
245 else
246 {
247 $section_text .= "<h$hc";
248 }
249 }
250
251 while ($hnum >= 1)
252 {
253 my $spacing = " " x $hnum;
254 $section_text .= "<!--\n";
255 $section_text .= $spacing."</Section>\n";
256 $section_text .= "-->\n";
257 $hnum--;
258 }
259
260 $section_text .= "<!--\n</Section>\n-->\n";
261
262 $$textref = $section_text;
[10426]263
[11893]264# if ($sectionh1>0)
265# {
266# print $outhandle " Located section headings ..."
267# if $self->{'verbosity'} > 1;
268# }
[10426]269
[10271]270 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
[10426]271
[10271]272 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
273
[10426]274 ## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
275
[10271]276 $self->SUPER::process(@_);
[10426]277
[10271]278}
279
280
281sub resize_if_necessary
282{
283 my ($self,$front,$back,$base_dir,$href) = @_;
[10426]284
[10271]285 # dig out width and height of image, if there
286 my $img_attributes = "$front back";
287 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
288 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
[10426]289
[10271]290 # derive local filename for image based on its URL
291 my $img_filename = $href;
292 $img_filename =~ s/^[^:]*:\/\///;
293 $img_filename = &util::filename_cat($base_dir, $img_filename);
[10426]294
[10271]295 # Replace %20's in URL with a space if required. Note that the filename
296 # may include the %20 in some situations
297 if ($img_filename =~ /\%20/) {
298 if (!-e $img_filename) {
299 $img_filename =~ s/\%20/ /g;
300 }
301 }
302 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
303 # get image info on width and height
[10426]304
[10271]305 my $outhandle = $self->{'outhandle'};
306 my $verbosity = $self->{'verbosity'};
307
308 my ($image_type, $actual_width, $actual_height, $image_size)
[15872]309 = &ImageConverter::identify($img_filename, $outhandle, $verbosity);
[10426]310
[10356]311 #print STDERR "**** $actual_width x $actual_height";
312 #print STDERR " (requested: $img_width x $img_height)\n";
[10271]313
314 if (($img_width < $actual_width) || ($img_height < $actual_height)) {
[11893]315 #print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
[10426]316
[10271]317 # derive new image name based on current image
318 my ($tailname, $dirname, $suffix)
[15872]319 = &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
[10426]320
[10271]321 my $resized_filename
322 = &util::filename_cat($dirname, $tailname."_resized".$suffix);
[10426]323
[10356]324 #print STDERR "**** suffix = $suffix\n";
[10426]325
[10271]326 # Generate smaller image with convert
[15872]327 my $newsize = "$img_width"."x$img_height";
[10271]328 my $command = "convert -interlace plane -verbose "
[15872]329 ."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
[24600]330 $command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl $command";
[11893]331 #print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
332 #my $result = '';
333 #print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
[10271]334 }
335 }
336 return $href;
337}
338
339
[10426]340sub extract_metadata
341{
342 my $self = shift (@_);
343 my ($textref, $metadata, $doc_obj) = @_;
344 my $outhandle = $self->{'outhandle'};
345
[11380]346 return if (!defined $textref);
347
[21801]348 my $separator = $self->{'metadata_field_separator'};
349 if ($separator eq "") {
350 undef $separator;
351 }
[10426]352 # metadata fields to extract/save. 'key' is the (lowercase) name of the
353 # html meta, 'value' is the metadata name for greenstone to use
354 my %find_fields = ();
355 my ($tag,$value);
[10271]356
[10426]357 my $orig_field = "";
[19993]358 foreach my $field (split /\s*,\s*/, $self->{'metadata_fields'}) {
[10426]359 # support tag<tagname>
[19993]360 if ($field =~ /^(.*?)\s*<(.*?)>$/) {
[10426]361 # "$2" is the user's preferred gs metadata name
362 $find_fields{lc($1)}=$2; # lc = lowercase
363 $orig_field = $1;
364 } else { # no <tagname> for mapping
365 # "$field" is the user's preferred gs metadata name
366 $find_fields{lc($field)}=$field; # lc = lowercase
367 $orig_field = $field;
368 }
[11380]369
[10426]370 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
371 $tag = $orig_field;
372 $value = $1;
373 if (!defined $value || !defined $tag){
[15872]374 #print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
[10426]375 next;
376 } else {
377 # clean up and add
378 chomp($value); # remove trailing \n, if any
379 $tag = $find_fields{lc($tag)};
[11893]380 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
381 # if ($self->{'verbosity'} > 2);
[21801]382 if (defined $separator) {
383 my @values = split($separator, $value);
384 foreach my $v (@values) {
385 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $v) if $v =~ /\S/;
386 }
387 }
388 else {
389 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
390 }
[10426]391 }
392 }
393 }
394}
395
[10271]3961;
Note: See TracBrowser for help on using the repository browser.