source: gsdl/trunk/perllib/plugins/StructuredHTMLPlugin.pm@ 17033

Last change on this file since 17033 was 16104, checked in by kjdon, 16 years ago

tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process

  • Property svn:keywords set to Author Date Id Revision
File size: 12.7 KB
RevLine 
[10271]1###########################################################################
2#
[15872]3# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
[10404]4# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5# HTML document
[10271]6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
[10404]28# This plugin is to process an HTML file where sections are divided by
29# user-defined headings tags. As it is difficult to predict what user's definition
30# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31# as well as allows to get rid of user-defined Table of Content (TOC)...
32# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
33
[15872]34package StructuredHTMLPlugin;
[10271]35
[15872]36use HTMLPlugin;
37use ImageConverter; # want the identify method
[10271]38
[15872]39use strict; # every perl program should have this!
40no strict 'refs'; # make an exception so we can use variables as filehandles
[10404]41
[10271]42sub BEGIN {
[15872]43 @StructuredHTMLPlugin::ISA = ('HTMLPlugin');
[10271]44}
45
[11849]46my $arguments =
47 [
48 { 'name' => "level1_header",
[15872]49 'desc' => "{StructuredHTMLPlugin.level1_header}",
[11849]50 'type' => "regexp",
51 'reqd' => "no",
52 'deft' => "" },
53 { 'name' => "level2_header",
[15872]54 'desc' => "{StructuredHTMLPlugin.level2_header}",
[11849]55 'type' => "regexp",
56 'reqd' => "no",
57 'deft' => "" },
58 { 'name' => "level3_header",
[15872]59 'desc' => "{StructuredHTMLPlugin.level3_header}",
[11849]60 'type' => "regexp",
61 'reqd' => "no",
62 'deft' => "" },
63 { 'name' => "title_header",
[15872]64 'desc' => "{StructuredHTMLPlugin.title_header}",
[11849]65 'type' => "regexp",
66 'reqd' => "no",
67 'deft' => "" },
[11884]68 { 'name' => "delete_toc",
[15872]69 'desc' => "{StructuredHTMLPlugin.delete_toc}",
[11884]70 'type' => "flag",
71 'reqd' => "no"},
[11849]72 { 'name' => "toc_header",
[15872]73 'desc' => "{StructuredHTMLPlugin.toc_header}",
[11849]74 'type' => "regexp",
75 'reqd' => "no",
[11884]76 'deft' => "" }
[11849]77 ];
[10271]78
[15872]79my $options = { 'name' => "StructuredHTMLPlugin",
80 'desc' => "{StructuredHTMLPlugin.desc}",
[10271]81 'abstract' => "no",
82 'inherits' => "yes",
83 'args' => $arguments };
84
85sub new {
86 my ($class) = shift (@_);
87 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88 push(@$pluginlist, $class);
89
[15872]90 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
91 push(@{$hashArgOptLists->{"OptList"}},$options);
[10271]92
[15872]93 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
[10271]94
95 return bless $self, $class;
96}
97
98
99sub process {
100 my $self = shift (@_);
[10404]101 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[10271]102 my $outhandle = $self->{'outhandle'};
103
104 my @head_and_body = split(/<body/i,$$textref);
105 my $head = shift(@head_and_body);
106 my $body_text = join("<body", @head_and_body);
[10600]107 $head =~ m/<title>(.+)<\/title>/i;
108 my $doctitle = $1 if defined $1;
[10723]109 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
[10426]110 my @doc_properties = split(/<xml>/i,$head);
111 my $doc_heading = shift(@doc_properties);
112 my $rest_doc_properties = join(" ", @doc_properties);
[11380]113
[10426]114 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
115 my $extracted_metadata = shift (@extracted_metadata);
116 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
117 }
118
[11893]119 # set the title here if we haven't found it yet
120 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
121 if (defined $doctitle && $doctitle =~ /\S/) {
122 $doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
123 } else {
124 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
125 }
126 }
127
128 # If delete_toc is enabled, it means to get rid of toc and tof contents.
[10271]129 # get rid of TOC and TOF sections and their title
[11380]130 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
[10496]131 if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
132 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
133 }
[10443]134 }
[10600]135
[10496]136 if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
[10271]137 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
[10600]138 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]139 }
[10443]140
141 if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
142 $self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
[10595]143 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]144 }
[10443]145
[10496]146 if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
[10443]147 $self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
[10595]148 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
[10271]149 }
150
[10496]151 if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
[10443]152 $self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
[10271]153 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
154 }
[11893]155
[10271]156 # Tidy up extra new lines
157 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
158 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
[10426]159
[15872]160 # what was the following line for. effectively unused. do we need it??
161 #$section_text .= "<!--\n<Section>\n-->\n";
[10600]162 #my $top_section_tag = "<!--\n<Section>\n-->\n";
163 #$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
164 #$body_text =~ s/(<div.*)/$top_section_tag$1/i;
[10271]165 my $body = "<body".$body_text;
166
167 my $section_text = $head;
[10426]168
[10271]169 # split HTML text on <h1>, <h2> etc tags
170 my @h_split = split(/<h/i,$body);
[10426]171
[10271]172 my $hnum = 0;
173
174 my $sectionh1 = 0;
175 $section_text .= shift(@h_split);
[10426]176
[10271]177 my $hc;
178 foreach $hc ( @h_split )
179 {
180 if ($hc =~ m/^([1-3])\s*.*?>(.*)$/s)
181 {
182 my $new_hnum = $1;
183 my $hc_after = $2;
184
185 if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
186 {
187 my $h_text = $1;
188 $hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
189 # boil HTML down to some interesting text
190 $h_text =~ s/^[1-3]>//;
191 $h_text =~ s/<\/?.*?>//sg;
192 $h_text =~ s/\s+/ /sg;
193 $h_text =~ s/^\s$//s;
194 $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
[10426]195
[10271]196 if ($h_text =~ m/\w+/)
197 {
198 if ($new_hnum > $hnum)
199 {
200 # increase section nesting
201 $hnum++;
202 while ($hnum < $new_hnum)
203 {
204 my $spacing = " " x $hnum;
205 $section_text .= "<!--\n";
206 $section_text .= $spacing."<Section>\n";
207 $section_text .= "-->\n";
208 $hnum++;
209 }
210 }
211 else # ($new_hnum <= $hnum)
212 {
213 # descrease section nesting
214 while ($hnum >= $new_hnum)
215 {
216 my $spacing = " " x $hnum;
217 $section_text .= "<!--\n";
218 $section_text .= $spacing."</Section>\n";
219 $section_text .= "-->\n";
220 $hnum--;
221 }
222 $hnum++;
223 }
224
[10600]225 my $spacing = " " x $hnum;
[10271]226 $section_text .= "<!--\n";
227 $section_text .= $spacing."<Section>\n";
228 $section_text .= $spacing." <Description>\n";
229 $section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
230 $section_text .= $spacing." </Description>\n";
231 $section_text .= "-->\n";
232
[11893]233 #print $outhandle $spacing."$h_text\n"
234 # if $self->{'verbosity'} > 2;
[10426]235
[10271]236 $sectionh1++ if ($hnum==1);
237 }
238 }
239 else {
240### print STDERR "***** hc = <h$hc\n\n";
241 }
242 $section_text .= "<h$hc";
243 }
244 else
245 {
246 $section_text .= "<h$hc";
247 }
248 }
249
250 while ($hnum >= 1)
251 {
252 my $spacing = " " x $hnum;
253 $section_text .= "<!--\n";
254 $section_text .= $spacing."</Section>\n";
255 $section_text .= "-->\n";
256 $hnum--;
257 }
258
259 $section_text .= "<!--\n</Section>\n-->\n";
260
261 $$textref = $section_text;
[10426]262
[11893]263# if ($sectionh1>0)
264# {
265# print $outhandle " Located section headings ..."
266# if $self->{'verbosity'} > 1;
267# }
[10426]268
[10271]269 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
[10426]270
[10271]271 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
272
[10426]273 ## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
274
[10271]275 $self->SUPER::process(@_);
[10426]276
[10271]277}
278
279
280sub resize_if_necessary
281{
282 my ($self,$front,$back,$base_dir,$href) = @_;
[10426]283
[10271]284 # dig out width and height of image, if there
285 my $img_attributes = "$front back";
286 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
287 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
[10426]288
[10271]289 # derive local filename for image based on its URL
290 my $img_filename = $href;
291 $img_filename =~ s/^[^:]*:\/\///;
292 $img_filename = &util::filename_cat($base_dir, $img_filename);
[10426]293
[10271]294 # Replace %20's in URL with a space if required. Note that the filename
295 # may include the %20 in some situations
296 if ($img_filename =~ /\%20/) {
297 if (!-e $img_filename) {
298 $img_filename =~ s/\%20/ /g;
299 }
300 }
301 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
302 # get image info on width and height
[10426]303
[10271]304 my $outhandle = $self->{'outhandle'};
305 my $verbosity = $self->{'verbosity'};
306
307 my ($image_type, $actual_width, $actual_height, $image_size)
[15872]308 = &ImageConverter::identify($img_filename, $outhandle, $verbosity);
[10426]309
[10356]310 #print STDERR "**** $actual_width x $actual_height";
311 #print STDERR " (requested: $img_width x $img_height)\n";
[10271]312
313 if (($img_width < $actual_width) || ($img_height < $actual_height)) {
[11893]314 #print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
[10426]315
[10271]316 # derive new image name based on current image
317 my ($tailname, $dirname, $suffix)
[15872]318 = &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
[10426]319
[10271]320 my $resized_filename
321 = &util::filename_cat($dirname, $tailname."_resized".$suffix);
[10426]322
[10356]323 #print STDERR "**** suffix = $suffix\n";
[10426]324
[10271]325 # Generate smaller image with convert
[15872]326 my $newsize = "$img_width"."x$img_height";
[10271]327 my $command = "convert -interlace plane -verbose "
[15872]328 ."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
[11893]329 #print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
330 #my $result = '';
331 #print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
[10271]332 }
333 }
334 return $href;
335}
336
337sub replace_images {
338 my $self = shift (@_);
339 my ($front, $link, $back, $base_dir,
340 $file, $doc_obj, $section) = @_;
341 # remove quotes from link at start and end if necessary
342 if ($link=~/^\"/) {
343 $link=~s/^\"//;$link=~s/\"$//;
344 $front.='"';
345 $back="\"$back";
346 }
[10426]347
[10271]348 $link =~ s/\n/ /g;
[10426]349
[10271]350 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[10426]351
[10271]352## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
[10426]353
[10271]354 my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
[10426]355
[10271]356 return $front . $middle . $back;
357}
358
[10426]359sub extract_metadata
360{
361 my $self = shift (@_);
362 my ($textref, $metadata, $doc_obj) = @_;
363 my $outhandle = $self->{'outhandle'};
364
[11380]365 return if (!defined $textref);
366
[10426]367 # metadata fields to extract/save. 'key' is the (lowercase) name of the
368 # html meta, 'value' is the metadata name for greenstone to use
369 my %find_fields = ();
370 my ($tag,$value);
[10271]371
[10426]372 my $orig_field = "";
[10723]373 foreach my $field (split /,/, $self->{'metadata_fields'}) {
[10426]374 # support tag<tagname>
375 if ($field =~ /^(.*?)<(.*?)>$/) {
376 # "$2" is the user's preferred gs metadata name
377 $find_fields{lc($1)}=$2; # lc = lowercase
378 $orig_field = $1;
379 } else { # no <tagname> for mapping
380 # "$field" is the user's preferred gs metadata name
381 $find_fields{lc($field)}=$field; # lc = lowercase
382 $orig_field = $field;
383 }
[11380]384
[10426]385 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
386 $tag = $orig_field;
387 $value = $1;
388 if (!defined $value || !defined $tag){
[15872]389 #print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
[10426]390 next;
391 } else {
392 # clean up and add
393 chomp($value); # remove trailing \n, if any
394 $tag = $find_fields{lc($tag)};
[11893]395 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
396 # if ($self->{'verbosity'} > 2);
[10426]397 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
398 }
399 }
400 }
401}
402
[10271]4031;
Note: See TracBrowser for help on using the repository browser.