source: trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm@ 12169

Last change on this file since 12169 was 12169, checked in by mdewsnip, 18 years ago

Tidied up that horrible long line in the new() function of every plugin.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.7 KB
RevLine 
[10271]1###########################################################################
2#
3# StructuredHTMLPlug.pm -- html plugin with extra facilities for teasing out
[10404]4# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5# HTML document
[10271]6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
[10404]28# This plugin is to process an HTML file where sections are divided by
29# user-defined headings tags. As it is difficult to predict what user's definition
30# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31# as well as allows to get rid of user-defined Table of Content (TOC)...
32# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
33
[10271]34package StructuredHTMLPlug;
35
36use HTMLPlug;
37use ImagePlug;
38
[10404]39#use strict; # every perl program should have this!
40#no strict 'refs'; # make an exception so we can use variables as filehandles
41
[10271]42sub BEGIN {
[10404]43 @StructuredHTMLPlug::ISA = ('HTMLPlug');
[10271]44}
45
[11849]46my $arguments =
47 [
48 { 'name' => "level1_header",
49 'desc' => "{StructuredHTMLPlug.level1_header}",
50 'type' => "regexp",
51 'reqd' => "no",
52 'deft' => "" },
53 { 'name' => "level2_header",
54 'desc' => "{StructuredHTMLPlug.level2_header}",
55 'type' => "regexp",
56 'reqd' => "no",
57 'deft' => "" },
58 { 'name' => "level3_header",
59 'desc' => "{StructuredHTMLPlug.level3_header}",
60 'type' => "regexp",
61 'reqd' => "no",
62 'deft' => "" },
63 { 'name' => "title_header",
64 'desc' => "{StructuredHTMLPlug.title_header}",
65 'type' => "regexp",
66 'reqd' => "no",
67 'deft' => "" },
[11884]68 { 'name' => "delete_toc",
69 'desc' => "{StructuredHTMLPlug.delete_toc}",
70 'type' => "flag",
71 'reqd' => "no"},
[11849]72 { 'name' => "toc_header",
73 'desc' => "{StructuredHTMLPlug.toc_header}",
74 'type' => "regexp",
75 'reqd' => "no",
[11884]76 'deft' => "" }
[11849]77 ];
[10271]78
79my $options = { 'name' => "StructuredHTMLPlug",
80 'desc' => "{StructuredHTMLPlug.desc}",
81 'abstract' => "no",
82 'inherits' => "yes",
83 'args' => $arguments };
84
85sub new {
86 my ($class) = shift (@_);
87 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88 push(@$pluginlist, $class);
89
90 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
91 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
92
[12169]93 my $self = new HTMLPlug($pluginlist, $inputargs, $hashArgOptLists);
[10271]94
95 return bless $self, $class;
96}
97
98
99sub process {
100 my $self = shift (@_);
[10404]101 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[10271]102 my $outhandle = $self->{'outhandle'};
103
104 print $outhandle "StructuredHTMLPlug: processing $file\n"
105 if $self->{'verbosity'} > 1;
[10600]106
[10271]107 my @head_and_body = split(/<body/i,$$textref);
108 my $head = shift(@head_and_body);
109 my $body_text = join("<body", @head_and_body);
[10600]110 $head =~ m/<title>(.+)<\/title>/i;
111 my $doctitle = $1 if defined $1;
[10723]112 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
[10426]113 my @doc_properties = split(/<xml>/i,$head);
114 my $doc_heading = shift(@doc_properties);
115 my $rest_doc_properties = join(" ", @doc_properties);
[11380]116
[10426]117 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
118 my $extracted_metadata = shift (@extracted_metadata);
119 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
120 }
121
[11893]122 # set the title here if we haven't found it yet
123 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
124 if (defined $doctitle && $doctitle =~ /\S/) {
125 $doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
126 } else {
127 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
128 }
129 }
130
131 # If delete_toc is enabled, it means to get rid of toc and tof contents.
[10271]132 # get rid of TOC and TOF sections and their title
[11380]133 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
[10496]134 if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
135 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
136 }
[10443]137 }
[10600]138
[10496]139 if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
[10271]140 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
[10600]141 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]142 }
[10443]143
144 if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
145 $self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
[10595]146 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]147 }
[10443]148
[10496]149 if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
[10443]150 $self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
[10595]151 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
[10271]152 }
153
[10496]154 if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
[10443]155 $self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
[10271]156 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
157 }
[11893]158
[10271]159 # Tidy up extra new lines
160 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
161 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
[10426]162
[10600]163 $section_text .= "<!--\n<Section>\n-->\n";
164 #my $top_section_tag = "<!--\n<Section>\n-->\n";
165 #$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
166 #$body_text =~ s/(<div.*)/$top_section_tag$1/i;
[10271]167 my $body = "<body".$body_text;
168
169 my $section_text = $head;
[10426]170
[10271]171 # split HTML text on <h1>, <h2> etc tags
172 my @h_split = split(/<h/i,$body);
[10426]173
[10271]174 my $hnum = 0;
175
176 my $sectionh1 = 0;
177 $section_text .= shift(@h_split);
[10426]178
[10271]179 my $hc;
180 foreach $hc ( @h_split )
181 {
182 if ($hc =~ m/^([1-3])\s*.*?>(.*)$/s)
183 {
184 my $new_hnum = $1;
185 my $hc_after = $2;
186
187 if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
188 {
189 my $h_text = $1;
190 $hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
191 # boil HTML down to some interesting text
192 $h_text =~ s/^[1-3]>//;
193 $h_text =~ s/<\/?.*?>//sg;
194 $h_text =~ s/\s+/ /sg;
195 $h_text =~ s/^\s$//s;
196 $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
[10426]197
[10271]198 if ($h_text =~ m/\w+/)
199 {
200 if ($new_hnum > $hnum)
201 {
202 # increase section nesting
203 $hnum++;
204 while ($hnum < $new_hnum)
205 {
206 my $spacing = " " x $hnum;
207 $section_text .= "<!--\n";
208 $section_text .= $spacing."<Section>\n";
209 $section_text .= "-->\n";
210 $hnum++;
211 }
212 }
213 else # ($new_hnum <= $hnum)
214 {
215 # descrease section nesting
216 while ($hnum >= $new_hnum)
217 {
218 my $spacing = " " x $hnum;
219 $section_text .= "<!--\n";
220 $section_text .= $spacing."</Section>\n";
221 $section_text .= "-->\n";
222 $hnum--;
223 }
224 $hnum++;
225 }
226
[10600]227 my $spacing = " " x $hnum;
[10271]228 $section_text .= "<!--\n";
229 $section_text .= $spacing."<Section>\n";
230 $section_text .= $spacing." <Description>\n";
231 $section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
232 $section_text .= $spacing." </Description>\n";
233 $section_text .= "-->\n";
234
[11893]235 #print $outhandle $spacing."$h_text\n"
236 # if $self->{'verbosity'} > 2;
[10426]237
[10271]238 $sectionh1++ if ($hnum==1);
239 }
240 }
241 else {
242### print STDERR "***** hc = <h$hc\n\n";
243 }
244 $section_text .= "<h$hc";
245 }
246 else
247 {
248 $section_text .= "<h$hc";
249 }
250 }
251
252 while ($hnum >= 1)
253 {
254 my $spacing = " " x $hnum;
255 $section_text .= "<!--\n";
256 $section_text .= $spacing."</Section>\n";
257 $section_text .= "-->\n";
258 $hnum--;
259 }
260
261 $section_text .= "<!--\n</Section>\n-->\n";
262
263 $$textref = $section_text;
[10426]264
[11893]265# if ($sectionh1>0)
266# {
267# print $outhandle " Located section headings ..."
268# if $self->{'verbosity'} > 1;
269# }
[10426]270
[10271]271 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
[10426]272
[10271]273 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
274
[10426]275 ## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
276
[10271]277 $self->SUPER::process(@_);
[10426]278
[10271]279}
280
281
282sub resize_if_necessary
283{
284 my ($self,$front,$back,$base_dir,$href) = @_;
[10426]285
[10271]286 # dig out width and height of image, if there
287 my $img_attributes = "$front back";
288 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
289 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
[10426]290
[10271]291 # derive local filename for image based on its URL
292 my $img_filename = $href;
293 $img_filename =~ s/^[^:]*:\/\///;
294 $img_filename = &util::filename_cat($base_dir, $img_filename);
[10426]295
[10271]296 # Replace %20's in URL with a space if required. Note that the filename
297 # may include the %20 in some situations
298 if ($img_filename =~ /\%20/) {
299 if (!-e $img_filename) {
300 $img_filename =~ s/\%20/ /g;
301 }
302 }
303 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
304 # get image info on width and height
[10426]305
[10271]306 my $outhandle = $self->{'outhandle'};
307 my $verbosity = $self->{'verbosity'};
308
309 my ($image_type, $actual_width, $actual_height, $image_size)
310 = &ImagePlug::identify($img_filename, $outhandle, $verbosity);
[10426]311
[10356]312 #print STDERR "**** $actual_width x $actual_height";
313 #print STDERR " (requested: $img_width x $img_height)\n";
[10271]314
315 if (($img_width < $actual_width) || ($img_height < $actual_height)) {
[11893]316 #print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
[10426]317
[10271]318 # derive new image name based on current image
319 my ($tailname, $dirname, $suffix)
320 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
[10426]321
[10271]322 my $resized_filename
323 = &util::filename_cat($dirname, $tailname."_resized".$suffix);
[10426]324
[10356]325 #print STDERR "**** suffix = $suffix\n";
[10426]326
[10271]327 # Generate smaller image with convert
328 my $newsize = "$img_widthx$image_height";
329 my $command = "convert -interlace plane -verbose "
330 ."-geometry $newsize \"img_$filename\" \"$resized_filename\"";
[11893]331 #print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
332 #my $result = '';
333 #print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
[10271]334 }
335 }
336 return $href;
337}
338
339sub replace_images {
340 my $self = shift (@_);
341 my ($front, $link, $back, $base_dir,
342 $file, $doc_obj, $section) = @_;
343 # remove quotes from link at start and end if necessary
344 if ($link=~/^\"/) {
345 $link=~s/^\"//;$link=~s/\"$//;
346 $front.='"';
347 $back="\"$back";
348 }
[10426]349
[10271]350 $link =~ s/\n/ /g;
[10426]351
[10271]352 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[10426]353
[10271]354## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
[10426]355
[10271]356 my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
[10426]357
[10271]358 return $front . $middle . $back;
359}
360
[10426]361sub extract_metadata
362{
363 my $self = shift (@_);
364 my ($textref, $metadata, $doc_obj) = @_;
365 my $outhandle = $self->{'outhandle'};
366
[11380]367 return if (!defined $textref);
368
[10426]369 # metadata fields to extract/save. 'key' is the (lowercase) name of the
370 # html meta, 'value' is the metadata name for greenstone to use
371 my %find_fields = ();
372 my ($tag,$value);
[10271]373
[10426]374 my $orig_field = "";
[10723]375 foreach my $field (split /,/, $self->{'metadata_fields'}) {
[10426]376 # support tag<tagname>
377 if ($field =~ /^(.*?)<(.*?)>$/) {
378 # "$2" is the user's preferred gs metadata name
379 $find_fields{lc($1)}=$2; # lc = lowercase
380 $orig_field = $1;
381 } else { # no <tagname> for mapping
382 # "$field" is the user's preferred gs metadata name
383 $find_fields{lc($field)}=$field; # lc = lowercase
384 $orig_field = $field;
385 }
[11380]386
[10426]387 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
388 $tag = $orig_field;
389 $value = $1;
390 if (!defined $value || !defined $tag){
[11893]391 #print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
[10426]392 next;
393 } else {
394 # clean up and add
395 chomp($value); # remove trailing \n, if any
396 $tag = $find_fields{lc($tag)};
[11893]397 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
398 # if ($self->{'verbosity'} > 2);
[10426]399 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
400 }
401 }
402 }
403}
404
[10271]4051;
Note: See TracBrowser for help on using the repository browser.