source: gsdl/trunk/perllib/plugins/StructuredHTMLPlugin.pm@ 16104

Last change on this file since 16104 was 16104, checked in by kjdon, 16 years ago

tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process

  • Property svn:keywords set to Author Date Id Revision
File size: 12.7 KB
Line 
1###########################################################################
2#
3# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
4# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5# HTML document
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28# This plugin is to process an HTML file where sections are divided by
29# user-defined headings tags. As it is difficult to predict what user's definition
30# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31# as well as allows to get rid of user-defined Table of Content (TOC)...
32# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
33
34package StructuredHTMLPlugin;
35
36use HTMLPlugin;
37use ImageConverter; # want the identify method
38
39use strict; # every perl program should have this!
40no strict 'refs'; # make an exception so we can use variables as filehandles
41
42sub BEGIN {
43 @StructuredHTMLPlugin::ISA = ('HTMLPlugin');
44}
45
46my $arguments =
47 [
48 { 'name' => "level1_header",
49 'desc' => "{StructuredHTMLPlugin.level1_header}",
50 'type' => "regexp",
51 'reqd' => "no",
52 'deft' => "" },
53 { 'name' => "level2_header",
54 'desc' => "{StructuredHTMLPlugin.level2_header}",
55 'type' => "regexp",
56 'reqd' => "no",
57 'deft' => "" },
58 { 'name' => "level3_header",
59 'desc' => "{StructuredHTMLPlugin.level3_header}",
60 'type' => "regexp",
61 'reqd' => "no",
62 'deft' => "" },
63 { 'name' => "title_header",
64 'desc' => "{StructuredHTMLPlugin.title_header}",
65 'type' => "regexp",
66 'reqd' => "no",
67 'deft' => "" },
68 { 'name' => "delete_toc",
69 'desc' => "{StructuredHTMLPlugin.delete_toc}",
70 'type' => "flag",
71 'reqd' => "no"},
72 { 'name' => "toc_header",
73 'desc' => "{StructuredHTMLPlugin.toc_header}",
74 'type' => "regexp",
75 'reqd' => "no",
76 'deft' => "" }
77 ];
78
79my $options = { 'name' => "StructuredHTMLPlugin",
80 'desc' => "{StructuredHTMLPlugin.desc}",
81 'abstract' => "no",
82 'inherits' => "yes",
83 'args' => $arguments };
84
85sub new {
86 my ($class) = shift (@_);
87 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88 push(@$pluginlist, $class);
89
90 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
91 push(@{$hashArgOptLists->{"OptList"}},$options);
92
93 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
94
95 return bless $self, $class;
96}
97
98
99sub process {
100 my $self = shift (@_);
101 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
102 my $outhandle = $self->{'outhandle'};
103
104 my @head_and_body = split(/<body/i,$$textref);
105 my $head = shift(@head_and_body);
106 my $body_text = join("<body", @head_and_body);
107 $head =~ m/<title>(.+)<\/title>/i;
108 my $doctitle = $1 if defined $1;
109 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
110 my @doc_properties = split(/<xml>/i,$head);
111 my $doc_heading = shift(@doc_properties);
112 my $rest_doc_properties = join(" ", @doc_properties);
113
114 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
115 my $extracted_metadata = shift (@extracted_metadata);
116 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
117 }
118
119 # set the title here if we haven't found it yet
120 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
121 if (defined $doctitle && $doctitle =~ /\S/) {
122 $doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
123 } else {
124 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
125 }
126 }
127
128 # If delete_toc is enabled, it means to get rid of toc and tof contents.
129 # get rid of TOC and TOF sections and their title
130 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
131 if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
132 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
133 }
134 }
135
136 if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
137 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
138 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
139 }
140
141 if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
142 $self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
143 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
144 }
145
146 if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
147 $self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
148 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
149 }
150
151 if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
152 $self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
153 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
154 }
155
156 # Tidy up extra new lines
157 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
158 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
159
160 # what was the following line for. effectively unused. do we need it??
161 #$section_text .= "<!--\n<Section>\n-->\n";
162 #my $top_section_tag = "<!--\n<Section>\n-->\n";
163 #$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
164 #$body_text =~ s/(<div.*)/$top_section_tag$1/i;
165 my $body = "<body".$body_text;
166
167 my $section_text = $head;
168
169 # split HTML text on <h1>, <h2> etc tags
170 my @h_split = split(/<h/i,$body);
171
172 my $hnum = 0;
173
174 my $sectionh1 = 0;
175 $section_text .= shift(@h_split);
176
177 my $hc;
178 foreach $hc ( @h_split )
179 {
180 if ($hc =~ m/^([1-3])\s*.*?>(.*)$/s)
181 {
182 my $new_hnum = $1;
183 my $hc_after = $2;
184
185 if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
186 {
187 my $h_text = $1;
188 $hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
189 # boil HTML down to some interesting text
190 $h_text =~ s/^[1-3]>//;
191 $h_text =~ s/<\/?.*?>//sg;
192 $h_text =~ s/\s+/ /sg;
193 $h_text =~ s/^\s$//s;
194 $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
195
196 if ($h_text =~ m/\w+/)
197 {
198 if ($new_hnum > $hnum)
199 {
200 # increase section nesting
201 $hnum++;
202 while ($hnum < $new_hnum)
203 {
204 my $spacing = " " x $hnum;
205 $section_text .= "<!--\n";
206 $section_text .= $spacing."<Section>\n";
207 $section_text .= "-->\n";
208 $hnum++;
209 }
210 }
211 else # ($new_hnum <= $hnum)
212 {
213 # descrease section nesting
214 while ($hnum >= $new_hnum)
215 {
216 my $spacing = " " x $hnum;
217 $section_text .= "<!--\n";
218 $section_text .= $spacing."</Section>\n";
219 $section_text .= "-->\n";
220 $hnum--;
221 }
222 $hnum++;
223 }
224
225 my $spacing = " " x $hnum;
226 $section_text .= "<!--\n";
227 $section_text .= $spacing."<Section>\n";
228 $section_text .= $spacing." <Description>\n";
229 $section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
230 $section_text .= $spacing." </Description>\n";
231 $section_text .= "-->\n";
232
233 #print $outhandle $spacing."$h_text\n"
234 # if $self->{'verbosity'} > 2;
235
236 $sectionh1++ if ($hnum==1);
237 }
238 }
239 else {
240### print STDERR "***** hc = <h$hc\n\n";
241 }
242 $section_text .= "<h$hc";
243 }
244 else
245 {
246 $section_text .= "<h$hc";
247 }
248 }
249
250 while ($hnum >= 1)
251 {
252 my $spacing = " " x $hnum;
253 $section_text .= "<!--\n";
254 $section_text .= $spacing."</Section>\n";
255 $section_text .= "-->\n";
256 $hnum--;
257 }
258
259 $section_text .= "<!--\n</Section>\n-->\n";
260
261 $$textref = $section_text;
262
263# if ($sectionh1>0)
264# {
265# print $outhandle " Located section headings ..."
266# if $self->{'verbosity'} > 1;
267# }
268
269 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
270
271 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
272
273 ## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
274
275 $self->SUPER::process(@_);
276
277}
278
279
280sub resize_if_necessary
281{
282 my ($self,$front,$back,$base_dir,$href) = @_;
283
284 # dig out width and height of image, if there
285 my $img_attributes = "$front back";
286 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
287 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
288
289 # derive local filename for image based on its URL
290 my $img_filename = $href;
291 $img_filename =~ s/^[^:]*:\/\///;
292 $img_filename = &util::filename_cat($base_dir, $img_filename);
293
294 # Replace %20's in URL with a space if required. Note that the filename
295 # may include the %20 in some situations
296 if ($img_filename =~ /\%20/) {
297 if (!-e $img_filename) {
298 $img_filename =~ s/\%20/ /g;
299 }
300 }
301 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
302 # get image info on width and height
303
304 my $outhandle = $self->{'outhandle'};
305 my $verbosity = $self->{'verbosity'};
306
307 my ($image_type, $actual_width, $actual_height, $image_size)
308 = &ImageConverter::identify($img_filename, $outhandle, $verbosity);
309
310 #print STDERR "**** $actual_width x $actual_height";
311 #print STDERR " (requested: $img_width x $img_height)\n";
312
313 if (($img_width < $actual_width) || ($img_height < $actual_height)) {
314 #print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
315
316 # derive new image name based on current image
317 my ($tailname, $dirname, $suffix)
318 = &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
319
320 my $resized_filename
321 = &util::filename_cat($dirname, $tailname."_resized".$suffix);
322
323 #print STDERR "**** suffix = $suffix\n";
324
325 # Generate smaller image with convert
326 my $newsize = "$img_width"."x$img_height";
327 my $command = "convert -interlace plane -verbose "
328 ."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
329 #print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
330 #my $result = '';
331 #print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
332 }
333 }
334 return $href;
335}
336
337sub replace_images {
338 my $self = shift (@_);
339 my ($front, $link, $back, $base_dir,
340 $file, $doc_obj, $section) = @_;
341 # remove quotes from link at start and end if necessary
342 if ($link=~/^\"/) {
343 $link=~s/^\"//;$link=~s/\"$//;
344 $front.='"';
345 $back="\"$back";
346 }
347
348 $link =~ s/\n/ /g;
349
350 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
351
352## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
353
354 my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
355
356 return $front . $middle . $back;
357}
358
359sub extract_metadata
360{
361 my $self = shift (@_);
362 my ($textref, $metadata, $doc_obj) = @_;
363 my $outhandle = $self->{'outhandle'};
364
365 return if (!defined $textref);
366
367 # metadata fields to extract/save. 'key' is the (lowercase) name of the
368 # html meta, 'value' is the metadata name for greenstone to use
369 my %find_fields = ();
370 my ($tag,$value);
371
372 my $orig_field = "";
373 foreach my $field (split /,/, $self->{'metadata_fields'}) {
374 # support tag<tagname>
375 if ($field =~ /^(.*?)<(.*?)>$/) {
376 # "$2" is the user's preferred gs metadata name
377 $find_fields{lc($1)}=$2; # lc = lowercase
378 $orig_field = $1;
379 } else { # no <tagname> for mapping
380 # "$field" is the user's preferred gs metadata name
381 $find_fields{lc($field)}=$field; # lc = lowercase
382 $orig_field = $field;
383 }
384
385 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
386 $tag = $orig_field;
387 $value = $1;
388 if (!defined $value || !defined $tag){
389 #print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
390 next;
391 } else {
392 # clean up and add
393 chomp($value); # remove trailing \n, if any
394 $tag = $find_fields{lc($tag)};
395 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
396 # if ($self->{'verbosity'} > 2);
397 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
398 }
399 }
400 }
401}
402
4031;
Note: See TracBrowser for help on using the repository browser.