source: gsdl/trunk/perllib/plugins/StructuredHTMLPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 12.8 KB
Line 
1###########################################################################
2#
3# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
4# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5# HTML document
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28# This plugin is to process an HTML file where sections are divided by
29# user-defined headings tags. As it is difficult to predict what user's definition
30# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31# as well as allows to get rid of user-defined Table of Content (TOC)...
32# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
33
34package StructuredHTMLPlugin;
35
36use HTMLPlugin;
37use ImageConverter; # want the identify method
38
39use strict; # every perl program should have this!
40no strict 'refs'; # make an exception so we can use variables as filehandles
41
42sub BEGIN {
43 @StructuredHTMLPlugin::ISA = ('HTMLPlugin');
44}
45
46my $arguments =
47 [
48 { 'name' => "level1_header",
49 'desc' => "{StructuredHTMLPlugin.level1_header}",
50 'type' => "regexp",
51 'reqd' => "no",
52 'deft' => "" },
53 { 'name' => "level2_header",
54 'desc' => "{StructuredHTMLPlugin.level2_header}",
55 'type' => "regexp",
56 'reqd' => "no",
57 'deft' => "" },
58 { 'name' => "level3_header",
59 'desc' => "{StructuredHTMLPlugin.level3_header}",
60 'type' => "regexp",
61 'reqd' => "no",
62 'deft' => "" },
63 { 'name' => "title_header",
64 'desc' => "{StructuredHTMLPlugin.title_header}",
65 'type' => "regexp",
66 'reqd' => "no",
67 'deft' => "" },
68 { 'name' => "delete_toc",
69 'desc' => "{StructuredHTMLPlugin.delete_toc}",
70 'type' => "flag",
71 'reqd' => "no"},
72 { 'name' => "toc_header",
73 'desc' => "{StructuredHTMLPlugin.toc_header}",
74 'type' => "regexp",
75 'reqd' => "no",
76 'deft' => "" }
77 ];
78
79my $options = { 'name' => "StructuredHTMLPlugin",
80 'desc' => "{StructuredHTMLPlugin.desc}",
81 'abstract' => "no",
82 'inherits' => "yes",
83 'args' => $arguments };
84
85sub new {
86 my ($class) = shift (@_);
87 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88 push(@$pluginlist, $class);
89
90 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
91 push(@{$hashArgOptLists->{"OptList"}},$options);
92
93 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
94
95 return bless $self, $class;
96}
97
98
99sub process {
100 my $self = shift (@_);
101 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
102 my $outhandle = $self->{'outhandle'};
103
104 print $outhandle "StructuredHTMLPlugin: processing $file\n"
105 if $self->{'verbosity'} > 1;
106
107 my @head_and_body = split(/<body/i,$$textref);
108 my $head = shift(@head_and_body);
109 my $body_text = join("<body", @head_and_body);
110 $head =~ m/<title>(.+)<\/title>/i;
111 my $doctitle = $1 if defined $1;
112 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
113 my @doc_properties = split(/<xml>/i,$head);
114 my $doc_heading = shift(@doc_properties);
115 my $rest_doc_properties = join(" ", @doc_properties);
116
117 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
118 my $extracted_metadata = shift (@extracted_metadata);
119 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
120 }
121
122 # set the title here if we haven't found it yet
123 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
124 if (defined $doctitle && $doctitle =~ /\S/) {
125 $doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
126 } else {
127 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
128 }
129 }
130
131 # If delete_toc is enabled, it means to get rid of toc and tof contents.
132 # get rid of TOC and TOF sections and their title
133 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
134 if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
135 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
136 }
137 }
138
139 if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
140 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
141 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
142 }
143
144 if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
145 $self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
146 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
147 }
148
149 if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
150 $self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
151 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
152 }
153
154 if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
155 $self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
156 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
157 }
158
159 # Tidy up extra new lines
160 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
161 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
162
163 # what was the following line for. effectively unused. do we need it??
164 #$section_text .= "<!--\n<Section>\n-->\n";
165 #my $top_section_tag = "<!--\n<Section>\n-->\n";
166 #$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
167 #$body_text =~ s/(<div.*)/$top_section_tag$1/i;
168 my $body = "<body".$body_text;
169
170 my $section_text = $head;
171
172 # split HTML text on <h1>, <h2> etc tags
173 my @h_split = split(/<h/i,$body);
174
175 my $hnum = 0;
176
177 my $sectionh1 = 0;
178 $section_text .= shift(@h_split);
179
180 my $hc;
181 foreach $hc ( @h_split )
182 {
183 if ($hc =~ m/^([1-3])\s*.*?>(.*)$/s)
184 {
185 my $new_hnum = $1;
186 my $hc_after = $2;
187
188 if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
189 {
190 my $h_text = $1;
191 $hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
192 # boil HTML down to some interesting text
193 $h_text =~ s/^[1-3]>//;
194 $h_text =~ s/<\/?.*?>//sg;
195 $h_text =~ s/\s+/ /sg;
196 $h_text =~ s/^\s$//s;
197 $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
198
199 if ($h_text =~ m/\w+/)
200 {
201 if ($new_hnum > $hnum)
202 {
203 # increase section nesting
204 $hnum++;
205 while ($hnum < $new_hnum)
206 {
207 my $spacing = " " x $hnum;
208 $section_text .= "<!--\n";
209 $section_text .= $spacing."<Section>\n";
210 $section_text .= "-->\n";
211 $hnum++;
212 }
213 }
214 else # ($new_hnum <= $hnum)
215 {
216 # descrease section nesting
217 while ($hnum >= $new_hnum)
218 {
219 my $spacing = " " x $hnum;
220 $section_text .= "<!--\n";
221 $section_text .= $spacing."</Section>\n";
222 $section_text .= "-->\n";
223 $hnum--;
224 }
225 $hnum++;
226 }
227
228 my $spacing = " " x $hnum;
229 $section_text .= "<!--\n";
230 $section_text .= $spacing."<Section>\n";
231 $section_text .= $spacing." <Description>\n";
232 $section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
233 $section_text .= $spacing." </Description>\n";
234 $section_text .= "-->\n";
235
236 #print $outhandle $spacing."$h_text\n"
237 # if $self->{'verbosity'} > 2;
238
239 $sectionh1++ if ($hnum==1);
240 }
241 }
242 else {
243### print STDERR "***** hc = <h$hc\n\n";
244 }
245 $section_text .= "<h$hc";
246 }
247 else
248 {
249 $section_text .= "<h$hc";
250 }
251 }
252
253 while ($hnum >= 1)
254 {
255 my $spacing = " " x $hnum;
256 $section_text .= "<!--\n";
257 $section_text .= $spacing."</Section>\n";
258 $section_text .= "-->\n";
259 $hnum--;
260 }
261
262 $section_text .= "<!--\n</Section>\n-->\n";
263
264 $$textref = $section_text;
265
266# if ($sectionh1>0)
267# {
268# print $outhandle " Located section headings ..."
269# if $self->{'verbosity'} > 1;
270# }
271
272 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
273
274 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
275
276 ## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
277
278 $self->SUPER::process(@_);
279
280}
281
282
283sub resize_if_necessary
284{
285 my ($self,$front,$back,$base_dir,$href) = @_;
286
287 # dig out width and height of image, if there
288 my $img_attributes = "$front back";
289 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
290 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
291
292 # derive local filename for image based on its URL
293 my $img_filename = $href;
294 $img_filename =~ s/^[^:]*:\/\///;
295 $img_filename = &util::filename_cat($base_dir, $img_filename);
296
297 # Replace %20's in URL with a space if required. Note that the filename
298 # may include the %20 in some situations
299 if ($img_filename =~ /\%20/) {
300 if (!-e $img_filename) {
301 $img_filename =~ s/\%20/ /g;
302 }
303 }
304 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
305 # get image info on width and height
306
307 my $outhandle = $self->{'outhandle'};
308 my $verbosity = $self->{'verbosity'};
309
310 my ($image_type, $actual_width, $actual_height, $image_size)
311 = &ImageConverter::identify($img_filename, $outhandle, $verbosity);
312
313 #print STDERR "**** $actual_width x $actual_height";
314 #print STDERR " (requested: $img_width x $img_height)\n";
315
316 if (($img_width < $actual_width) || ($img_height < $actual_height)) {
317 #print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
318
319 # derive new image name based on current image
320 my ($tailname, $dirname, $suffix)
321 = &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
322
323 my $resized_filename
324 = &util::filename_cat($dirname, $tailname."_resized".$suffix);
325
326 #print STDERR "**** suffix = $suffix\n";
327
328 # Generate smaller image with convert
329 my $newsize = "$img_width"."x$img_height";
330 my $command = "convert -interlace plane -verbose "
331 ."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
332 #print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
333 #my $result = '';
334 #print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
335 }
336 }
337 return $href;
338}
339
340sub replace_images {
341 my $self = shift (@_);
342 my ($front, $link, $back, $base_dir,
343 $file, $doc_obj, $section) = @_;
344 # remove quotes from link at start and end if necessary
345 if ($link=~/^\"/) {
346 $link=~s/^\"//;$link=~s/\"$//;
347 $front.='"';
348 $back="\"$back";
349 }
350
351 $link =~ s/\n/ /g;
352
353 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
354
355## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
356
357 my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
358
359 return $front . $middle . $back;
360}
361
362sub extract_metadata
363{
364 my $self = shift (@_);
365 my ($textref, $metadata, $doc_obj) = @_;
366 my $outhandle = $self->{'outhandle'};
367
368 return if (!defined $textref);
369
370 # metadata fields to extract/save. 'key' is the (lowercase) name of the
371 # html meta, 'value' is the metadata name for greenstone to use
372 my %find_fields = ();
373 my ($tag,$value);
374
375 my $orig_field = "";
376 foreach my $field (split /,/, $self->{'metadata_fields'}) {
377 # support tag<tagname>
378 if ($field =~ /^(.*?)<(.*?)>$/) {
379 # "$2" is the user's preferred gs metadata name
380 $find_fields{lc($1)}=$2; # lc = lowercase
381 $orig_field = $1;
382 } else { # no <tagname> for mapping
383 # "$field" is the user's preferred gs metadata name
384 $find_fields{lc($field)}=$field; # lc = lowercase
385 $orig_field = $field;
386 }
387
388 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
389 $tag = $orig_field;
390 $value = $1;
391 if (!defined $value || !defined $tag){
392 #print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
393 next;
394 } else {
395 # clean up and add
396 chomp($value); # remove trailing \n, if any
397 $tag = $find_fields{lc($tag)};
398 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
399 # if ($self->{'verbosity'} > 2);
400 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
401 }
402 }
403 }
404}
405
4061;
Note: See TracBrowser for help on using the repository browser.