source: trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm@ 10356

Last change on this file since 10356 was 10356, checked in by chi, 19 years ago

tidy up the code.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.6 KB
Line 
1###########################################################################
2#
3# StructuredHTMLPlug.pm -- html plugin with extra facilities for teasing out
4# hierarchical structure (such as h1, h2, h3 tags) in an HTML document
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27package StructuredHTMLPlug;
28
29use HTMLPlug;
30use ImagePlug;
31
32sub BEGIN {
33 @ISA = ('HTMLPlug');
34}
35
36my $arguments =
37 [ { 'name' => "check_toc",
38 'desc' => "StructuredHTMLPlug.check_toc'}",
39 'type' => "flag",
40 'reqd' => "no"},
41 { 'name' => "title_header",
42 'desc' => "{StructuredHTMLPlug.title_header}",
43 'type' => "regexp",
44 'reqd' => "no"},
45 { 'name' => "level1_header",
46 'desc' => "{StructuredHTMLPlug.level1_header}",
47 'type' => "regexp",
48 'reqd' => "no"},
49 { 'name' => "level2_header",
50 'desc' => "{StructuredHTMLPlug.level2_header}",
51 'type' => "regexp",
52 'reqd' => "no"},
53 { 'name' => "level3_header",
54 'desc' => "{StructuredHTMLPlug.level3_header}",
55 'type' => "regexp",
56 'reqd' => "no"},
57 { 'name' => "toc_header",
58 'desc' => "{StructuredHTMLPlug.toc_header}",
59 'type' => "regexp",
60 'reqd' => "no"},
61 { 'name' => "tof_header",
62 'desc' => "{StructuredHTMLPlug.tof_header}",
63 'type' => "regexp",
64 'reqd' => "no"}];
65
66my $options = { 'name' => "StructuredHTMLPlug",
67 'desc' => "{StructuredHTMLPlug.desc}",
68 'abstract' => "no",
69 'inherits' => "yes",
70 'args' => $arguments };
71
72sub new {
73 my ($class) = shift (@_);
74 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
75 push(@$pluginlist, $class);
76
77 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
78 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
79
80 my $self = (defined $hashArgOptLists)? new HTMLPlug($pluginlist,$inputargs,$hashArgOptLists): new HTMLPlug($pluginlist,$inputargs);
81
82 return bless $self, $class;
83
84}
85
86sub read {
87 my $self = shift (@_);
88 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
89
90 my $filename = $file;
91 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
92
93 if ($filename =~ m/\.html?$/) {
94 my $poss_doc_filename = $filename;
95 $poss_doc_filename =~ s/\.html?$/.doc/;
96
97 if (-e $poss_doc_filename) {
98 # this file has already been processed by Word plugin
99 return 0;
100 }
101 }
102 return $self->SUPER::read(@_);
103}
104
105sub process {
106 my $self = shift (@_);
107 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
108 my $outhandle = $self->{'outhandle'};
109
110 print $outhandle "StructuredHTMLPlug: processing $file\n"
111 if $self->{'verbosity'} > 1;
112
113 my @head_and_body = split(/<body/i,$$textref);
114 my $head = shift(@head_and_body);
115 my $body_text = join("<body", @head_and_body);
116
117 # get rid of TOC and TOF sections and their title
118 # If check_toc is enables, it means to get rid of toc and tof contents.
119 if ($self->{'check_toc'}){
120 #line-height:150%;mso-ansi-language:FR'>Contents<o:p></o:p></span></b></p>
121 # get rid of Table of Contents title and Table of Figures
122 $body_text =~ s/<p[^>]*><b><span[^>]*>(Table of Content.|Content.)<o:p><\/o:p><\/span><\/b><\/p>//isg;
123 $body_text =~ s/<p[^>]*><b><span[^>]*>(Table of Figure.|Figure.)<o:p><\/o:p><\/span><\/b><\/p>//isg;
124 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
125 $body_text =~ s/<p class=(($self->{'tof_header'})[^>]*)>(.+?)<\/p>//isg;
126 }
127
128 if ($self->{'title_header'}){
129 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
130 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><title>$3<\/title><\/p>/isg;
131 }
132 if ($self->{'level1_header'}){
133 $self->{'level1_header'} =~ s/^(\()(.*)(\))/$2/is;
134 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
135 }
136 if ($self->{'level2_header'}){
137 $self->{'level2_header'} =~ s/^(\()(.*)(\))/$2/is;
138 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
139 }
140
141 if ($self->{'level3_header'}){
142 $self->{'level3_header'} =~ s/^(\()(.*)(\))/$2/is;
143 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
144 }
145
146 # Tidy up extra new lines
147 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
148 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
149
150 my $body = "<body".$body_text;
151
152 my $section_text = $head;
153 $section_text .= "<!--\n<Section>\n-->\n";
154
155 # split HTML text on <h1>, <h2> etc tags
156 my @h_split = split(/<h/i,$body);
157
158 my $hnum = 0;
159
160 my $sectionh1 = 0;
161 $section_text .= shift(@h_split);
162
163 my $hc;
164 foreach $hc ( @h_split )
165 {
166 if ($hc =~ m/^([1-3])\s*.*?>(.*)$/s)
167 {
168 my $new_hnum = $1;
169 my $hc_after = $2;
170
171 if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
172 {
173 my $h_text = $1;
174 $hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
175 # boil HTML down to some interesting text
176 $h_text =~ s/^[1-3]>//;
177 $h_text =~ s/<\/?.*?>//sg;
178 $h_text =~ s/\s+/ /sg;
179 $h_text =~ s/^\s$//s;
180 $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
181
182 if ($h_text =~ m/\w+/)
183 {
184 if ($new_hnum > $hnum)
185 {
186 # increase section nesting
187 $hnum++;
188 while ($hnum < $new_hnum)
189 {
190 my $spacing = " " x $hnum;
191 $section_text .= "<!--\n";
192 $section_text .= $spacing."<Section>\n";
193 $section_text .= "-->\n";
194 $hnum++;
195 }
196 }
197 else # ($new_hnum <= $hnum)
198 {
199 # descrease section nesting
200 while ($hnum >= $new_hnum)
201 {
202 my $spacing = " " x $hnum;
203 $section_text .= "<!--\n";
204 $section_text .= $spacing."</Section>\n";
205 $section_text .= "-->\n";
206 $hnum--;
207 }
208 $hnum++;
209 }
210
211 my $spacing = " " x $hnum;
212 $section_text .= "<!--\n";
213 $section_text .= $spacing."<Section>\n";
214 $section_text .= $spacing." <Description>\n";
215 $section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
216 $section_text .= $spacing." </Description>\n";
217 $section_text .= "-->\n";
218
219 print $outhandle $spacing."$h_text\n"
220 if $self->{'verbosity'} > 2;
221
222 $sectionh1++ if ($hnum==1);
223 }
224 }
225 else {
226### print STDERR "***** hc = <h$hc\n\n";
227
228 }
229 # $section_text .= "<!-- \n</Section>\n-->\n";
230 #print STDERR "***HC = $hc\n";
231 $section_text .= "<h$hc";
232 }
233 else
234 {
235 $section_text .= "<h$hc";
236 }
237 }
238
239 while ($hnum >= 1)
240 {
241 my $spacing = " " x $hnum;
242 $section_text .= "<!--\n";
243 $section_text .= $spacing."</Section>\n";
244 $section_text .= "-->\n";
245 $hnum--;
246 }
247
248 $section_text .= "<!--\n</Section>\n-->\n";
249
250 $$textref = $section_text;
251
252# should be textref not testref???
253# $$testref =~ s/<h(\d+)>(.*?)<\/h$1>/<Section><Metadata name=\"Title\">$1<\/Metadata></Section><h$1><\/h$1>/gi;
254
255 if ($sectionh1>0)
256 {
257 print $outhandle " Located section headings ..."
258 if $self->{'verbosity'} > 1;
259 }
260 print $outhandle " Passing on the HTMLPlug\n"
261 if $self->{'verbosity'} > 1;
262
263 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
264
265 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
266
267## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
268
269 $self->SUPER::process(@_);
270
271 # associate original file with doc object
272 my $cursection = $doc_obj->get_top_section();
273 my $filename = &util::filename_cat($base_dir, $file);
274 if (-e $filename)
275 {
276 print $outhandle " Adding associated Word document\n"
277 if $self->{'verbosity'} > 1;
278
279 $doc_obj->associate_file($filename, "doc.doc", undef, $cursection);
280
281 my $doclink = "<a href=_httpcollection_/index/assoc/[archivedir]/doc.doc>";
282 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
283 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icondoc_");
284 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
285
286 my $file_size = -s $filename;
287 if ($file_size>1024)
288 {
289 my $fs_kbytes = sprintf("%d",$file_size/1024);
290 $doc_obj->add_utf8_metadata ($cursection, "filesize", "$fs_kbytes Kb");
291 }
292 else
293 {
294 $doc_obj->add_utf8_metadata ($cursection, "filesize", "$file_size bytes");
295 }
296
297 if ($file_size > 200000)
298 {
299 $doc_obj->add_utf8_metadata ($cursection, "fswarning", "1");
300 }
301 }
302}
303
304
305sub resize_if_necessary
306{
307 my ($self,$front,$back,$base_dir,$href) = @_;
308
309 # dig out width and height of image, if there
310 my $img_attributes = "$front back";
311 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
312 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
313
314 # derive local filename for image based on its URL
315 my $img_filename = $href;
316 $img_filename =~ s/^[^:]*:\/\///;
317 $img_filename = &util::filename_cat($base_dir, $img_filename);
318
319 # Replace %20's in URL with a space if required. Note that the filename
320 # may include the %20 in some situations
321 if ($img_filename =~ /\%20/) {
322 if (!-e $img_filename) {
323 $img_filename =~ s/\%20/ /g;
324 }
325 }
326 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
327 # get image info on width and height
328
329 my $outhandle = $self->{'outhandle'};
330 my $verbosity = $self->{'verbosity'};
331
332 my ($image_type, $actual_width, $actual_height, $image_size)
333 = &ImagePlug::identify($img_filename, $outhandle, $verbosity);
334
335 #print STDERR "**** $actual_width x $actual_height";
336 #print STDERR " (requested: $img_width x $img_height)\n";
337
338 if (($img_width < $actual_width) || ($img_height < $actual_height)) {
339 print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
340
341 # derive new image name based on current image
342 my ($tailname, $dirname, $suffix)
343 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
344
345 my $resized_filename
346 = &util::filename_cat($dirname, $tailname."_resized".$suffix);
347
348 #print STDERR "**** suffix = $suffix\n";
349
350 # Generate smaller image with convert
351 my $newsize = "$img_widthx$image_height";
352 my $command = "convert -interlace plane -verbose "
353 ."-geometry $newsize \"img_$filename\" \"$resized_filename\"";
354 print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
355 my $result = '';
356 print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
357
358 }
359 }
360
361 return $href;
362}
363
364
365
366
367sub replace_images {
368 my $self = shift (@_);
369 my ($front, $link, $back, $base_dir,
370 $file, $doc_obj, $section) = @_;
371 # remove quotes from link at start and end if necessary
372 if ($link=~/^\"/) {
373 $link=~s/^\"//;$link=~s/\"$//;
374 $front.='"';
375 $back="\"$back";
376 }
377
378 $link =~ s/\n/ /g;
379
380 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
381
382## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
383
384 my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
385
386 return $front . $middle . $back;
387}
388
389
3901;
Note: See TracBrowser for help on using the repository browser.