source: trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm@ 10404

Last change on this file since 10404 was 10404, checked in by chi, 19 years ago

remove the plugin arguments to WordPlug.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.4 KB
Line 
1###########################################################################
2#
3# StructuredHTMLPlug.pm -- html plugin with extra facilities for teasing out
4# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5# HTML document
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28# This plugin is to process an HTML file where sections are divided by
29# user-defined headings tags. As it is difficult to predict what user's definition
30# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31# as well as allows to get rid of user-defined Table of Content (TOC)...
32# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
33
34package StructuredHTMLPlug;
35
36use HTMLPlug;
37use ImagePlug;
38
39#use strict; # every perl program should have this!
40#no strict 'refs'; # make an exception so we can use variables as filehandles
41
42sub BEGIN {
43 @StructuredHTMLPlug::ISA = ('HTMLPlug');
44}
45
46my $arguments = [];
47
48my $options = { 'name' => "StructuredHTMLPlug",
49 'desc' => "{StructuredHTMLPlug.desc}",
50 'abstract' => "no",
51 'inherits' => "yes",
52 'args' => $arguments };
53
54sub new {
55 my ($class) = shift (@_);
56 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
57 push(@$pluginlist, $class);
58
59 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
60 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
61
62 my $self = (defined $hashArgOptLists)? new HTMLPlug($pluginlist,$inputargs,$hashArgOptLists): new HTMLPlug($pluginlist,$inputargs);
63
64 return bless $self, $class;
65
66}
67
68sub read {
69 my $self = shift (@_);
70 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
71
72 my $filename = $file;
73 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
74
75 if ($filename =~ m/\.html?$/) {
76 my $poss_doc_filename = $filename;
77 $poss_doc_filename =~ s/\.html?$/.doc/;
78
79 if (-e $poss_doc_filename) {
80 # this file has already been processed by Word plugin
81 return 0;
82 }
83 }
84 return $self->SUPER::read(@_);
85}
86
87sub process {
88 my $self = shift (@_);
89 #my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
90 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
91 my $outhandle = $self->{'outhandle'};
92
93 print $outhandle "StructuredHTMLPlug: processing $file\n"
94 if $self->{'verbosity'} > 1;
95
96 my @head_and_body = split(/<body/i,$$textref);
97 my $head = shift(@head_and_body);
98 my $body_text = join("<body", @head_and_body);
99
100 # If checkout_toc is enables, it means to get rid of toc and tof contents.
101 # get rid of TOC and TOF sections and their title
102 if ($self->{'checkout_toc'}){
103 #line-height:150%;mso-ansi-language:FR'>Contents<o:p></o:p></span></b></p>
104 # get rid of Table of Contents title and Table of Figures
105 #$body_text =~ s/<p[^>]*><b><span[^>]*>(Table of Content.|Content.)<o:p><\/o:p><\/span><\/b><\/p>//isg;
106 #$body_text =~ s/<p[^>]*><b><span[^>]*>(Table of Figure.|Figure.)<o:p><\/o:p><\/span><\/b><\/p>//isg;
107 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
108 $body_text =~ s/<p class=(($self->{'tof_header'})[^>]*)>(.+?)<\/p>//isg;
109 }
110
111 if ($self->{'title_header'}){
112 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
113 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><title>$3<\/title><\/p>/isg;
114 }
115 if ($self->{'level1_header'}){
116 $self->{'level1_header'} =~ s/^(\()(.*)(\))/$2/is;
117 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
118 }
119 if ($self->{'level2_header'}){
120 $self->{'level2_header'} =~ s/^(\()(.*)(\))/$2/is;
121 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
122 }
123
124 if ($self->{'level3_header'}){
125 $self->{'level3_header'} =~ s/^(\()(.*)(\))/$2/is;
126 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
127 }
128
129 # Tidy up extra new lines
130 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
131 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
132
133 my $body = "<body".$body_text;
134
135 my $section_text = $head;
136 $section_text .= "<!--\n<Section>\n-->\n";
137
138 # split HTML text on <h1>, <h2> etc tags
139 my @h_split = split(/<h/i,$body);
140
141 my $hnum = 0;
142
143 my $sectionh1 = 0;
144 $section_text .= shift(@h_split);
145
146 my $hc;
147 foreach $hc ( @h_split )
148 {
149 if ($hc =~ m/^([1-3])\s*.*?>(.*)$/s)
150 {
151 my $new_hnum = $1;
152 my $hc_after = $2;
153
154 if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
155 {
156 my $h_text = $1;
157 $hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
158 # boil HTML down to some interesting text
159 $h_text =~ s/^[1-3]>//;
160 $h_text =~ s/<\/?.*?>//sg;
161 $h_text =~ s/\s+/ /sg;
162 $h_text =~ s/^\s$//s;
163 $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
164
165 if ($h_text =~ m/\w+/)
166 {
167 if ($new_hnum > $hnum)
168 {
169 # increase section nesting
170 $hnum++;
171 while ($hnum < $new_hnum)
172 {
173 my $spacing = " " x $hnum;
174 $section_text .= "<!--\n";
175 $section_text .= $spacing."<Section>\n";
176 $section_text .= "-->\n";
177 $hnum++;
178 }
179 }
180 else # ($new_hnum <= $hnum)
181 {
182 # descrease section nesting
183 while ($hnum >= $new_hnum)
184 {
185 my $spacing = " " x $hnum;
186 $section_text .= "<!--\n";
187 $section_text .= $spacing."</Section>\n";
188 $section_text .= "-->\n";
189 $hnum--;
190 }
191 $hnum++;
192 }
193
194 my $spacing = " " x $hnum;
195 $section_text .= "<!--\n";
196 $section_text .= $spacing."<Section>\n";
197 $section_text .= $spacing." <Description>\n";
198 $section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
199 $section_text .= $spacing." </Description>\n";
200 $section_text .= "-->\n";
201
202 print $outhandle $spacing."$h_text\n"
203 if $self->{'verbosity'} > 2;
204
205 $sectionh1++ if ($hnum==1);
206 }
207 }
208 else {
209### print STDERR "***** hc = <h$hc\n\n";
210
211 }
212 # $section_text .= "<!-- \n</Section>\n-->\n";
213 #print STDERR "***HC = $hc\n";
214 $section_text .= "<h$hc";
215 }
216 else
217 {
218 $section_text .= "<h$hc";
219 }
220 }
221
222 while ($hnum >= 1)
223 {
224 my $spacing = " " x $hnum;
225 $section_text .= "<!--\n";
226 $section_text .= $spacing."</Section>\n";
227 $section_text .= "-->\n";
228 $hnum--;
229 }
230
231 $section_text .= "<!--\n</Section>\n-->\n";
232
233 $$textref = $section_text;
234
235# should be textref not testref???
236# $$testref =~ s/<h(\d+)>(.*?)<\/h$1>/<Section><Metadata name=\"Title\">$1<\/Metadata></Section><h$1><\/h$1>/gi;
237
238 if ($sectionh1>0)
239 {
240 print $outhandle " Located section headings ..."
241 if $self->{'verbosity'} > 1;
242 }
243 print $outhandle " Passing on the HTMLPlug\n"
244 if $self->{'verbosity'} > 1;
245
246 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
247
248 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
249
250## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
251
252 $self->SUPER::process(@_);
253
254 # associate original file with doc object
255 my $cursection = $doc_obj->get_top_section();
256 my $filename = &util::filename_cat($base_dir, $file);
257 if (-e $filename)
258 {
259 print $outhandle " Adding associated Word document\n"
260 if $self->{'verbosity'} > 1;
261
262 $doc_obj->associate_file($filename, "doc.doc", undef, $cursection);
263
264 my $doclink = "<a href=_httpcollection_/index/assoc/[archivedir]/doc.doc>";
265 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
266 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icondoc_");
267 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
268
269 my $file_size = -s $filename;
270 if ($file_size>1024)
271 {
272 my $fs_kbytes = sprintf("%d",$file_size/1024);
273 $doc_obj->add_utf8_metadata ($cursection, "filesize", "$fs_kbytes Kb");
274 }
275 else
276 {
277 $doc_obj->add_utf8_metadata ($cursection, "filesize", "$file_size bytes");
278 }
279
280 if ($file_size > 200000)
281 {
282 $doc_obj->add_utf8_metadata ($cursection, "fswarning", "1");
283 }
284 }
285}
286
287
288sub resize_if_necessary
289{
290 my ($self,$front,$back,$base_dir,$href) = @_;
291
292 # dig out width and height of image, if there
293 my $img_attributes = "$front back";
294 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
295 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
296
297 # derive local filename for image based on its URL
298 my $img_filename = $href;
299 $img_filename =~ s/^[^:]*:\/\///;
300 $img_filename = &util::filename_cat($base_dir, $img_filename);
301
302 # Replace %20's in URL with a space if required. Note that the filename
303 # may include the %20 in some situations
304 if ($img_filename =~ /\%20/) {
305 if (!-e $img_filename) {
306 $img_filename =~ s/\%20/ /g;
307 }
308 }
309 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
310 # get image info on width and height
311
312 my $outhandle = $self->{'outhandle'};
313 my $verbosity = $self->{'verbosity'};
314
315 my ($image_type, $actual_width, $actual_height, $image_size)
316 = &ImagePlug::identify($img_filename, $outhandle, $verbosity);
317
318 #print STDERR "**** $actual_width x $actual_height";
319 #print STDERR " (requested: $img_width x $img_height)\n";
320
321 if (($img_width < $actual_width) || ($img_height < $actual_height)) {
322 print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
323
324 # derive new image name based on current image
325 my ($tailname, $dirname, $suffix)
326 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
327
328 my $resized_filename
329 = &util::filename_cat($dirname, $tailname."_resized".$suffix);
330
331 #print STDERR "**** suffix = $suffix\n";
332
333 # Generate smaller image with convert
334 my $newsize = "$img_widthx$image_height";
335 my $command = "convert -interlace plane -verbose "
336 ."-geometry $newsize \"img_$filename\" \"$resized_filename\"";
337 print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
338 my $result = '';
339 print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
340
341 }
342 }
343
344 return $href;
345}
346
347
348
349
350sub replace_images {
351 my $self = shift (@_);
352 my ($front, $link, $back, $base_dir,
353 $file, $doc_obj, $section) = @_;
354 # remove quotes from link at start and end if necessary
355 if ($link=~/^\"/) {
356 $link=~s/^\"//;$link=~s/\"$//;
357 $front.='"';
358 $back="\"$back";
359 }
360
361 $link =~ s/\n/ /g;
362
363 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
364
365## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
366
367 my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
368
369 return $front . $middle . $back;
370}
371
372
3731;
Note: See TracBrowser for help on using the repository browser.