source: trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm@ 10600

Last change on this file since 10600 was 10600, checked in by chi, 19 years ago

modifications for deal with document title (as the first H1 heading) and also store the
metadata title retrieved from document at the top_section.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.7 KB
Line 
1###########################################################################
2#
3# StructuredHTMLPlug.pm -- html plugin with extra facilities for teasing out
4# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5# HTML document
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28# This plugin is to process an HTML file where sections are divided by
29# user-defined headings tags. As it is difficult to predict what user's definition
30# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31# as well as allows to get rid of user-defined Table of Content (TOC)...
32# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
33
34package StructuredHTMLPlug;
35
36use HTMLPlug;
37use ImagePlug;
38
39#use strict; # every perl program should have this!
40#no strict 'refs'; # make an exception so we can use variables as filehandles
41
42sub BEGIN {
43 @StructuredHTMLPlug::ISA = ('HTMLPlug');
44}
45
46my $arguments = [];
47
48my $options = { 'name' => "StructuredHTMLPlug",
49 'desc' => "{StructuredHTMLPlug.desc}",
50 'abstract' => "no",
51 'inherits' => "yes",
52 'args' => $arguments };
53
54sub new {
55 my ($class) = shift (@_);
56 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
57 push(@$pluginlist, $class);
58
59 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
60 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
61
62 my $self = (defined $hashArgOptLists)? new HTMLPlug($pluginlist,$inputargs,$hashArgOptLists): new HTMLPlug($pluginlist,$inputargs);
63
64 return bless $self, $class;
65}
66
67sub read {
68 my $self = shift (@_);
69 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
70
71 my $filename = $file;
72 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
73
74 if ($filename =~ m/\.html?$/) {
75 my $poss_doc_filename = $filename;
76 $poss_doc_filename =~ s/\.html?$/.doc/;
77
78 if (-e $poss_doc_filename) {
79 # this file has already been processed by Word plugin
80 return 0;
81 }
82 }
83 return $self->SUPER::read(@_);
84}
85
86sub process {
87 my $self = shift (@_);
88 #my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
89 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
90 my $outhandle = $self->{'outhandle'};
91
92 print $outhandle "StructuredHTMLPlug: processing $file\n"
93 if $self->{'verbosity'} > 1;
94
95 my @head_and_body = split(/<body/i,$$textref);
96 my $head = shift(@head_and_body);
97 my $body_text = join("<body", @head_and_body);
98 $head =~ m/<title>(.+)<\/title>/i;
99 my $doctitle = $1 if defined $1;
100 if (defined $self->{'extracted_word_metadata_fields'} && $self->{'extracted_word_metadata_fields'}=~ /\S/) {
101 my @doc_properties = split(/<xml>/i,$head);
102 my $doc_heading = shift(@doc_properties);
103 my $rest_doc_properties = join(" ", @doc_properties);
104 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
105 my $extracted_metadata = shift (@extracted_metadata);
106 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
107 }
108
109 # If delete_toc is enables, it means to get rid of toc and tof contents.
110 # get rid of TOC and TOF sections and their title
111 if ($self->{'delete_toc'} == 1){
112 if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
113 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
114 }
115 if (defined $self->{'tof_header'}&& $self->{'tof_header'}=~ /\S/) {
116 $body_text =~ s/<p class=(($self->{'tof_header'})[^>]*)>(.+?)<\/p>//isg;
117 }
118 }
119
120 if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
121 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
122 #$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><title>$3<\/title><\/p>/isg;
123 #$doctitle = $3;
124 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
125 #$body_text =~ m/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/isg;
126 #$doctitle = "<h1>".$3."<\/h1>" if defined $3;
127 }
128
129 if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
130 $self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
131 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
132 }
133
134 if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
135 $self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
136 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
137 }
138
139 if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
140 $self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
141 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
142 }
143 # Tidy up extra new lines
144 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
145 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
146
147 $section_text .= "<!--\n<Section>\n-->\n";
148 #my $top_section_tag = "<!--\n<Section>\n-->\n";
149 #$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
150 #$body_text =~ s/(<div.*)/$top_section_tag$1/i;
151 my $body = "<body".$body_text;
152
153 my $section_text = $head;
154
155 # split HTML text on <h1>, <h2> etc tags
156 my @h_split = split(/<h/i,$body);
157
158 my $hnum = 0;
159
160 my $sectionh1 = 0;
161 $section_text .= shift(@h_split);
162
163 my $hc;
164 foreach $hc ( @h_split )
165 {
166 if ($hc =~ m/^([1-3])\s*.*?>(.*)$/s)
167 {
168 my $new_hnum = $1;
169 my $hc_after = $2;
170
171 if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
172 {
173 my $h_text = $1;
174 $hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
175 # boil HTML down to some interesting text
176 $h_text =~ s/^[1-3]>//;
177 $h_text =~ s/<\/?.*?>//sg;
178 $h_text =~ s/\s+/ /sg;
179 $h_text =~ s/^\s$//s;
180 $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
181
182 if ($h_text =~ m/\w+/)
183 {
184 if ($new_hnum > $hnum)
185 {
186 # increase section nesting
187 $hnum++;
188 while ($hnum < $new_hnum)
189 {
190 my $spacing = " " x $hnum;
191 $section_text .= "<!--\n";
192 $section_text .= $spacing."<Section>\n";
193 $section_text .= "-->\n";
194 $hnum++;
195 }
196 }
197 else # ($new_hnum <= $hnum)
198 {
199 # descrease section nesting
200 while ($hnum >= $new_hnum)
201 {
202 my $spacing = " " x $hnum;
203 $section_text .= "<!--\n";
204 $section_text .= $spacing."</Section>\n";
205 $section_text .= "-->\n";
206 $hnum--;
207 }
208 $hnum++;
209 }
210
211 my $spacing = " " x $hnum;
212 $section_text .= "<!--\n";
213 $section_text .= $spacing."<Section>\n";
214 $section_text .= $spacing." <Description>\n";
215 $section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
216 $section_text .= $spacing." </Description>\n";
217 $section_text .= "-->\n";
218
219 print $outhandle $spacing."$h_text\n"
220 if $self->{'verbosity'} > 2;
221
222 $sectionh1++ if ($hnum==1);
223 }
224 }
225 else {
226### print STDERR "***** hc = <h$hc\n\n";
227 }
228 $section_text .= "<h$hc";
229 }
230 else
231 {
232 $section_text .= "<h$hc";
233 }
234 }
235
236 while ($hnum >= 1)
237 {
238 my $spacing = " " x $hnum;
239 $section_text .= "<!--\n";
240 $section_text .= $spacing."</Section>\n";
241 $section_text .= "-->\n";
242 $hnum--;
243 }
244
245 $section_text .= "<!--\n</Section>\n-->\n";
246
247 $$textref = $section_text;
248
249 # should be textref not testref???
250 #$$testref =~ s/<h(\d+)>(.*?)<\/h$1>/<Section><Metadata name=\"Title\">$1<\/Metadata></Section><h$1><\/h$1>/gi;
251
252 if ($sectionh1>0)
253 {
254 print $outhandle " Located section headings ..."
255 if $self->{'verbosity'} > 1;
256 }
257 print $outhandle " Passing on the HTMLPlug\n"
258 if $self->{'verbosity'} > 1;
259
260 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
261
262 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
263
264 ## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
265
266 $self->SUPER::process(@_);
267
268 # associate original file with doc object
269 my $cursection = $doc_obj->get_top_section();
270 my $filename = &util::filename_cat($base_dir, $file);
271 if (-e $filename)
272 {
273 print $outhandle " Adding associated Word document\n"
274 if $self->{'verbosity'} > 1;
275
276 $doc_obj->associate_file($filename, "doc.doc", undef, $cursection);
277
278 my $doclink = "<a href=_httpcollection_/index/assoc/[archivedir]/doc.doc>";
279 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
280 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icondoc_");
281 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
282 $doc_obj->add_utf8_metadata ($cursection, "Title", $doctitle);
283 my $file_size = -s $filename;
284 if ($file_size>1024)
285 {
286 my $fs_kbytes = sprintf("%d",$file_size/1024);
287 $doc_obj->add_utf8_metadata ($cursection, "filesize", "$fs_kbytes Kb");
288 }
289 else
290 {
291 $doc_obj->add_utf8_metadata ($cursection, "filesize", "$file_size bytes");
292 }
293
294 if ($file_size > 200000)
295 {
296 $doc_obj->add_utf8_metadata ($cursection, "fswarning", "1");
297 }
298 }
299}
300
301
302sub resize_if_necessary
303{
304 my ($self,$front,$back,$base_dir,$href) = @_;
305
306 # dig out width and height of image, if there
307 my $img_attributes = "$front back";
308 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
309 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
310
311 # derive local filename for image based on its URL
312 my $img_filename = $href;
313 $img_filename =~ s/^[^:]*:\/\///;
314 $img_filename = &util::filename_cat($base_dir, $img_filename);
315
316 # Replace %20's in URL with a space if required. Note that the filename
317 # may include the %20 in some situations
318 if ($img_filename =~ /\%20/) {
319 if (!-e $img_filename) {
320 $img_filename =~ s/\%20/ /g;
321 }
322 }
323 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
324 # get image info on width and height
325
326 my $outhandle = $self->{'outhandle'};
327 my $verbosity = $self->{'verbosity'};
328
329 my ($image_type, $actual_width, $actual_height, $image_size)
330 = &ImagePlug::identify($img_filename, $outhandle, $verbosity);
331
332 #print STDERR "**** $actual_width x $actual_height";
333 #print STDERR " (requested: $img_width x $img_height)\n";
334
335 if (($img_width < $actual_width) || ($img_height < $actual_height)) {
336 print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
337
338 # derive new image name based on current image
339 my ($tailname, $dirname, $suffix)
340 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
341
342 my $resized_filename
343 = &util::filename_cat($dirname, $tailname."_resized".$suffix);
344
345 #print STDERR "**** suffix = $suffix\n";
346
347 # Generate smaller image with convert
348 my $newsize = "$img_widthx$image_height";
349 my $command = "convert -interlace plane -verbose "
350 ."-geometry $newsize \"img_$filename\" \"$resized_filename\"";
351 print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
352 my $result = '';
353 print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
354 }
355 }
356 return $href;
357}
358
359sub replace_images {
360 my $self = shift (@_);
361 my ($front, $link, $back, $base_dir,
362 $file, $doc_obj, $section) = @_;
363 # remove quotes from link at start and end if necessary
364 if ($link=~/^\"/) {
365 $link=~s/^\"//;$link=~s/\"$//;
366 $front.='"';
367 $back="\"$back";
368 }
369
370 $link =~ s/\n/ /g;
371
372 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
373
374## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
375
376 my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
377
378 return $front . $middle . $back;
379}
380
381sub extract_metadata
382{
383 my $self = shift (@_);
384 my ($textref, $metadata, $doc_obj) = @_;
385 my $outhandle = $self->{'outhandle'};
386
387 # metadata fields to extract/save. 'key' is the (lowercase) name of the
388 # html meta, 'value' is the metadata name for greenstone to use
389 my %find_fields = ();
390 my ($tag,$value);
391
392 my $orig_field = "";
393 foreach my $field (split /,/, $self->{'extracted_word_metadata_fields'}) {
394 # support tag<tagname>
395 if ($field =~ /^(.*?)<(.*?)>$/) {
396 # "$2" is the user's preferred gs metadata name
397 $find_fields{lc($1)}=$2; # lc = lowercase
398 $orig_field = $1;
399 } else { # no <tagname> for mapping
400 # "$field" is the user's preferred gs metadata name
401 $find_fields{lc($field)}=$field; # lc = lowercase
402 $orig_field = $field;
403 }
404 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
405 $tag = $orig_field;
406 $value = $1;
407 if (!defined $value || !defined $tag){
408 print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
409 next;
410 } else {
411 # clean up and add
412 chomp($value); # remove trailing \n, if any
413 $tag = $find_fields{lc($tag)};
414 print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
415 if ($self->{'verbosity'} > 2);
416 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
417 }
418 }
419 }
420}
421
4221;
Note: See TracBrowser for help on using the repository browser.