source: trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm@ 11090

Last change on this file since 11090 was 10723, checked in by chi, 19 years ago

Change the option of extracted_word_metadata_fields to metadata_fields.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.6 KB
Line 
1###########################################################################
2#
3# StructuredHTMLPlug.pm -- html plugin with extra facilities for teasing out
4# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5# HTML document
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28# This plugin is to process an HTML file where sections are divided by
29# user-defined headings tags. As it is difficult to predict what user's definition
30# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31# as well as allows to get rid of user-defined Table of Content (TOC)...
32# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
33
34package StructuredHTMLPlug;
35
36use HTMLPlug;
37use ImagePlug;
38
39#use strict; # every perl program should have this!
40#no strict 'refs'; # make an exception so we can use variables as filehandles
41
42sub BEGIN {
43 @StructuredHTMLPlug::ISA = ('HTMLPlug');
44}
45
46my $arguments = [];
47
48my $options = { 'name' => "StructuredHTMLPlug",
49 'desc' => "{StructuredHTMLPlug.desc}",
50 'abstract' => "no",
51 'inherits' => "yes",
52 'args' => $arguments };
53
54sub new {
55 my ($class) = shift (@_);
56 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
57 push(@$pluginlist, $class);
58
59 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
60 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
61
62 my $self = (defined $hashArgOptLists)? new HTMLPlug($pluginlist,$inputargs,$hashArgOptLists): new HTMLPlug($pluginlist,$inputargs);
63
64 return bless $self, $class;
65}
66
67sub read {
68 my $self = shift (@_);
69 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
70
71 my $filename = $file;
72 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
73
74 if ($filename =~ m/\.html?$/) {
75 my $poss_doc_filename = $filename;
76 $poss_doc_filename =~ s/\.html?$/.doc/;
77
78 if (-e $poss_doc_filename) {
79 # this file has already been processed by Word plugin
80 return 0;
81 }
82 }
83 return $self->SUPER::read(@_);
84}
85
86sub process {
87 my $self = shift (@_);
88 #my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
89 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
90 my $outhandle = $self->{'outhandle'};
91
92 print $outhandle "StructuredHTMLPlug: processing $file\n"
93 if $self->{'verbosity'} > 1;
94
95 my @head_and_body = split(/<body/i,$$textref);
96 my $head = shift(@head_and_body);
97 my $body_text = join("<body", @head_and_body);
98 $head =~ m/<title>(.+)<\/title>/i;
99 my $doctitle = $1 if defined $1;
100 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
101 my @doc_properties = split(/<xml>/i,$head);
102 my $doc_heading = shift(@doc_properties);
103 my $rest_doc_properties = join(" ", @doc_properties);
104 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
105 my $extracted_metadata = shift (@extracted_metadata);
106 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
107 }
108
109 # If delete_toc is enables, it means to get rid of toc and tof contents.
110 # get rid of TOC and TOF sections and their title
111 if ($self->{'delete_toc'} == 1){
112 if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
113 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
114 }
115 if (defined $self->{'tof_header'}&& $self->{'tof_header'}=~ /\S/) {
116 $body_text =~ s/<p class=(($self->{'tof_header'})[^>]*)>(.+?)<\/p>//isg;
117 }
118 }
119
120 if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
121 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
122 #$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><title>$3<\/title><\/p>/isg;
123 #$doctitle = $3;
124 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
125 #$body_text =~ m/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/isg;
126 #$doctitle = "<h1>".$3."<\/h1>" if defined $3;
127 }
128
129 if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
130 $self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
131 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
132 }
133
134 if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
135 $self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
136 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
137 }
138
139 if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
140 $self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
141 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
142 }
143 # Tidy up extra new lines
144 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
145 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
146
147 $section_text .= "<!--\n<Section>\n-->\n";
148 #my $top_section_tag = "<!--\n<Section>\n-->\n";
149 #$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
150 #$body_text =~ s/(<div.*)/$top_section_tag$1/i;
151 my $body = "<body".$body_text;
152
153 my $section_text = $head;
154
155 # split HTML text on <h1>, <h2> etc tags
156 my @h_split = split(/<h/i,$body);
157
158 my $hnum = 0;
159
160 my $sectionh1 = 0;
161 $section_text .= shift(@h_split);
162
163 my $hc;
164 foreach $hc ( @h_split )
165 {
166 if ($hc =~ m/^([1-3])\s*.*?>(.*)$/s)
167 {
168 my $new_hnum = $1;
169 my $hc_after = $2;
170
171 if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
172 {
173 my $h_text = $1;
174 $hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
175 # boil HTML down to some interesting text
176 $h_text =~ s/^[1-3]>//;
177 $h_text =~ s/<\/?.*?>//sg;
178 $h_text =~ s/\s+/ /sg;
179 $h_text =~ s/^\s$//s;
180 $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
181
182 if ($h_text =~ m/\w+/)
183 {
184 if ($new_hnum > $hnum)
185 {
186 # increase section nesting
187 $hnum++;
188 while ($hnum < $new_hnum)
189 {
190 my $spacing = " " x $hnum;
191 $section_text .= "<!--\n";
192 $section_text .= $spacing."<Section>\n";
193 $section_text .= "-->\n";
194 $hnum++;
195 }
196 }
197 else # ($new_hnum <= $hnum)
198 {
199 # descrease section nesting
200 while ($hnum >= $new_hnum)
201 {
202 my $spacing = " " x $hnum;
203 $section_text .= "<!--\n";
204 $section_text .= $spacing."</Section>\n";
205 $section_text .= "-->\n";
206 $hnum--;
207 }
208 $hnum++;
209 }
210
211 my $spacing = " " x $hnum;
212 $section_text .= "<!--\n";
213 $section_text .= $spacing."<Section>\n";
214 $section_text .= $spacing." <Description>\n";
215 $section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
216 $section_text .= $spacing." </Description>\n";
217 $section_text .= "-->\n";
218
219 print $outhandle $spacing."$h_text\n"
220 if $self->{'verbosity'} > 2;
221
222 $sectionh1++ if ($hnum==1);
223 }
224 }
225 else {
226### print STDERR "***** hc = <h$hc\n\n";
227 }
228 $section_text .= "<h$hc";
229 }
230 else
231 {
232 $section_text .= "<h$hc";
233 }
234 }
235
236 while ($hnum >= 1)
237 {
238 my $spacing = " " x $hnum;
239 $section_text .= "<!--\n";
240 $section_text .= $spacing."</Section>\n";
241 $section_text .= "-->\n";
242 $hnum--;
243 }
244
245 $section_text .= "<!--\n</Section>\n-->\n";
246
247 $$textref = $section_text;
248
249 # should be textref not testref???
250 #$$testref =~ s/<h(\d+)>(.*?)<\/h$1>/<Section><Metadata name=\"Title\">$1<\/Metadata></Section><h$1><\/h$1>/gi;
251
252 if ($sectionh1>0)
253 {
254 print $outhandle " Located section headings ..."
255 if $self->{'verbosity'} > 1;
256 }
257 print $outhandle " Passing on the HTMLPlug\n"
258 if $self->{'verbosity'} > 1;
259
260 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
261
262 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
263
264 ## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
265
266 $self->SUPER::process(@_);
267
268 # associate original file with doc object
269 my $cursection = $doc_obj->get_top_section();
270 my $filename = &util::filename_cat($base_dir, $file);
271 if (-e $filename)
272 {
273 print $outhandle " Adding associated Word document\n"
274 if $self->{'verbosity'} > 1;
275
276 $doc_obj->associate_file($filename, "doc.doc", undef, $cursection);
277
278 my $doclink = "<a href=_httpcollection_/index/assoc/[archivedir]/doc.doc>";
279 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
280 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icondoc_");
281 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
282 $doc_obj->add_utf8_metadata ($cursection, "Title", $doctitle);
283 my $file_size = -s $filename;
284 if ($file_size>1024)
285 {
286 my $fs_kbytes = sprintf("%d",$file_size/1024);
287 $doc_obj->add_utf8_metadata ($cursection, "filesize", "$fs_kbytes Kb");
288 }
289 else
290 {
291 $doc_obj->add_utf8_metadata ($cursection, "filesize", "$file_size bytes");
292 }
293
294 if ($file_size > 200000)
295 {
296 $doc_obj->add_utf8_metadata ($cursection, "fswarning", "1");
297 }
298 }
299}
300
301
302sub resize_if_necessary
303{
304 my ($self,$front,$back,$base_dir,$href) = @_;
305
306 # dig out width and height of image, if there
307 my $img_attributes = "$front back";
308 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
309 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
310
311 # derive local filename for image based on its URL
312 my $img_filename = $href;
313 $img_filename =~ s/^[^:]*:\/\///;
314 $img_filename = &util::filename_cat($base_dir, $img_filename);
315
316 # Replace %20's in URL with a space if required. Note that the filename
317 # may include the %20 in some situations
318 if ($img_filename =~ /\%20/) {
319 if (!-e $img_filename) {
320 $img_filename =~ s/\%20/ /g;
321 }
322 }
323 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
324 # get image info on width and height
325
326 my $outhandle = $self->{'outhandle'};
327 my $verbosity = $self->{'verbosity'};
328
329 my ($image_type, $actual_width, $actual_height, $image_size)
330 = &ImagePlug::identify($img_filename, $outhandle, $verbosity);
331
332 #print STDERR "**** $actual_width x $actual_height";
333 #print STDERR " (requested: $img_width x $img_height)\n";
334
335 if (($img_width < $actual_width) || ($img_height < $actual_height)) {
336 print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
337
338 # derive new image name based on current image
339 my ($tailname, $dirname, $suffix)
340 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
341
342 my $resized_filename
343 = &util::filename_cat($dirname, $tailname."_resized".$suffix);
344
345 #print STDERR "**** suffix = $suffix\n";
346
347 # Generate smaller image with convert
348 my $newsize = "$img_widthx$image_height";
349 my $command = "convert -interlace plane -verbose "
350 ."-geometry $newsize \"img_$filename\" \"$resized_filename\"";
351 print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
352 my $result = '';
353 print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
354 }
355 }
356 return $href;
357}
358
359sub replace_images {
360 my $self = shift (@_);
361 my ($front, $link, $back, $base_dir,
362 $file, $doc_obj, $section) = @_;
363 # remove quotes from link at start and end if necessary
364 if ($link=~/^\"/) {
365 $link=~s/^\"//;$link=~s/\"$//;
366 $front.='"';
367 $back="\"$back";
368 }
369
370 $link =~ s/\n/ /g;
371
372 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
373
374## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
375
376 my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
377
378 return $front . $middle . $back;
379}
380
381sub extract_metadata
382{
383 my $self = shift (@_);
384 my ($textref, $metadata, $doc_obj) = @_;
385 my $outhandle = $self->{'outhandle'};
386
387 # metadata fields to extract/save. 'key' is the (lowercase) name of the
388 # html meta, 'value' is the metadata name for greenstone to use
389 my %find_fields = ();
390 my ($tag,$value);
391
392 my $orig_field = "";
393 foreach my $field (split /,/, $self->{'metadata_fields'}) {
394 # support tag<tagname>
395 if ($field =~ /^(.*?)<(.*?)>$/) {
396 # "$2" is the user's preferred gs metadata name
397 $find_fields{lc($1)}=$2; # lc = lowercase
398 $orig_field = $1;
399 } else { # no <tagname> for mapping
400 # "$field" is the user's preferred gs metadata name
401 $find_fields{lc($field)}=$field; # lc = lowercase
402 $orig_field = $field;
403 }
404 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
405 $tag = $orig_field;
406 $value = $1;
407 if (!defined $value || !defined $tag){
408 print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
409 next;
410 } else {
411 # clean up and add
412 chomp($value); # remove trailing \n, if any
413 $tag = $find_fields{lc($tag)};
414 print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
415 if ($self->{'verbosity'} > 2);
416 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
417 }
418 }
419 }
420}
421
4221;
Note: See TracBrowser for help on using the repository browser.