source: main/trunk/greenstone2/perllib/plugins/StructuredHTMLPlugin.pm@ 28489

Last change on this file since 28489 was 24600, checked in by ak19, 13 years ago

Added gs-magick.pl script which will set the environment for ImageMagick (including LD_LIBRARY_PATH) before launching the requested ImageMagick command and arguments. By setting the Imagemagick environment from this script we ensure that the modified env variables don't create conflicts with libraries needed for normal linux execution. All the Greenstone files in the *binary* that made direct calls to imagemagick now go through this script. The affected files are perl files in bin/script and perllib and Gatherer.java of GLI. (wvware has files that test for imagemagick during compilation stage, which is independent of our changs which are only for users running imagemagick from a GS binary.) The final problems were related to how different perl files made use of the return values and the output of running their imagemagick command: they would query the 127 and/or and/or run the command with backtick operators to get the output printed to STDOUT. By inserting an intermediate gs-magick.pl file, needed to ensure that the exit code stored in 127 would at least be passed on correctly, as is necessary when testing the exit code against non-zero values or greater/less than zero (instead of comparing them with equals/not equal to 0). To get the correct exit code as emitted by imagemagick, calling code needs to shift bits in 127 and converting it to a signed value.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.1 KB
Line 
1###########################################################################
2#
3# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
4# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5# HTML document
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28# This plugin is to process an HTML file where sections are divided by
29# user-defined headings tags. As it is difficult to predict what user's definition
30# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31# as well as allows to get rid of user-defined Table of Content (TOC)...
32# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
33
34package StructuredHTMLPlugin;
35
36use HTMLPlugin;
37use ImageConverter; # want the identify method
38use util;
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43sub BEGIN {
44 @StructuredHTMLPlugin::ISA = ('HTMLPlugin');
45}
46
47my $arguments =
48 [
49 { 'name' => "level1_header",
50 'desc' => "{StructuredHTMLPlugin.level1_header}",
51 'type' => "regexp",
52 'reqd' => "no",
53 'deft' => "" },
54 { 'name' => "level2_header",
55 'desc' => "{StructuredHTMLPlugin.level2_header}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => "" },
59 { 'name' => "level3_header",
60 'desc' => "{StructuredHTMLPlugin.level3_header}",
61 'type' => "regexp",
62 'reqd' => "no",
63 'deft' => "" },
64 { 'name' => "title_header",
65 'desc' => "{StructuredHTMLPlugin.title_header}",
66 'type' => "regexp",
67 'reqd' => "no",
68 'deft' => "" },
69 { 'name' => "delete_toc",
70 'desc' => "{StructuredHTMLPlugin.delete_toc}",
71 'type' => "flag",
72 'reqd' => "no"},
73 { 'name' => "toc_header",
74 'desc' => "{StructuredHTMLPlugin.toc_header}",
75 'type' => "regexp",
76 'reqd' => "no",
77 'deft' => "" }
78 ];
79
80my $options = { 'name' => "StructuredHTMLPlugin",
81 'desc' => "{StructuredHTMLPlugin.desc}",
82 'abstract' => "no",
83 'inherits' => "yes",
84 'args' => $arguments };
85
86sub new {
87 my ($class) = shift (@_);
88 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89 push(@$pluginlist, $class);
90
91 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92 push(@{$hashArgOptLists->{"OptList"}},$options);
93
94 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
95
96 return bless $self, $class;
97}
98
99
100sub process {
101 my $self = shift (@_);
102 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
103 my $outhandle = $self->{'outhandle'};
104
105 my @head_and_body = split(/<body/i,$$textref);
106 my $head = shift(@head_and_body);
107 my $body_text = join("<body", @head_and_body);
108 $head =~ m/<title>(.+)<\/title>/i;
109 my $doctitle = $1 if defined $1;
110 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
111 my @doc_properties = split(/<xml>/i,$head);
112 my $doc_heading = shift(@doc_properties);
113 my $rest_doc_properties = join(" ", @doc_properties);
114
115 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
116 my $extracted_metadata = shift (@extracted_metadata);
117 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
118 }
119
120 # set the title here if we haven't found it yet
121 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
122 if (defined $doctitle && $doctitle =~ /\S/) {
123 $doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
124 } else {
125 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
126 }
127 }
128
129 # If delete_toc is enabled, it means to get rid of toc and tof contents.
130 # get rid of TOC and TOF sections and their title
131 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
132 if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
133 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
134 }
135 }
136
137 if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
138 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
139 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
140 }
141
142 if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
143 $self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
144 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
145 }
146
147 if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
148 $self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
149 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
150 }
151
152 if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
153 $self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
154 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
155 }
156
157 # Tidy up extra new lines
158 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
159 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
160
161 # what was the following line for. effectively unused. do we need it??
162 #$section_text .= "<!--\n<Section>\n-->\n";
163 #my $top_section_tag = "<!--\n<Section>\n-->\n";
164 #$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
165 #$body_text =~ s/(<div.*)/$top_section_tag$1/i;
166 my $body = "<body".$body_text;
167
168 my $section_text = $head;
169
170 # split HTML text on <h1>, <h2> etc tags
171 my @h_split = split(/<h/i,$body);
172
173 my $hnum = 0;
174
175 my $sectionh1 = 0;
176 $section_text .= shift(@h_split);
177
178 my $hc;
179 foreach $hc ( @h_split )
180 {
181 if ($hc =~ m/^([1-3])\s*.*?>(.*)$/s)
182 {
183 my $new_hnum = $1;
184 my $hc_after = $2;
185
186 if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
187 {
188 my $h_text = $1;
189 $hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
190 # boil HTML down to some interesting text
191 $h_text =~ s/^[1-3]>//;
192 $h_text =~ s/<\/?.*?>//sg;
193 $h_text =~ s/\s+/ /sg;
194 $h_text =~ s/^\s$//s;
195 $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
196
197 if ($h_text =~ m/\w+/)
198 {
199 if ($new_hnum > $hnum)
200 {
201 # increase section nesting
202 $hnum++;
203 while ($hnum < $new_hnum)
204 {
205 my $spacing = " " x $hnum;
206 $section_text .= "<!--\n";
207 $section_text .= $spacing."<Section>\n";
208 $section_text .= "-->\n";
209 $hnum++;
210 }
211 }
212 else # ($new_hnum <= $hnum)
213 {
214 # descrease section nesting
215 while ($hnum >= $new_hnum)
216 {
217 my $spacing = " " x $hnum;
218 $section_text .= "<!--\n";
219 $section_text .= $spacing."</Section>\n";
220 $section_text .= "-->\n";
221 $hnum--;
222 }
223 $hnum++;
224 }
225
226 my $spacing = " " x $hnum;
227 $section_text .= "<!--\n";
228 $section_text .= $spacing."<Section>\n";
229 $section_text .= $spacing." <Description>\n";
230 $section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
231 $section_text .= $spacing." </Description>\n";
232 $section_text .= "-->\n";
233
234 #print $outhandle $spacing."$h_text\n"
235 # if $self->{'verbosity'} > 2;
236
237 $sectionh1++ if ($hnum==1);
238 }
239 }
240 else {
241### print STDERR "***** hc = <h$hc\n\n";
242 }
243 $section_text .= "<h$hc";
244 }
245 else
246 {
247 $section_text .= "<h$hc";
248 }
249 }
250
251 while ($hnum >= 1)
252 {
253 my $spacing = " " x $hnum;
254 $section_text .= "<!--\n";
255 $section_text .= $spacing."</Section>\n";
256 $section_text .= "-->\n";
257 $hnum--;
258 }
259
260 $section_text .= "<!--\n</Section>\n-->\n";
261
262 $$textref = $section_text;
263
264# if ($sectionh1>0)
265# {
266# print $outhandle " Located section headings ..."
267# if $self->{'verbosity'} > 1;
268# }
269
270 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
271
272 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
273
274 ## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
275
276 $self->SUPER::process(@_);
277
278}
279
280
281sub resize_if_necessary
282{
283 my ($self,$front,$back,$base_dir,$href) = @_;
284
285 # dig out width and height of image, if there
286 my $img_attributes = "$front back";
287 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
288 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
289
290 # derive local filename for image based on its URL
291 my $img_filename = $href;
292 $img_filename =~ s/^[^:]*:\/\///;
293 $img_filename = &util::filename_cat($base_dir, $img_filename);
294
295 # Replace %20's in URL with a space if required. Note that the filename
296 # may include the %20 in some situations
297 if ($img_filename =~ /\%20/) {
298 if (!-e $img_filename) {
299 $img_filename =~ s/\%20/ /g;
300 }
301 }
302 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
303 # get image info on width and height
304
305 my $outhandle = $self->{'outhandle'};
306 my $verbosity = $self->{'verbosity'};
307
308 my ($image_type, $actual_width, $actual_height, $image_size)
309 = &ImageConverter::identify($img_filename, $outhandle, $verbosity);
310
311 #print STDERR "**** $actual_width x $actual_height";
312 #print STDERR " (requested: $img_width x $img_height)\n";
313
314 if (($img_width < $actual_width) || ($img_height < $actual_height)) {
315 #print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
316
317 # derive new image name based on current image
318 my ($tailname, $dirname, $suffix)
319 = &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
320
321 my $resized_filename
322 = &util::filename_cat($dirname, $tailname."_resized".$suffix);
323
324 #print STDERR "**** suffix = $suffix\n";
325
326 # Generate smaller image with convert
327 my $newsize = "$img_width"."x$img_height";
328 my $command = "convert -interlace plane -verbose "
329 ."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
330 $command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl $command";
331 #print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
332 #my $result = '';
333 #print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
334 }
335 }
336 return $href;
337}
338
339sub replace_images {
340 my $self = shift (@_);
341 my ($front, $link, $back, $base_dir,
342 $file, $doc_obj, $section) = @_;
343 # remove quotes from link at start and end if necessary
344 if ($link=~/^\"/) {
345 $link=~s/^\"//;$link=~s/\"$//;
346 $front.='"';
347 $back="\"$back";
348 }
349
350 $link =~ s/\n/ /g;
351
352 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
353
354## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
355
356 my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
357
358 return $front . $middle . $back;
359}
360
361sub extract_metadata
362{
363 my $self = shift (@_);
364 my ($textref, $metadata, $doc_obj) = @_;
365 my $outhandle = $self->{'outhandle'};
366
367 return if (!defined $textref);
368
369 my $separator = $self->{'metadata_field_separator'};
370 if ($separator eq "") {
371 undef $separator;
372 }
373 # metadata fields to extract/save. 'key' is the (lowercase) name of the
374 # html meta, 'value' is the metadata name for greenstone to use
375 my %find_fields = ();
376 my ($tag,$value);
377
378 my $orig_field = "";
379 foreach my $field (split /\s*,\s*/, $self->{'metadata_fields'}) {
380 # support tag<tagname>
381 if ($field =~ /^(.*?)\s*<(.*?)>$/) {
382 # "$2" is the user's preferred gs metadata name
383 $find_fields{lc($1)}=$2; # lc = lowercase
384 $orig_field = $1;
385 } else { # no <tagname> for mapping
386 # "$field" is the user's preferred gs metadata name
387 $find_fields{lc($field)}=$field; # lc = lowercase
388 $orig_field = $field;
389 }
390
391 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
392 $tag = $orig_field;
393 $value = $1;
394 if (!defined $value || !defined $tag){
395 #print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
396 next;
397 } else {
398 # clean up and add
399 chomp($value); # remove trailing \n, if any
400 $tag = $find_fields{lc($tag)};
401 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
402 # if ($self->{'verbosity'} > 2);
403 if (defined $separator) {
404 my @values = split($separator, $value);
405 foreach my $v (@values) {
406 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $v) if $v =~ /\S/;
407 }
408 }
409 else {
410 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
411 }
412 }
413 }
414 }
415}
416
4171;
Note: See TracBrowser for help on using the repository browser.