source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/StructuredHTMLPlugin.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

File size: 13.1 KB
Line 
1###########################################################################
2#
3# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
4# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5# HTML document
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28# This plugin is to process an HTML file where sections are divided by
29# user-defined headings tags. As it is difficult to predict what user's definition
30# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31# as well as allows to get rid of user-defined Table of Content (TOC)...
32# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
33
34package StructuredHTMLPlugin;
35
36use HTMLPlugin;
37use ImageConverter; # want the identify method
38use util;
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43sub BEGIN {
44 @StructuredHTMLPlugin::ISA = ('HTMLPlugin');
45}
46
47my $arguments =
48 [
49 { 'name' => "level1_header",
50 'desc' => "{StructuredHTMLPlugin.level1_header}",
51 'type' => "regexp",
52 'reqd' => "no",
53 'deft' => "" },
54 { 'name' => "level2_header",
55 'desc' => "{StructuredHTMLPlugin.level2_header}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => "" },
59 { 'name' => "level3_header",
60 'desc' => "{StructuredHTMLPlugin.level3_header}",
61 'type' => "regexp",
62 'reqd' => "no",
63 'deft' => "" },
64 { 'name' => "title_header",
65 'desc' => "{StructuredHTMLPlugin.title_header}",
66 'type' => "regexp",
67 'reqd' => "no",
68 'deft' => "" },
69 { 'name' => "delete_toc",
70 'desc' => "{StructuredHTMLPlugin.delete_toc}",
71 'type' => "flag",
72 'reqd' => "no"},
73 { 'name' => "toc_header",
74 'desc' => "{StructuredHTMLPlugin.toc_header}",
75 'type' => "regexp",
76 'reqd' => "no",
77 'deft' => "" }
78 ];
79
80my $options = { 'name' => "StructuredHTMLPlugin",
81 'desc' => "{StructuredHTMLPlugin.desc}",
82 'abstract' => "no",
83 'inherits' => "yes",
84 'args' => $arguments };
85
86sub new {
87 my ($class) = shift (@_);
88 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89 push(@$pluginlist, $class);
90
91 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92 push(@{$hashArgOptLists->{"OptList"}},$options);
93
94 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
95
96 return bless $self, $class;
97}
98
99
100sub process {
101 my $self = shift (@_);
102 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
103 my $outhandle = $self->{'outhandle'};
104
105 my @head_and_body = split(/<body/i,$$textref);
106 my $head = shift(@head_and_body);
107 my $body_text = join("<body", @head_and_body);
108 $head =~ m/<title>(.+)<\/title>/i;
109 my $doctitle = $1 if defined $1;
110 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
111 my @doc_properties = split(/<xml>/i,$head);
112 my $doc_heading = shift(@doc_properties);
113 my $rest_doc_properties = join(" ", @doc_properties);
114
115 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
116 my $extracted_metadata = shift (@extracted_metadata);
117 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
118 }
119
120 # set the title here if we haven't found it yet
121 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
122 if (defined $doctitle && $doctitle =~ /\S/) {
123 $doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
124 } else {
125 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
126 }
127 }
128
129 # If delete_toc is enabled, it means to get rid of toc and tof contents.
130 # get rid of TOC and TOF sections and their title
131 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
132 if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
133 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
134 }
135 }
136
137 if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
138 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
139 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
140 }
141
142 if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
143 $self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
144 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
145 }
146
147 if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
148 $self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
149 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
150 }
151
152 if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
153 $self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
154 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
155 }
156
157 # Tidy up extra new lines
158 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
159 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
160
161 # what was the following line for. effectively unused. do we need it??
162 #$section_text .= "<!--\n<Section>\n-->\n";
163 #my $top_section_tag = "<!--\n<Section>\n-->\n";
164 #$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
165 #$body_text =~ s/(<div.*)/$top_section_tag$1/i;
166 my $body = "<body".$body_text;
167
168 my $section_text = $head;
169
170 # split HTML text on <h1>, <h2> etc tags
171 my @h_split = split(/<h/i,$body);
172
173 my $hnum = 0;
174
175 my $sectionh1 = 0;
176 $section_text .= shift(@h_split);
177
178 my $hc;
179 foreach $hc ( @h_split )
180 {
181 if ($hc =~ m/^([1-3])\s*.*?>(.*)$/s)
182 {
183 my $new_hnum = $1;
184 my $hc_after = $2;
185
186 if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
187 {
188 my $h_text = $1;
189 $hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
190 # boil HTML down to some interesting text
191 $h_text =~ s/^[1-3]>//;
192 $h_text =~ s/<\/?.*?>//sg;
193 $h_text =~ s/\s+/ /sg;
194 $h_text =~ s/^\s$//s;
195 $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
196
197 if ($h_text =~ m/\w+/)
198 {
199 if ($new_hnum > $hnum)
200 {
201 # increase section nesting
202 $hnum++;
203 while ($hnum < $new_hnum)
204 {
205 my $spacing = " " x $hnum;
206 $section_text .= "<!--\n";
207 $section_text .= $spacing."<Section>\n";
208 $section_text .= "-->\n";
209 $hnum++;
210 }
211 }
212 else # ($new_hnum <= $hnum)
213 {
214 # descrease section nesting
215 while ($hnum >= $new_hnum)
216 {
217 my $spacing = " " x $hnum;
218 $section_text .= "<!--\n";
219 $section_text .= $spacing."</Section>\n";
220 $section_text .= "-->\n";
221 $hnum--;
222 }
223 $hnum++;
224 }
225
226 my $spacing = " " x $hnum;
227 $section_text .= "<!--\n";
228 $section_text .= $spacing."<Section>\n";
229 $section_text .= $spacing." <Description>\n";
230 $section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
231 $section_text .= $spacing." </Description>\n";
232 $section_text .= "-->\n";
233
234 #print $outhandle $spacing."$h_text\n"
235 # if $self->{'verbosity'} > 2;
236
237 $sectionh1++ if ($hnum==1);
238 }
239 }
240 else {
241### print STDERR "***** hc = <h$hc\n\n";
242 }
243 $section_text .= "<h$hc";
244 }
245 else
246 {
247 $section_text .= "<h$hc";
248 }
249 }
250
251 while ($hnum >= 1)
252 {
253 my $spacing = " " x $hnum;
254 $section_text .= "<!--\n";
255 $section_text .= $spacing."</Section>\n";
256 $section_text .= "-->\n";
257 $hnum--;
258 }
259
260 $section_text .= "<!--\n</Section>\n-->\n";
261
262 $$textref = $section_text;
263
264# if ($sectionh1>0)
265# {
266# print $outhandle " Located section headings ..."
267# if $self->{'verbosity'} > 1;
268# }
269
270 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
271
272 $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
273
274 ## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
275
276 $self->SUPER::process(@_);
277
278}
279
280
281sub resize_if_necessary
282{
283 my ($self,$front,$back,$base_dir,$href) = @_;
284
285 # dig out width and height of image, if there
286 my $img_attributes = "$front back";
287 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
288 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
289
290 # derive local filename for image based on its URL
291 my $img_filename = $href;
292 $img_filename =~ s/^[^:]*:\/\///;
293 $img_filename = &util::filename_cat($base_dir, $img_filename);
294
295 # Replace %20's in URL with a space if required. Note that the filename
296 # may include the %20 in some situations
297 if ($img_filename =~ /\%20/) {
298 if (!-e $img_filename) {
299 $img_filename =~ s/\%20/ /g;
300 }
301 }
302 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
303 # get image info on width and height
304
305 my $outhandle = $self->{'outhandle'};
306 my $verbosity = $self->{'verbosity'};
307
308 my ($image_type, $actual_width, $actual_height, $image_size)
309 = &ImageConverter::identify($img_filename, $outhandle, $verbosity);
310
311 #print STDERR "**** $actual_width x $actual_height";
312 #print STDERR " (requested: $img_width x $img_height)\n";
313
314 if (($img_width < $actual_width) || ($img_height < $actual_height)) {
315 #print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
316
317 # derive new image name based on current image
318 my ($tailname, $dirname, $suffix)
319 = &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
320
321 my $resized_filename
322 = &util::filename_cat($dirname, $tailname."_resized".$suffix);
323
324 #print STDERR "**** suffix = $suffix\n";
325
326 # Generate smaller image with convert
327 my $newsize = "$img_width"."x$img_height";
328 my $command = "convert -interlace plane -verbose "
329 ."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
330 $command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl $command";
331 #print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
332 #my $result = '';
333 #print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
334 }
335 }
336 return $href;
337}
338
339sub replace_images {
340 my $self = shift (@_);
341 my ($front, $link, $back, $base_dir,
342 $file, $doc_obj, $section) = @_;
343 # remove quotes from link at start and end if necessary
344 if ($link=~/^\"/) {
345 $link=~s/^\"//;$link=~s/\"$//;
346 $front.='"';
347 $back="\"$back";
348 }
349
350 $link =~ s/\n/ /g;
351
352 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
353
354## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
355
356 my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
357
358 return $front . $middle . $back;
359}
360
361sub extract_metadata
362{
363 my $self = shift (@_);
364 my ($textref, $metadata, $doc_obj) = @_;
365 my $outhandle = $self->{'outhandle'};
366
367 return if (!defined $textref);
368
369 my $separator = $self->{'metadata_field_separator'};
370 if ($separator eq "") {
371 undef $separator;
372 }
373 # metadata fields to extract/save. 'key' is the (lowercase) name of the
374 # html meta, 'value' is the metadata name for greenstone to use
375 my %find_fields = ();
376 my ($tag,$value);
377
378 my $orig_field = "";
379 foreach my $field (split /\s*,\s*/, $self->{'metadata_fields'}) {
380 # support tag<tagname>
381 if ($field =~ /^(.*?)\s*<(.*?)>$/) {
382 # "$2" is the user's preferred gs metadata name
383 $find_fields{lc($1)}=$2; # lc = lowercase
384 $orig_field = $1;
385 } else { # no <tagname> for mapping
386 # "$field" is the user's preferred gs metadata name
387 $find_fields{lc($field)}=$field; # lc = lowercase
388 $orig_field = $field;
389 }
390
391 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
392 $tag = $orig_field;
393 $value = $1;
394 if (!defined $value || !defined $tag){
395 #print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
396 next;
397 } else {
398 # clean up and add
399 chomp($value); # remove trailing \n, if any
400 $tag = $find_fields{lc($tag)};
401 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
402 # if ($self->{'verbosity'} > 2);
403 if (defined $separator) {
404 my @values = split($separator, $value);
405 foreach my $v (@values) {
406 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $v) if $v =~ /\S/;
407 }
408 }
409 else {
410 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
411 }
412 }
413 }
414 }
415}
416
4171;
Note: See TracBrowser for help on using the repository browser.