Context Navigation

source: main/trunk/greenstone2/perllib/plugins/StructuredHTMLPlugin.pm@ 28489

Last change on this file since 28489 was 24600, checked in by ak19, 13 years ago
Added gs-magick.pl script which will set the environment for ImageMagick (including LD_LIBRARY_PATH) before launching the requested ImageMagick command and arguments. By setting the Imagemagick environment from this script we ensure that the modified env variables don't create conflicts with libraries needed for normal linux execution. All the Greenstone files in the binary that made direct calls to imagemagick now go through this script. The affected files are perl files in bin/script and perllib and Gatherer.java of GLI. (wvware has files that test for imagemagick during compilation stage, which is independent of our changs which are only for users running imagemagick from a GS binary.) The final problems were related to how different perl files made use of the return values and the output of running their imagemagick command: they would query the 127 and/or and/or run the command with backtick operators to get the output printed to STDOUT. By inserting an intermediate gs-magick.pl file, needed to ensure that the exit code stored in 127 would at least be passed on correctly, as is necessary when testing the exit code against non-zero values or greater/less than zero (instead of comparing them with equals/not equal to 0). To get the correct exit code as emitted by imagemagick, calling code needs to shift bits in 127 and converting it to a signed value.
Property svn:keywords set to `Author Date Id Revision`
File size: 13.1 KB

Line
1	###########################################################################
2	#
3	# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
4	# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5	# HTML document
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28	# This plugin is to process an HTML file where sections are divided by
29	# user-defined headings tags. As it is difficult to predict what user's definition
30	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31	# as well as allows to get rid of user-defined Table of Content (TOC)...
32	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
33
34	package StructuredHTMLPlugin;
35
36	use HTMLPlugin;
37	use ImageConverter; # want the identify method
38	use util;
39
40	use strict; # every perl program should have this!
41	no strict 'refs'; # make an exception so we can use variables as filehandles
42
43	sub BEGIN {
44	@StructuredHTMLPlugin::ISA = ('HTMLPlugin');
45	}
46
47	my $arguments =
48	[
49	{ 'name' => "level1_header",
50	'desc' => "{StructuredHTMLPlugin.level1_header}",
51	'type' => "regexp",
52	'reqd' => "no",
53	'deft' => "" },
54	{ 'name' => "level2_header",
55	'desc' => "{StructuredHTMLPlugin.level2_header}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => "" },
59	{ 'name' => "level3_header",
60	'desc' => "{StructuredHTMLPlugin.level3_header}",
61	'type' => "regexp",
62	'reqd' => "no",
63	'deft' => "" },
64	{ 'name' => "title_header",
65	'desc' => "{StructuredHTMLPlugin.title_header}",
66	'type' => "regexp",
67	'reqd' => "no",
68	'deft' => "" },
69	{ 'name' => "delete_toc",
70	'desc' => "{StructuredHTMLPlugin.delete_toc}",
71	'type' => "flag",
72	'reqd' => "no"},
73	{ 'name' => "toc_header",
74	'desc' => "{StructuredHTMLPlugin.toc_header}",
75	'type' => "regexp",
76	'reqd' => "no",
77	'deft' => "" }
78	];
79
80	my $options = { 'name' => "StructuredHTMLPlugin",
81	'desc' => "{StructuredHTMLPlugin.desc}",
82	'abstract' => "no",
83	'inherits' => "yes",
84	'args' => $arguments };
85
86	sub new {
87	my ($class) = shift (@_);
88	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89	push(@$pluginlist, $class);
90
91	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92	push(@{$hashArgOptLists->{"OptList"}},$options);
93
94	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
95
96	return bless $self, $class;
97	}
98
99
100	sub process {
101	my $self = shift (@_);
102	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
103	my $outhandle = $self->{'outhandle'};
104
105	my @head_and_body = split(/<body/i,$$textref);
106	my $head = shift(@head_and_body);
107	my $body_text = join("<body", @head_and_body);
108	$head =~ m/<title>(.+)<\/title>/i;
109	my $doctitle = $1 if defined $1;
110	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
111	my @doc_properties = split(/<xml>/i,$head);
112	my $doc_heading = shift(@doc_properties);
113	my $rest_doc_properties = join(" ", @doc_properties);
114
115	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
116	my $extracted_metadata = shift (@extracted_metadata);
117	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
118	}
119
120	# set the title here if we haven't found it yet
121	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
122	if (defined $doctitle && $doctitle =~ /\S/) {
123	$doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
124	} else {
125	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
126	}
127	}
128
129	# If delete_toc is enabled, it means to get rid of toc and tof contents.
130	# get rid of TOC and TOF sections and their title
131	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
132	if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
133	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
134	}
135	}
136
137	if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
138	$self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
139	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
140	}
141
142	if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
143	$self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
144	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
145	}
146
147	if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
148	$self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
149	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
150	}
151
152	if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
153	$self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
154	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
155	}
156
157	# Tidy up extra new lines
158	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
159	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
160
161	# what was the following line for. effectively unused. do we need it??
162	#$section_text .= "<!--\n<Section>\n-->\n";
163	#my $top_section_tag = "<!--\n<Section>\n-->\n";
164	#$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
165	#$body_text =~ s/(<div.*)/$top_section_tag$1/i;
166	my $body = "<body".$body_text;
167
168	my $section_text = $head;
169
170	# split HTML text on <h1>, <h2> etc tags
171	my @h_split = split(/<h/i,$body);
172
173	my $hnum = 0;
174
175	my $sectionh1 = 0;
176	$section_text .= shift(@h_split);
177
178	my $hc;
179	foreach $hc ( @h_split )
180	{
181	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
182	{
183	my $new_hnum = $1;
184	my $hc_after = $2;
185
186	if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
187	{
188	my $h_text = $1;
189	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
190	# boil HTML down to some interesting text
191	$h_text =~ s/^[1-3]>//;
192	$h_text =~ s/<\/?.*?>//sg;
193	$h_text =~ s/\s+/ /sg;
194	$h_text =~ s/^\s$//s;
195	$h_text =~ s/( )+\W*/ /sg;
196
197	if ($h_text =~ m/\w+/)
198	{
199	if ($new_hnum > $hnum)
200	{
201	# increase section nesting
202	$hnum++;
203	while ($hnum < $new_hnum)
204	{
205	my $spacing = " " x $hnum;
206	$section_text .= "<!--\n";
207	$section_text .= $spacing."<Section>\n";
208	$section_text .= "-->\n";
209	$hnum++;
210	}
211	}
212	else # ($new_hnum <= $hnum)
213	{
214	# descrease section nesting
215	while ($hnum >= $new_hnum)
216	{
217	my $spacing = " " x $hnum;
218	$section_text .= "<!--\n";
219	$section_text .= $spacing."</Section>\n";
220	$section_text .= "-->\n";
221	$hnum--;
222	}
223	$hnum++;
224	}
225
226	my $spacing = " " x $hnum;
227	$section_text .= "<!--\n";
228	$section_text .= $spacing."<Section>\n";
229	$section_text .= $spacing." <Description>\n";
230	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
231	$section_text .= $spacing." </Description>\n";
232	$section_text .= "-->\n";
233
234	#print $outhandle $spacing."$h_text\n"
235	# if $self->{'verbosity'} > 2;
236
237	$sectionh1++ if ($hnum==1);
238	}
239	}
240	else {
241	### print STDERR "***** hc = <h$hc\n\n";
242	}
243	$section_text .= "<h$hc";
244	}
245	else
246	{
247	$section_text .= "<h$hc";
248	}
249	}
250
251	while ($hnum >= 1)
252	{
253	my $spacing = " " x $hnum;
254	$section_text .= "<!--\n";
255	$section_text .= $spacing."</Section>\n";
256	$section_text .= "-->\n";
257	$hnum--;
258	}
259
260	$section_text .= "<!--\n</Section>\n-->\n";
261
262	$$textref = $section_text;
263
264	# if ($sectionh1>0)
265	# {
266	# print $outhandle " Located section headings ..."
267	# if $self->{'verbosity'} > 1;
268	# }
269
270	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
271
272	$$textref =~ s/( )+/ /sg;
273
274	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
275
276	$self->SUPER::process(@_);
277
278	}
279
280
281	sub resize_if_necessary
282	{
283	my ($self,$front,$back,$base_dir,$href) = @_;
284
285	# dig out width and height of image, if there
286	my $img_attributes = "$front back";
287	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
288	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
289
290	# derive local filename for image based on its URL
291	my $img_filename = $href;
292	$img_filename =~ s/^[^:]*:\/\///;
293	$img_filename = &util::filename_cat($base_dir, $img_filename);
294
295	# Replace %20's in URL with a space if required. Note that the filename
296	# may include the %20 in some situations
297	if ($img_filename =~ /\%20/) {
298	if (!-e $img_filename) {
299	$img_filename =~ s/\%20/ /g;
300	}
301	}
302	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
303	# get image info on width and height
304
305	my $outhandle = $self->{'outhandle'};
306	my $verbosity = $self->{'verbosity'};
307
308	my ($image_type, $actual_width, $actual_height, $image_size)
309	= &ImageConverter::identify($img_filename, $outhandle, $verbosity);
310
311	#print STDERR "**** $actual_width x $actual_height";
312	#print STDERR " (requested: $img_width x $img_height)\n";
313
314	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
315	#print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
316
317	# derive new image name based on current image
318	my ($tailname, $dirname, $suffix)
319	= &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
320
321	my $resized_filename
322	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
323
324	#print STDERR "**** suffix = $suffix\n";
325
326	# Generate smaller image with convert
327	my $newsize = "$img_width"."x$img_height";
328	my $command = "convert -interlace plane -verbose "
329	."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
330	$command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl $command";
331	#print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
332	#my $result = '';
333	#print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
334	}
335	}
336	return $href;
337	}
338
339	sub replace_images {
340	my $self = shift (@_);
341	my ($front, $link, $back, $base_dir,
342	$file, $doc_obj, $section) = @_;
343	# remove quotes from link at start and end if necessary
344	if ($link=~/^\"/) {
345	$link=~s/^\"//;$link=~s/\"$//;
346	$front.='"';
347	$back="\"$back";
348	}
349
350	$link =~ s/\n/ /g;
351
352	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
353
354	## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
355
356	my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
357
358	return $front . $middle . $back;
359	}
360
361	sub extract_metadata
362	{
363	my $self = shift (@_);
364	my ($textref, $metadata, $doc_obj) = @_;
365	my $outhandle = $self->{'outhandle'};
366
367	return if (!defined $textref);
368
369	my $separator = $self->{'metadata_field_separator'};
370	if ($separator eq "") {
371	undef $separator;
372	}
373	# metadata fields to extract/save. 'key' is the (lowercase) name of the
374	# html meta, 'value' is the metadata name for greenstone to use
375	my %find_fields = ();
376	my ($tag,$value);
377
378	my $orig_field = "";
379	foreach my $field (split /\s,\s/, $self->{'metadata_fields'}) {
380	# support tag<tagname>
381	if ($field =~ /^(.?)\s<(.*?)>$/) {
382	# "$2" is the user's preferred gs metadata name
383	$find_fields{lc($1)}=$2; # lc = lowercase
384	$orig_field = $1;
385	} else { # no <tagname> for mapping
386	# "$field" is the user's preferred gs metadata name
387	$find_fields{lc($field)}=$field; # lc = lowercase
388	$orig_field = $field;
389	}
390
391	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
392	$tag = $orig_field;
393	$value = $1;
394	if (!defined $value \|\| !defined $tag){
395	#print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
396	next;
397	} else {
398	# clean up and add
399	chomp($value); # remove trailing \n, if any
400	$tag = $find_fields{lc($tag)};
401	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
402	# if ($self->{'verbosity'} > 2);
403	if (defined $separator) {
404	my @values = split($separator, $value);
405	foreach my $v (@values) {
406	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $v) if $v =~ /\S/;
407	}
408	}
409	else {
410	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
411	}
412	}
413	}
414	}
415	}
416
417	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: