Context Navigation

source: main/trunk/greenstone2/perllib/plugins/StructuredHTMLPlugin.pm@ 33299

Last change on this file since 33299 was 33299, checked in by ak19, 5 years ago
Committing Dr Bainbridge's fix to remove duplicated heading titles in sectioned HTML produced from sectioned Word documents by StructuredHTMLPlugin. Committing this prior to debugging other issues. 2. Minor change to unicode.pm, improved description for function
Property svn:keywords set to `Author Date Id Revision`
File size: 13.0 KB

Line
1	###########################################################################
2	#
3	# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
4	# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5	# HTML document
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28	# This plugin is to process an HTML file where sections are divided by
29	# user-defined headings tags. As it is difficult to predict what user's definition
30	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31	# as well as allows to get rid of user-defined Table of Content (TOC)...
32	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
33
34	package StructuredHTMLPlugin;
35
36	use HTMLPlugin;
37	use ImageConverter; # want the identify method
38	use util;
39
40	use strict; # every perl program should have this!
41	no strict 'refs'; # make an exception so we can use variables as filehandles
42
43	sub BEGIN {
44	@StructuredHTMLPlugin::ISA = ('HTMLPlugin');
45	}
46
47	my $arguments =
48	[
49	{ 'name' => "level1_header",
50	'desc' => "{StructuredHTMLPlugin.level1_header}",
51	'type' => "regexp",
52	'reqd' => "no",
53	'deft' => "" },
54	{ 'name' => "level2_header",
55	'desc' => "{StructuredHTMLPlugin.level2_header}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => "" },
59	{ 'name' => "level3_header",
60	'desc' => "{StructuredHTMLPlugin.level3_header}",
61	'type' => "regexp",
62	'reqd' => "no",
63	'deft' => "" },
64	{ 'name' => "title_header",
65	'desc' => "{StructuredHTMLPlugin.title_header}",
66	'type' => "regexp",
67	'reqd' => "no",
68	'deft' => "" },
69	{ 'name' => "delete_toc",
70	'desc' => "{StructuredHTMLPlugin.delete_toc}",
71	'type' => "flag",
72	'reqd' => "no"},
73	{ 'name' => "toc_header",
74	'desc' => "{StructuredHTMLPlugin.toc_header}",
75	'type' => "regexp",
76	'reqd' => "no",
77	'deft' => "" }
78	];
79
80	my $options = { 'name' => "StructuredHTMLPlugin",
81	'desc' => "{StructuredHTMLPlugin.desc}",
82	'abstract' => "no",
83	'inherits' => "yes",
84	'args' => $arguments };
85
86	sub new {
87	my ($class) = shift (@_);
88	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89	push(@$pluginlist, $class);
90
91	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92	push(@{$hashArgOptLists->{"OptList"}},$options);
93
94	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
95
96	return bless $self, $class;
97	}
98
99
100	sub process {
101	my $self = shift (@_);
102	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
103	my $outhandle = $self->{'outhandle'};
104
105	my @head_and_body = split(/<body/i,$$textref);
106	my $head = shift(@head_and_body);
107	my $body_text = join("<body", @head_and_body);
108	$head =~ m/<title>(.+)<\/title>/i;
109	my $doctitle = $1 if defined $1;
110	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
111	my @doc_properties = split(/<xml>/i,$head);
112	my $doc_heading = shift(@doc_properties);
113	my $rest_doc_properties = join(" ", @doc_properties);
114
115	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
116	my $extracted_metadata = shift (@extracted_metadata);
117	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
118	}
119
120	# set the title here if we haven't found it yet
121	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
122	if (defined $doctitle && $doctitle =~ /\S/) {
123	$doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
124	} else {
125	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
126	}
127	}
128
129	# If delete_toc is enabled, it means to get rid of toc and tof contents.
130	# get rid of TOC and TOF sections and their title
131	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
132	if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
133	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
134	}
135	}
136
137	if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
138	$self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
139	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
140	}
141
142	if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
143	$self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
144	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
145	}
146
147	if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
148	$self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
149	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
150	}
151
152	if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
153	$self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
154	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
155	}
156
157	# Tidy up extra new lines
158	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
159	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
160
161	# what was the following line for. effectively unused. do we need it??
162	#$section_text .= "<!--\n<Section>\n-->\n";
163	#my $top_section_tag = "<!--\n<Section>\n-->\n";
164	#$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
165	#$body_text =~ s/(<div.*)/$top_section_tag$1/i;
166	my $body = "<body".$body_text;
167
168	my $section_text = $head;
169
170	# split HTML text on <h1>, <h2> etc tags
171	my @h_split = split(/<h/i,$body);
172
173	my $hnum = 0;
174
175	my $sectionh1 = 0;
176	$section_text .= shift(@h_split);
177
178	my $hc;
179	foreach $hc ( @h_split )
180	{
181	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
182	{
183	my $new_hnum = $1;
184	my $hc_after = $2;
185	my $filtered_hc = undef;
186
187	if ($hc_after =~ m/^(.?)<\/h$new_hnum>(.)$/is)
188	{
189	my $h_text = $1;
190	$filtered_hc = $2; # This represents the remainder of $hc, after the (e.g.) <h2>xxxx</h2> element
191
192	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
193	# boil HTML down to some interesting text
194	$h_text =~ s/^[1-3]>//;
195	$h_text =~ s/<\/?.*?>//sg;
196	$h_text =~ s/\s+/ /sg;
197	$h_text =~ s/^\s$//s;
198	$h_text =~ s/( )+\W*/ /sg;
199
200	if ($h_text =~ m/\w+/)
201	{
202	if ($new_hnum > $hnum)
203	{
204	# increase section nesting
205	$hnum++;
206	while ($hnum < $new_hnum)
207	{
208	my $spacing = " " x $hnum;
209	$section_text .= "<!--\n";
210	$section_text .= $spacing."<Section>\n";
211	$section_text .= "-->\n";
212	$hnum++;
213	}
214	}
215	else # ($new_hnum <= $hnum)
216	{
217	# descrease section nesting
218	while ($hnum >= $new_hnum)
219	{
220	my $spacing = " " x $hnum;
221	$section_text .= "<!--\n";
222	$section_text .= $spacing."</Section>\n";
223	$section_text .= "-->\n";
224	$hnum--;
225	}
226	$hnum++;
227	}
228
229	my $spacing = " " x $hnum;
230	$section_text .= "<!--\n";
231	$section_text .= $spacing."<Section>\n";
232	$section_text .= $spacing." <Description>\n";
233	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
234	$section_text .= $spacing." </Description>\n";
235	$section_text .= "-->\n";
236
237	#print $outhandle $spacing."$h_text\n"
238	# if $self->{'verbosity'} > 2;
239
240	$sectionh1++ if ($hnum==1);
241	}
242	}
243	else {
244	### print STDERR "***** hc = <h$hc\n\n";
245	}
246
247	# This can probably be replaced by the first statement in the if-statement, because
248	# $filtered_hc should always be defined when it is assigned above (even if it resolves
249	# to be an empty string)
250	if (defined $filtered_hc) {
251	$section_text .= $filtered_hc;
252	}
253	else {
254	$section_text .= "<h$hc";
255	}
256	}
257	else
258	{
259	$section_text .= "<h$hc";
260	}
261	}
262
263	while ($hnum >= 1)
264	{
265	my $spacing = " " x $hnum;
266	$section_text .= "<!--\n";
267	$section_text .= $spacing."</Section>\n";
268	$section_text .= "-->\n";
269	$hnum--;
270	}
271
272	$section_text .= "<!--\n</Section>\n-->\n";
273
274	$$textref = $section_text;
275
276	# if ($sectionh1>0)
277	# {
278	# print $outhandle " Located section headings ..."
279	# if $self->{'verbosity'} > 1;
280	# }
281
282	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
283
284	$$textref =~ s/( )+/ /sg;
285
286	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
287
288	$self->SUPER::process(@_);
289
290	}
291
292
293	sub resize_if_necessary
294	{
295	my ($self,$front,$back,$base_dir,$href) = @_;
296
297	# dig out width and height of image, if there
298	my $img_attributes = "$front back";
299	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
300	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
301
302	# derive local filename for image based on its URL
303	my $img_filename = $href;
304	$img_filename =~ s/^[^:]*:\/\///;
305	$img_filename = &util::filename_cat($base_dir, $img_filename);
306
307	# Replace %20's in URL with a space if required. Note that the filename
308	# may include the %20 in some situations
309	if ($img_filename =~ /\%20/) {
310	if (!-e $img_filename) {
311	$img_filename =~ s/\%20/ /g;
312	}
313	}
314	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
315	# get image info on width and height
316
317	my $outhandle = $self->{'outhandle'};
318	my $verbosity = $self->{'verbosity'};
319
320	my ($image_type, $actual_width, $actual_height, $image_size)
321	= &ImageConverter::identify($img_filename, $outhandle, $verbosity);
322
323	#print STDERR "**** $actual_width x $actual_height";
324	#print STDERR " (requested: $img_width x $img_height)\n";
325
326	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
327	#print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
328
329	# derive new image name based on current image
330	my ($tailname, $dirname, $suffix)
331	= &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
332
333	my $resized_filename
334	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
335
336	#print STDERR "**** suffix = $suffix\n";
337
338	# Generate smaller image with convert
339	my $newsize = "$img_width"."x$img_height";
340	my $command = "convert -interlace plane -verbose "
341	."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
342	$command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl $command";
343	#print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
344	#my $result = '';
345	#print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
346	}
347	}
348	return $href;
349	}
350
351
352	sub extract_metadata
353	{
354	my $self = shift (@_);
355	my ($textref, $metadata, $doc_obj) = @_;
356	my $outhandle = $self->{'outhandle'};
357
358	return if (!defined $textref);
359
360	my $separator = $self->{'metadata_field_separator'};
361	if ($separator eq "") {
362	undef $separator;
363	}
364	# metadata fields to extract/save. 'key' is the (lowercase) name of the
365	# html meta, 'value' is the metadata name for greenstone to use
366	my %find_fields = ();
367	my ($tag,$value);
368
369	my $orig_field = "";
370	foreach my $field (split /\s,\s/, $self->{'metadata_fields'}) {
371	# support tag<tagname>
372	if ($field =~ /^(.?)\s<(.*?)>$/) {
373	# "$2" is the user's preferred gs metadata name
374	$find_fields{lc($1)}=$2; # lc = lowercase
375	$orig_field = $1;
376	} else { # no <tagname> for mapping
377	# "$field" is the user's preferred gs metadata name
378	$find_fields{lc($field)}=$field; # lc = lowercase
379	$orig_field = $field;
380	}
381
382	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
383	$tag = $orig_field;
384	$value = $1;
385	if (!defined $value \|\| !defined $tag){
386	#print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
387	next;
388	} else {
389	# clean up and add
390	chomp($value); # remove trailing \n, if any
391	$tag = $find_fields{lc($tag)};
392	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
393	# if ($self->{'verbosity'} > 2);
394	if (defined $separator) {
395	my @values = split($separator, $value);
396	foreach my $v (@values) {
397	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $v) if $v =~ /\S/;
398	}
399	}
400	else {
401	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
402	}
403	}
404	}
405	}
406	}
407
408	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: