Context Navigation

source: gsdl/trunk/perllib/plugins/StructuredHTMLPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago
plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...
Property svn:keywords set to `Author Date Id Revision`
File size: 12.8 KB

Line
1	###########################################################################
2	#
3	# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
4	# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5	# HTML document
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28	# This plugin is to process an HTML file where sections are divided by
29	# user-defined headings tags. As it is difficult to predict what user's definition
30	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31	# as well as allows to get rid of user-defined Table of Content (TOC)...
32	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
33
34	package StructuredHTMLPlugin;
35
36	use HTMLPlugin;
37	use ImageConverter; # want the identify method
38
39	use strict; # every perl program should have this!
40	no strict 'refs'; # make an exception so we can use variables as filehandles
41
42	sub BEGIN {
43	@StructuredHTMLPlugin::ISA = ('HTMLPlugin');
44	}
45
46	my $arguments =
47	[
48	{ 'name' => "level1_header",
49	'desc' => "{StructuredHTMLPlugin.level1_header}",
50	'type' => "regexp",
51	'reqd' => "no",
52	'deft' => "" },
53	{ 'name' => "level2_header",
54	'desc' => "{StructuredHTMLPlugin.level2_header}",
55	'type' => "regexp",
56	'reqd' => "no",
57	'deft' => "" },
58	{ 'name' => "level3_header",
59	'desc' => "{StructuredHTMLPlugin.level3_header}",
60	'type' => "regexp",
61	'reqd' => "no",
62	'deft' => "" },
63	{ 'name' => "title_header",
64	'desc' => "{StructuredHTMLPlugin.title_header}",
65	'type' => "regexp",
66	'reqd' => "no",
67	'deft' => "" },
68	{ 'name' => "delete_toc",
69	'desc' => "{StructuredHTMLPlugin.delete_toc}",
70	'type' => "flag",
71	'reqd' => "no"},
72	{ 'name' => "toc_header",
73	'desc' => "{StructuredHTMLPlugin.toc_header}",
74	'type' => "regexp",
75	'reqd' => "no",
76	'deft' => "" }
77	];
78
79	my $options = { 'name' => "StructuredHTMLPlugin",
80	'desc' => "{StructuredHTMLPlugin.desc}",
81	'abstract' => "no",
82	'inherits' => "yes",
83	'args' => $arguments };
84
85	sub new {
86	my ($class) = shift (@_);
87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88	push(@$pluginlist, $class);
89
90	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
91	push(@{$hashArgOptLists->{"OptList"}},$options);
92
93	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
94
95	return bless $self, $class;
96	}
97
98
99	sub process {
100	my $self = shift (@_);
101	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
102	my $outhandle = $self->{'outhandle'};
103
104	print $outhandle "StructuredHTMLPlugin: processing $file\n"
105	if $self->{'verbosity'} > 1;
106
107	my @head_and_body = split(/<body/i,$$textref);
108	my $head = shift(@head_and_body);
109	my $body_text = join("<body", @head_and_body);
110	$head =~ m/<title>(.+)<\/title>/i;
111	my $doctitle = $1 if defined $1;
112	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
113	my @doc_properties = split(/<xml>/i,$head);
114	my $doc_heading = shift(@doc_properties);
115	my $rest_doc_properties = join(" ", @doc_properties);
116
117	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
118	my $extracted_metadata = shift (@extracted_metadata);
119	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
120	}
121
122	# set the title here if we haven't found it yet
123	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
124	if (defined $doctitle && $doctitle =~ /\S/) {
125	$doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
126	} else {
127	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
128	}
129	}
130
131	# If delete_toc is enabled, it means to get rid of toc and tof contents.
132	# get rid of TOC and TOF sections and their title
133	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
134	if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
135	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
136	}
137	}
138
139	if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
140	$self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
141	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
142	}
143
144	if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
145	$self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
146	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
147	}
148
149	if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
150	$self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
151	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
152	}
153
154	if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
155	$self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
156	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
157	}
158
159	# Tidy up extra new lines
160	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
161	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
162
163	# what was the following line for. effectively unused. do we need it??
164	#$section_text .= "<!--\n<Section>\n-->\n";
165	#my $top_section_tag = "<!--\n<Section>\n-->\n";
166	#$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
167	#$body_text =~ s/(<div.*)/$top_section_tag$1/i;
168	my $body = "<body".$body_text;
169
170	my $section_text = $head;
171
172	# split HTML text on <h1>, <h2> etc tags
173	my @h_split = split(/<h/i,$body);
174
175	my $hnum = 0;
176
177	my $sectionh1 = 0;
178	$section_text .= shift(@h_split);
179
180	my $hc;
181	foreach $hc ( @h_split )
182	{
183	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
184	{
185	my $new_hnum = $1;
186	my $hc_after = $2;
187
188	if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
189	{
190	my $h_text = $1;
191	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
192	# boil HTML down to some interesting text
193	$h_text =~ s/^[1-3]>//;
194	$h_text =~ s/<\/?.*?>//sg;
195	$h_text =~ s/\s+/ /sg;
196	$h_text =~ s/^\s$//s;
197	$h_text =~ s/( )+\W*/ /sg;
198
199	if ($h_text =~ m/\w+/)
200	{
201	if ($new_hnum > $hnum)
202	{
203	# increase section nesting
204	$hnum++;
205	while ($hnum < $new_hnum)
206	{
207	my $spacing = " " x $hnum;
208	$section_text .= "<!--\n";
209	$section_text .= $spacing."<Section>\n";
210	$section_text .= "-->\n";
211	$hnum++;
212	}
213	}
214	else # ($new_hnum <= $hnum)
215	{
216	# descrease section nesting
217	while ($hnum >= $new_hnum)
218	{
219	my $spacing = " " x $hnum;
220	$section_text .= "<!--\n";
221	$section_text .= $spacing."</Section>\n";
222	$section_text .= "-->\n";
223	$hnum--;
224	}
225	$hnum++;
226	}
227
228	my $spacing = " " x $hnum;
229	$section_text .= "<!--\n";
230	$section_text .= $spacing."<Section>\n";
231	$section_text .= $spacing." <Description>\n";
232	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
233	$section_text .= $spacing." </Description>\n";
234	$section_text .= "-->\n";
235
236	#print $outhandle $spacing."$h_text\n"
237	# if $self->{'verbosity'} > 2;
238
239	$sectionh1++ if ($hnum==1);
240	}
241	}
242	else {
243	### print STDERR "***** hc = <h$hc\n\n";
244	}
245	$section_text .= "<h$hc";
246	}
247	else
248	{
249	$section_text .= "<h$hc";
250	}
251	}
252
253	while ($hnum >= 1)
254	{
255	my $spacing = " " x $hnum;
256	$section_text .= "<!--\n";
257	$section_text .= $spacing."</Section>\n";
258	$section_text .= "-->\n";
259	$hnum--;
260	}
261
262	$section_text .= "<!--\n</Section>\n-->\n";
263
264	$$textref = $section_text;
265
266	# if ($sectionh1>0)
267	# {
268	# print $outhandle " Located section headings ..."
269	# if $self->{'verbosity'} > 1;
270	# }
271
272	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
273
274	$$textref =~ s/( )+/ /sg;
275
276	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
277
278	$self->SUPER::process(@_);
279
280	}
281
282
283	sub resize_if_necessary
284	{
285	my ($self,$front,$back,$base_dir,$href) = @_;
286
287	# dig out width and height of image, if there
288	my $img_attributes = "$front back";
289	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
290	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
291
292	# derive local filename for image based on its URL
293	my $img_filename = $href;
294	$img_filename =~ s/^[^:]*:\/\///;
295	$img_filename = &util::filename_cat($base_dir, $img_filename);
296
297	# Replace %20's in URL with a space if required. Note that the filename
298	# may include the %20 in some situations
299	if ($img_filename =~ /\%20/) {
300	if (!-e $img_filename) {
301	$img_filename =~ s/\%20/ /g;
302	}
303	}
304	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
305	# get image info on width and height
306
307	my $outhandle = $self->{'outhandle'};
308	my $verbosity = $self->{'verbosity'};
309
310	my ($image_type, $actual_width, $actual_height, $image_size)
311	= &ImageConverter::identify($img_filename, $outhandle, $verbosity);
312
313	#print STDERR "**** $actual_width x $actual_height";
314	#print STDERR " (requested: $img_width x $img_height)\n";
315
316	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
317	#print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
318
319	# derive new image name based on current image
320	my ($tailname, $dirname, $suffix)
321	= &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
322
323	my $resized_filename
324	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
325
326	#print STDERR "**** suffix = $suffix\n";
327
328	# Generate smaller image with convert
329	my $newsize = "$img_width"."x$img_height";
330	my $command = "convert -interlace plane -verbose "
331	."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
332	#print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
333	#my $result = '';
334	#print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
335	}
336	}
337	return $href;
338	}
339
340	sub replace_images {
341	my $self = shift (@_);
342	my ($front, $link, $back, $base_dir,
343	$file, $doc_obj, $section) = @_;
344	# remove quotes from link at start and end if necessary
345	if ($link=~/^\"/) {
346	$link=~s/^\"//;$link=~s/\"$//;
347	$front.='"';
348	$back="\"$back";
349	}
350
351	$link =~ s/\n/ /g;
352
353	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
354
355	## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
356
357	my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
358
359	return $front . $middle . $back;
360	}
361
362	sub extract_metadata
363	{
364	my $self = shift (@_);
365	my ($textref, $metadata, $doc_obj) = @_;
366	my $outhandle = $self->{'outhandle'};
367
368	return if (!defined $textref);
369
370	# metadata fields to extract/save. 'key' is the (lowercase) name of the
371	# html meta, 'value' is the metadata name for greenstone to use
372	my %find_fields = ();
373	my ($tag,$value);
374
375	my $orig_field = "";
376	foreach my $field (split /,/, $self->{'metadata_fields'}) {
377	# support tag<tagname>
378	if ($field =~ /^(.?)<(.?)>$/) {
379	# "$2" is the user's preferred gs metadata name
380	$find_fields{lc($1)}=$2; # lc = lowercase
381	$orig_field = $1;
382	} else { # no <tagname> for mapping
383	# "$field" is the user's preferred gs metadata name
384	$find_fields{lc($field)}=$field; # lc = lowercase
385	$orig_field = $field;
386	}
387
388	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
389	$tag = $orig_field;
390	$value = $1;
391	if (!defined $value \|\| !defined $tag){
392	#print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
393	next;
394	} else {
395	# clean up and add
396	chomp($value); # remove trailing \n, if any
397	$tag = $find_fields{lc($tag)};
398	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
399	# if ($self->{'verbosity'} > 2);
400	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
401	}
402	}
403	}
404	}
405
406	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: