Context Navigation

source: gsdl/trunk/perllib/plugins/StructuredHTMLPlugin.pm@ 16822

Last change on this file since 16822 was 16104, checked in by kjdon, 16 years ago
tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process
Property svn:keywords set to `Author Date Id Revision`
File size: 12.7 KB

Line
1	###########################################################################
2	#
3	# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
4	# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5	# HTML document
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28	# This plugin is to process an HTML file where sections are divided by
29	# user-defined headings tags. As it is difficult to predict what user's definition
30	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31	# as well as allows to get rid of user-defined Table of Content (TOC)...
32	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
33
34	package StructuredHTMLPlugin;
35
36	use HTMLPlugin;
37	use ImageConverter; # want the identify method
38
39	use strict; # every perl program should have this!
40	no strict 'refs'; # make an exception so we can use variables as filehandles
41
42	sub BEGIN {
43	@StructuredHTMLPlugin::ISA = ('HTMLPlugin');
44	}
45
46	my $arguments =
47	[
48	{ 'name' => "level1_header",
49	'desc' => "{StructuredHTMLPlugin.level1_header}",
50	'type' => "regexp",
51	'reqd' => "no",
52	'deft' => "" },
53	{ 'name' => "level2_header",
54	'desc' => "{StructuredHTMLPlugin.level2_header}",
55	'type' => "regexp",
56	'reqd' => "no",
57	'deft' => "" },
58	{ 'name' => "level3_header",
59	'desc' => "{StructuredHTMLPlugin.level3_header}",
60	'type' => "regexp",
61	'reqd' => "no",
62	'deft' => "" },
63	{ 'name' => "title_header",
64	'desc' => "{StructuredHTMLPlugin.title_header}",
65	'type' => "regexp",
66	'reqd' => "no",
67	'deft' => "" },
68	{ 'name' => "delete_toc",
69	'desc' => "{StructuredHTMLPlugin.delete_toc}",
70	'type' => "flag",
71	'reqd' => "no"},
72	{ 'name' => "toc_header",
73	'desc' => "{StructuredHTMLPlugin.toc_header}",
74	'type' => "regexp",
75	'reqd' => "no",
76	'deft' => "" }
77	];
78
79	my $options = { 'name' => "StructuredHTMLPlugin",
80	'desc' => "{StructuredHTMLPlugin.desc}",
81	'abstract' => "no",
82	'inherits' => "yes",
83	'args' => $arguments };
84
85	sub new {
86	my ($class) = shift (@_);
87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88	push(@$pluginlist, $class);
89
90	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
91	push(@{$hashArgOptLists->{"OptList"}},$options);
92
93	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
94
95	return bless $self, $class;
96	}
97
98
99	sub process {
100	my $self = shift (@_);
101	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
102	my $outhandle = $self->{'outhandle'};
103
104	my @head_and_body = split(/<body/i,$$textref);
105	my $head = shift(@head_and_body);
106	my $body_text = join("<body", @head_and_body);
107	$head =~ m/<title>(.+)<\/title>/i;
108	my $doctitle = $1 if defined $1;
109	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
110	my @doc_properties = split(/<xml>/i,$head);
111	my $doc_heading = shift(@doc_properties);
112	my $rest_doc_properties = join(" ", @doc_properties);
113
114	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
115	my $extracted_metadata = shift (@extracted_metadata);
116	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
117	}
118
119	# set the title here if we haven't found it yet
120	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
121	if (defined $doctitle && $doctitle =~ /\S/) {
122	$doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
123	} else {
124	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
125	}
126	}
127
128	# If delete_toc is enabled, it means to get rid of toc and tof contents.
129	# get rid of TOC and TOF sections and their title
130	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
131	if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
132	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
133	}
134	}
135
136	if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
137	$self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
138	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
139	}
140
141	if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
142	$self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
143	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
144	}
145
146	if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
147	$self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
148	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
149	}
150
151	if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
152	$self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
153	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
154	}
155
156	# Tidy up extra new lines
157	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
158	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
159
160	# what was the following line for. effectively unused. do we need it??
161	#$section_text .= "<!--\n<Section>\n-->\n";
162	#my $top_section_tag = "<!--\n<Section>\n-->\n";
163	#$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
164	#$body_text =~ s/(<div.*)/$top_section_tag$1/i;
165	my $body = "<body".$body_text;
166
167	my $section_text = $head;
168
169	# split HTML text on <h1>, <h2> etc tags
170	my @h_split = split(/<h/i,$body);
171
172	my $hnum = 0;
173
174	my $sectionh1 = 0;
175	$section_text .= shift(@h_split);
176
177	my $hc;
178	foreach $hc ( @h_split )
179	{
180	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
181	{
182	my $new_hnum = $1;
183	my $hc_after = $2;
184
185	if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
186	{
187	my $h_text = $1;
188	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
189	# boil HTML down to some interesting text
190	$h_text =~ s/^[1-3]>//;
191	$h_text =~ s/<\/?.*?>//sg;
192	$h_text =~ s/\s+/ /sg;
193	$h_text =~ s/^\s$//s;
194	$h_text =~ s/( )+\W*/ /sg;
195
196	if ($h_text =~ m/\w+/)
197	{
198	if ($new_hnum > $hnum)
199	{
200	# increase section nesting
201	$hnum++;
202	while ($hnum < $new_hnum)
203	{
204	my $spacing = " " x $hnum;
205	$section_text .= "<!--\n";
206	$section_text .= $spacing."<Section>\n";
207	$section_text .= "-->\n";
208	$hnum++;
209	}
210	}
211	else # ($new_hnum <= $hnum)
212	{
213	# descrease section nesting
214	while ($hnum >= $new_hnum)
215	{
216	my $spacing = " " x $hnum;
217	$section_text .= "<!--\n";
218	$section_text .= $spacing."</Section>\n";
219	$section_text .= "-->\n";
220	$hnum--;
221	}
222	$hnum++;
223	}
224
225	my $spacing = " " x $hnum;
226	$section_text .= "<!--\n";
227	$section_text .= $spacing."<Section>\n";
228	$section_text .= $spacing." <Description>\n";
229	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
230	$section_text .= $spacing." </Description>\n";
231	$section_text .= "-->\n";
232
233	#print $outhandle $spacing."$h_text\n"
234	# if $self->{'verbosity'} > 2;
235
236	$sectionh1++ if ($hnum==1);
237	}
238	}
239	else {
240	### print STDERR "***** hc = <h$hc\n\n";
241	}
242	$section_text .= "<h$hc";
243	}
244	else
245	{
246	$section_text .= "<h$hc";
247	}
248	}
249
250	while ($hnum >= 1)
251	{
252	my $spacing = " " x $hnum;
253	$section_text .= "<!--\n";
254	$section_text .= $spacing."</Section>\n";
255	$section_text .= "-->\n";
256	$hnum--;
257	}
258
259	$section_text .= "<!--\n</Section>\n-->\n";
260
261	$$textref = $section_text;
262
263	# if ($sectionh1>0)
264	# {
265	# print $outhandle " Located section headings ..."
266	# if $self->{'verbosity'} > 1;
267	# }
268
269	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
270
271	$$textref =~ s/( )+/ /sg;
272
273	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
274
275	$self->SUPER::process(@_);
276
277	}
278
279
280	sub resize_if_necessary
281	{
282	my ($self,$front,$back,$base_dir,$href) = @_;
283
284	# dig out width and height of image, if there
285	my $img_attributes = "$front back";
286	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
287	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
288
289	# derive local filename for image based on its URL
290	my $img_filename = $href;
291	$img_filename =~ s/^[^:]*:\/\///;
292	$img_filename = &util::filename_cat($base_dir, $img_filename);
293
294	# Replace %20's in URL with a space if required. Note that the filename
295	# may include the %20 in some situations
296	if ($img_filename =~ /\%20/) {
297	if (!-e $img_filename) {
298	$img_filename =~ s/\%20/ /g;
299	}
300	}
301	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
302	# get image info on width and height
303
304	my $outhandle = $self->{'outhandle'};
305	my $verbosity = $self->{'verbosity'};
306
307	my ($image_type, $actual_width, $actual_height, $image_size)
308	= &ImageConverter::identify($img_filename, $outhandle, $verbosity);
309
310	#print STDERR "**** $actual_width x $actual_height";
311	#print STDERR " (requested: $img_width x $img_height)\n";
312
313	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
314	#print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
315
316	# derive new image name based on current image
317	my ($tailname, $dirname, $suffix)
318	= &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
319
320	my $resized_filename
321	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
322
323	#print STDERR "**** suffix = $suffix\n";
324
325	# Generate smaller image with convert
326	my $newsize = "$img_width"."x$img_height";
327	my $command = "convert -interlace plane -verbose "
328	."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
329	#print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
330	#my $result = '';
331	#print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
332	}
333	}
334	return $href;
335	}
336
337	sub replace_images {
338	my $self = shift (@_);
339	my ($front, $link, $back, $base_dir,
340	$file, $doc_obj, $section) = @_;
341	# remove quotes from link at start and end if necessary
342	if ($link=~/^\"/) {
343	$link=~s/^\"//;$link=~s/\"$//;
344	$front.='"';
345	$back="\"$back";
346	}
347
348	$link =~ s/\n/ /g;
349
350	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
351
352	## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
353
354	my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
355
356	return $front . $middle . $back;
357	}
358
359	sub extract_metadata
360	{
361	my $self = shift (@_);
362	my ($textref, $metadata, $doc_obj) = @_;
363	my $outhandle = $self->{'outhandle'};
364
365	return if (!defined $textref);
366
367	# metadata fields to extract/save. 'key' is the (lowercase) name of the
368	# html meta, 'value' is the metadata name for greenstone to use
369	my %find_fields = ();
370	my ($tag,$value);
371
372	my $orig_field = "";
373	foreach my $field (split /,/, $self->{'metadata_fields'}) {
374	# support tag<tagname>
375	if ($field =~ /^(.?)<(.?)>$/) {
376	# "$2" is the user's preferred gs metadata name
377	$find_fields{lc($1)}=$2; # lc = lowercase
378	$orig_field = $1;
379	} else { # no <tagname> for mapping
380	# "$field" is the user's preferred gs metadata name
381	$find_fields{lc($field)}=$field; # lc = lowercase
382	$orig_field = $field;
383	}
384
385	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
386	$tag = $orig_field;
387	$value = $1;
388	if (!defined $value \|\| !defined $tag){
389	#print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
390	next;
391	} else {
392	# clean up and add
393	chomp($value); # remove trailing \n, if any
394	$tag = $find_fields{lc($tag)};
395	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
396	# if ($self->{'verbosity'} > 2);
397	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
398	}
399	}
400	}
401	}
402
403	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: