Context Navigation

source: main/trunk/greenstone2/perllib/plugins/StructuredHTMLPlugin.pm@ 22705

Last change on this file since 22705 was 21801, checked in by kjdon, 14 years ago
extended HTMLPlugin's metadata_field_separator option to Word and StructuredHTML plugins
Property svn:keywords set to `Author Date Id Revision`
File size: 13.0 KB

Line
1	###########################################################################
2	#
3	# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
4	# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5	# HTML document
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28	# This plugin is to process an HTML file where sections are divided by
29	# user-defined headings tags. As it is difficult to predict what user's definition
30	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31	# as well as allows to get rid of user-defined Table of Content (TOC)...
32	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
33
34	package StructuredHTMLPlugin;
35
36	use HTMLPlugin;
37	use ImageConverter; # want the identify method
38
39	use strict; # every perl program should have this!
40	no strict 'refs'; # make an exception so we can use variables as filehandles
41
42	sub BEGIN {
43	@StructuredHTMLPlugin::ISA = ('HTMLPlugin');
44	}
45
46	my $arguments =
47	[
48	{ 'name' => "level1_header",
49	'desc' => "{StructuredHTMLPlugin.level1_header}",
50	'type' => "regexp",
51	'reqd' => "no",
52	'deft' => "" },
53	{ 'name' => "level2_header",
54	'desc' => "{StructuredHTMLPlugin.level2_header}",
55	'type' => "regexp",
56	'reqd' => "no",
57	'deft' => "" },
58	{ 'name' => "level3_header",
59	'desc' => "{StructuredHTMLPlugin.level3_header}",
60	'type' => "regexp",
61	'reqd' => "no",
62	'deft' => "" },
63	{ 'name' => "title_header",
64	'desc' => "{StructuredHTMLPlugin.title_header}",
65	'type' => "regexp",
66	'reqd' => "no",
67	'deft' => "" },
68	{ 'name' => "delete_toc",
69	'desc' => "{StructuredHTMLPlugin.delete_toc}",
70	'type' => "flag",
71	'reqd' => "no"},
72	{ 'name' => "toc_header",
73	'desc' => "{StructuredHTMLPlugin.toc_header}",
74	'type' => "regexp",
75	'reqd' => "no",
76	'deft' => "" }
77	];
78
79	my $options = { 'name' => "StructuredHTMLPlugin",
80	'desc' => "{StructuredHTMLPlugin.desc}",
81	'abstract' => "no",
82	'inherits' => "yes",
83	'args' => $arguments };
84
85	sub new {
86	my ($class) = shift (@_);
87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88	push(@$pluginlist, $class);
89
90	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
91	push(@{$hashArgOptLists->{"OptList"}},$options);
92
93	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
94
95	return bless $self, $class;
96	}
97
98
99	sub process {
100	my $self = shift (@_);
101	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
102	my $outhandle = $self->{'outhandle'};
103
104	my @head_and_body = split(/<body/i,$$textref);
105	my $head = shift(@head_and_body);
106	my $body_text = join("<body", @head_and_body);
107	$head =~ m/<title>(.+)<\/title>/i;
108	my $doctitle = $1 if defined $1;
109	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
110	my @doc_properties = split(/<xml>/i,$head);
111	my $doc_heading = shift(@doc_properties);
112	my $rest_doc_properties = join(" ", @doc_properties);
113
114	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
115	my $extracted_metadata = shift (@extracted_metadata);
116	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
117	}
118
119	# set the title here if we haven't found it yet
120	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
121	if (defined $doctitle && $doctitle =~ /\S/) {
122	$doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
123	} else {
124	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
125	}
126	}
127
128	# If delete_toc is enabled, it means to get rid of toc and tof contents.
129	# get rid of TOC and TOF sections and their title
130	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
131	if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
132	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
133	}
134	}
135
136	if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
137	$self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
138	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
139	}
140
141	if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
142	$self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
143	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
144	}
145
146	if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
147	$self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
148	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
149	}
150
151	if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
152	$self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
153	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
154	}
155
156	# Tidy up extra new lines
157	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
158	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
159
160	# what was the following line for. effectively unused. do we need it??
161	#$section_text .= "<!--\n<Section>\n-->\n";
162	#my $top_section_tag = "<!--\n<Section>\n-->\n";
163	#$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
164	#$body_text =~ s/(<div.*)/$top_section_tag$1/i;
165	my $body = "<body".$body_text;
166
167	my $section_text = $head;
168
169	# split HTML text on <h1>, <h2> etc tags
170	my @h_split = split(/<h/i,$body);
171
172	my $hnum = 0;
173
174	my $sectionh1 = 0;
175	$section_text .= shift(@h_split);
176
177	my $hc;
178	foreach $hc ( @h_split )
179	{
180	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
181	{
182	my $new_hnum = $1;
183	my $hc_after = $2;
184
185	if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
186	{
187	my $h_text = $1;
188	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
189	# boil HTML down to some interesting text
190	$h_text =~ s/^[1-3]>//;
191	$h_text =~ s/<\/?.*?>//sg;
192	$h_text =~ s/\s+/ /sg;
193	$h_text =~ s/^\s$//s;
194	$h_text =~ s/( )+\W*/ /sg;
195
196	if ($h_text =~ m/\w+/)
197	{
198	if ($new_hnum > $hnum)
199	{
200	# increase section nesting
201	$hnum++;
202	while ($hnum < $new_hnum)
203	{
204	my $spacing = " " x $hnum;
205	$section_text .= "<!--\n";
206	$section_text .= $spacing."<Section>\n";
207	$section_text .= "-->\n";
208	$hnum++;
209	}
210	}
211	else # ($new_hnum <= $hnum)
212	{
213	# descrease section nesting
214	while ($hnum >= $new_hnum)
215	{
216	my $spacing = " " x $hnum;
217	$section_text .= "<!--\n";
218	$section_text .= $spacing."</Section>\n";
219	$section_text .= "-->\n";
220	$hnum--;
221	}
222	$hnum++;
223	}
224
225	my $spacing = " " x $hnum;
226	$section_text .= "<!--\n";
227	$section_text .= $spacing."<Section>\n";
228	$section_text .= $spacing." <Description>\n";
229	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
230	$section_text .= $spacing." </Description>\n";
231	$section_text .= "-->\n";
232
233	#print $outhandle $spacing."$h_text\n"
234	# if $self->{'verbosity'} > 2;
235
236	$sectionh1++ if ($hnum==1);
237	}
238	}
239	else {
240	### print STDERR "***** hc = <h$hc\n\n";
241	}
242	$section_text .= "<h$hc";
243	}
244	else
245	{
246	$section_text .= "<h$hc";
247	}
248	}
249
250	while ($hnum >= 1)
251	{
252	my $spacing = " " x $hnum;
253	$section_text .= "<!--\n";
254	$section_text .= $spacing."</Section>\n";
255	$section_text .= "-->\n";
256	$hnum--;
257	}
258
259	$section_text .= "<!--\n</Section>\n-->\n";
260
261	$$textref = $section_text;
262
263	# if ($sectionh1>0)
264	# {
265	# print $outhandle " Located section headings ..."
266	# if $self->{'verbosity'} > 1;
267	# }
268
269	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
270
271	$$textref =~ s/( )+/ /sg;
272
273	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
274
275	$self->SUPER::process(@_);
276
277	}
278
279
280	sub resize_if_necessary
281	{
282	my ($self,$front,$back,$base_dir,$href) = @_;
283
284	# dig out width and height of image, if there
285	my $img_attributes = "$front back";
286	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
287	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
288
289	# derive local filename for image based on its URL
290	my $img_filename = $href;
291	$img_filename =~ s/^[^:]*:\/\///;
292	$img_filename = &util::filename_cat($base_dir, $img_filename);
293
294	# Replace %20's in URL with a space if required. Note that the filename
295	# may include the %20 in some situations
296	if ($img_filename =~ /\%20/) {
297	if (!-e $img_filename) {
298	$img_filename =~ s/\%20/ /g;
299	}
300	}
301	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
302	# get image info on width and height
303
304	my $outhandle = $self->{'outhandle'};
305	my $verbosity = $self->{'verbosity'};
306
307	my ($image_type, $actual_width, $actual_height, $image_size)
308	= &ImageConverter::identify($img_filename, $outhandle, $verbosity);
309
310	#print STDERR "**** $actual_width x $actual_height";
311	#print STDERR " (requested: $img_width x $img_height)\n";
312
313	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
314	#print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
315
316	# derive new image name based on current image
317	my ($tailname, $dirname, $suffix)
318	= &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
319
320	my $resized_filename
321	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
322
323	#print STDERR "**** suffix = $suffix\n";
324
325	# Generate smaller image with convert
326	my $newsize = "$img_width"."x$img_height";
327	my $command = "convert -interlace plane -verbose "
328	."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
329	#print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
330	#my $result = '';
331	#print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
332	}
333	}
334	return $href;
335	}
336
337	sub replace_images {
338	my $self = shift (@_);
339	my ($front, $link, $back, $base_dir,
340	$file, $doc_obj, $section) = @_;
341	# remove quotes from link at start and end if necessary
342	if ($link=~/^\"/) {
343	$link=~s/^\"//;$link=~s/\"$//;
344	$front.='"';
345	$back="\"$back";
346	}
347
348	$link =~ s/\n/ /g;
349
350	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
351
352	## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
353
354	my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
355
356	return $front . $middle . $back;
357	}
358
359	sub extract_metadata
360	{
361	my $self = shift (@_);
362	my ($textref, $metadata, $doc_obj) = @_;
363	my $outhandle = $self->{'outhandle'};
364
365	return if (!defined $textref);
366
367	my $separator = $self->{'metadata_field_separator'};
368	if ($separator eq "") {
369	undef $separator;
370	}
371	# metadata fields to extract/save. 'key' is the (lowercase) name of the
372	# html meta, 'value' is the metadata name for greenstone to use
373	my %find_fields = ();
374	my ($tag,$value);
375
376	my $orig_field = "";
377	foreach my $field (split /\s,\s/, $self->{'metadata_fields'}) {
378	# support tag<tagname>
379	if ($field =~ /^(.?)\s<(.*?)>$/) {
380	# "$2" is the user's preferred gs metadata name
381	$find_fields{lc($1)}=$2; # lc = lowercase
382	$orig_field = $1;
383	} else { # no <tagname> for mapping
384	# "$field" is the user's preferred gs metadata name
385	$find_fields{lc($field)}=$field; # lc = lowercase
386	$orig_field = $field;
387	}
388
389	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
390	$tag = $orig_field;
391	$value = $1;
392	if (!defined $value \|\| !defined $tag){
393	#print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
394	next;
395	} else {
396	# clean up and add
397	chomp($value); # remove trailing \n, if any
398	$tag = $find_fields{lc($tag)};
399	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
400	# if ($self->{'verbosity'} > 2);
401	if (defined $separator) {
402	my @values = split($separator, $value);
403	foreach my $v (@values) {
404	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $v) if $v =~ /\S/;
405	}
406	}
407	else {
408	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
409	}
410	}
411	}
412	}
413	}
414
415	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: