Context Navigation

source: trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm@ 12169

Last change on this file since 12169 was 12169, checked in by mdewsnip, 18 years ago
Tidied up that horrible long line in the new() function of every plugin.
Property svn:keywords set to `Author Date Id Revision`
File size: 12.7 KB

Line
1	###########################################################################
2	#
3	# StructuredHTMLPlug.pm -- html plugin with extra facilities for teasing out
4	# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5	# HTML document
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28	# This plugin is to process an HTML file where sections are divided by
29	# user-defined headings tags. As it is difficult to predict what user's definition
30	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31	# as well as allows to get rid of user-defined Table of Content (TOC)...
32	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
33
34	package StructuredHTMLPlug;
35
36	use HTMLPlug;
37	use ImagePlug;
38
39	#use strict; # every perl program should have this!
40	#no strict 'refs'; # make an exception so we can use variables as filehandles
41
42	sub BEGIN {
43	@StructuredHTMLPlug::ISA = ('HTMLPlug');
44	}
45
46	my $arguments =
47	[
48	{ 'name' => "level1_header",
49	'desc' => "{StructuredHTMLPlug.level1_header}",
50	'type' => "regexp",
51	'reqd' => "no",
52	'deft' => "" },
53	{ 'name' => "level2_header",
54	'desc' => "{StructuredHTMLPlug.level2_header}",
55	'type' => "regexp",
56	'reqd' => "no",
57	'deft' => "" },
58	{ 'name' => "level3_header",
59	'desc' => "{StructuredHTMLPlug.level3_header}",
60	'type' => "regexp",
61	'reqd' => "no",
62	'deft' => "" },
63	{ 'name' => "title_header",
64	'desc' => "{StructuredHTMLPlug.title_header}",
65	'type' => "regexp",
66	'reqd' => "no",
67	'deft' => "" },
68	{ 'name' => "delete_toc",
69	'desc' => "{StructuredHTMLPlug.delete_toc}",
70	'type' => "flag",
71	'reqd' => "no"},
72	{ 'name' => "toc_header",
73	'desc' => "{StructuredHTMLPlug.toc_header}",
74	'type' => "regexp",
75	'reqd' => "no",
76	'deft' => "" }
77	];
78
79	my $options = { 'name' => "StructuredHTMLPlug",
80	'desc' => "{StructuredHTMLPlug.desc}",
81	'abstract' => "no",
82	'inherits' => "yes",
83	'args' => $arguments };
84
85	sub new {
86	my ($class) = shift (@_);
87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88	push(@$pluginlist, $class);
89
90	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
91	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
92
93	my $self = new HTMLPlug($pluginlist, $inputargs, $hashArgOptLists);
94
95	return bless $self, $class;
96	}
97
98
99	sub process {
100	my $self = shift (@_);
101	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
102	my $outhandle = $self->{'outhandle'};
103
104	print $outhandle "StructuredHTMLPlug: processing $file\n"
105	if $self->{'verbosity'} > 1;
106
107	my @head_and_body = split(/<body/i,$$textref);
108	my $head = shift(@head_and_body);
109	my $body_text = join("<body", @head_and_body);
110	$head =~ m/<title>(.+)<\/title>/i;
111	my $doctitle = $1 if defined $1;
112	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
113	my @doc_properties = split(/<xml>/i,$head);
114	my $doc_heading = shift(@doc_properties);
115	my $rest_doc_properties = join(" ", @doc_properties);
116
117	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
118	my $extracted_metadata = shift (@extracted_metadata);
119	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
120	}
121
122	# set the title here if we haven't found it yet
123	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
124	if (defined $doctitle && $doctitle =~ /\S/) {
125	$doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
126	} else {
127	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
128	}
129	}
130
131	# If delete_toc is enabled, it means to get rid of toc and tof contents.
132	# get rid of TOC and TOF sections and their title
133	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
134	if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
135	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
136	}
137	}
138
139	if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
140	$self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
141	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
142	}
143
144	if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
145	$self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
146	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
147	}
148
149	if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
150	$self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
151	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
152	}
153
154	if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
155	$self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
156	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
157	}
158
159	# Tidy up extra new lines
160	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
161	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
162
163	$section_text .= "<!--\n<Section>\n-->\n";
164	#my $top_section_tag = "<!--\n<Section>\n-->\n";
165	#$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
166	#$body_text =~ s/(<div.*)/$top_section_tag$1/i;
167	my $body = "<body".$body_text;
168
169	my $section_text = $head;
170
171	# split HTML text on <h1>, <h2> etc tags
172	my @h_split = split(/<h/i,$body);
173
174	my $hnum = 0;
175
176	my $sectionh1 = 0;
177	$section_text .= shift(@h_split);
178
179	my $hc;
180	foreach $hc ( @h_split )
181	{
182	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
183	{
184	my $new_hnum = $1;
185	my $hc_after = $2;
186
187	if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
188	{
189	my $h_text = $1;
190	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
191	# boil HTML down to some interesting text
192	$h_text =~ s/^[1-3]>//;
193	$h_text =~ s/<\/?.*?>//sg;
194	$h_text =~ s/\s+/ /sg;
195	$h_text =~ s/^\s$//s;
196	$h_text =~ s/( )+\W*/ /sg;
197
198	if ($h_text =~ m/\w+/)
199	{
200	if ($new_hnum > $hnum)
201	{
202	# increase section nesting
203	$hnum++;
204	while ($hnum < $new_hnum)
205	{
206	my $spacing = " " x $hnum;
207	$section_text .= "<!--\n";
208	$section_text .= $spacing."<Section>\n";
209	$section_text .= "-->\n";
210	$hnum++;
211	}
212	}
213	else # ($new_hnum <= $hnum)
214	{
215	# descrease section nesting
216	while ($hnum >= $new_hnum)
217	{
218	my $spacing = " " x $hnum;
219	$section_text .= "<!--\n";
220	$section_text .= $spacing."</Section>\n";
221	$section_text .= "-->\n";
222	$hnum--;
223	}
224	$hnum++;
225	}
226
227	my $spacing = " " x $hnum;
228	$section_text .= "<!--\n";
229	$section_text .= $spacing."<Section>\n";
230	$section_text .= $spacing." <Description>\n";
231	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
232	$section_text .= $spacing." </Description>\n";
233	$section_text .= "-->\n";
234
235	#print $outhandle $spacing."$h_text\n"
236	# if $self->{'verbosity'} > 2;
237
238	$sectionh1++ if ($hnum==1);
239	}
240	}
241	else {
242	### print STDERR "***** hc = <h$hc\n\n";
243	}
244	$section_text .= "<h$hc";
245	}
246	else
247	{
248	$section_text .= "<h$hc";
249	}
250	}
251
252	while ($hnum >= 1)
253	{
254	my $spacing = " " x $hnum;
255	$section_text .= "<!--\n";
256	$section_text .= $spacing."</Section>\n";
257	$section_text .= "-->\n";
258	$hnum--;
259	}
260
261	$section_text .= "<!--\n</Section>\n-->\n";
262
263	$$textref = $section_text;
264
265	# if ($sectionh1>0)
266	# {
267	# print $outhandle " Located section headings ..."
268	# if $self->{'verbosity'} > 1;
269	# }
270
271	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
272
273	$$textref =~ s/( )+/ /sg;
274
275	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
276
277	$self->SUPER::process(@_);
278
279	}
280
281
282	sub resize_if_necessary
283	{
284	my ($self,$front,$back,$base_dir,$href) = @_;
285
286	# dig out width and height of image, if there
287	my $img_attributes = "$front back";
288	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
289	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
290
291	# derive local filename for image based on its URL
292	my $img_filename = $href;
293	$img_filename =~ s/^[^:]*:\/\///;
294	$img_filename = &util::filename_cat($base_dir, $img_filename);
295
296	# Replace %20's in URL with a space if required. Note that the filename
297	# may include the %20 in some situations
298	if ($img_filename =~ /\%20/) {
299	if (!-e $img_filename) {
300	$img_filename =~ s/\%20/ /g;
301	}
302	}
303	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
304	# get image info on width and height
305
306	my $outhandle = $self->{'outhandle'};
307	my $verbosity = $self->{'verbosity'};
308
309	my ($image_type, $actual_width, $actual_height, $image_size)
310	= &ImagePlug::identify($img_filename, $outhandle, $verbosity);
311
312	#print STDERR "**** $actual_width x $actual_height";
313	#print STDERR " (requested: $img_width x $img_height)\n";
314
315	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
316	#print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
317
318	# derive new image name based on current image
319	my ($tailname, $dirname, $suffix)
320	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
321
322	my $resized_filename
323	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
324
325	#print STDERR "**** suffix = $suffix\n";
326
327	# Generate smaller image with convert
328	my $newsize = "$img_widthx$image_height";
329	my $command = "convert -interlace plane -verbose "
330	."-geometry $newsize \"img_$filename\" \"$resized_filename\"";
331	#print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
332	#my $result = '';
333	#print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
334	}
335	}
336	return $href;
337	}
338
339	sub replace_images {
340	my $self = shift (@_);
341	my ($front, $link, $back, $base_dir,
342	$file, $doc_obj, $section) = @_;
343	# remove quotes from link at start and end if necessary
344	if ($link=~/^\"/) {
345	$link=~s/^\"//;$link=~s/\"$//;
346	$front.='"';
347	$back="\"$back";
348	}
349
350	$link =~ s/\n/ /g;
351
352	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
353
354	## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
355
356	my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
357
358	return $front . $middle . $back;
359	}
360
361	sub extract_metadata
362	{
363	my $self = shift (@_);
364	my ($textref, $metadata, $doc_obj) = @_;
365	my $outhandle = $self->{'outhandle'};
366
367	return if (!defined $textref);
368
369	# metadata fields to extract/save. 'key' is the (lowercase) name of the
370	# html meta, 'value' is the metadata name for greenstone to use
371	my %find_fields = ();
372	my ($tag,$value);
373
374	my $orig_field = "";
375	foreach my $field (split /,/, $self->{'metadata_fields'}) {
376	# support tag<tagname>
377	if ($field =~ /^(.?)<(.?)>$/) {
378	# "$2" is the user's preferred gs metadata name
379	$find_fields{lc($1)}=$2; # lc = lowercase
380	$orig_field = $1;
381	} else { # no <tagname> for mapping
382	# "$field" is the user's preferred gs metadata name
383	$find_fields{lc($field)}=$field; # lc = lowercase
384	$orig_field = $field;
385	}
386
387	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
388	$tag = $orig_field;
389	$value = $1;
390	if (!defined $value \|\| !defined $tag){
391	#print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
392	next;
393	} else {
394	# clean up and add
395	chomp($value); # remove trailing \n, if any
396	$tag = $find_fields{lc($tag)};
397	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
398	# if ($self->{'verbosity'} > 2);
399	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
400	}
401	}
402	}
403	}
404
405	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: