Context Navigation

source: trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm@ 10356

Last change on this file since 10356 was 10356, checked in by chi, 19 years ago
tidy up the code.
Property svn:keywords set to `Author Date Id Revision`
File size: 11.6 KB

Line
1	###########################################################################
2	#
3	# StructuredHTMLPlug.pm -- html plugin with extra facilities for teasing out
4	# hierarchical structure (such as h1, h2, h3 tags) in an HTML document
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27	package StructuredHTMLPlug;
28
29	use HTMLPlug;
30	use ImagePlug;
31
32	sub BEGIN {
33	@ISA = ('HTMLPlug');
34	}
35
36	my $arguments =
37	[ { 'name' => "check_toc",
38	'desc' => "StructuredHTMLPlug.check_toc'}",
39	'type' => "flag",
40	'reqd' => "no"},
41	{ 'name' => "title_header",
42	'desc' => "{StructuredHTMLPlug.title_header}",
43	'type' => "regexp",
44	'reqd' => "no"},
45	{ 'name' => "level1_header",
46	'desc' => "{StructuredHTMLPlug.level1_header}",
47	'type' => "regexp",
48	'reqd' => "no"},
49	{ 'name' => "level2_header",
50	'desc' => "{StructuredHTMLPlug.level2_header}",
51	'type' => "regexp",
52	'reqd' => "no"},
53	{ 'name' => "level3_header",
54	'desc' => "{StructuredHTMLPlug.level3_header}",
55	'type' => "regexp",
56	'reqd' => "no"},
57	{ 'name' => "toc_header",
58	'desc' => "{StructuredHTMLPlug.toc_header}",
59	'type' => "regexp",
60	'reqd' => "no"},
61	{ 'name' => "tof_header",
62	'desc' => "{StructuredHTMLPlug.tof_header}",
63	'type' => "regexp",
64	'reqd' => "no"}];
65
66	my $options = { 'name' => "StructuredHTMLPlug",
67	'desc' => "{StructuredHTMLPlug.desc}",
68	'abstract' => "no",
69	'inherits' => "yes",
70	'args' => $arguments };
71
72	sub new {
73	my ($class) = shift (@_);
74	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
75	push(@$pluginlist, $class);
76
77	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
78	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
79
80	my $self = (defined $hashArgOptLists)? new HTMLPlug($pluginlist,$inputargs,$hashArgOptLists): new HTMLPlug($pluginlist,$inputargs);
81
82	return bless $self, $class;
83
84	}
85
86	sub read {
87	my $self = shift (@_);
88	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
89
90	my $filename = $file;
91	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
92
93	if ($filename =~ m/\.html?$/) {
94	my $poss_doc_filename = $filename;
95	$poss_doc_filename =~ s/\.html?$/.doc/;
96
97	if (-e $poss_doc_filename) {
98	# this file has already been processed by Word plugin
99	return 0;
100	}
101	}
102	return $self->SUPER::read(@_);
103	}
104
105	sub process {
106	my $self = shift (@_);
107	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
108	my $outhandle = $self->{'outhandle'};
109
110	print $outhandle "StructuredHTMLPlug: processing $file\n"
111	if $self->{'verbosity'} > 1;
112
113	my @head_and_body = split(/<body/i,$$textref);
114	my $head = shift(@head_and_body);
115	my $body_text = join("<body", @head_and_body);
116
117	# get rid of TOC and TOF sections and their title
118	# If check_toc is enables, it means to get rid of toc and tof contents.
119	if ($self->{'check_toc'}){
120	#line-height:150%;mso-ansi-language:FR'>Contents<o:p></o:p></span></b></p>
121	# get rid of Table of Contents title and Table of Figures
122	$body_text =~ s/<p[^>]><b><span[^>]>(Table of Content.\|Content.)<o:p><\/o:p><\/span><\/b><\/p>//isg;
123	$body_text =~ s/<p[^>]><b><span[^>]>(Table of Figure.\|Figure.)<o:p><\/o:p><\/span><\/b><\/p>//isg;
124	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
125	$body_text =~ s/<p class=(($self->{'tof_header'})[^>]*)>(.+?)<\/p>//isg;
126	}
127
128	if ($self->{'title_header'}){
129	$self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
130	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><title>$3<\/title><\/p>/isg;
131	}
132	if ($self->{'level1_header'}){
133	$self->{'level1_header'} =~ s/^(\()(.*)(\))/$2/is;
134	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
135	}
136	if ($self->{'level2_header'}){
137	$self->{'level2_header'} =~ s/^(\()(.*)(\))/$2/is;
138	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
139	}
140
141	if ($self->{'level3_header'}){
142	$self->{'level3_header'} =~ s/^(\()(.*)(\))/$2/is;
143	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
144	}
145
146	# Tidy up extra new lines
147	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
148	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
149
150	my $body = "<body".$body_text;
151
152	my $section_text = $head;
153	$section_text .= "<!--\n<Section>\n-->\n";
154
155	# split HTML text on <h1>, <h2> etc tags
156	my @h_split = split(/<h/i,$body);
157
158	my $hnum = 0;
159
160	my $sectionh1 = 0;
161	$section_text .= shift(@h_split);
162
163	my $hc;
164	foreach $hc ( @h_split )
165	{
166	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
167	{
168	my $new_hnum = $1;
169	my $hc_after = $2;
170
171	if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
172	{
173	my $h_text = $1;
174	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
175	# boil HTML down to some interesting text
176	$h_text =~ s/^[1-3]>//;
177	$h_text =~ s/<\/?.*?>//sg;
178	$h_text =~ s/\s+/ /sg;
179	$h_text =~ s/^\s$//s;
180	$h_text =~ s/( )+\W*/ /sg;
181
182	if ($h_text =~ m/\w+/)
183	{
184	if ($new_hnum > $hnum)
185	{
186	# increase section nesting
187	$hnum++;
188	while ($hnum < $new_hnum)
189	{
190	my $spacing = " " x $hnum;
191	$section_text .= "<!--\n";
192	$section_text .= $spacing."<Section>\n";
193	$section_text .= "-->\n";
194	$hnum++;
195	}
196	}
197	else # ($new_hnum <= $hnum)
198	{
199	# descrease section nesting
200	while ($hnum >= $new_hnum)
201	{
202	my $spacing = " " x $hnum;
203	$section_text .= "<!--\n";
204	$section_text .= $spacing."</Section>\n";
205	$section_text .= "-->\n";
206	$hnum--;
207	}
208	$hnum++;
209	}
210
211	my $spacing = " " x $hnum;
212	$section_text .= "<!--\n";
213	$section_text .= $spacing."<Section>\n";
214	$section_text .= $spacing." <Description>\n";
215	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
216	$section_text .= $spacing." </Description>\n";
217	$section_text .= "-->\n";
218
219	print $outhandle $spacing."$h_text\n"
220	if $self->{'verbosity'} > 2;
221
222	$sectionh1++ if ($hnum==1);
223	}
224	}
225	else {
226	### print STDERR "***** hc = <h$hc\n\n";
227
228	}
229	# $section_text .= "<!-- \n</Section>\n-->\n";
230	#print STDERR "***HC = $hc\n";
231	$section_text .= "<h$hc";
232	}
233	else
234	{
235	$section_text .= "<h$hc";
236	}
237	}
238
239	while ($hnum >= 1)
240	{
241	my $spacing = " " x $hnum;
242	$section_text .= "<!--\n";
243	$section_text .= $spacing."</Section>\n";
244	$section_text .= "-->\n";
245	$hnum--;
246	}
247
248	$section_text .= "<!--\n</Section>\n-->\n";
249
250	$$textref = $section_text;
251
252	# should be textref not testref???
253	# $$testref =~ s/<h(\d+)>(.*?)<\/h$1>/<Section><Metadata name=\"Title\">$1<\/Metadata></Section><h$1><\/h$1>/gi;
254
255	if ($sectionh1>0)
256	{
257	print $outhandle " Located section headings ..."
258	if $self->{'verbosity'} > 1;
259	}
260	print $outhandle " Passing on the HTMLPlug\n"
261	if $self->{'verbosity'} > 1;
262
263	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
264
265	$$textref =~ s/( )+/ /sg;
266
267	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
268
269	$self->SUPER::process(@_);
270
271	# associate original file with doc object
272	my $cursection = $doc_obj->get_top_section();
273	my $filename = &util::filename_cat($base_dir, $file);
274	if (-e $filename)
275	{
276	print $outhandle " Adding associated Word document\n"
277	if $self->{'verbosity'} > 1;
278
279	$doc_obj->associate_file($filename, "doc.doc", undef, $cursection);
280
281	my $doclink = "<a href=_httpcollection_/index/assoc/[archivedir]/doc.doc>";
282	$doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
283	$doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icondoc_");
284	$doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
285
286	my $file_size = -s $filename;
287	if ($file_size>1024)
288	{
289	my $fs_kbytes = sprintf("%d",$file_size/1024);
290	$doc_obj->add_utf8_metadata ($cursection, "filesize", "$fs_kbytes Kb");
291	}
292	else
293	{
294	$doc_obj->add_utf8_metadata ($cursection, "filesize", "$file_size bytes");
295	}
296
297	if ($file_size > 200000)
298	{
299	$doc_obj->add_utf8_metadata ($cursection, "fswarning", "1");
300	}
301	}
302	}
303
304
305	sub resize_if_necessary
306	{
307	my ($self,$front,$back,$base_dir,$href) = @_;
308
309	# dig out width and height of image, if there
310	my $img_attributes = "$front back";
311	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
312	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
313
314	# derive local filename for image based on its URL
315	my $img_filename = $href;
316	$img_filename =~ s/^[^:]*:\/\///;
317	$img_filename = &util::filename_cat($base_dir, $img_filename);
318
319	# Replace %20's in URL with a space if required. Note that the filename
320	# may include the %20 in some situations
321	if ($img_filename =~ /\%20/) {
322	if (!-e $img_filename) {
323	$img_filename =~ s/\%20/ /g;
324	}
325	}
326	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
327	# get image info on width and height
328
329	my $outhandle = $self->{'outhandle'};
330	my $verbosity = $self->{'verbosity'};
331
332	my ($image_type, $actual_width, $actual_height, $image_size)
333	= &ImagePlug::identify($img_filename, $outhandle, $verbosity);
334
335	#print STDERR "**** $actual_width x $actual_height";
336	#print STDERR " (requested: $img_width x $img_height)\n";
337
338	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
339	print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
340
341	# derive new image name based on current image
342	my ($tailname, $dirname, $suffix)
343	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
344
345	my $resized_filename
346	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
347
348	#print STDERR "**** suffix = $suffix\n";
349
350	# Generate smaller image with convert
351	my $newsize = "$img_widthx$image_height";
352	my $command = "convert -interlace plane -verbose "
353	."-geometry $newsize \"img_$filename\" \"$resized_filename\"";
354	print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
355	my $result = '';
356	print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
357
358	}
359	}
360
361	return $href;
362	}
363
364
365
366
367	sub replace_images {
368	my $self = shift (@_);
369	my ($front, $link, $back, $base_dir,
370	$file, $doc_obj, $section) = @_;
371	# remove quotes from link at start and end if necessary
372	if ($link=~/^\"/) {
373	$link=~s/^\"//;$link=~s/\"$//;
374	$front.='"';
375	$back="\"$back";
376	}
377
378	$link =~ s/\n/ /g;
379
380	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
381
382	## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
383
384	my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
385
386	return $front . $middle . $back;
387	}
388
389
390	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: