Context Navigation

source: trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm@ 11380

Last change on this file since 11380 was 11380, checked in by kjdon, 18 years ago
davids fixes for a couple of warnings
Property svn:keywords set to `Author Date Id Revision`
File size: 13.7 KB

Line
1	###########################################################################
2	#
3	# StructuredHTMLPlug.pm -- html plugin with extra facilities for teasing out
4	# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5	# HTML document
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28	# This plugin is to process an HTML file where sections are divided by
29	# user-defined headings tags. As it is difficult to predict what user's definition
30	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31	# as well as allows to get rid of user-defined Table of Content (TOC)...
32	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
33
34	package StructuredHTMLPlug;
35
36	use HTMLPlug;
37	use ImagePlug;
38
39	#use strict; # every perl program should have this!
40	#no strict 'refs'; # make an exception so we can use variables as filehandles
41
42	sub BEGIN {
43	@StructuredHTMLPlug::ISA = ('HTMLPlug');
44	}
45
46	my $arguments = [];
47
48	my $options = { 'name' => "StructuredHTMLPlug",
49	'desc' => "{StructuredHTMLPlug.desc}",
50	'abstract' => "no",
51	'inherits' => "yes",
52	'args' => $arguments };
53
54	sub new {
55	my ($class) = shift (@_);
56	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
57	push(@$pluginlist, $class);
58
59	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
60	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
61
62	my $self = (defined $hashArgOptLists)? new HTMLPlug($pluginlist,$inputargs,$hashArgOptLists): new HTMLPlug($pluginlist,$inputargs);
63
64	return bless $self, $class;
65	}
66
67	sub read {
68	my $self = shift (@_);
69	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
70
71	my $filename = $file;
72	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
73
74	if ($filename =~ m/\.html?$/) {
75	my $poss_doc_filename = $filename;
76	$poss_doc_filename =~ s/\.html?$/.doc/;
77
78	if (-e $poss_doc_filename) {
79	# this file has already been processed by Word plugin
80	return 0;
81	}
82	}
83	return $self->SUPER::read(@_);
84	}
85
86	sub process {
87	my $self = shift (@_);
88	#my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
89	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
90	my $outhandle = $self->{'outhandle'};
91
92	print $outhandle "StructuredHTMLPlug: processing $file\n"
93	if $self->{'verbosity'} > 1;
94
95	my @head_and_body = split(/<body/i,$$textref);
96	my $head = shift(@head_and_body);
97	my $body_text = join("<body", @head_and_body);
98	$head =~ m/<title>(.+)<\/title>/i;
99	my $doctitle = $1 if defined $1;
100	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
101	my @doc_properties = split(/<xml>/i,$head);
102	my $doc_heading = shift(@doc_properties);
103	my $rest_doc_properties = join(" ", @doc_properties);
104
105	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
106	my $extracted_metadata = shift (@extracted_metadata);
107	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
108	}
109
110	# If delete_toc is enables, it means to get rid of toc and tof contents.
111	# get rid of TOC and TOF sections and their title
112	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
113	if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
114	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
115	}
116	if (defined $self->{'tof_header'}&& $self->{'tof_header'}=~ /\S/) {
117	$body_text =~ s/<p class=(($self->{'tof_header'})[^>]*)>(.+?)<\/p>//isg;
118	}
119	}
120
121	if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
122	$self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
123	#$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><title>$3<\/title><\/p>/isg;
124	#$doctitle = $3;
125	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
126	#$body_text =~ m/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/isg;
127	#$doctitle = "<h1>".$3."<\/h1>" if defined $3;
128	}
129
130	if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
131	$self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
132	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
133	}
134
135	if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
136	$self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
137	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
138	}
139
140	if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
141	$self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
142	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
143	}
144	# Tidy up extra new lines
145	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
146	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
147
148	$section_text .= "<!--\n<Section>\n-->\n";
149	#my $top_section_tag = "<!--\n<Section>\n-->\n";
150	#$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
151	#$body_text =~ s/(<div.*)/$top_section_tag$1/i;
152	my $body = "<body".$body_text;
153
154	my $section_text = $head;
155
156	# split HTML text on <h1>, <h2> etc tags
157	my @h_split = split(/<h/i,$body);
158
159	my $hnum = 0;
160
161	my $sectionh1 = 0;
162	$section_text .= shift(@h_split);
163
164	my $hc;
165	foreach $hc ( @h_split )
166	{
167	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
168	{
169	my $new_hnum = $1;
170	my $hc_after = $2;
171
172	if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
173	{
174	my $h_text = $1;
175	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
176	# boil HTML down to some interesting text
177	$h_text =~ s/^[1-3]>//;
178	$h_text =~ s/<\/?.*?>//sg;
179	$h_text =~ s/\s+/ /sg;
180	$h_text =~ s/^\s$//s;
181	$h_text =~ s/( )+\W*/ /sg;
182
183	if ($h_text =~ m/\w+/)
184	{
185	if ($new_hnum > $hnum)
186	{
187	# increase section nesting
188	$hnum++;
189	while ($hnum < $new_hnum)
190	{
191	my $spacing = " " x $hnum;
192	$section_text .= "<!--\n";
193	$section_text .= $spacing."<Section>\n";
194	$section_text .= "-->\n";
195	$hnum++;
196	}
197	}
198	else # ($new_hnum <= $hnum)
199	{
200	# descrease section nesting
201	while ($hnum >= $new_hnum)
202	{
203	my $spacing = " " x $hnum;
204	$section_text .= "<!--\n";
205	$section_text .= $spacing."</Section>\n";
206	$section_text .= "-->\n";
207	$hnum--;
208	}
209	$hnum++;
210	}
211
212	my $spacing = " " x $hnum;
213	$section_text .= "<!--\n";
214	$section_text .= $spacing."<Section>\n";
215	$section_text .= $spacing." <Description>\n";
216	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
217	$section_text .= $spacing." </Description>\n";
218	$section_text .= "-->\n";
219
220	print $outhandle $spacing."$h_text\n"
221	if $self->{'verbosity'} > 2;
222
223	$sectionh1++ if ($hnum==1);
224	}
225	}
226	else {
227	### print STDERR "***** hc = <h$hc\n\n";
228	}
229	$section_text .= "<h$hc";
230	}
231	else
232	{
233	$section_text .= "<h$hc";
234	}
235	}
236
237	while ($hnum >= 1)
238	{
239	my $spacing = " " x $hnum;
240	$section_text .= "<!--\n";
241	$section_text .= $spacing."</Section>\n";
242	$section_text .= "-->\n";
243	$hnum--;
244	}
245
246	$section_text .= "<!--\n</Section>\n-->\n";
247
248	$$textref = $section_text;
249
250	# should be textref not testref???
251	#$$testref =~ s/<h(\d+)>(.*?)<\/h$1>/<Section><Metadata name=\"Title\">$1<\/Metadata></Section><h$1><\/h$1>/gi;
252
253	if ($sectionh1>0)
254	{
255	print $outhandle " Located section headings ..."
256	if $self->{'verbosity'} > 1;
257	}
258	print $outhandle " Passing on the HTMLPlug\n"
259	if $self->{'verbosity'} > 1;
260
261	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
262
263	$$textref =~ s/( )+/ /sg;
264
265	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
266
267	$self->SUPER::process(@_);
268
269	# associate original file with doc object
270	my $cursection = $doc_obj->get_top_section();
271	my $filename = &util::filename_cat($base_dir, $file);
272	if (-e $filename)
273	{
274	print $outhandle " Adding associated Word document\n"
275	if $self->{'verbosity'} > 1;
276
277	$doc_obj->associate_file($filename, "doc.doc", undef, $cursection);
278
279	my $doclink = "<a href=_httpcollection_/index/assoc/[archivedir]/doc.doc>";
280	$doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
281	$doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icondoc_");
282	$doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
283	$doc_obj->add_utf8_metadata ($cursection, "Title", $doctitle);
284	my $file_size = -s $filename;
285	if ($file_size>1024)
286	{
287	my $fs_kbytes = sprintf("%d",$file_size/1024);
288	$doc_obj->add_utf8_metadata ($cursection, "filesize", "$fs_kbytes Kb");
289	}
290	else
291	{
292	$doc_obj->add_utf8_metadata ($cursection, "filesize", "$file_size bytes");
293	}
294
295	if ($file_size > 200000)
296	{
297	$doc_obj->add_utf8_metadata ($cursection, "fswarning", "1");
298	}
299	}
300	}
301
302
303	sub resize_if_necessary
304	{
305	my ($self,$front,$back,$base_dir,$href) = @_;
306
307	# dig out width and height of image, if there
308	my $img_attributes = "$front back";
309	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
310	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
311
312	# derive local filename for image based on its URL
313	my $img_filename = $href;
314	$img_filename =~ s/^[^:]*:\/\///;
315	$img_filename = &util::filename_cat($base_dir, $img_filename);
316
317	# Replace %20's in URL with a space if required. Note that the filename
318	# may include the %20 in some situations
319	if ($img_filename =~ /\%20/) {
320	if (!-e $img_filename) {
321	$img_filename =~ s/\%20/ /g;
322	}
323	}
324	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
325	# get image info on width and height
326
327	my $outhandle = $self->{'outhandle'};
328	my $verbosity = $self->{'verbosity'};
329
330	my ($image_type, $actual_width, $actual_height, $image_size)
331	= &ImagePlug::identify($img_filename, $outhandle, $verbosity);
332
333	#print STDERR "**** $actual_width x $actual_height";
334	#print STDERR " (requested: $img_width x $img_height)\n";
335
336	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
337	print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
338
339	# derive new image name based on current image
340	my ($tailname, $dirname, $suffix)
341	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
342
343	my $resized_filename
344	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
345
346	#print STDERR "**** suffix = $suffix\n";
347
348	# Generate smaller image with convert
349	my $newsize = "$img_widthx$image_height";
350	my $command = "convert -interlace plane -verbose "
351	."-geometry $newsize \"img_$filename\" \"$resized_filename\"";
352	print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
353	my $result = '';
354	print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
355	}
356	}
357	return $href;
358	}
359
360	sub replace_images {
361	my $self = shift (@_);
362	my ($front, $link, $back, $base_dir,
363	$file, $doc_obj, $section) = @_;
364	# remove quotes from link at start and end if necessary
365	if ($link=~/^\"/) {
366	$link=~s/^\"//;$link=~s/\"$//;
367	$front.='"';
368	$back="\"$back";
369	}
370
371	$link =~ s/\n/ /g;
372
373	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
374
375	## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
376
377	my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
378
379	return $front . $middle . $back;
380	}
381
382	sub extract_metadata
383	{
384	my $self = shift (@_);
385	my ($textref, $metadata, $doc_obj) = @_;
386	my $outhandle = $self->{'outhandle'};
387
388	return if (!defined $textref);
389
390	# metadata fields to extract/save. 'key' is the (lowercase) name of the
391	# html meta, 'value' is the metadata name for greenstone to use
392	my %find_fields = ();
393	my ($tag,$value);
394
395	my $orig_field = "";
396	foreach my $field (split /,/, $self->{'metadata_fields'}) {
397	# support tag<tagname>
398	if ($field =~ /^(.?)<(.?)>$/) {
399	# "$2" is the user's preferred gs metadata name
400	$find_fields{lc($1)}=$2; # lc = lowercase
401	$orig_field = $1;
402	} else { # no <tagname> for mapping
403	# "$field" is the user's preferred gs metadata name
404	$find_fields{lc($field)}=$field; # lc = lowercase
405	$orig_field = $field;
406	}
407
408	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
409	$tag = $orig_field;
410	$value = $1;
411	if (!defined $value \|\| !defined $tag){
412	print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
413	next;
414	} else {
415	# clean up and add
416	chomp($value); # remove trailing \n, if any
417	$tag = $find_fields{lc($tag)};
418	print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
419	if ($self->{'verbosity'} > 2);
420	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
421	}
422	}
423	}
424	}
425
426	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: