Context Navigation

source: trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm@ 10600

Last change on this file since 10600 was 10600, checked in by chi, 19 years ago
modifications for deal with document title (as the first H1 heading) and also store the metadata title retrieved from document at the top_section.
Property svn:keywords set to `Author Date Id Revision`
File size: 13.7 KB

Line
1	###########################################################################
2	#
3	# StructuredHTMLPlug.pm -- html plugin with extra facilities for teasing out
4	# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5	# HTML document
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28	# This plugin is to process an HTML file where sections are divided by
29	# user-defined headings tags. As it is difficult to predict what user's definition
30	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31	# as well as allows to get rid of user-defined Table of Content (TOC)...
32	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
33
34	package StructuredHTMLPlug;
35
36	use HTMLPlug;
37	use ImagePlug;
38
39	#use strict; # every perl program should have this!
40	#no strict 'refs'; # make an exception so we can use variables as filehandles
41
42	sub BEGIN {
43	@StructuredHTMLPlug::ISA = ('HTMLPlug');
44	}
45
46	my $arguments = [];
47
48	my $options = { 'name' => "StructuredHTMLPlug",
49	'desc' => "{StructuredHTMLPlug.desc}",
50	'abstract' => "no",
51	'inherits' => "yes",
52	'args' => $arguments };
53
54	sub new {
55	my ($class) = shift (@_);
56	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
57	push(@$pluginlist, $class);
58
59	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
60	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
61
62	my $self = (defined $hashArgOptLists)? new HTMLPlug($pluginlist,$inputargs,$hashArgOptLists): new HTMLPlug($pluginlist,$inputargs);
63
64	return bless $self, $class;
65	}
66
67	sub read {
68	my $self = shift (@_);
69	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
70
71	my $filename = $file;
72	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
73
74	if ($filename =~ m/\.html?$/) {
75	my $poss_doc_filename = $filename;
76	$poss_doc_filename =~ s/\.html?$/.doc/;
77
78	if (-e $poss_doc_filename) {
79	# this file has already been processed by Word plugin
80	return 0;
81	}
82	}
83	return $self->SUPER::read(@_);
84	}
85
86	sub process {
87	my $self = shift (@_);
88	#my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
89	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
90	my $outhandle = $self->{'outhandle'};
91
92	print $outhandle "StructuredHTMLPlug: processing $file\n"
93	if $self->{'verbosity'} > 1;
94
95	my @head_and_body = split(/<body/i,$$textref);
96	my $head = shift(@head_and_body);
97	my $body_text = join("<body", @head_and_body);
98	$head =~ m/<title>(.+)<\/title>/i;
99	my $doctitle = $1 if defined $1;
100	if (defined $self->{'extracted_word_metadata_fields'} && $self->{'extracted_word_metadata_fields'}=~ /\S/) {
101	my @doc_properties = split(/<xml>/i,$head);
102	my $doc_heading = shift(@doc_properties);
103	my $rest_doc_properties = join(" ", @doc_properties);
104	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
105	my $extracted_metadata = shift (@extracted_metadata);
106	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
107	}
108
109	# If delete_toc is enables, it means to get rid of toc and tof contents.
110	# get rid of TOC and TOF sections and their title
111	if ($self->{'delete_toc'} == 1){
112	if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
113	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
114	}
115	if (defined $self->{'tof_header'}&& $self->{'tof_header'}=~ /\S/) {
116	$body_text =~ s/<p class=(($self->{'tof_header'})[^>]*)>(.+?)<\/p>//isg;
117	}
118	}
119
120	if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
121	$self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
122	#$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><title>$3<\/title><\/p>/isg;
123	#$doctitle = $3;
124	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
125	#$body_text =~ m/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/isg;
126	#$doctitle = "<h1>".$3."<\/h1>" if defined $3;
127	}
128
129	if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
130	$self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
131	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
132	}
133
134	if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
135	$self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
136	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
137	}
138
139	if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
140	$self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
141	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
142	}
143	# Tidy up extra new lines
144	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
145	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
146
147	$section_text .= "<!--\n<Section>\n-->\n";
148	#my $top_section_tag = "<!--\n<Section>\n-->\n";
149	#$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
150	#$body_text =~ s/(<div.*)/$top_section_tag$1/i;
151	my $body = "<body".$body_text;
152
153	my $section_text = $head;
154
155	# split HTML text on <h1>, <h2> etc tags
156	my @h_split = split(/<h/i,$body);
157
158	my $hnum = 0;
159
160	my $sectionh1 = 0;
161	$section_text .= shift(@h_split);
162
163	my $hc;
164	foreach $hc ( @h_split )
165	{
166	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
167	{
168	my $new_hnum = $1;
169	my $hc_after = $2;
170
171	if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
172	{
173	my $h_text = $1;
174	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
175	# boil HTML down to some interesting text
176	$h_text =~ s/^[1-3]>//;
177	$h_text =~ s/<\/?.*?>//sg;
178	$h_text =~ s/\s+/ /sg;
179	$h_text =~ s/^\s$//s;
180	$h_text =~ s/( )+\W*/ /sg;
181
182	if ($h_text =~ m/\w+/)
183	{
184	if ($new_hnum > $hnum)
185	{
186	# increase section nesting
187	$hnum++;
188	while ($hnum < $new_hnum)
189	{
190	my $spacing = " " x $hnum;
191	$section_text .= "<!--\n";
192	$section_text .= $spacing."<Section>\n";
193	$section_text .= "-->\n";
194	$hnum++;
195	}
196	}
197	else # ($new_hnum <= $hnum)
198	{
199	# descrease section nesting
200	while ($hnum >= $new_hnum)
201	{
202	my $spacing = " " x $hnum;
203	$section_text .= "<!--\n";
204	$section_text .= $spacing."</Section>\n";
205	$section_text .= "-->\n";
206	$hnum--;
207	}
208	$hnum++;
209	}
210
211	my $spacing = " " x $hnum;
212	$section_text .= "<!--\n";
213	$section_text .= $spacing."<Section>\n";
214	$section_text .= $spacing." <Description>\n";
215	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
216	$section_text .= $spacing." </Description>\n";
217	$section_text .= "-->\n";
218
219	print $outhandle $spacing."$h_text\n"
220	if $self->{'verbosity'} > 2;
221
222	$sectionh1++ if ($hnum==1);
223	}
224	}
225	else {
226	### print STDERR "***** hc = <h$hc\n\n";
227	}
228	$section_text .= "<h$hc";
229	}
230	else
231	{
232	$section_text .= "<h$hc";
233	}
234	}
235
236	while ($hnum >= 1)
237	{
238	my $spacing = " " x $hnum;
239	$section_text .= "<!--\n";
240	$section_text .= $spacing."</Section>\n";
241	$section_text .= "-->\n";
242	$hnum--;
243	}
244
245	$section_text .= "<!--\n</Section>\n-->\n";
246
247	$$textref = $section_text;
248
249	# should be textref not testref???
250	#$$testref =~ s/<h(\d+)>(.*?)<\/h$1>/<Section><Metadata name=\"Title\">$1<\/Metadata></Section><h$1><\/h$1>/gi;
251
252	if ($sectionh1>0)
253	{
254	print $outhandle " Located section headings ..."
255	if $self->{'verbosity'} > 1;
256	}
257	print $outhandle " Passing on the HTMLPlug\n"
258	if $self->{'verbosity'} > 1;
259
260	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
261
262	$$textref =~ s/( )+/ /sg;
263
264	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
265
266	$self->SUPER::process(@_);
267
268	# associate original file with doc object
269	my $cursection = $doc_obj->get_top_section();
270	my $filename = &util::filename_cat($base_dir, $file);
271	if (-e $filename)
272	{
273	print $outhandle " Adding associated Word document\n"
274	if $self->{'verbosity'} > 1;
275
276	$doc_obj->associate_file($filename, "doc.doc", undef, $cursection);
277
278	my $doclink = "<a href=_httpcollection_/index/assoc/[archivedir]/doc.doc>";
279	$doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
280	$doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icondoc_");
281	$doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
282	$doc_obj->add_utf8_metadata ($cursection, "Title", $doctitle);
283	my $file_size = -s $filename;
284	if ($file_size>1024)
285	{
286	my $fs_kbytes = sprintf("%d",$file_size/1024);
287	$doc_obj->add_utf8_metadata ($cursection, "filesize", "$fs_kbytes Kb");
288	}
289	else
290	{
291	$doc_obj->add_utf8_metadata ($cursection, "filesize", "$file_size bytes");
292	}
293
294	if ($file_size > 200000)
295	{
296	$doc_obj->add_utf8_metadata ($cursection, "fswarning", "1");
297	}
298	}
299	}
300
301
302	sub resize_if_necessary
303	{
304	my ($self,$front,$back,$base_dir,$href) = @_;
305
306	# dig out width and height of image, if there
307	my $img_attributes = "$front back";
308	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
309	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
310
311	# derive local filename for image based on its URL
312	my $img_filename = $href;
313	$img_filename =~ s/^[^:]*:\/\///;
314	$img_filename = &util::filename_cat($base_dir, $img_filename);
315
316	# Replace %20's in URL with a space if required. Note that the filename
317	# may include the %20 in some situations
318	if ($img_filename =~ /\%20/) {
319	if (!-e $img_filename) {
320	$img_filename =~ s/\%20/ /g;
321	}
322	}
323	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
324	# get image info on width and height
325
326	my $outhandle = $self->{'outhandle'};
327	my $verbosity = $self->{'verbosity'};
328
329	my ($image_type, $actual_width, $actual_height, $image_size)
330	= &ImagePlug::identify($img_filename, $outhandle, $verbosity);
331
332	#print STDERR "**** $actual_width x $actual_height";
333	#print STDERR " (requested: $img_width x $img_height)\n";
334
335	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
336	print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
337
338	# derive new image name based on current image
339	my ($tailname, $dirname, $suffix)
340	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
341
342	my $resized_filename
343	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
344
345	#print STDERR "**** suffix = $suffix\n";
346
347	# Generate smaller image with convert
348	my $newsize = "$img_widthx$image_height";
349	my $command = "convert -interlace plane -verbose "
350	."-geometry $newsize \"img_$filename\" \"$resized_filename\"";
351	print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
352	my $result = '';
353	print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
354	}
355	}
356	return $href;
357	}
358
359	sub replace_images {
360	my $self = shift (@_);
361	my ($front, $link, $back, $base_dir,
362	$file, $doc_obj, $section) = @_;
363	# remove quotes from link at start and end if necessary
364	if ($link=~/^\"/) {
365	$link=~s/^\"//;$link=~s/\"$//;
366	$front.='"';
367	$back="\"$back";
368	}
369
370	$link =~ s/\n/ /g;
371
372	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
373
374	## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
375
376	my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
377
378	return $front . $middle . $back;
379	}
380
381	sub extract_metadata
382	{
383	my $self = shift (@_);
384	my ($textref, $metadata, $doc_obj) = @_;
385	my $outhandle = $self->{'outhandle'};
386
387	# metadata fields to extract/save. 'key' is the (lowercase) name of the
388	# html meta, 'value' is the metadata name for greenstone to use
389	my %find_fields = ();
390	my ($tag,$value);
391
392	my $orig_field = "";
393	foreach my $field (split /,/, $self->{'extracted_word_metadata_fields'}) {
394	# support tag<tagname>
395	if ($field =~ /^(.?)<(.?)>$/) {
396	# "$2" is the user's preferred gs metadata name
397	$find_fields{lc($1)}=$2; # lc = lowercase
398	$orig_field = $1;
399	} else { # no <tagname> for mapping
400	# "$field" is the user's preferred gs metadata name
401	$find_fields{lc($field)}=$field; # lc = lowercase
402	$orig_field = $field;
403	}
404	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
405	$tag = $orig_field;
406	$value = $1;
407	if (!defined $value \|\| !defined $tag){
408	print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
409	next;
410	} else {
411	# clean up and add
412	chomp($value); # remove trailing \n, if any
413	$tag = $find_fields{lc($tag)};
414	print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
415	if ($self->{'verbosity'} > 2);
416	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
417	}
418	}
419	}
420	}
421
422	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: