Context Navigation

source: main/trunk/greenstone2/perllib/plugins/StructuredHTMLPlugin.pm@ 32341

Last change on this file since 32341 was 32332, checked in by kjdon, 6 years ago
removed replace_images function. this inherits from HTMLPlugin, and replace images is called from that. don't know why it had its own version of this in the first place as it didn't look customised at all.
Property svn:keywords set to `Author Date Id Revision`
File size: 12.5 KB

Line
1	###########################################################################
2	#
3	# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
4	# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5	# HTML document
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28	# This plugin is to process an HTML file where sections are divided by
29	# user-defined headings tags. As it is difficult to predict what user's definition
30	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31	# as well as allows to get rid of user-defined Table of Content (TOC)...
32	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
33
34	package StructuredHTMLPlugin;
35
36	use HTMLPlugin;
37	use ImageConverter; # want the identify method
38	use util;
39
40	use strict; # every perl program should have this!
41	no strict 'refs'; # make an exception so we can use variables as filehandles
42
43	sub BEGIN {
44	@StructuredHTMLPlugin::ISA = ('HTMLPlugin');
45	}
46
47	my $arguments =
48	[
49	{ 'name' => "level1_header",
50	'desc' => "{StructuredHTMLPlugin.level1_header}",
51	'type' => "regexp",
52	'reqd' => "no",
53	'deft' => "" },
54	{ 'name' => "level2_header",
55	'desc' => "{StructuredHTMLPlugin.level2_header}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => "" },
59	{ 'name' => "level3_header",
60	'desc' => "{StructuredHTMLPlugin.level3_header}",
61	'type' => "regexp",
62	'reqd' => "no",
63	'deft' => "" },
64	{ 'name' => "title_header",
65	'desc' => "{StructuredHTMLPlugin.title_header}",
66	'type' => "regexp",
67	'reqd' => "no",
68	'deft' => "" },
69	{ 'name' => "delete_toc",
70	'desc' => "{StructuredHTMLPlugin.delete_toc}",
71	'type' => "flag",
72	'reqd' => "no"},
73	{ 'name' => "toc_header",
74	'desc' => "{StructuredHTMLPlugin.toc_header}",
75	'type' => "regexp",
76	'reqd' => "no",
77	'deft' => "" }
78	];
79
80	my $options = { 'name' => "StructuredHTMLPlugin",
81	'desc' => "{StructuredHTMLPlugin.desc}",
82	'abstract' => "no",
83	'inherits' => "yes",
84	'args' => $arguments };
85
86	sub new {
87	my ($class) = shift (@_);
88	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89	push(@$pluginlist, $class);
90
91	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92	push(@{$hashArgOptLists->{"OptList"}},$options);
93
94	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
95
96	return bless $self, $class;
97	}
98
99
100	sub process {
101	my $self = shift (@_);
102	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
103	my $outhandle = $self->{'outhandle'};
104
105	my @head_and_body = split(/<body/i,$$textref);
106	my $head = shift(@head_and_body);
107	my $body_text = join("<body", @head_and_body);
108	$head =~ m/<title>(.+)<\/title>/i;
109	my $doctitle = $1 if defined $1;
110	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
111	my @doc_properties = split(/<xml>/i,$head);
112	my $doc_heading = shift(@doc_properties);
113	my $rest_doc_properties = join(" ", @doc_properties);
114
115	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
116	my $extracted_metadata = shift (@extracted_metadata);
117	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
118	}
119
120	# set the title here if we haven't found it yet
121	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
122	if (defined $doctitle && $doctitle =~ /\S/) {
123	$doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
124	} else {
125	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
126	}
127	}
128
129	# If delete_toc is enabled, it means to get rid of toc and tof contents.
130	# get rid of TOC and TOF sections and their title
131	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
132	if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
133	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
134	}
135	}
136
137	if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
138	$self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
139	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
140	}
141
142	if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
143	$self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
144	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
145	}
146
147	if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
148	$self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
149	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
150	}
151
152	if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
153	$self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
154	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
155	}
156
157	# Tidy up extra new lines
158	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
159	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
160
161	# what was the following line for. effectively unused. do we need it??
162	#$section_text .= "<!--\n<Section>\n-->\n";
163	#my $top_section_tag = "<!--\n<Section>\n-->\n";
164	#$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
165	#$body_text =~ s/(<div.*)/$top_section_tag$1/i;
166	my $body = "<body".$body_text;
167
168	my $section_text = $head;
169
170	# split HTML text on <h1>, <h2> etc tags
171	my @h_split = split(/<h/i,$body);
172
173	my $hnum = 0;
174
175	my $sectionh1 = 0;
176	$section_text .= shift(@h_split);
177
178	my $hc;
179	foreach $hc ( @h_split )
180	{
181	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
182	{
183	my $new_hnum = $1;
184	my $hc_after = $2;
185
186	if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
187	{
188	my $h_text = $1;
189	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
190	# boil HTML down to some interesting text
191	$h_text =~ s/^[1-3]>//;
192	$h_text =~ s/<\/?.*?>//sg;
193	$h_text =~ s/\s+/ /sg;
194	$h_text =~ s/^\s$//s;
195	$h_text =~ s/( )+\W*/ /sg;
196
197	if ($h_text =~ m/\w+/)
198	{
199	if ($new_hnum > $hnum)
200	{
201	# increase section nesting
202	$hnum++;
203	while ($hnum < $new_hnum)
204	{
205	my $spacing = " " x $hnum;
206	$section_text .= "<!--\n";
207	$section_text .= $spacing."<Section>\n";
208	$section_text .= "-->\n";
209	$hnum++;
210	}
211	}
212	else # ($new_hnum <= $hnum)
213	{
214	# descrease section nesting
215	while ($hnum >= $new_hnum)
216	{
217	my $spacing = " " x $hnum;
218	$section_text .= "<!--\n";
219	$section_text .= $spacing."</Section>\n";
220	$section_text .= "-->\n";
221	$hnum--;
222	}
223	$hnum++;
224	}
225
226	my $spacing = " " x $hnum;
227	$section_text .= "<!--\n";
228	$section_text .= $spacing."<Section>\n";
229	$section_text .= $spacing." <Description>\n";
230	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
231	$section_text .= $spacing." </Description>\n";
232	$section_text .= "-->\n";
233
234	#print $outhandle $spacing."$h_text\n"
235	# if $self->{'verbosity'} > 2;
236
237	$sectionh1++ if ($hnum==1);
238	}
239	}
240	else {
241	### print STDERR "***** hc = <h$hc\n\n";
242	}
243	$section_text .= "<h$hc";
244	}
245	else
246	{
247	$section_text .= "<h$hc";
248	}
249	}
250
251	while ($hnum >= 1)
252	{
253	my $spacing = " " x $hnum;
254	$section_text .= "<!--\n";
255	$section_text .= $spacing."</Section>\n";
256	$section_text .= "-->\n";
257	$hnum--;
258	}
259
260	$section_text .= "<!--\n</Section>\n-->\n";
261
262	$$textref = $section_text;
263
264	# if ($sectionh1>0)
265	# {
266	# print $outhandle " Located section headings ..."
267	# if $self->{'verbosity'} > 1;
268	# }
269
270	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
271
272	$$textref =~ s/( )+/ /sg;
273
274	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
275
276	$self->SUPER::process(@_);
277
278	}
279
280
281	sub resize_if_necessary
282	{
283	my ($self,$front,$back,$base_dir,$href) = @_;
284
285	# dig out width and height of image, if there
286	my $img_attributes = "$front back";
287	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
288	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
289
290	# derive local filename for image based on its URL
291	my $img_filename = $href;
292	$img_filename =~ s/^[^:]*:\/\///;
293	$img_filename = &util::filename_cat($base_dir, $img_filename);
294
295	# Replace %20's in URL with a space if required. Note that the filename
296	# may include the %20 in some situations
297	if ($img_filename =~ /\%20/) {
298	if (!-e $img_filename) {
299	$img_filename =~ s/\%20/ /g;
300	}
301	}
302	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
303	# get image info on width and height
304
305	my $outhandle = $self->{'outhandle'};
306	my $verbosity = $self->{'verbosity'};
307
308	my ($image_type, $actual_width, $actual_height, $image_size)
309	= &ImageConverter::identify($img_filename, $outhandle, $verbosity);
310
311	#print STDERR "**** $actual_width x $actual_height";
312	#print STDERR " (requested: $img_width x $img_height)\n";
313
314	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
315	#print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
316
317	# derive new image name based on current image
318	my ($tailname, $dirname, $suffix)
319	= &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
320
321	my $resized_filename
322	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
323
324	#print STDERR "**** suffix = $suffix\n";
325
326	# Generate smaller image with convert
327	my $newsize = "$img_width"."x$img_height";
328	my $command = "convert -interlace plane -verbose "
329	."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
330	$command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl $command";
331	#print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
332	#my $result = '';
333	#print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
334	}
335	}
336	return $href;
337	}
338
339
340	sub extract_metadata
341	{
342	my $self = shift (@_);
343	my ($textref, $metadata, $doc_obj) = @_;
344	my $outhandle = $self->{'outhandle'};
345
346	return if (!defined $textref);
347
348	my $separator = $self->{'metadata_field_separator'};
349	if ($separator eq "") {
350	undef $separator;
351	}
352	# metadata fields to extract/save. 'key' is the (lowercase) name of the
353	# html meta, 'value' is the metadata name for greenstone to use
354	my %find_fields = ();
355	my ($tag,$value);
356
357	my $orig_field = "";
358	foreach my $field (split /\s,\s/, $self->{'metadata_fields'}) {
359	# support tag<tagname>
360	if ($field =~ /^(.?)\s<(.*?)>$/) {
361	# "$2" is the user's preferred gs metadata name
362	$find_fields{lc($1)}=$2; # lc = lowercase
363	$orig_field = $1;
364	} else { # no <tagname> for mapping
365	# "$field" is the user's preferred gs metadata name
366	$find_fields{lc($field)}=$field; # lc = lowercase
367	$orig_field = $field;
368	}
369
370	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
371	$tag = $orig_field;
372	$value = $1;
373	if (!defined $value \|\| !defined $tag){
374	#print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
375	next;
376	} else {
377	# clean up and add
378	chomp($value); # remove trailing \n, if any
379	$tag = $find_fields{lc($tag)};
380	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
381	# if ($self->{'verbosity'} > 2);
382	if (defined $separator) {
383	my @values = split($separator, $value);
384	foreach my $v (@values) {
385	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $v) if $v =~ /\S/;
386	}
387	}
388	else {
389	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
390	}
391	}
392	}
393	}
394	}
395
396	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: