Context Navigation

StructuredHTMLPlugin.pm@ 33309

Last change on this file since 33309 was 33309, checked in by ak19, 5 years ago
More workarounds for HTML conversion results from Word's windows_scripting. If there were newlines between headings in the original word doc that accidentally had heading formatting, then the windows_scripting conversion creates a skeleton heading containing no actual text but space. The result being that when the doc.xml is generated, there's an empty subsection reflecting that empty heading. So adding further cleanup into StructuredHTMLPlugin to look for and remove these empty headings resulting from common Word user errors.
Property svn:keywords set to `Author Date Id Revision`
File size: 14.7 KB

Line
1	###########################################################################
2	#
3	# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
4	# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
5	# HTML document
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28	# This plugin is to process an HTML file where sections are divided by
29	# user-defined headings tags. As it is difficult to predict what user's definition
30	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
31	# as well as allows to get rid of user-defined Table of Content (TOC)...
32	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
33
34	package StructuredHTMLPlugin;
35
36	use HTMLPlugin;
37	use ImageConverter; # want the identify method
38	use util;
39
40	use strict; # every perl program should have this!
41	no strict 'refs'; # make an exception so we can use variables as filehandles
42
43	sub BEGIN {
44	@StructuredHTMLPlugin::ISA = ('HTMLPlugin');
45	}
46
47	my $arguments =
48	[
49	{ 'name' => "level1_header",
50	'desc' => "{StructuredHTMLPlugin.level1_header}",
51	'type' => "regexp",
52	'reqd' => "no",
53	'deft' => "" },
54	{ 'name' => "level2_header",
55	'desc' => "{StructuredHTMLPlugin.level2_header}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => "" },
59	{ 'name' => "level3_header",
60	'desc' => "{StructuredHTMLPlugin.level3_header}",
61	'type' => "regexp",
62	'reqd' => "no",
63	'deft' => "" },
64	{ 'name' => "title_header",
65	'desc' => "{StructuredHTMLPlugin.title_header}",
66	'type' => "regexp",
67	'reqd' => "no",
68	'deft' => "" },
69	{ 'name' => "delete_toc",
70	'desc' => "{StructuredHTMLPlugin.delete_toc}",
71	'type' => "flag",
72	'reqd' => "no"},
73	{ 'name' => "toc_header",
74	'desc' => "{StructuredHTMLPlugin.toc_header}",
75	'type' => "regexp",
76	'reqd' => "no",
77	'deft' => "" }
78	];
79
80	my $options = { 'name' => "StructuredHTMLPlugin",
81	'desc' => "{StructuredHTMLPlugin.desc}",
82	'abstract' => "no",
83	'inherits' => "yes",
84	'args' => $arguments };
85
86	sub new {
87	my ($class) = shift (@_);
88	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89	push(@$pluginlist, $class);
90
91	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92	push(@{$hashArgOptLists->{"OptList"}},$options);
93
94	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
95
96	return bless $self, $class;
97	}
98
99
100	sub process {
101	my $self = shift (@_);
102	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
103	my $outhandle = $self->{'outhandle'};
104
105	my @head_and_body = split(/<body/i,$$textref);
106	my $head = shift(@head_and_body);
107	my $body_text = join("<body", @head_and_body); # won't actually work to prefix "<body" to just the body remaining in @head_and_body array, since only 1 element (the body) remains in @head_and_body
108	$head =~ m/<title>(.+)<\/title>/i;
109	my $doctitle = $1 if defined $1;
110	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
111	my @doc_properties = split(/<xml>/i,$head);
112	my $doc_heading = shift(@doc_properties);
113	my $rest_doc_properties = join(" ", @doc_properties);
114
115	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
116	my $extracted_metadata = shift (@extracted_metadata);
117	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
118	}
119
120	# set the title here if we haven't found it yet
121	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
122	if (defined $doctitle && $doctitle =~ /\S/) {
123	$doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
124	} else {
125	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
126	}
127	}
128
129	# If delete_toc is enabled, it means to get rid of toc and tof contents.
130	# get rid of TOC and TOF sections and their title
131	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
132	if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
133	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
134	}
135	}
136
137	if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
138	$self->{'title_header'} =~ s/^($)(.*)($)/$2/is;
139	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
140	}
141
142	if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
143	$self->{'level1_header'} =~ s/^$(.*)$/$1/i;
144	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
145	}
146
147	if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
148	$self->{'level2_header'} =~ s/^$(.*)$/$1/i;
149	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
150	}
151
152	if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
153	$self->{'level3_header'} =~ s/^$(.*)$/$1/is;
154	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
155	}
156
157	# Tidy up extra new lines
158	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
159	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
160
161	# what was the following line for. effectively unused. do we need it??
162	#$section_text .= "<!--\n<Section>\n-->\n";
163	#my $top_section_tag = "<!--\n<Section>\n-->\n";
164	#$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
165	#$body_text =~ s/(<div.*)/$top_section_tag$1/i;
166	my $body = "<body".$body_text;
167
168	# remove empty headings that Word's windows_scripting may insert for multiple new lines around headings
169	# have heading markup, e.g. <h2><o:p> </o:p></h2>
170	$body =~ s@<h[1-6]>(<o:p>)?( )+(</o:p>)?</h[1-6]>@@gis;
171
172	my $section_text = $head;
173
174	# split HTML text on <h1>, <h2> etc tags
175	my @h_split = split(/<h/i,$body);
176
177	my $hnum = 0;
178
179	my $sectionh1 = 0;
180	$section_text .= shift(@h_split);
181
182	# When windows_scripting is on, WordPlugin invokes Word to convert the doc(x) file to HTML which is then
183	# processed by this StructuredHTMLPlugin. However, Word will embed the entire HTML body content inside a <div>
184	# This <div> becomes problematic, since in sectioned documents, the first section would end up starting with a div
185	# but not contain a matching closing div, while the final section will end with an unmatched closing div.
186	# So, as a hack, we remove any opening <div> appearing immediately after the <body> before the first <h>eading.
187	# And we'll set a flag to remember to remove any corresponding closing </div> before the closing </body>.
188	# So, now we look for any unclosed <div> elements in the preamble (pre-Headings) html that is in $section_text
189	my $remove_global_div = 0;
190	if($section_text =~ m/^(.?)\s<div[^>]>\s$/is) {
191	$section_text = $1;
192	$remove_global_div = 1;
193	print $outhandle "********** Found and removed a global opening <div> at start of html body, will monitor for closing div too.\n"
194	if $self->{'verbosity'} > 2;
195	}
196
197	my $hc;
198	foreach $hc ( @h_split )
199	{
200	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
201	{
202	my $new_hnum = $1;
203	my $hc_after = $2;
204	my $filtered_hc = undef;
205
206	if ($hc_after =~ m/^(.?)<\/h$new_hnum>(.)$/is)
207	{
208	my $h_text = $1;
209	$filtered_hc = $2; # This represents the remainder of $hc, after the (e.g.) <h2>xxxx</h2> element
210
211	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
212	# boil HTML down to some interesting text
213	$h_text =~ s/^[1-3]>//;
214	$h_text =~ s/<\/?.*?>//sg;
215	$h_text =~ s/\s+/ /sg;
216	$h_text =~ s/^\s$//s;
217	$h_text =~ s/( )+\W*/ /sg;
218
219	if ($h_text =~ m/\w+/)
220	{
221	if ($new_hnum > $hnum)
222	{
223	# increase section nesting
224	$hnum++;
225	while ($hnum < $new_hnum)
226	{
227	my $spacing = " " x $hnum;
228	$section_text .= "<!--\n";
229	$section_text .= $spacing."<Section>\n";
230	$section_text .= "-->\n";
231	$hnum++;
232	}
233	}
234	else # ($new_hnum <= $hnum)
235	{
236	# descrease section nesting
237	while ($hnum >= $new_hnum)
238	{
239	my $spacing = " " x $hnum;
240	$section_text .= "<!--\n";
241	$section_text .= $spacing."</Section>\n";
242	$section_text .= "-->\n";
243	$hnum--;
244	}
245	$hnum++;
246	}
247
248	my $spacing = " " x $hnum;
249	$section_text .= "<!--\n";
250	$section_text .= $spacing."<Section>\n";
251	$section_text .= $spacing." <Description>\n";
252	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
253	$section_text .= $spacing." </Description>\n";
254	$section_text .= "-->\n";
255
256	#print $outhandle $spacing."$h_text\n"
257	# if $self->{'verbosity'} > 2;
258
259	$sectionh1++ if ($hnum==1);
260	}
261	}
262	else {
263	### print STDERR "***** hc = <h$hc\n\n";
264	}
265
266	# This can probably be replaced by the first statement in the if-statement, because
267	# $filtered_hc should always be defined when it is assigned above (even if it resolves
268	# to be an empty string)
269	if (defined $filtered_hc) {
270	$section_text .= $filtered_hc;
271	}
272	else {
273	$section_text .= "<h$hc";
274	}
275	}
276	else
277	{
278	$section_text .= "<h$hc";
279	}
280	}
281
282	if($remove_global_div) { # then need to also handle a closing </div> tag for the global div too, and if one is present, remove it
283	$section_text =~ s@\s</div[^>]>(\s</body>\s</html>\s*)$@$1@is;
284	print $outhandle "********** Removing any matching closing global divider element\n"
285	if $self->{'verbosity'} > 2;
286	}
287
288
289	while ($hnum >= 1)
290	{
291	my $spacing = " " x $hnum;
292	$section_text .= "<!--\n";
293	$section_text .= $spacing."</Section>\n";
294	$section_text .= "-->\n";
295	$hnum--;
296	}
297
298	$section_text .= "<!--\n</Section>\n-->\n";
299
300	$$textref = $section_text;
301
302	# if ($sectionh1>0)
303	# {
304	# print $outhandle " Located section headings ..."
305	# if $self->{'verbosity'} > 1;
306	# }
307
308	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
309
310	$$textref =~ s/( )+/ /sg;
311
312	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
313
314	$self->SUPER::process(@_);
315
316	}
317
318
319	sub resize_if_necessary
320	{
321	my ($self,$front,$back,$base_dir,$href) = @_;
322
323	# dig out width and height of image, if there
324	my $img_attributes = "$front back";
325	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
326	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
327
328	# derive local filename for image based on its URL
329	my $img_filename = $href;
330	$img_filename =~ s/^[^:]*:\/\///;
331	$img_filename = &util::filename_cat($base_dir, $img_filename);
332
333	# Replace %20's in URL with a space if required. Note that the filename
334	# may include the %20 in some situations
335	if ($img_filename =~ /\%20/) {
336	if (!-e $img_filename) {
337	$img_filename =~ s/\%20/ /g;
338	}
339	}
340	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
341	# get image info on width and height
342
343	my $outhandle = $self->{'outhandle'};
344	my $verbosity = $self->{'verbosity'};
345
346	my ($image_type, $actual_width, $actual_height, $image_size)
347	= &ImageConverter::identify($img_filename, $outhandle, $verbosity);
348
349	#print STDERR "**** $actual_width x $actual_height";
350	#print STDERR " (requested: $img_width x $img_height)\n";
351
352	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
353	#print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
354
355	# derive new image name based on current image
356	my ($tailname, $dirname, $suffix)
357	= &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
358
359	my $resized_filename
360	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
361
362	#print STDERR "**** suffix = $suffix\n";
363
364	# Generate smaller image with convert
365	my $newsize = "$img_width"."x$img_height";
366	my $command = "convert -interlace plane -verbose "
367	."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
368	$command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl $command";
369	#print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
370	#my $result = '';
371	#print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
372	}
373	}
374	return $href;
375	}
376
377
378	sub extract_metadata
379	{
380	my $self = shift (@_);
381	my ($textref, $metadata, $doc_obj) = @_;
382	my $outhandle = $self->{'outhandle'};
383
384	return if (!defined $textref);
385
386	my $separator = $self->{'metadata_field_separator'};
387	if ($separator eq "") {
388	undef $separator;
389	}
390	# metadata fields to extract/save. 'key' is the (lowercase) name of the
391	# html meta, 'value' is the metadata name for greenstone to use
392	my %find_fields = ();
393	my ($tag,$value);
394
395	my $orig_field = "";
396	foreach my $field (split /\s,\s/, $self->{'metadata_fields'}) {
397	# support tag<tagname>
398	if ($field =~ /^(.?)\s<(.*?)>$/) {
399	# "$2" is the user's preferred gs metadata name
400	$find_fields{lc($1)}=$2; # lc = lowercase
401	$orig_field = $1;
402	} else { # no <tagname> for mapping
403	# "$field" is the user's preferred gs metadata name
404	$find_fields{lc($field)}=$field; # lc = lowercase
405	$orig_field = $field;
406	}
407
408	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
409	$tag = $orig_field;
410	$value = $1;
411	if (!defined $value \|\| !defined $tag){
412	#print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
413	next;
414	} else {
415	# clean up and add
416	chomp($value); # remove trailing \n, if any
417	$tag = $find_fields{lc($tag)};
418	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
419	# if ($self->{'verbosity'} > 2);
420	if (defined $separator) {
421	my @values = split($separator, $value);
422	foreach my $v (@values) {
423	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $v) if $v =~ /\S/;
424	}
425	}
426	else {
427	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
428	}
429	}
430	}
431	}
432	}
433
434	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/plugins/StructuredHTMLPlugin.pm@ 33309

Download in other formats: