Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

StructuredHTMLPlugin.pm

Last change on this file was 33309, checked in by ak19, 5 years ago
More workarounds for HTML conversion results from Word's windows_scripting. If there were newlines between headings in the original word doc that accidentally had heading formatting, then the windows_scripting conversion creates a skeleton heading containing no actual text but space. The result being that when the doc.xml is generated, there's an empty subsection reflecting that empty heading. So adding further cleanup into StructuredHTMLPlugin to look for and remove these empty headings resulting from common Word user errors.
Property svn:keywords set to `Author Date Id Revision`
File size: 14.7 KB

Rev	Line
[10271]	1	###########################################################################
	2	#
[15872]	3	# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
[10404]	4	# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
	5	# HTML document
[10271]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
	11	# Copyright (C) 1999 New Zealand Digital Library Project
	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
[10404]	28	# This plugin is to process an HTML file where sections are divided by
	29	# user-defined headings tags. As it is difficult to predict what user's definition
	30	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
	31	# as well as allows to get rid of user-defined Table of Content (TOC)...
	32	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
	33
[15872]	34	package StructuredHTMLPlugin;
[10271]	35
[15872]	36	use HTMLPlugin;
	37	use ImageConverter; # want the identify method
[24600]	38	use util;
[10271]	39
[15872]	40	use strict; # every perl program should have this!
	41	no strict 'refs'; # make an exception so we can use variables as filehandles
[10404]	42
[10271]	43	sub BEGIN {
[15872]	44	@StructuredHTMLPlugin::ISA = ('HTMLPlugin');
[10271]	45	}
	46
[11849]	47	my $arguments =
	48	[
	49	{ 'name' => "level1_header",
[15872]	50	'desc' => "{StructuredHTMLPlugin.level1_header}",
[11849]	51	'type' => "regexp",
	52	'reqd' => "no",
	53	'deft' => "" },
	54	{ 'name' => "level2_header",
[15872]	55	'desc' => "{StructuredHTMLPlugin.level2_header}",
[11849]	56	'type' => "regexp",
	57	'reqd' => "no",
	58	'deft' => "" },
	59	{ 'name' => "level3_header",
[15872]	60	'desc' => "{StructuredHTMLPlugin.level3_header}",
[11849]	61	'type' => "regexp",
	62	'reqd' => "no",
	63	'deft' => "" },
	64	{ 'name' => "title_header",
[15872]	65	'desc' => "{StructuredHTMLPlugin.title_header}",
[11849]	66	'type' => "regexp",
	67	'reqd' => "no",
	68	'deft' => "" },
[11884]	69	{ 'name' => "delete_toc",
[15872]	70	'desc' => "{StructuredHTMLPlugin.delete_toc}",
[11884]	71	'type' => "flag",
	72	'reqd' => "no"},
[11849]	73	{ 'name' => "toc_header",
[15872]	74	'desc' => "{StructuredHTMLPlugin.toc_header}",
[11849]	75	'type' => "regexp",
	76	'reqd' => "no",
[11884]	77	'deft' => "" }
[11849]	78	];
[10271]	79
[15872]	80	my $options = { 'name' => "StructuredHTMLPlugin",
	81	'desc' => "{StructuredHTMLPlugin.desc}",
[10271]	82	'abstract' => "no",
	83	'inherits' => "yes",
	84	'args' => $arguments };
	85
	86	sub new {
	87	my ($class) = shift (@_);
	88	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	89	push(@$pluginlist, $class);
	90
[15872]	91	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	92	push(@{$hashArgOptLists->{"OptList"}},$options);
[10271]	93
[15872]	94	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
[10271]	95
	96	return bless $self, $class;
	97	}
	98
	99
	100	sub process {
	101	my $self = shift (@_);
[10404]	102	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[10271]	103	my $outhandle = $self->{'outhandle'};
	104
	105	my @head_and_body = split(/<body/i,$$textref);
	106	my $head = shift(@head_and_body);
[33301]	107	my $body_text = join("<body", @head_and_body); # won't actually work to prefix "<body" to just the body remaining in @head_and_body array, since only 1 element (the body) remains in @head_and_body
[10600]	108	$head =~ m/<title>(.+)<\/title>/i;
	109	my $doctitle = $1 if defined $1;
[10723]	110	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
[10426]	111	my @doc_properties = split(/<xml>/i,$head);
	112	my $doc_heading = shift(@doc_properties);
	113	my $rest_doc_properties = join(" ", @doc_properties);
[11380]	114
[10426]	115	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
	116	my $extracted_metadata = shift (@extracted_metadata);
	117	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
	118	}
	119
[11893]	120	# set the title here if we haven't found it yet
	121	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
	122	if (defined $doctitle && $doctitle =~ /\S/) {
	123	$doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
	124	} else {
	125	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
	126	}
	127	}
	128
	129	# If delete_toc is enabled, it means to get rid of toc and tof contents.
[10271]	130	# get rid of TOC and TOF sections and their title
[11380]	131	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
[10496]	132	if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
	133	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
	134	}
[10443]	135	}
[10600]	136
[10496]	137	if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
[10271]	138	$self->{'title_header'} =~ s/^($)(.*)($)/$2/is;
[10600]	139	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]	140	}
[10443]	141
	142	if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
	143	$self->{'level1_header'} =~ s/^$(.*)$/$1/i;
[10595]	144	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]	145	}
[10443]	146
[10496]	147	if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
[10443]	148	$self->{'level2_header'} =~ s/^$(.*)$/$1/i;
[10595]	149	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
[10271]	150	}
	151
[10496]	152	if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
[10443]	153	$self->{'level3_header'} =~ s/^$(.*)$/$1/is;
[10271]	154	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
	155	}
[11893]	156
[10271]	157	# Tidy up extra new lines
	158	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
	159	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
[33301]	160
	161	# what was the following line for. effectively unused. do we need it??
[15872]	162	#$section_text .= "<!--\n<Section>\n-->\n";
[10600]	163	#my $top_section_tag = "<!--\n<Section>\n-->\n";
	164	#$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
	165	#$body_text =~ s/(<div.*)/$top_section_tag$1/i;
[10271]	166	my $body = "<body".$body_text;
[33301]	167
[33309]	168	# remove empty headings that Word's windows_scripting may insert for multiple new lines around headings
	169	# have heading markup, e.g. <h2><o:p> </o:p></h2>
	170	$body =~ s@<h[1-6]>(<o:p>)?( )+(</o:p>)?</h[1-6]>@@gis;
	171
[10271]	172	my $section_text = $head;
[10426]	173
[10271]	174	# split HTML text on <h1>, <h2> etc tags
	175	my @h_split = split(/<h/i,$body);
[10426]	176
[10271]	177	my $hnum = 0;
	178
	179	my $sectionh1 = 0;
	180	$section_text .= shift(@h_split);
[33301]	181
	182	# When windows_scripting is on, WordPlugin invokes Word to convert the doc(x) file to HTML which is then
	183	# processed by this StructuredHTMLPlugin. However, Word will embed the entire HTML body content inside a <div>
	184	# This <div> becomes problematic, since in sectioned documents, the first section would end up starting with a div
	185	# but not contain a matching closing div, while the final section will end with an unmatched closing div.
	186	# So, as a hack, we remove any opening <div> appearing immediately after the <body> before the first <h>eading.
	187	# And we'll set a flag to remember to remove any corresponding closing </div> before the closing </body>.
	188	# So, now we look for any unclosed <div> elements in the preamble (pre-Headings) html that is in $section_text
	189	my $remove_global_div = 0;
	190	if($section_text =~ m/^(.?)\s<div[^>]>\s$/is) {
	191	$section_text = $1;
	192	$remove_global_div = 1;
	193	print $outhandle "********** Found and removed a global opening <div> at start of html body, will monitor for closing div too.\n"
	194	if $self->{'verbosity'} > 2;
	195	}
	196
[10271]	197	my $hc;
	198	foreach $hc ( @h_split )
	199	{
[33299]	200	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
[10271]	201	{
[33299]	202	my $new_hnum = $1;
	203	my $hc_after = $2;
	204	my $filtered_hc = undef;
	205
	206	if ($hc_after =~ m/^(.?)<\/h$new_hnum>(.)$/is)
[10271]	207	{
[33299]	208	my $h_text = $1;
	209	$filtered_hc = $2; # This represents the remainder of $hc, after the (e.g.) <h2>xxxx</h2> element
	210
	211	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
	212	# boil HTML down to some interesting text
	213	$h_text =~ s/^[1-3]>//;
	214	$h_text =~ s/<\/?.*?>//sg;
	215	$h_text =~ s/\s+/ /sg;
	216	$h_text =~ s/^\s$//s;
	217	$h_text =~ s/( )+\W*/ /sg;
	218
	219	if ($h_text =~ m/\w+/)
	220	{
	221	if ($new_hnum > $hnum)
	222	{
	223	# increase section nesting
	224	$hnum++;
	225	while ($hnum < $new_hnum)
	226	{
	227	my $spacing = " " x $hnum;
	228	$section_text .= "<!--\n";
	229	$section_text .= $spacing."<Section>\n";
	230	$section_text .= "-->\n";
	231	$hnum++;
	232	}
	233	}
	234	else # ($new_hnum <= $hnum)
	235	{
	236	# descrease section nesting
	237	while ($hnum >= $new_hnum)
	238	{
	239	my $spacing = " " x $hnum;
	240	$section_text .= "<!--\n";
	241	$section_text .= $spacing."</Section>\n";
	242	$section_text .= "-->\n";
	243	$hnum--;
	244	}
	245	$hnum++;
	246	}
	247
	248	my $spacing = " " x $hnum;
	249	$section_text .= "<!--\n";
	250	$section_text .= $spacing."<Section>\n";
	251	$section_text .= $spacing." <Description>\n";
	252	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
	253	$section_text .= $spacing." </Description>\n";
	254	$section_text .= "-->\n";
	255
	256	#print $outhandle $spacing."$h_text\n"
	257	# if $self->{'verbosity'} > 2;
	258
	259	$sectionh1++ if ($hnum==1);
	260	}
[10271]	261	}
[33299]	262	else {
	263	### print STDERR "***** hc = <h$hc\n\n";
[10271]	264	}
[33299]	265
	266	# This can probably be replaced by the first statement in the if-statement, because
	267	# $filtered_hc should always be defined when it is assigned above (even if it resolves
	268	# to be an empty string)
	269	if (defined $filtered_hc) {
	270	$section_text .= $filtered_hc;
	271	}
	272	else {
	273	$section_text .= "<h$hc";
	274	}
[10271]	275	}
[33299]	276	else
	277	{
	278	$section_text .= "<h$hc";
	279	}
[10271]	280	}
[33301]	281
	282	if($remove_global_div) { # then need to also handle a closing </div> tag for the global div too, and if one is present, remove it
	283	$section_text =~ s@\s</div[^>]>(\s</body>\s</html>\s*)$@$1@is;
	284	print $outhandle "********** Removing any matching closing global divider element\n"
	285	if $self->{'verbosity'} > 2;
	286	}
	287
	288
	289	while ($hnum >= 1)
[10271]	290	{
	291	my $spacing = " " x $hnum;
	292	$section_text .= "<!--\n";
	293	$section_text .= $spacing."</Section>\n";
	294	$section_text .= "-->\n";
	295	$hnum--;
	296	}
	297
	298	$section_text .= "<!--\n</Section>\n-->\n";
[33301]	299
[10271]	300	$$textref = $section_text;
[10426]	301
[11893]	302	# if ($sectionh1>0)
	303	# {
	304	# print $outhandle " Located section headings ..."
	305	# if $self->{'verbosity'} > 1;
	306	# }
[10426]	307
[10271]	308	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
[10426]	309
[10271]	310	$$textref =~ s/( )+/ /sg;
	311
[10426]	312	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
	313
[10271]	314	$self->SUPER::process(@_);
[10426]	315
[10271]	316	}
	317
	318
	319	sub resize_if_necessary
	320	{
	321	my ($self,$front,$back,$base_dir,$href) = @_;
[10426]	322
[10271]	323	# dig out width and height of image, if there
	324	my $img_attributes = "$front back";
	325	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
	326	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
[10426]	327
[10271]	328	# derive local filename for image based on its URL
	329	my $img_filename = $href;
	330	$img_filename =~ s/^[^:]*:\/\///;
	331	$img_filename = &util::filename_cat($base_dir, $img_filename);
[10426]	332
[10271]	333	# Replace %20's in URL with a space if required. Note that the filename
	334	# may include the %20 in some situations
	335	if ($img_filename =~ /\%20/) {
	336	if (!-e $img_filename) {
	337	$img_filename =~ s/\%20/ /g;
	338	}
	339	}
	340	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
	341	# get image info on width and height
[10426]	342
[10271]	343	my $outhandle = $self->{'outhandle'};
	344	my $verbosity = $self->{'verbosity'};
	345
	346	my ($image_type, $actual_width, $actual_height, $image_size)
[15872]	347	= &ImageConverter::identify($img_filename, $outhandle, $verbosity);
[10426]	348
[10356]	349	#print STDERR "**** $actual_width x $actual_height";
	350	#print STDERR " (requested: $img_width x $img_height)\n";
[10271]	351
	352	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
[11893]	353	#print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
[10426]	354
[10271]	355	# derive new image name based on current image
	356	my ($tailname, $dirname, $suffix)
[15872]	357	= &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
[10426]	358
[10271]	359	my $resized_filename
	360	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
[10426]	361
[10356]	362	#print STDERR "**** suffix = $suffix\n";
[10426]	363
[10271]	364	# Generate smaller image with convert
[15872]	365	my $newsize = "$img_width"."x$img_height";
[10271]	366	my $command = "convert -interlace plane -verbose "
[15872]	367	."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
[24600]	368	$command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl $command";
[11893]	369	#print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
	370	#my $result = '';
	371	#print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
[10271]	372	}
	373	}
	374	return $href;
	375	}
	376
	377
[10426]	378	sub extract_metadata
	379	{
	380	my $self = shift (@_);
	381	my ($textref, $metadata, $doc_obj) = @_;
	382	my $outhandle = $self->{'outhandle'};
	383
[11380]	384	return if (!defined $textref);
	385
[21801]	386	my $separator = $self->{'metadata_field_separator'};
	387	if ($separator eq "") {
	388	undef $separator;
	389	}
[10426]	390	# metadata fields to extract/save. 'key' is the (lowercase) name of the
	391	# html meta, 'value' is the metadata name for greenstone to use
	392	my %find_fields = ();
	393	my ($tag,$value);
[10271]	394
[10426]	395	my $orig_field = "";
[19993]	396	foreach my $field (split /\s,\s/, $self->{'metadata_fields'}) {
[10426]	397	# support tag<tagname>
[19993]	398	if ($field =~ /^(.?)\s<(.*?)>$/) {
[10426]	399	# "$2" is the user's preferred gs metadata name
	400	$find_fields{lc($1)}=$2; # lc = lowercase
	401	$orig_field = $1;
	402	} else { # no <tagname> for mapping
	403	# "$field" is the user's preferred gs metadata name
	404	$find_fields{lc($field)}=$field; # lc = lowercase
	405	$orig_field = $field;
	406	}
[11380]	407
[10426]	408	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
	409	$tag = $orig_field;
	410	$value = $1;
	411	if (!defined $value \|\| !defined $tag){
[15872]	412	#print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
[10426]	413	next;
	414	} else {
	415	# clean up and add
	416	chomp($value); # remove trailing \n, if any
	417	$tag = $find_fields{lc($tag)};
[11893]	418	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
	419	# if ($self->{'verbosity'} > 2);
[21801]	420	if (defined $separator) {
	421	my @values = split($separator, $value);
	422	foreach my $v (@values) {
	423	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $v) if $v =~ /\S/;
	424	}
	425	}
	426	else {
	427	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
	428	}
[10426]	429	}
	430	}
	431	}
	432	}
	433
[10271]	434	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/plugins/StructuredHTMLPlugin.pm

Download in other formats: