Context Navigation

source: gsdl/trunk/perllib/plugins/StructuredHTMLPlugin.pm@ 19993

Last change on this file since 19993 was 19993, checked in by ak19, 15 years ago
Dr Bainbridge's fix to deal with spaces in metadata_fields option (spaces before angle brackets and around commas that separate a list of metadata fields).
Property svn:keywords set to `Author Date Id Revision`
File size: 12.7 KB

Rev	Line
[10271]	1	###########################################################################
	2	#
[15872]	3	# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
[10404]	4	# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
	5	# HTML document
[10271]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
	11	# Copyright (C) 1999 New Zealand Digital Library Project
	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
[10404]	28	# This plugin is to process an HTML file where sections are divided by
	29	# user-defined headings tags. As it is difficult to predict what user's definition
	30	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
	31	# as well as allows to get rid of user-defined Table of Content (TOC)...
	32	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
	33
[15872]	34	package StructuredHTMLPlugin;
[10271]	35
[15872]	36	use HTMLPlugin;
	37	use ImageConverter; # want the identify method
[10271]	38
[15872]	39	use strict; # every perl program should have this!
	40	no strict 'refs'; # make an exception so we can use variables as filehandles
[10404]	41
[10271]	42	sub BEGIN {
[15872]	43	@StructuredHTMLPlugin::ISA = ('HTMLPlugin');
[10271]	44	}
	45
[11849]	46	my $arguments =
	47	[
	48	{ 'name' => "level1_header",
[15872]	49	'desc' => "{StructuredHTMLPlugin.level1_header}",
[11849]	50	'type' => "regexp",
	51	'reqd' => "no",
	52	'deft' => "" },
	53	{ 'name' => "level2_header",
[15872]	54	'desc' => "{StructuredHTMLPlugin.level2_header}",
[11849]	55	'type' => "regexp",
	56	'reqd' => "no",
	57	'deft' => "" },
	58	{ 'name' => "level3_header",
[15872]	59	'desc' => "{StructuredHTMLPlugin.level3_header}",
[11849]	60	'type' => "regexp",
	61	'reqd' => "no",
	62	'deft' => "" },
	63	{ 'name' => "title_header",
[15872]	64	'desc' => "{StructuredHTMLPlugin.title_header}",
[11849]	65	'type' => "regexp",
	66	'reqd' => "no",
	67	'deft' => "" },
[11884]	68	{ 'name' => "delete_toc",
[15872]	69	'desc' => "{StructuredHTMLPlugin.delete_toc}",
[11884]	70	'type' => "flag",
	71	'reqd' => "no"},
[11849]	72	{ 'name' => "toc_header",
[15872]	73	'desc' => "{StructuredHTMLPlugin.toc_header}",
[11849]	74	'type' => "regexp",
	75	'reqd' => "no",
[11884]	76	'deft' => "" }
[11849]	77	];
[10271]	78
[15872]	79	my $options = { 'name' => "StructuredHTMLPlugin",
	80	'desc' => "{StructuredHTMLPlugin.desc}",
[10271]	81	'abstract' => "no",
	82	'inherits' => "yes",
	83	'args' => $arguments };
	84
	85	sub new {
	86	my ($class) = shift (@_);
	87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	88	push(@$pluginlist, $class);
	89
[15872]	90	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	91	push(@{$hashArgOptLists->{"OptList"}},$options);
[10271]	92
[15872]	93	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
[10271]	94
	95	return bless $self, $class;
	96	}
	97
	98
	99	sub process {
	100	my $self = shift (@_);
[10404]	101	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[10271]	102	my $outhandle = $self->{'outhandle'};
	103
	104	my @head_and_body = split(/<body/i,$$textref);
	105	my $head = shift(@head_and_body);
	106	my $body_text = join("<body", @head_and_body);
[10600]	107	$head =~ m/<title>(.+)<\/title>/i;
	108	my $doctitle = $1 if defined $1;
[10723]	109	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
[10426]	110	my @doc_properties = split(/<xml>/i,$head);
	111	my $doc_heading = shift(@doc_properties);
	112	my $rest_doc_properties = join(" ", @doc_properties);
[11380]	113
[10426]	114	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
	115	my $extracted_metadata = shift (@extracted_metadata);
	116	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
	117	}
	118
[11893]	119	# set the title here if we haven't found it yet
	120	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
	121	if (defined $doctitle && $doctitle =~ /\S/) {
	122	$doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
	123	} else {
	124	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
	125	}
	126	}
	127
	128	# If delete_toc is enabled, it means to get rid of toc and tof contents.
[10271]	129	# get rid of TOC and TOF sections and their title
[11380]	130	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
[10496]	131	if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
	132	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
	133	}
[10443]	134	}
[10600]	135
[10496]	136	if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
[10271]	137	$self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
[10600]	138	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]	139	}
[10443]	140
	141	if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
	142	$self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
[10595]	143	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]	144	}
[10443]	145
[10496]	146	if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
[10443]	147	$self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
[10595]	148	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
[10271]	149	}
	150
[10496]	151	if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
[10443]	152	$self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
[10271]	153	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
	154	}
[11893]	155
[10271]	156	# Tidy up extra new lines
	157	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
	158	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
[10426]	159
[15872]	160	# what was the following line for. effectively unused. do we need it??
	161	#$section_text .= "<!--\n<Section>\n-->\n";
[10600]	162	#my $top_section_tag = "<!--\n<Section>\n-->\n";
	163	#$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
	164	#$body_text =~ s/(<div.*)/$top_section_tag$1/i;
[10271]	165	my $body = "<body".$body_text;
	166
	167	my $section_text = $head;
[10426]	168
[10271]	169	# split HTML text on <h1>, <h2> etc tags
	170	my @h_split = split(/<h/i,$body);
[10426]	171
[10271]	172	my $hnum = 0;
	173
	174	my $sectionh1 = 0;
	175	$section_text .= shift(@h_split);
[10426]	176
[10271]	177	my $hc;
	178	foreach $hc ( @h_split )
	179	{
	180	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
	181	{
	182	my $new_hnum = $1;
	183	my $hc_after = $2;
	184
	185	if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
	186	{
	187	my $h_text = $1;
	188	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
	189	# boil HTML down to some interesting text
	190	$h_text =~ s/^[1-3]>//;
	191	$h_text =~ s/<\/?.*?>//sg;
	192	$h_text =~ s/\s+/ /sg;
	193	$h_text =~ s/^\s$//s;
	194	$h_text =~ s/( )+\W*/ /sg;
[10426]	195
[10271]	196	if ($h_text =~ m/\w+/)
	197	{
	198	if ($new_hnum > $hnum)
	199	{
	200	# increase section nesting
	201	$hnum++;
	202	while ($hnum < $new_hnum)
	203	{
	204	my $spacing = " " x $hnum;
	205	$section_text .= "<!--\n";
	206	$section_text .= $spacing."<Section>\n";
	207	$section_text .= "-->\n";
	208	$hnum++;
	209	}
	210	}
	211	else # ($new_hnum <= $hnum)
	212	{
	213	# descrease section nesting
	214	while ($hnum >= $new_hnum)
	215	{
	216	my $spacing = " " x $hnum;
	217	$section_text .= "<!--\n";
	218	$section_text .= $spacing."</Section>\n";
	219	$section_text .= "-->\n";
	220	$hnum--;
	221	}
	222	$hnum++;
	223	}
	224
[10600]	225	my $spacing = " " x $hnum;
[10271]	226	$section_text .= "<!--\n";
	227	$section_text .= $spacing."<Section>\n";
	228	$section_text .= $spacing." <Description>\n";
	229	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
	230	$section_text .= $spacing." </Description>\n";
	231	$section_text .= "-->\n";
	232
[11893]	233	#print $outhandle $spacing."$h_text\n"
	234	# if $self->{'verbosity'} > 2;
[10426]	235
[10271]	236	$sectionh1++ if ($hnum==1);
	237	}
	238	}
	239	else {
	240	### print STDERR "***** hc = <h$hc\n\n";
	241	}
	242	$section_text .= "<h$hc";
	243	}
	244	else
	245	{
	246	$section_text .= "<h$hc";
	247	}
	248	}
	249
	250	while ($hnum >= 1)
	251	{
	252	my $spacing = " " x $hnum;
	253	$section_text .= "<!--\n";
	254	$section_text .= $spacing."</Section>\n";
	255	$section_text .= "-->\n";
	256	$hnum--;
	257	}
	258
	259	$section_text .= "<!--\n</Section>\n-->\n";
	260
	261	$$textref = $section_text;
[10426]	262
[11893]	263	# if ($sectionh1>0)
	264	# {
	265	# print $outhandle " Located section headings ..."
	266	# if $self->{'verbosity'} > 1;
	267	# }
[10426]	268
[10271]	269	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
[10426]	270
[10271]	271	$$textref =~ s/( )+/ /sg;
	272
[10426]	273	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
	274
[10271]	275	$self->SUPER::process(@_);
[10426]	276
[10271]	277	}
	278
	279
	280	sub resize_if_necessary
	281	{
	282	my ($self,$front,$back,$base_dir,$href) = @_;
[10426]	283
[10271]	284	# dig out width and height of image, if there
	285	my $img_attributes = "$front back";
	286	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
	287	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
[10426]	288
[10271]	289	# derive local filename for image based on its URL
	290	my $img_filename = $href;
	291	$img_filename =~ s/^[^:]*:\/\///;
	292	$img_filename = &util::filename_cat($base_dir, $img_filename);
[10426]	293
[10271]	294	# Replace %20's in URL with a space if required. Note that the filename
	295	# may include the %20 in some situations
	296	if ($img_filename =~ /\%20/) {
	297	if (!-e $img_filename) {
	298	$img_filename =~ s/\%20/ /g;
	299	}
	300	}
	301	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
	302	# get image info on width and height
[10426]	303
[10271]	304	my $outhandle = $self->{'outhandle'};
	305	my $verbosity = $self->{'verbosity'};
	306
	307	my ($image_type, $actual_width, $actual_height, $image_size)
[15872]	308	= &ImageConverter::identify($img_filename, $outhandle, $verbosity);
[10426]	309
[10356]	310	#print STDERR "**** $actual_width x $actual_height";
	311	#print STDERR " (requested: $img_width x $img_height)\n";
[10271]	312
	313	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
[11893]	314	#print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
[10426]	315
[10271]	316	# derive new image name based on current image
	317	my ($tailname, $dirname, $suffix)
[15872]	318	= &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
[10426]	319
[10271]	320	my $resized_filename
	321	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
[10426]	322
[10356]	323	#print STDERR "**** suffix = $suffix\n";
[10426]	324
[10271]	325	# Generate smaller image with convert
[15872]	326	my $newsize = "$img_width"."x$img_height";
[10271]	327	my $command = "convert -interlace plane -verbose "
[15872]	328	."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
[11893]	329	#print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
	330	#my $result = '';
	331	#print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
[10271]	332	}
	333	}
	334	return $href;
	335	}
	336
	337	sub replace_images {
	338	my $self = shift (@_);
	339	my ($front, $link, $back, $base_dir,
	340	$file, $doc_obj, $section) = @_;
	341	# remove quotes from link at start and end if necessary
	342	if ($link=~/^\"/) {
	343	$link=~s/^\"//;$link=~s/\"$//;
	344	$front.='"';
	345	$back="\"$back";
	346	}
[10426]	347
[10271]	348	$link =~ s/\n/ /g;
[10426]	349
[10271]	350	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[10426]	351
[10271]	352	## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
[10426]	353
[10271]	354	my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
[10426]	355
[10271]	356	return $front . $middle . $back;
	357	}
	358
[10426]	359	sub extract_metadata
	360	{
	361	my $self = shift (@_);
	362	my ($textref, $metadata, $doc_obj) = @_;
	363	my $outhandle = $self->{'outhandle'};
	364
[11380]	365	return if (!defined $textref);
	366
[10426]	367	# metadata fields to extract/save. 'key' is the (lowercase) name of the
	368	# html meta, 'value' is the metadata name for greenstone to use
	369	my %find_fields = ();
	370	my ($tag,$value);
[10271]	371
[10426]	372	my $orig_field = "";
[19993]	373	foreach my $field (split /\s,\s/, $self->{'metadata_fields'}) {
[10426]	374	# support tag<tagname>
[19993]	375	if ($field =~ /^(.?)\s<(.*?)>$/) {
[10426]	376	# "$2" is the user's preferred gs metadata name
	377	$find_fields{lc($1)}=$2; # lc = lowercase
	378	$orig_field = $1;
	379	} else { # no <tagname> for mapping
	380	# "$field" is the user's preferred gs metadata name
	381	$find_fields{lc($field)}=$field; # lc = lowercase
	382	$orig_field = $field;
	383	}
[11380]	384
[10426]	385	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
	386	$tag = $orig_field;
	387	$value = $1;
	388	if (!defined $value \|\| !defined $tag){
[15872]	389	#print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
[10426]	390	next;
	391	} else {
	392	# clean up and add
	393	chomp($value); # remove trailing \n, if any
	394	$tag = $find_fields{lc($tag)};
[11893]	395	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
	396	# if ($self->{'verbosity'} > 2);
[10426]	397	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
	398	}
	399	}
	400	}
	401	}
	402
[10271]	403	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: