Context Navigation

source: gsdl/trunk/perllib/plugins/StructuredHTMLPlugin.pm@ 17033

Last change on this file since 17033 was 16104, checked in by kjdon, 16 years ago
tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process
Property svn:keywords set to `Author Date Id Revision`
File size: 12.7 KB

Rev	Line
[10271]	1	###########################################################################
	2	#
[15872]	3	# StructuredHTMLPlugin.pm -- html plugin with extra facilities for teasing out
[10404]	4	# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
	5	# HTML document
[10271]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
	11	# Copyright (C) 1999 New Zealand Digital Library Project
	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
[10404]	28	# This plugin is to process an HTML file where sections are divided by
	29	# user-defined headings tags. As it is difficult to predict what user's definition
	30	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
	31	# as well as allows to get rid of user-defined Table of Content (TOC)...
	32	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
	33
[15872]	34	package StructuredHTMLPlugin;
[10271]	35
[15872]	36	use HTMLPlugin;
	37	use ImageConverter; # want the identify method
[10271]	38
[15872]	39	use strict; # every perl program should have this!
	40	no strict 'refs'; # make an exception so we can use variables as filehandles
[10404]	41
[10271]	42	sub BEGIN {
[15872]	43	@StructuredHTMLPlugin::ISA = ('HTMLPlugin');
[10271]	44	}
	45
[11849]	46	my $arguments =
	47	[
	48	{ 'name' => "level1_header",
[15872]	49	'desc' => "{StructuredHTMLPlugin.level1_header}",
[11849]	50	'type' => "regexp",
	51	'reqd' => "no",
	52	'deft' => "" },
	53	{ 'name' => "level2_header",
[15872]	54	'desc' => "{StructuredHTMLPlugin.level2_header}",
[11849]	55	'type' => "regexp",
	56	'reqd' => "no",
	57	'deft' => "" },
	58	{ 'name' => "level3_header",
[15872]	59	'desc' => "{StructuredHTMLPlugin.level3_header}",
[11849]	60	'type' => "regexp",
	61	'reqd' => "no",
	62	'deft' => "" },
	63	{ 'name' => "title_header",
[15872]	64	'desc' => "{StructuredHTMLPlugin.title_header}",
[11849]	65	'type' => "regexp",
	66	'reqd' => "no",
	67	'deft' => "" },
[11884]	68	{ 'name' => "delete_toc",
[15872]	69	'desc' => "{StructuredHTMLPlugin.delete_toc}",
[11884]	70	'type' => "flag",
	71	'reqd' => "no"},
[11849]	72	{ 'name' => "toc_header",
[15872]	73	'desc' => "{StructuredHTMLPlugin.toc_header}",
[11849]	74	'type' => "regexp",
	75	'reqd' => "no",
[11884]	76	'deft' => "" }
[11849]	77	];
[10271]	78
[15872]	79	my $options = { 'name' => "StructuredHTMLPlugin",
	80	'desc' => "{StructuredHTMLPlugin.desc}",
[10271]	81	'abstract' => "no",
	82	'inherits' => "yes",
	83	'args' => $arguments };
	84
	85	sub new {
	86	my ($class) = shift (@_);
	87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	88	push(@$pluginlist, $class);
	89
[15872]	90	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	91	push(@{$hashArgOptLists->{"OptList"}},$options);
[10271]	92
[15872]	93	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
[10271]	94
	95	return bless $self, $class;
	96	}
	97
	98
	99	sub process {
	100	my $self = shift (@_);
[10404]	101	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[10271]	102	my $outhandle = $self->{'outhandle'};
	103
	104	my @head_and_body = split(/<body/i,$$textref);
	105	my $head = shift(@head_and_body);
	106	my $body_text = join("<body", @head_and_body);
[10600]	107	$head =~ m/<title>(.+)<\/title>/i;
	108	my $doctitle = $1 if defined $1;
[10723]	109	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
[10426]	110	my @doc_properties = split(/<xml>/i,$head);
	111	my $doc_heading = shift(@doc_properties);
	112	my $rest_doc_properties = join(" ", @doc_properties);
[11380]	113
[10426]	114	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
	115	my $extracted_metadata = shift (@extracted_metadata);
	116	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
	117	}
	118
[11893]	119	# set the title here if we haven't found it yet
	120	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
	121	if (defined $doctitle && $doctitle =~ /\S/) {
	122	$doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
	123	} else {
	124	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
	125	}
	126	}
	127
	128	# If delete_toc is enabled, it means to get rid of toc and tof contents.
[10271]	129	# get rid of TOC and TOF sections and their title
[11380]	130	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
[10496]	131	if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
	132	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
	133	}
[10443]	134	}
[10600]	135
[10496]	136	if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
[10271]	137	$self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
[10600]	138	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]	139	}
[10443]	140
	141	if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
	142	$self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
[10595]	143	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]	144	}
[10443]	145
[10496]	146	if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
[10443]	147	$self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
[10595]	148	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
[10271]	149	}
	150
[10496]	151	if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
[10443]	152	$self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
[10271]	153	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
	154	}
[11893]	155
[10271]	156	# Tidy up extra new lines
	157	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
	158	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
[10426]	159
[15872]	160	# what was the following line for. effectively unused. do we need it??
	161	#$section_text .= "<!--\n<Section>\n-->\n";
[10600]	162	#my $top_section_tag = "<!--\n<Section>\n-->\n";
	163	#$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
	164	#$body_text =~ s/(<div.*)/$top_section_tag$1/i;
[10271]	165	my $body = "<body".$body_text;
	166
	167	my $section_text = $head;
[10426]	168
[10271]	169	# split HTML text on <h1>, <h2> etc tags
	170	my @h_split = split(/<h/i,$body);
[10426]	171
[10271]	172	my $hnum = 0;
	173
	174	my $sectionh1 = 0;
	175	$section_text .= shift(@h_split);
[10426]	176
[10271]	177	my $hc;
	178	foreach $hc ( @h_split )
	179	{
	180	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
	181	{
	182	my $new_hnum = $1;
	183	my $hc_after = $2;
	184
	185	if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
	186	{
	187	my $h_text = $1;
	188	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
	189	# boil HTML down to some interesting text
	190	$h_text =~ s/^[1-3]>//;
	191	$h_text =~ s/<\/?.*?>//sg;
	192	$h_text =~ s/\s+/ /sg;
	193	$h_text =~ s/^\s$//s;
	194	$h_text =~ s/( )+\W*/ /sg;
[10426]	195
[10271]	196	if ($h_text =~ m/\w+/)
	197	{
	198	if ($new_hnum > $hnum)
	199	{
	200	# increase section nesting
	201	$hnum++;
	202	while ($hnum < $new_hnum)
	203	{
	204	my $spacing = " " x $hnum;
	205	$section_text .= "<!--\n";
	206	$section_text .= $spacing."<Section>\n";
	207	$section_text .= "-->\n";
	208	$hnum++;
	209	}
	210	}
	211	else # ($new_hnum <= $hnum)
	212	{
	213	# descrease section nesting
	214	while ($hnum >= $new_hnum)
	215	{
	216	my $spacing = " " x $hnum;
	217	$section_text .= "<!--\n";
	218	$section_text .= $spacing."</Section>\n";
	219	$section_text .= "-->\n";
	220	$hnum--;
	221	}
	222	$hnum++;
	223	}
	224
[10600]	225	my $spacing = " " x $hnum;
[10271]	226	$section_text .= "<!--\n";
	227	$section_text .= $spacing."<Section>\n";
	228	$section_text .= $spacing." <Description>\n";
	229	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
	230	$section_text .= $spacing." </Description>\n";
	231	$section_text .= "-->\n";
	232
[11893]	233	#print $outhandle $spacing."$h_text\n"
	234	# if $self->{'verbosity'} > 2;
[10426]	235
[10271]	236	$sectionh1++ if ($hnum==1);
	237	}
	238	}
	239	else {
	240	### print STDERR "***** hc = <h$hc\n\n";
	241	}
	242	$section_text .= "<h$hc";
	243	}
	244	else
	245	{
	246	$section_text .= "<h$hc";
	247	}
	248	}
	249
	250	while ($hnum >= 1)
	251	{
	252	my $spacing = " " x $hnum;
	253	$section_text .= "<!--\n";
	254	$section_text .= $spacing."</Section>\n";
	255	$section_text .= "-->\n";
	256	$hnum--;
	257	}
	258
	259	$section_text .= "<!--\n</Section>\n-->\n";
	260
	261	$$textref = $section_text;
[10426]	262
[11893]	263	# if ($sectionh1>0)
	264	# {
	265	# print $outhandle " Located section headings ..."
	266	# if $self->{'verbosity'} > 1;
	267	# }
[10426]	268
[10271]	269	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
[10426]	270
[10271]	271	$$textref =~ s/( )+/ /sg;
	272
[10426]	273	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
	274
[10271]	275	$self->SUPER::process(@_);
[10426]	276
[10271]	277	}
	278
	279
	280	sub resize_if_necessary
	281	{
	282	my ($self,$front,$back,$base_dir,$href) = @_;
[10426]	283
[10271]	284	# dig out width and height of image, if there
	285	my $img_attributes = "$front back";
	286	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
	287	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
[10426]	288
[10271]	289	# derive local filename for image based on its URL
	290	my $img_filename = $href;
	291	$img_filename =~ s/^[^:]*:\/\///;
	292	$img_filename = &util::filename_cat($base_dir, $img_filename);
[10426]	293
[10271]	294	# Replace %20's in URL with a space if required. Note that the filename
	295	# may include the %20 in some situations
	296	if ($img_filename =~ /\%20/) {
	297	if (!-e $img_filename) {
	298	$img_filename =~ s/\%20/ /g;
	299	}
	300	}
	301	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
	302	# get image info on width and height
[10426]	303
[10271]	304	my $outhandle = $self->{'outhandle'};
	305	my $verbosity = $self->{'verbosity'};
	306
	307	my ($image_type, $actual_width, $actual_height, $image_size)
[15872]	308	= &ImageConverter::identify($img_filename, $outhandle, $verbosity);
[10426]	309
[10356]	310	#print STDERR "**** $actual_width x $actual_height";
	311	#print STDERR " (requested: $img_width x $img_height)\n";
[10271]	312
	313	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
[11893]	314	#print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
[10426]	315
[10271]	316	# derive new image name based on current image
	317	my ($tailname, $dirname, $suffix)
[15872]	318	= &File::Basename::fileparse($img_filename, "\\.[^\\.]+\$");
[10426]	319
[10271]	320	my $resized_filename
	321	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
[10426]	322
[10356]	323	#print STDERR "**** suffix = $suffix\n";
[10426]	324
[10271]	325	# Generate smaller image with convert
[15872]	326	my $newsize = "$img_width"."x$img_height";
[10271]	327	my $command = "convert -interlace plane -verbose "
[15872]	328	."-geometry $newsize \"$img_filename\" \"$resized_filename\"";
[11893]	329	#print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
	330	#my $result = '';
	331	#print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
[10271]	332	}
	333	}
	334	return $href;
	335	}
	336
	337	sub replace_images {
	338	my $self = shift (@_);
	339	my ($front, $link, $back, $base_dir,
	340	$file, $doc_obj, $section) = @_;
	341	# remove quotes from link at start and end if necessary
	342	if ($link=~/^\"/) {
	343	$link=~s/^\"//;$link=~s/\"$//;
	344	$front.='"';
	345	$back="\"$back";
	346	}
[10426]	347
[10271]	348	$link =~ s/\n/ /g;
[10426]	349
[10271]	350	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[10426]	351
[10271]	352	## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
[10426]	353
[10271]	354	my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
[10426]	355
[10271]	356	return $front . $middle . $back;
	357	}
	358
[10426]	359	sub extract_metadata
	360	{
	361	my $self = shift (@_);
	362	my ($textref, $metadata, $doc_obj) = @_;
	363	my $outhandle = $self->{'outhandle'};
	364
[11380]	365	return if (!defined $textref);
	366
[10426]	367	# metadata fields to extract/save. 'key' is the (lowercase) name of the
	368	# html meta, 'value' is the metadata name for greenstone to use
	369	my %find_fields = ();
	370	my ($tag,$value);
[10271]	371
[10426]	372	my $orig_field = "";
[10723]	373	foreach my $field (split /,/, $self->{'metadata_fields'}) {
[10426]	374	# support tag<tagname>
	375	if ($field =~ /^(.?)<(.?)>$/) {
	376	# "$2" is the user's preferred gs metadata name
	377	$find_fields{lc($1)}=$2; # lc = lowercase
	378	$orig_field = $1;
	379	} else { # no <tagname> for mapping
	380	# "$field" is the user's preferred gs metadata name
	381	$find_fields{lc($field)}=$field; # lc = lowercase
	382	$orig_field = $field;
	383	}
[11380]	384
[10426]	385	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
	386	$tag = $orig_field;
	387	$value = $1;
	388	if (!defined $value \|\| !defined $tag){
[15872]	389	#print $outhandle "StructuredHTMLPlugin: can't find VALUE in \"$tag\"\n";
[10426]	390	next;
	391	} else {
	392	# clean up and add
	393	chomp($value); # remove trailing \n, if any
	394	$tag = $find_fields{lc($tag)};
[11893]	395	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
	396	# if ($self->{'verbosity'} > 2);
[10426]	397	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
	398	}
	399	}
	400	}
	401	}
	402
[10271]	403	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: