Context Navigation

source: trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm@ 12169

Last change on this file since 12169 was 12169, checked in by mdewsnip, 18 years ago
Tidied up that horrible long line in the new() function of every plugin.
Property svn:keywords set to `Author Date Id Revision`
File size: 12.7 KB

Rev	Line
[10271]	1	###########################################################################
	2	#
	3	# StructuredHTMLPlug.pm -- html plugin with extra facilities for teasing out
[10404]	4	# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
	5	# HTML document
[10271]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
	11	# Copyright (C) 1999 New Zealand Digital Library Project
	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
[10404]	28	# This plugin is to process an HTML file where sections are divided by
	29	# user-defined headings tags. As it is difficult to predict what user's definition
	30	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
	31	# as well as allows to get rid of user-defined Table of Content (TOC)...
	32	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
	33
[10271]	34	package StructuredHTMLPlug;
	35
	36	use HTMLPlug;
	37	use ImagePlug;
	38
[10404]	39	#use strict; # every perl program should have this!
	40	#no strict 'refs'; # make an exception so we can use variables as filehandles
	41
[10271]	42	sub BEGIN {
[10404]	43	@StructuredHTMLPlug::ISA = ('HTMLPlug');
[10271]	44	}
	45
[11849]	46	my $arguments =
	47	[
	48	{ 'name' => "level1_header",
	49	'desc' => "{StructuredHTMLPlug.level1_header}",
	50	'type' => "regexp",
	51	'reqd' => "no",
	52	'deft' => "" },
	53	{ 'name' => "level2_header",
	54	'desc' => "{StructuredHTMLPlug.level2_header}",
	55	'type' => "regexp",
	56	'reqd' => "no",
	57	'deft' => "" },
	58	{ 'name' => "level3_header",
	59	'desc' => "{StructuredHTMLPlug.level3_header}",
	60	'type' => "regexp",
	61	'reqd' => "no",
	62	'deft' => "" },
	63	{ 'name' => "title_header",
	64	'desc' => "{StructuredHTMLPlug.title_header}",
	65	'type' => "regexp",
	66	'reqd' => "no",
	67	'deft' => "" },
[11884]	68	{ 'name' => "delete_toc",
	69	'desc' => "{StructuredHTMLPlug.delete_toc}",
	70	'type' => "flag",
	71	'reqd' => "no"},
[11849]	72	{ 'name' => "toc_header",
	73	'desc' => "{StructuredHTMLPlug.toc_header}",
	74	'type' => "regexp",
	75	'reqd' => "no",
[11884]	76	'deft' => "" }
[11849]	77	];
[10271]	78
	79	my $options = { 'name' => "StructuredHTMLPlug",
	80	'desc' => "{StructuredHTMLPlug.desc}",
	81	'abstract' => "no",
	82	'inherits' => "yes",
	83	'args' => $arguments };
	84
	85	sub new {
	86	my ($class) = shift (@_);
	87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	88	push(@$pluginlist, $class);
	89
	90	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	91	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
	92
[12169]	93	my $self = new HTMLPlug($pluginlist, $inputargs, $hashArgOptLists);
[10271]	94
	95	return bless $self, $class;
	96	}
	97
	98
	99	sub process {
	100	my $self = shift (@_);
[10404]	101	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[10271]	102	my $outhandle = $self->{'outhandle'};
	103
	104	print $outhandle "StructuredHTMLPlug: processing $file\n"
	105	if $self->{'verbosity'} > 1;
[10600]	106
[10271]	107	my @head_and_body = split(/<body/i,$$textref);
	108	my $head = shift(@head_and_body);
	109	my $body_text = join("<body", @head_and_body);
[10600]	110	$head =~ m/<title>(.+)<\/title>/i;
	111	my $doctitle = $1 if defined $1;
[10723]	112	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
[10426]	113	my @doc_properties = split(/<xml>/i,$head);
	114	my $doc_heading = shift(@doc_properties);
	115	my $rest_doc_properties = join(" ", @doc_properties);
[11380]	116
[10426]	117	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
	118	my $extracted_metadata = shift (@extracted_metadata);
	119	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
	120	}
	121
[11893]	122	# set the title here if we haven't found it yet
	123	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
	124	if (defined $doctitle && $doctitle =~ /\S/) {
	125	$doc_obj->add_metadata($doc_obj->get_top_section(), "Title", $doctitle);
	126	} else {
	127	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
	128	}
	129	}
	130
	131	# If delete_toc is enabled, it means to get rid of toc and tof contents.
[10271]	132	# get rid of TOC and TOF sections and their title
[11380]	133	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
[10496]	134	if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
	135	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
	136	}
[10443]	137	}
[10600]	138
[10496]	139	if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
[10271]	140	$self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
[10600]	141	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]	142	}
[10443]	143
	144	if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
	145	$self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
[10595]	146	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]	147	}
[10443]	148
[10496]	149	if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
[10443]	150	$self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
[10595]	151	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
[10271]	152	}
	153
[10496]	154	if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
[10443]	155	$self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
[10271]	156	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
	157	}
[11893]	158
[10271]	159	# Tidy up extra new lines
	160	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
	161	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
[10426]	162
[10600]	163	$section_text .= "<!--\n<Section>\n-->\n";
	164	#my $top_section_tag = "<!--\n<Section>\n-->\n";
	165	#$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
	166	#$body_text =~ s/(<div.*)/$top_section_tag$1/i;
[10271]	167	my $body = "<body".$body_text;
	168
	169	my $section_text = $head;
[10426]	170
[10271]	171	# split HTML text on <h1>, <h2> etc tags
	172	my @h_split = split(/<h/i,$body);
[10426]	173
[10271]	174	my $hnum = 0;
	175
	176	my $sectionh1 = 0;
	177	$section_text .= shift(@h_split);
[10426]	178
[10271]	179	my $hc;
	180	foreach $hc ( @h_split )
	181	{
	182	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
	183	{
	184	my $new_hnum = $1;
	185	my $hc_after = $2;
	186
	187	if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
	188	{
	189	my $h_text = $1;
	190	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
	191	# boil HTML down to some interesting text
	192	$h_text =~ s/^[1-3]>//;
	193	$h_text =~ s/<\/?.*?>//sg;
	194	$h_text =~ s/\s+/ /sg;
	195	$h_text =~ s/^\s$//s;
	196	$h_text =~ s/( )+\W*/ /sg;
[10426]	197
[10271]	198	if ($h_text =~ m/\w+/)
	199	{
	200	if ($new_hnum > $hnum)
	201	{
	202	# increase section nesting
	203	$hnum++;
	204	while ($hnum < $new_hnum)
	205	{
	206	my $spacing = " " x $hnum;
	207	$section_text .= "<!--\n";
	208	$section_text .= $spacing."<Section>\n";
	209	$section_text .= "-->\n";
	210	$hnum++;
	211	}
	212	}
	213	else # ($new_hnum <= $hnum)
	214	{
	215	# descrease section nesting
	216	while ($hnum >= $new_hnum)
	217	{
	218	my $spacing = " " x $hnum;
	219	$section_text .= "<!--\n";
	220	$section_text .= $spacing."</Section>\n";
	221	$section_text .= "-->\n";
	222	$hnum--;
	223	}
	224	$hnum++;
	225	}
	226
[10600]	227	my $spacing = " " x $hnum;
[10271]	228	$section_text .= "<!--\n";
	229	$section_text .= $spacing."<Section>\n";
	230	$section_text .= $spacing." <Description>\n";
	231	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
	232	$section_text .= $spacing." </Description>\n";
	233	$section_text .= "-->\n";
	234
[11893]	235	#print $outhandle $spacing."$h_text\n"
	236	# if $self->{'verbosity'} > 2;
[10426]	237
[10271]	238	$sectionh1++ if ($hnum==1);
	239	}
	240	}
	241	else {
	242	### print STDERR "***** hc = <h$hc\n\n";
	243	}
	244	$section_text .= "<h$hc";
	245	}
	246	else
	247	{
	248	$section_text .= "<h$hc";
	249	}
	250	}
	251
	252	while ($hnum >= 1)
	253	{
	254	my $spacing = " " x $hnum;
	255	$section_text .= "<!--\n";
	256	$section_text .= $spacing."</Section>\n";
	257	$section_text .= "-->\n";
	258	$hnum--;
	259	}
	260
	261	$section_text .= "<!--\n</Section>\n-->\n";
	262
	263	$$textref = $section_text;
[10426]	264
[11893]	265	# if ($sectionh1>0)
	266	# {
	267	# print $outhandle " Located section headings ..."
	268	# if $self->{'verbosity'} > 1;
	269	# }
[10426]	270
[10271]	271	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
[10426]	272
[10271]	273	$$textref =~ s/( )+/ /sg;
	274
[10426]	275	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
	276
[10271]	277	$self->SUPER::process(@_);
[10426]	278
[10271]	279	}
	280
	281
	282	sub resize_if_necessary
	283	{
	284	my ($self,$front,$back,$base_dir,$href) = @_;
[10426]	285
[10271]	286	# dig out width and height of image, if there
	287	my $img_attributes = "$front back";
	288	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
	289	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
[10426]	290
[10271]	291	# derive local filename for image based on its URL
	292	my $img_filename = $href;
	293	$img_filename =~ s/^[^:]*:\/\///;
	294	$img_filename = &util::filename_cat($base_dir, $img_filename);
[10426]	295
[10271]	296	# Replace %20's in URL with a space if required. Note that the filename
	297	# may include the %20 in some situations
	298	if ($img_filename =~ /\%20/) {
	299	if (!-e $img_filename) {
	300	$img_filename =~ s/\%20/ /g;
	301	}
	302	}
	303	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
	304	# get image info on width and height
[10426]	305
[10271]	306	my $outhandle = $self->{'outhandle'};
	307	my $verbosity = $self->{'verbosity'};
	308
	309	my ($image_type, $actual_width, $actual_height, $image_size)
	310	= &ImagePlug::identify($img_filename, $outhandle, $verbosity);
[10426]	311
[10356]	312	#print STDERR "**** $actual_width x $actual_height";
	313	#print STDERR " (requested: $img_width x $img_height)\n";
[10271]	314
	315	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
[11893]	316	#print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
[10426]	317
[10271]	318	# derive new image name based on current image
	319	my ($tailname, $dirname, $suffix)
	320	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
[10426]	321
[10271]	322	my $resized_filename
	323	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
[10426]	324
[10356]	325	#print STDERR "**** suffix = $suffix\n";
[10426]	326
[10271]	327	# Generate smaller image with convert
	328	my $newsize = "$img_widthx$image_height";
	329	my $command = "convert -interlace plane -verbose "
	330	."-geometry $newsize \"img_$filename\" \"$resized_filename\"";
[11893]	331	#print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
	332	#my $result = '';
	333	#print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
[10271]	334	}
	335	}
	336	return $href;
	337	}
	338
	339	sub replace_images {
	340	my $self = shift (@_);
	341	my ($front, $link, $back, $base_dir,
	342	$file, $doc_obj, $section) = @_;
	343	# remove quotes from link at start and end if necessary
	344	if ($link=~/^\"/) {
	345	$link=~s/^\"//;$link=~s/\"$//;
	346	$front.='"';
	347	$back="\"$back";
	348	}
[10426]	349
[10271]	350	$link =~ s/\n/ /g;
[10426]	351
[10271]	352	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[10426]	353
[10271]	354	## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
[10426]	355
[10271]	356	my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
[10426]	357
[10271]	358	return $front . $middle . $back;
	359	}
	360
[10426]	361	sub extract_metadata
	362	{
	363	my $self = shift (@_);
	364	my ($textref, $metadata, $doc_obj) = @_;
	365	my $outhandle = $self->{'outhandle'};
	366
[11380]	367	return if (!defined $textref);
	368
[10426]	369	# metadata fields to extract/save. 'key' is the (lowercase) name of the
	370	# html meta, 'value' is the metadata name for greenstone to use
	371	my %find_fields = ();
	372	my ($tag,$value);
[10271]	373
[10426]	374	my $orig_field = "";
[10723]	375	foreach my $field (split /,/, $self->{'metadata_fields'}) {
[10426]	376	# support tag<tagname>
	377	if ($field =~ /^(.?)<(.?)>$/) {
	378	# "$2" is the user's preferred gs metadata name
	379	$find_fields{lc($1)}=$2; # lc = lowercase
	380	$orig_field = $1;
	381	} else { # no <tagname> for mapping
	382	# "$field" is the user's preferred gs metadata name
	383	$find_fields{lc($field)}=$field; # lc = lowercase
	384	$orig_field = $field;
	385	}
[11380]	386
[10426]	387	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
	388	$tag = $orig_field;
	389	$value = $1;
	390	if (!defined $value \|\| !defined $tag){
[11893]	391	#print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
[10426]	392	next;
	393	} else {
	394	# clean up and add
	395	chomp($value); # remove trailing \n, if any
	396	$tag = $find_fields{lc($tag)};
[11893]	397	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
	398	# if ($self->{'verbosity'} > 2);
[10426]	399	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
	400	}
	401	}
	402	}
	403	}
	404
[10271]	405	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: