Context Navigation

source: trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm@ 11090

Last change on this file since 11090 was 10723, checked in by chi, 19 years ago
Change the option of extracted_word_metadata_fields to metadata_fields.
Property svn:keywords set to `Author Date Id Revision`
File size: 13.6 KB

Rev	Line
[10271]	1	###########################################################################
	2	#
	3	# StructuredHTMLPlug.pm -- html plugin with extra facilities for teasing out
[10404]	4	# hierarchical structure (such as h1, h2, h3, or user-defined tags) in an
	5	# HTML document
[10271]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
	11	# Copyright (C) 1999 New Zealand Digital Library Project
	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
[10404]	28	# This plugin is to process an HTML file where sections are divided by
	29	# user-defined headings tags. As it is difficult to predict what user's definition
	30	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
	31	# as well as allows to get rid of user-defined Table of Content (TOC)...
	32	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
	33
[10271]	34	package StructuredHTMLPlug;
	35
	36	use HTMLPlug;
	37	use ImagePlug;
	38
[10404]	39	#use strict; # every perl program should have this!
	40	#no strict 'refs'; # make an exception so we can use variables as filehandles
	41
[10271]	42	sub BEGIN {
[10404]	43	@StructuredHTMLPlug::ISA = ('HTMLPlug');
[10271]	44	}
	45
[10404]	46	my $arguments = [];
[10271]	47
	48	my $options = { 'name' => "StructuredHTMLPlug",
	49	'desc' => "{StructuredHTMLPlug.desc}",
	50	'abstract' => "no",
	51	'inherits' => "yes",
	52	'args' => $arguments };
	53
	54	sub new {
	55	my ($class) = shift (@_);
	56	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	57	push(@$pluginlist, $class);
	58
	59	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	60	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
	61
	62	my $self = (defined $hashArgOptLists)? new HTMLPlug($pluginlist,$inputargs,$hashArgOptLists): new HTMLPlug($pluginlist,$inputargs);
	63
	64	return bless $self, $class;
	65	}
	66
	67	sub read {
	68	my $self = shift (@_);
	69	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
	70
	71	my $filename = $file;
	72	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
	73
	74	if ($filename =~ m/\.html?$/) {
	75	my $poss_doc_filename = $filename;
	76	$poss_doc_filename =~ s/\.html?$/.doc/;
	77
	78	if (-e $poss_doc_filename) {
	79	# this file has already been processed by Word plugin
	80	return 0;
	81	}
	82	}
	83	return $self->SUPER::read(@_);
	84	}
	85
	86	sub process {
	87	my $self = shift (@_);
[10404]	88	#my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
	89	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[10271]	90	my $outhandle = $self->{'outhandle'};
	91
	92	print $outhandle "StructuredHTMLPlug: processing $file\n"
	93	if $self->{'verbosity'} > 1;
[10600]	94
[10271]	95	my @head_and_body = split(/<body/i,$$textref);
	96	my $head = shift(@head_and_body);
	97	my $body_text = join("<body", @head_and_body);
[10600]	98	$head =~ m/<title>(.+)<\/title>/i;
	99	my $doctitle = $1 if defined $1;
[10723]	100	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
[10426]	101	my @doc_properties = split(/<xml>/i,$head);
	102	my $doc_heading = shift(@doc_properties);
	103	my $rest_doc_properties = join(" ", @doc_properties);
	104	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
	105	my $extracted_metadata = shift (@extracted_metadata);
	106	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
	107	}
	108
[10496]	109	# If delete_toc is enables, it means to get rid of toc and tof contents.
[10271]	110	# get rid of TOC and TOF sections and their title
[10496]	111	if ($self->{'delete_toc'} == 1){
	112	if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
	113	$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
	114	}
	115	if (defined $self->{'tof_header'}&& $self->{'tof_header'}=~ /\S/) {
	116	$body_text =~ s/<p class=(($self->{'tof_header'})[^>]*)>(.+?)<\/p>//isg;
	117	}
[10443]	118	}
[10600]	119
[10496]	120	if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
[10271]	121	$self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
[10600]	122	#$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><title>$3<\/title><\/p>/isg;
	123	#$doctitle = $3;
	124	$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
	125	#$body_text =~ m/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/isg;
	126	#$doctitle = "<h1>".$3."<\/h1>" if defined $3;
[10271]	127	}
[10443]	128
	129	if (defined $self->{'level1_header'} && $self->{'level1_header'}=~ /\S/ ){
	130	$self->{'level1_header'} =~ s/^\((.*)\)/$1/i;
[10595]	131	$body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
[10271]	132	}
[10443]	133
[10496]	134	if (defined $self->{'level2_header'} && $self->{'level2_header'}=~ /\S/){
[10443]	135	$self->{'level2_header'} =~ s/^\((.*)\)/$1/i;
[10595]	136	$body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
[10271]	137	}
	138
[10496]	139	if (defined $self->{'level3_header'} && $self->{'level3_header'}=~ /\S/ ){
[10443]	140	$self->{'level3_header'} =~ s/^\((.*)\)/$1/is;
[10271]	141	$body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
	142	}
	143	# Tidy up extra new lines
	144	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
	145	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
[10426]	146
[10600]	147	$section_text .= "<!--\n<Section>\n-->\n";
	148	#my $top_section_tag = "<!--\n<Section>\n-->\n";
	149	#$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
	150	#$body_text =~ s/(<div.*)/$top_section_tag$1/i;
[10271]	151	my $body = "<body".$body_text;
	152
	153	my $section_text = $head;
[10426]	154
[10271]	155	# split HTML text on <h1>, <h2> etc tags
	156	my @h_split = split(/<h/i,$body);
[10426]	157
[10271]	158	my $hnum = 0;
	159
	160	my $sectionh1 = 0;
	161	$section_text .= shift(@h_split);
[10426]	162
[10271]	163	my $hc;
	164	foreach $hc ( @h_split )
	165	{
	166	if ($hc =~ m/^([1-3])\s.?>(.*)$/s)
	167	{
	168	my $new_hnum = $1;
	169	my $hc_after = $2;
	170
	171	if ($hc_after =~ m/^(.*?)<\/h$new_hnum>/is)
	172	{
	173	my $h_text = $1;
	174	$hc =~ s/^(\&nbsp\;)+/\&nbsp\;/g;
	175	# boil HTML down to some interesting text
	176	$h_text =~ s/^[1-3]>//;
	177	$h_text =~ s/<\/?.*?>//sg;
	178	$h_text =~ s/\s+/ /sg;
	179	$h_text =~ s/^\s$//s;
	180	$h_text =~ s/( )+\W*/ /sg;
[10426]	181
[10271]	182	if ($h_text =~ m/\w+/)
	183	{
	184	if ($new_hnum > $hnum)
	185	{
	186	# increase section nesting
	187	$hnum++;
	188	while ($hnum < $new_hnum)
	189	{
	190	my $spacing = " " x $hnum;
	191	$section_text .= "<!--\n";
	192	$section_text .= $spacing."<Section>\n";
	193	$section_text .= "-->\n";
	194	$hnum++;
	195	}
	196	}
	197	else # ($new_hnum <= $hnum)
	198	{
	199	# descrease section nesting
	200	while ($hnum >= $new_hnum)
	201	{
	202	my $spacing = " " x $hnum;
	203	$section_text .= "<!--\n";
	204	$section_text .= $spacing."</Section>\n";
	205	$section_text .= "-->\n";
	206	$hnum--;
	207	}
	208	$hnum++;
	209	}
	210
[10600]	211	my $spacing = " " x $hnum;
[10271]	212	$section_text .= "<!--\n";
	213	$section_text .= $spacing."<Section>\n";
	214	$section_text .= $spacing." <Description>\n";
	215	$section_text .= $spacing." <Metadata name=\"Title\">$h_text</Metadata>";
	216	$section_text .= $spacing." </Description>\n";
	217	$section_text .= "-->\n";
	218
	219	print $outhandle $spacing."$h_text\n"
	220	if $self->{'verbosity'} > 2;
[10426]	221
[10271]	222	$sectionh1++ if ($hnum==1);
	223	}
	224	}
	225	else {
	226	### print STDERR "***** hc = <h$hc\n\n";
	227	}
	228	$section_text .= "<h$hc";
	229	}
	230	else
	231	{
	232	$section_text .= "<h$hc";
	233	}
	234	}
	235
	236	while ($hnum >= 1)
	237	{
	238	my $spacing = " " x $hnum;
	239	$section_text .= "<!--\n";
	240	$section_text .= $spacing."</Section>\n";
	241	$section_text .= "-->\n";
	242	$hnum--;
	243	}
	244
	245	$section_text .= "<!--\n</Section>\n-->\n";
	246
	247	$$textref = $section_text;
[10426]	248
	249	# should be textref not testref???
	250	#$$testref =~ s/<h(\d+)>(.*?)<\/h$1>/<Section><Metadata name=\"Title\">$1<\/Metadata></Section><h$1><\/h$1>/gi;
	251
[10271]	252	if ($sectionh1>0)
	253	{
	254	print $outhandle " Located section headings ..."
	255	if $self->{'verbosity'} > 1;
	256	}
	257	print $outhandle " Passing on the HTMLPlug\n"
	258	if $self->{'verbosity'} > 1;
[10426]	259
[10271]	260	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
[10426]	261
[10271]	262	$$textref =~ s/( )+/ /sg;
	263
[10426]	264	## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?
	265
[10271]	266	$self->SUPER::process(@_);
[10426]	267
[10271]	268	# associate original file with doc object
	269	my $cursection = $doc_obj->get_top_section();
	270	my $filename = &util::filename_cat($base_dir, $file);
	271	if (-e $filename)
	272	{
	273	print $outhandle " Adding associated Word document\n"
	274	if $self->{'verbosity'} > 1;
	275
	276	$doc_obj->associate_file($filename, "doc.doc", undef, $cursection);
[10426]	277
[10271]	278	my $doclink = "<a href=_httpcollection_/index/assoc/[archivedir]/doc.doc>";
	279	$doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
	280	$doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icondoc_");
	281	$doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
[10600]	282	$doc_obj->add_utf8_metadata ($cursection, "Title", $doctitle);
	283	my $file_size = -s $filename;
[10271]	284	if ($file_size>1024)
	285	{
	286	my $fs_kbytes = sprintf("%d",$file_size/1024);
	287	$doc_obj->add_utf8_metadata ($cursection, "filesize", "$fs_kbytes Kb");
	288	}
	289	else
	290	{
	291	$doc_obj->add_utf8_metadata ($cursection, "filesize", "$file_size bytes");
	292	}
	293
	294	if ($file_size > 200000)
	295	{
	296	$doc_obj->add_utf8_metadata ($cursection, "fswarning", "1");
	297	}
	298	}
	299	}
	300
	301
	302	sub resize_if_necessary
	303	{
	304	my ($self,$front,$back,$base_dir,$href) = @_;
[10426]	305
[10271]	306	# dig out width and height of image, if there
	307	my $img_attributes = "$front back";
	308	my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
	309	my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
[10426]	310
[10271]	311	# derive local filename for image based on its URL
	312	my $img_filename = $href;
	313	$img_filename =~ s/^[^:]*:\/\///;
	314	$img_filename = &util::filename_cat($base_dir, $img_filename);
[10426]	315
[10271]	316	# Replace %20's in URL with a space if required. Note that the filename
	317	# may include the %20 in some situations
	318	if ($img_filename =~ /\%20/) {
	319	if (!-e $img_filename) {
	320	$img_filename =~ s/\%20/ /g;
	321	}
	322	}
	323	if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
	324	# get image info on width and height
[10426]	325
[10271]	326	my $outhandle = $self->{'outhandle'};
	327	my $verbosity = $self->{'verbosity'};
	328
	329	my ($image_type, $actual_width, $actual_height, $image_size)
	330	= &ImagePlug::identify($img_filename, $outhandle, $verbosity);
[10426]	331
[10356]	332	#print STDERR "**** $actual_width x $actual_height";
	333	#print STDERR " (requested: $img_width x $img_height)\n";
[10271]	334
	335	if (($img_width < $actual_width) \|\| ($img_height < $actual_height)) {
	336	print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
[10426]	337
[10271]	338	# derive new image name based on current image
	339	my ($tailname, $dirname, $suffix)
	340	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
[10426]	341
[10271]	342	my $resized_filename
	343	= &util::filename_cat($dirname, $tailname."_resized".$suffix);
[10426]	344
[10356]	345	#print STDERR "**** suffix = $suffix\n";
[10426]	346
[10271]	347	# Generate smaller image with convert
	348	my $newsize = "$img_widthx$image_height";
	349	my $command = "convert -interlace plane -verbose "
	350	."-geometry $newsize \"img_$filename\" \"$resized_filename\"";
	351	print $outhandle "ImageResize: $command\n" if ($verbosity > 2);
	352	my $result = '';
	353	print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
	354	}
	355	}
	356	return $href;
	357	}
	358
	359	sub replace_images {
	360	my $self = shift (@_);
	361	my ($front, $link, $back, $base_dir,
	362	$file, $doc_obj, $section) = @_;
	363	# remove quotes from link at start and end if necessary
	364	if ($link=~/^\"/) {
	365	$link=~s/^\"//;$link=~s/\"$//;
	366	$front.='"';
	367	$back="\"$back";
	368	}
[10426]	369
[10271]	370	$link =~ s/\n/ /g;
[10426]	371
[10271]	372	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[10426]	373
[10271]	374	## $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
[10426]	375
[10271]	376	my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
[10426]	377
[10271]	378	return $front . $middle . $back;
	379	}
	380
[10426]	381	sub extract_metadata
	382	{
	383	my $self = shift (@_);
	384	my ($textref, $metadata, $doc_obj) = @_;
	385	my $outhandle = $self->{'outhandle'};
	386
	387	# metadata fields to extract/save. 'key' is the (lowercase) name of the
	388	# html meta, 'value' is the metadata name for greenstone to use
	389	my %find_fields = ();
	390	my ($tag,$value);
[10271]	391
[10426]	392	my $orig_field = "";
[10723]	393	foreach my $field (split /,/, $self->{'metadata_fields'}) {
[10426]	394	# support tag<tagname>
	395	if ($field =~ /^(.?)<(.?)>$/) {
	396	# "$2" is the user's preferred gs metadata name
	397	$find_fields{lc($1)}=$2; # lc = lowercase
	398	$orig_field = $1;
	399	} else { # no <tagname> for mapping
	400	# "$field" is the user's preferred gs metadata name
	401	$find_fields{lc($field)}=$field; # lc = lowercase
	402	$orig_field = $field;
	403	}
	404	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
	405	$tag = $orig_field;
	406	$value = $1;
	407	if (!defined $value \|\| !defined $tag){
	408	print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
	409	next;
	410	} else {
	411	# clean up and add
	412	chomp($value); # remove trailing \n, if any
	413	$tag = $find_fields{lc($tag)};
	414	print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
	415	if ($self->{'verbosity'} > 2);
	416	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
	417	}
	418	}
	419	}
	420	}
	421
[10271]	422	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: