Context Navigation

source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 9823

Last change on this file since 9823 was 8166, checked in by mdewsnip, 20 years ago
Added FileSize metadata in most plugins.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 10.8 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# HBPlug.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[1244]	26	# plugin which processes an HTML book directory
[4]	27
[1244]	28	# This plugin is used by the Humanity Library collections and does not handle
	29	# input encodings other than ascii or extended ascii
	30
	31	# this code is kind of ugly and could no doubt be made to run faster, by leaving
	32	# it in this state I hope to encourage people to make their collections use
	33	# HBSPlug instead ;-)
	34
	35	# Use HBSPlug if creating a new collection and marking up files like the
	36	# Humanity Library collections. HBSPlug accepts all input encodings but
	37	# expects the marked up files to be cleaner than those used by the
	38	# Humanity Library collections
	39
[3540]	40	# 12/05/02 Added usage datastructure - John Thompson
	41
[4]	42	package HBPlug;
	43
[1010]	44	use ghtml;
[4]	45	use BasPlug;
[3542]	46	use unicode;
[286]	47	use util;
[288]	48	use doc;
[4]	49
[288]	50
[4]	51	sub BEGIN {
	52	@ISA = ('BasPlug');
	53	}
	54
[6408]	55	my $arguments =
	56	[ { 'name' => "process_exp",
	57	'desc' => "{BasPlug.process_exp}",
	58	'type' => "regexp",
	59	'reqd' => "no",
	60	'deft' => &get_default_process_exp() }
	61	];
	62
[3540]	63	my $options = { 'name' => "HBPlug",
[5680]	64	'desc' => "{HBPlug.desc}",
[6408]	65	'abstract' => "no",
	66	'inherits' => "yes",
	67	'args' => $arguments };
[3540]	68
[4]	69	sub new {
	70	my ($class) = @_;
[1244]	71	my $self = new BasPlug ("HBPlug", @_);
[5924]	72	$self->{'plugin_type'} = "HBPlug";
[3540]	73	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
	74	my $option_list = $self->{'option_list'};
	75	push( @{$option_list}, $options );
[4]	76
	77	return bless $self, $class;
	78	}
	79
[1244]	80	sub init {
[4]	81	my $self = shift (@_);
[1424]	82	my ($verbosity, $outhandle) = @_;
[4]	83
[1424]	84	$self->BasPlug::init($verbosity, $outhandle);
[4744]	85	$self->{'input_encoding'} = "iso_8859_1";
[1244]	86
	87	# this plugin only handles ascii encodings
	88	if ($self->{'input_encoding'} !~ /^(iso_8859_1\|ascii)$/) {
	89	die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
	90	$self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
[3540]	91	}
[4]	92	}
	93
[1244]	94	# this is included only to prevent warnings being printed out
	95	# from BasPlug::init. The process_exp is not used by this plugin
	96	sub get_default_process_exp {
	97	my $self = shift (@_);
	98
	99	return "This plugin does not use a process_exp\n";
	100	}
	101
	102
[4]	103	sub HB_read_html_file {
	104	my $self = shift (@_);
[318]	105	my ($htmlfile, $text) = @_;
[4]	106
	107	# load in the file
	108	if (!open (FILE, $htmlfile)) {
[1424]	109	my $outhandle = $self->{'outhandle'};
	110	print $outhandle "ERROR - could not open $htmlfile\n";
[318]	111	return;
[4]	112	}
	113
[318]	114	my $foundbody = 0;
[1244]	115	$self->HB_gettext (\$foundbody, $text, "FILE");
[318]	116	close FILE;
	117
	118	# just in case there was no <body> tag
	119	if (!$foundbody) {
	120	$foundbody = 1;
	121	open (FILE, $htmlfile) \|\| return;
[1244]	122	$self->HB_gettext (\$foundbody, $text, "FILE");
[318]	123	close FILE;
	124	}
[3542]	125	# text is in utf8
[318]	126	}
	127
[3542]	128	# converts the text to utf8, as ghtml does that for é etc.
[318]	129	sub HB_gettext {
	130	my $self = shift (@_);
	131	my ($foundbody, $text, $handle) = @_;
[1424]	132	my $outhandle = $self->{'outhandle'};
[318]	133
[4]	134	my $line = "";
[318]	135	while (defined ($line = <$handle>)) {
[4]	136	# look for body tag
[318]	137	if (!$$foundbody) {
[4]	138	if ($line =~ s/^.<body[^>]>//i) {
[318]	139	$$foundbody = 1;
[4]	140	} else {
	141	next;
	142	}
	143	}
[318]	144
[4]	145	# check for symbol fonts
	146	if ($line =~ /<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
	147	my $font = $1;
[1424]	148	print $outhandle "HBPlug::HB_gettext - warning removed font $font\n"
[4]	149	if ($font !~ /^arial$/i);
	150	}
	151
	152	$line =~ s/<\/p>//ig; # remove </p> tags
	153	$line =~ s/<\/?(body\|html\|font)\b[^>]*>//ig; # remove any unwanted tags
	154
[318]	155	$$text .= $line;
[4]	156	}
[3542]	157	#
	158	if ($self->{'input_encoding'} eq "iso_8859_1") {
	159	# convert to utf-8
	160	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
	161	}
	162	# convert any alphanumeric character entities to their utf-8
	163	# equivalent for indexing purposes
	164	&ghtml::convertcharentities ($$text);
	165
[318]	166	$$text =~ s/\s+/ /g; # remove \n's
[4]	167	}
	168
	169	sub HB_clean_section {
	170	my $self = shift (@_);
	171	my ($section) = @_;
	172
	173	# remove tags without a starting tag from the section
	174	my ($tag, $tagstart);
	175	while ($section =~ /<\/([^>]{1,10})>/) {
	176	$tag = $1;
	177	$tagstart = index($section, "<$tag");
	178	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
	179	$section =~ s/<\/$tag>//;
	180	}
	181
	182	# remove extra paragraph tags
	183	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
	184
	185	# remove extra stuff at the end of the section
	186	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
	187
	188	# add a newline at the beginning of each paragraph
	189	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
	190
	191	# add a newline every 80 characters at a word boundary
	192	# Note: this regular expression puts a line feed before
	193	# the last word in each section, even when it is not
	194	# needed.
	195	$section =~ s/(.{1,80})\s/$1\n/g;
	196
[317]	197	# fix up the image links
	198	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
[1020]	199	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
[317]	200	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
[1020]	201	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
[4]	202
	203	return $section;
	204	}
	205
	206
	207	sub shorten {
	208	my $self = shift (@_);
	209	my ($text) = @_;
	210
	211	return "\"$text\"" if (length($text) < 100);
	212
	213	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
	214	substr ($text, length($text)-50) . "\"";
	215	}
	216
[1244]	217	# if input_encoding is ascii we can call add_utf8_metadata
	218	# directly but if it's iso_8859_1 (the default) we need to call
	219	# add_metadata so that the ascii2utf8 conversion is done first
	220	# this should speed things up a little if processing an ascii only
	221	# document with input_encoding set to ascii
	222	sub HB_add_metadata {
	223	my $self = shift (@_);
	224	my ($doc_obj, $cursection, $field, $value) = @_;
[4]	225
[3542]	226	# All text should now be in utf-8
	227	# if ($self->{'input_encoding'} eq "ascii") {
	228	$doc_obj->add_utf8_metadata ($cursection, $field, $value);
	229	# } else {
	230	# $doc_obj->add_metadata ($cursection, $field, $value);
	231	# }
[1244]	232	}
	233
[317]	234	# return number of files processed, undef if can't process
[4]	235	# Note that $base_dir might be "" and that $file might
	236	# include directories
	237	sub read {
	238	my $self = shift (@_);
[6332]	239	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
[1424]	240	my $outhandle = $self->{'outhandle'};
[4]	241
[288]	242	# get the html filename and see if this is an HTML Book...
	243	my $jobnumber = $file;
[4]	244	if ($file =~ /[\\\/]/) {
	245	($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
	246	}
[317]	247	return undef unless defined $jobnumber;
[286]	248	my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
[317]	249	return undef unless -e $htmlfile;
[4]	250
[6332]	251	print STDERR "<Processing n='$file' p='HBPlug'>\n" if ($gli);
[1424]	252	print $outhandle "HBPlug: processing $file\n";
[4]	253
	254	# read in the file and do basic html cleaning (removing header etc)
[318]	255	my $html = "";
	256	$self->HB_read_html_file ($htmlfile, \$html);
[3542]	257	# html is in utf8
[4]	258
	259	# create a new document
	260	my $doc_obj = new doc ($file, "indexed_doc");
[2327]	261	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
[4]	262
	263	# copy the book cover if it exists
[286]	264	my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
[796]	265	$doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
[7508]	266	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[8166]	267	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileFormat", "HB");
	268	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $htmlfile));
[4]	269
[317]	270	my $cursection = $doc_obj->get_top_section();
[8121]	271
[317]	272	# add metadata for top level of document
[1244]	273	foreach my $field (keys(%$metadata)) {
[317]	274	# $metadata->{$field} may be an array reference
	275	if (ref ($metadata->{$field}) eq "ARRAY") {
	276	map {
[1244]	277	$self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
[317]	278	} @{$metadata->{$field}};
	279	} else {
[1244]	280	$self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
[317]	281	}
	282	}
	283
[4]	284	# process the file one section at a time
	285	my $curtoclevel = 1;
	286	my $firstsection = 1;
	287	while (length ($html) > 0) {
[339]	288	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
[4]	289	my $toclevel = $3;
	290	my $title = $4;
	291	my $sectiontext = "";
[321]	292	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
[4]	293	$sectiontext = $1;
	294	} else {
	295	$sectiontext = $html;
	296	$html = "";
	297	}
	298
	299	# remove tags and extra spaces from the title
	300	$title =~ s/<\/?[^>]+>//g;
	301	$title =~ s/^\s+\|\s+$//g;
	302
	303	# close any sections below the current level and
	304	# create a new section (special case for the firstsection)
	305	while (($curtoclevel > $toclevel) \|\|
	306	(!$firstsection && $curtoclevel == $toclevel)) {
	307	$cursection = $doc_obj->get_parent_section ($cursection);
	308	$curtoclevel--;
	309	}
	310	if ($curtoclevel+1 < $toclevel) {
[1424]	311	print $outhandle "WARNING - jump in toc levels in $htmlfile " .
[4]	312	"from $curtoclevel to $toclevel\n";
	313	}
	314	while ($curtoclevel < $toclevel) {
	315	$curtoclevel++;
	316	$cursection =
	317	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
	318	}
	319
	320	# add the metadata to this section
[1244]	321	$self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
[286]	322
[4]	323	# clean up the section html
	324	$sectiontext = $self->HB_clean_section($sectiontext);
	325
	326	# associate any files
[286]	327	map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
[1020]	328	if /_httpdocimg_\/([^\"]+)\"/; 0; }
	329	split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
[4]	330
	331	# add the text for this section
[3542]	332	# All read text should now be in utf-8
	333	# if ($self->{'input_encoding'} eq "ascii") {
	334	$doc_obj->add_utf8_text ($cursection, $sectiontext);
	335	# } else {
	336	# $doc_obj->add_text ($cursection, $sectiontext);
	337	# }
[4]	338	} else {
[1424]	339	print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
[4]	340	"\nin $htmlfile\n";
	341	last;
	342	}
[317]	343	$firstsection = 0;
[4]	344	}
	345
	346	# add a OID
	347	$doc_obj->set_OID ();
	348
	349	# process the document
[288]	350	$processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
[4]	351
	352	return 1; # processed the file
	353	}
	354
	355
	356	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: