Context Navigation

source: gsdl/trunk/perllib/plugins/HBPlugin.pm@ 18327

Last change on this file since 18327 was 18327, checked in by ak19, 15 years ago
Extra parameter to new doc(): the renaming method to be used on the file (base64 or URL encoding).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 10.0 KB

Rev	Line
[537]	1	###########################################################################
	2	#
[15872]	3	# HBPlugin.pm --
[537]	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[1244]	26	# plugin which processes an HTML book directory
[4]	27
[1244]	28	# This plugin is used by the Humanity Library collections and does not handle
	29	# input encodings other than ascii or extended ascii
	30
	31	# this code is kind of ugly and could no doubt be made to run faster, by leaving
	32	# it in this state I hope to encourage people to make their collections use
	33	# HBSPlug instead ;-)
	34
	35	# Use HBSPlug if creating a new collection and marking up files like the
	36	# Humanity Library collections. HBSPlug accepts all input encodings but
	37	# expects the marked up files to be cleaner than those used by the
	38	# Humanity Library collections
	39
[15872]	40	package HBPlugin;
[4]	41
[1010]	42	use ghtml;
[15872]	43	use BasePlugin;
[3542]	44	use unicode;
[286]	45	use util;
[288]	46	use doc;
[4]	47
[10254]	48	use strict;
	49	no strict 'refs'; # allow filehandles to be variables and viceversa
[288]	50
[4]	51	sub BEGIN {
[15872]	52	@HBPlugin::ISA = ('BasePlugin');
[4]	53	}
[15872]	54	my $encoding_list =
	55	[ { 'name' => "ascii",
[16019]	56	'desc' => "{BasePlugin.encoding.ascii}" },
[15872]	57	{ 'name' => "iso_8859_1",
[16019]	58	'desc' => "{HBPlugin.encoding.iso_8859_1}" } ];
[15872]	59
[6408]	60	my $arguments =
	61	[ { 'name' => "process_exp",
[15872]	62	'desc' => "{BasePlugin.process_exp}",
[6408]	63	'type' => "regexp",
	64	'reqd' => "no",
[15872]	65	'deft' => &get_default_process_exp() },
	66	{ 'name' => "input_encoding",
	67	'desc' => "{ReadTextFile.input_encoding}",
	68	'type' => "enum",
	69	'deft' => "iso_8859_1",
	70	'list' => $encoding_list,
	71	'reqd' => "no" }
[6408]	72	];
	73
[15872]	74	my $options = { 'name' => "HBPlugin",
	75	'desc' => "{HBPlugin.desc}",
[6408]	76	'abstract' => "no",
	77	'inherits' => "yes",
	78	'args' => $arguments };
[3540]	79
[4]	80	sub new {
[10218]	81	my ($class) = shift (@_);
	82	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	83	push(@$pluginlist, $class);
[4]	84
[15872]	85	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	86	push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]	87
[15872]	88	my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
[10218]	89
[4]	90	return bless $self, $class;
	91	}
	92
[1244]	93	# this is included only to prevent warnings being printed out
[15872]	94	# from BasePlugin::init. The process_exp is not used by this plugin
[1244]	95	sub get_default_process_exp {
	96	my $self = shift (@_);
	97
	98	return "This plugin does not use a process_exp\n";
	99	}
	100
	101
[4]	102	sub HB_read_html_file {
	103	my $self = shift (@_);
[318]	104	my ($htmlfile, $text) = @_;
[4]	105
	106	# load in the file
	107	if (!open (FILE, $htmlfile)) {
[1424]	108	my $outhandle = $self->{'outhandle'};
	109	print $outhandle "ERROR - could not open $htmlfile\n";
[318]	110	return;
[4]	111	}
	112
[318]	113	my $foundbody = 0;
[1244]	114	$self->HB_gettext (\$foundbody, $text, "FILE");
[318]	115	close FILE;
	116
	117	# just in case there was no <body> tag
	118	if (!$foundbody) {
	119	$foundbody = 1;
	120	open (FILE, $htmlfile) \|\| return;
[1244]	121	$self->HB_gettext (\$foundbody, $text, "FILE");
[318]	122	close FILE;
	123	}
[3542]	124	# text is in utf8
[318]	125	}
	126
[3542]	127	# converts the text to utf8, as ghtml does that for é etc.
[318]	128	sub HB_gettext {
	129	my $self = shift (@_);
	130	my ($foundbody, $text, $handle) = @_;
[1424]	131	my $outhandle = $self->{'outhandle'};
[318]	132
[4]	133	my $line = "";
[318]	134	while (defined ($line = <$handle>)) {
[4]	135	# look for body tag
[318]	136	if (!$$foundbody) {
[4]	137	if ($line =~ s/^.<body[^>]>//i) {
[318]	138	$$foundbody = 1;
[4]	139	} else {
	140	next;
	141	}
	142	}
[318]	143
[4]	144	# check for symbol fonts
	145	if ($line =~ /<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
	146	my $font = $1;
[15872]	147	print $outhandle "HBPlugin::HB_gettext - warning removed font $font\n"
[4]	148	if ($font !~ /^arial$/i);
	149	}
	150
	151	$line =~ s/<\/p>//ig; # remove </p> tags
	152	$line =~ s/<\/?(body\|html\|font)\b[^>]*>//ig; # remove any unwanted tags
	153
[318]	154	$$text .= $line;
[4]	155	}
[3542]	156	#
	157	if ($self->{'input_encoding'} eq "iso_8859_1") {
	158	# convert to utf-8
	159	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
	160	}
	161	# convert any alphanumeric character entities to their utf-8
	162	# equivalent for indexing purposes
	163	&ghtml::convertcharentities ($$text);
	164
[318]	165	$$text =~ s/\s+/ /g; # remove \n's
[4]	166	}
	167
	168	sub HB_clean_section {
	169	my $self = shift (@_);
	170	my ($section) = @_;
	171
	172	# remove tags without a starting tag from the section
	173	my ($tag, $tagstart);
	174	while ($section =~ /<\/([^>]{1,10})>/) {
	175	$tag = $1;
	176	$tagstart = index($section, "<$tag");
	177	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
	178	$section =~ s/<\/$tag>//;
	179	}
	180
	181	# remove extra paragraph tags
	182	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
	183
	184	# remove extra stuff at the end of the section
	185	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
	186
	187	# add a newline at the beginning of each paragraph
	188	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
	189
	190	# add a newline every 80 characters at a word boundary
	191	# Note: this regular expression puts a line feed before
	192	# the last word in each section, even when it is not
	193	# needed.
	194	$section =~ s/(.{1,80})\s/$1\n/g;
	195
[317]	196	# fix up the image links
	197	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
[1020]	198	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
[317]	199	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
[1020]	200	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
[4]	201
	202	return $section;
	203	}
	204
	205
	206	sub shorten {
	207	my $self = shift (@_);
	208	my ($text) = @_;
	209
	210	return "\"$text\"" if (length($text) < 100);
	211
	212	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
	213	substr ($text, length($text)-50) . "\"";
	214	}
	215
[317]	216	# return number of files processed, undef if can't process
[4]	217	# Note that $base_dir might be "" and that $file might
	218	# include directories
	219	sub read {
	220	my $self = shift (@_);
[16392]	221	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[1424]	222	my $outhandle = $self->{'outhandle'};
[4]	223
[288]	224	# get the html filename and see if this is an HTML Book...
	225	my $jobnumber = $file;
[4]	226	if ($file =~ /[\\\/]/) {
	227	($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
	228	}
[317]	229	return undef unless defined $jobnumber;
[286]	230	my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
[317]	231	return undef unless -e $htmlfile;
[4]	232
[15872]	233	print STDERR "<Processing n='$file' p='HBPlugin'>\n" if ($gli);
	234	print $outhandle "HBPlugin: processing $file\n";
[4]	235
	236	# read in the file and do basic html cleaning (removing header etc)
[318]	237	my $html = "";
	238	$self->HB_read_html_file ($htmlfile, \$html);
[3542]	239	# html is in utf8
[4]	240
	241	# create a new document
[18327]	242	my $doc_obj = new doc ($file, "indexed_doc", $self->{'file_rename_method'});
[4]	243
	244	# copy the book cover if it exists
[286]	245	my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
[796]	246	$doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
[7508]	247	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[8166]	248	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileFormat", "HB");
	249	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $htmlfile));
[4]	250
[317]	251	my $cursection = $doc_obj->get_top_section();
[8121]	252
[317]	253	# add metadata for top level of document
[1244]	254	foreach my $field (keys(%$metadata)) {
[317]	255	# $metadata->{$field} may be an array reference
	256	if (ref ($metadata->{$field}) eq "ARRAY") {
[15872]	257	map {
	258	$doc_obj->add_utf8_metadata($cursection, $field, $_);
[317]	259	} @{$metadata->{$field}};
	260	} else {
[15872]	261	$doc_obj->add_utf8_metadata($cursection, $field, $metadata->{$field});
[317]	262	}
	263	}
	264
[4]	265	# process the file one section at a time
	266	my $curtoclevel = 1;
	267	my $firstsection = 1;
	268	while (length ($html) > 0) {
[339]	269	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
[4]	270	my $toclevel = $3;
	271	my $title = $4;
	272	my $sectiontext = "";
[321]	273	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
[4]	274	$sectiontext = $1;
	275	} else {
	276	$sectiontext = $html;
	277	$html = "";
	278	}
	279
	280	# remove tags and extra spaces from the title
	281	$title =~ s/<\/?[^>]+>//g;
	282	$title =~ s/^\s+\|\s+$//g;
	283
	284	# close any sections below the current level and
	285	# create a new section (special case for the firstsection)
	286	while (($curtoclevel > $toclevel) \|\|
	287	(!$firstsection && $curtoclevel == $toclevel)) {
	288	$cursection = $doc_obj->get_parent_section ($cursection);
	289	$curtoclevel--;
	290	}
	291	if ($curtoclevel+1 < $toclevel) {
[1424]	292	print $outhandle "WARNING - jump in toc levels in $htmlfile " .
[4]	293	"from $curtoclevel to $toclevel\n";
	294	}
	295	while ($curtoclevel < $toclevel) {
	296	$curtoclevel++;
	297	$cursection =
	298	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
	299	}
	300
	301	# add the metadata to this section
[15872]	302	$doc_obj->add_utf8_metadata($cursection, "Title", $title);
[286]	303
[4]	304	# clean up the section html
	305	$sectiontext = $self->HB_clean_section($sectiontext);
	306
	307	# associate any files
[286]	308	map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
[1020]	309	if /_httpdocimg_\/([^\"]+)\"/; 0; }
	310	split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
[4]	311
	312	# add the text for this section
[3542]	313	$doc_obj->add_utf8_text ($cursection, $sectiontext);
[4]	314	} else {
[1424]	315	print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
[4]	316	"\nin $htmlfile\n";
	317	last;
	318	}
[317]	319	$firstsection = 0;
[4]	320	}
	321
	322	# add a OID
[17026]	323	$self->add_OID($doc_obj);
[4]	324
	325	# process the document
[288]	326	$processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
[4]	327
	328	return 1; # processed the file
	329	}
	330
	331
	332	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: