Context Navigation

source: branches/New_Config_Format-branch/gsdl/perllib/plugins/HBPlug.pm@ 1279

Last change on this file since 1279 was 1279, checked in by sjboddie, 24 years ago
merged changes to trunk into New_Config_Format branch
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 9.2 KB

Line
1	###########################################################################
2	#
3	# HBPlug.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# plugin which processes an HTML book directory
27
28	# This plugin is used by the Humanity Library collections and does not handle
29	# input encodings other than ascii or extended ascii
30
31	# this code is kind of ugly and could no doubt be made to run faster, by leaving
32	# it in this state I hope to encourage people to make their collections use
33	# HBSPlug instead ;-)
34
35	# Use HBSPlug if creating a new collection and marking up files like the
36	# Humanity Library collections. HBSPlug accepts all input encodings but
37	# expects the marked up files to be cleaner than those used by the
38	# Humanity Library collections
39
40	package HBPlug;
41
42	use ghtml;
43	use BasPlug;
44	use util;
45	use doc;
46
47
48	sub BEGIN {
49	@ISA = ('BasPlug');
50	}
51
52	sub new {
53	my ($class) = @_;
54	my $self = new BasPlug ("HBPlug", @_);
55
56	return bless $self, $class;
57	}
58
59	sub init {
60	my $self = shift (@_);
61	my ($verbosity) = @_;
62
63	$self->BasPlug::init();
64
65	# this plugin only handles ascii encodings
66	if ($self->{'input_encoding'} !~ /^(iso_8859_1\|ascii)$/) {
67	die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
68	$self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
69	}
70	}
71
72	# this is included only to prevent warnings being printed out
73	# from BasPlug::init. The process_exp is not used by this plugin
74	sub get_default_process_exp {
75	my $self = shift (@_);
76
77	return "This plugin does not use a process_exp\n";
78	}
79
80
81	sub HB_read_html_file {
82	my $self = shift (@_);
83	my ($htmlfile, $text) = @_;
84
85	# load in the file
86	if (!open (FILE, $htmlfile)) {
87	print STDERR "ERROR - could not open $htmlfile\n";
88	return;
89	}
90
91	my $foundbody = 0;
92	$self->HB_gettext (\$foundbody, $text, "FILE");
93	close FILE;
94
95	# just in case there was no <body> tag
96	if (!$foundbody) {
97	$foundbody = 1;
98	open (FILE, $htmlfile) \|\| return;
99	$self->HB_gettext (\$foundbody, $text, "FILE");
100	close FILE;
101	}
102	}
103
104	sub HB_gettext {
105	my $self = shift (@_);
106	my ($foundbody, $text, $handle) = @_;
107
108	my $line = "";
109	while (defined ($line = <$handle>)) {
110	# look for body tag
111	if (!$$foundbody) {
112	if ($line =~ s/^.<body[^>]>//i) {
113	$$foundbody = 1;
114	} else {
115	next;
116	}
117	}
118
119	# check for symbol fonts
120	if ($line =~ /<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
121	my $font = $1;
122	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
123	if ($font !~ /^arial$/i);
124	}
125
126	$line =~ s/<\/p>//ig; # remove </p> tags
127	$line =~ s/<\/?(body\|html\|font)\b[^>]*>//ig; # remove any unwanted tags
128
129	# convert any alphanumeric character entities to their extended
130	# ascii equivalent for indexing purposes
131	&ghtml::convertcharentities ($line);
132
133	$$text .= $line;
134	}
135	$$text =~ s/\s+/ /g; # remove \n's
136	}
137
138	sub HB_clean_section {
139	my $self = shift (@_);
140	my ($section) = @_;
141
142	# remove tags without a starting tag from the section
143	my ($tag, $tagstart);
144	while ($section =~ /<\/([^>]{1,10})>/) {
145	$tag = $1;
146	$tagstart = index($section, "<$tag");
147	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
148	$section =~ s/<\/$tag>//;
149	}
150
151	# remove extra paragraph tags
152	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
153
154	# remove extra stuff at the end of the section
155	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
156
157	# add a newline at the beginning of each paragraph
158	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
159
160	# add a newline every 80 characters at a word boundary
161	# Note: this regular expression puts a line feed before
162	# the last word in each section, even when it is not
163	# needed.
164	$section =~ s/(.{1,80})\s/$1\n/g;
165
166	# fix up the image links
167	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
168	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
169	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
170	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
171
172	return $section;
173	}
174
175
176	sub shorten {
177	my $self = shift (@_);
178	my ($text) = @_;
179
180	return "\"$text\"" if (length($text) < 100);
181
182	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
183	substr ($text, length($text)-50) . "\"";
184	}
185
186	# if input_encoding is ascii we can call add_utf8_metadata
187	# directly but if it's iso_8859_1 (the default) we need to call
188	# add_metadata so that the ascii2utf8 conversion is done first
189	# this should speed things up a little if processing an ascii only
190	# document with input_encoding set to ascii
191	sub HB_add_metadata {
192	my $self = shift (@_);
193	my ($doc_obj, $cursection, $field, $value) = @_;
194
195	if ($self->{'input_encoding'} eq "ascii") {
196	$doc_obj->add_utf8_metadata ($cursection, $field, $value);
197	} else {
198	$doc_obj->add_metadata ($cursection, $field, $value);
199	}
200	}
201
202	# return number of files processed, undef if can't process
203	# Note that $base_dir might be "" and that $file might
204	# include directories
205	sub read {
206	my $self = shift (@_);
207	my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
208
209	# get the html filename and see if this is an HTML Book...
210	my $jobnumber = $file;
211	if ($file =~ /[\\\/]/) {
212	($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
213	}
214	return undef unless defined $jobnumber;
215	my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
216	return undef unless -e $htmlfile;
217
218	print STDERR "HBPlug: processing $file\n";
219
220	# read in the file and do basic html cleaning (removing header etc)
221	my $html = "";
222	$self->HB_read_html_file ($htmlfile, \$html);
223
224	# create a new document
225	my $doc_obj = new doc ($file, "indexed_doc");
226
227	# copy the book cover if it exists
228	my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
229	$doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
230
231	my $cursection = $doc_obj->get_top_section();
232
233	# add metadata for top level of document
234	foreach my $field (keys(%$metadata)) {
235	# $metadata->{$field} may be an array reference
236	if (ref ($metadata->{$field}) eq "ARRAY") {
237	map {
238	$self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
239	} @{$metadata->{$field}};
240	} else {
241	$self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
242	}
243	}
244
245	# process the file one section at a time
246	my $curtoclevel = 1;
247	my $firstsection = 1;
248	while (length ($html) > 0) {
249	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
250	my $toclevel = $3;
251	my $title = $4;
252	my $sectiontext = "";
253	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
254	$sectiontext = $1;
255	} else {
256	$sectiontext = $html;
257	$html = "";
258	}
259
260	# remove tags and extra spaces from the title
261	$title =~ s/<\/?[^>]+>//g;
262	$title =~ s/^\s+\|\s+$//g;
263
264	# close any sections below the current level and
265	# create a new section (special case for the firstsection)
266	while (($curtoclevel > $toclevel) \|\|
267	(!$firstsection && $curtoclevel == $toclevel)) {
268	$cursection = $doc_obj->get_parent_section ($cursection);
269	$curtoclevel--;
270	}
271	if ($curtoclevel+1 < $toclevel) {
272	print STDERR "WARNING - jump in toc levels in $htmlfile " .
273	"from $curtoclevel to $toclevel\n";
274	}
275	while ($curtoclevel < $toclevel) {
276	$curtoclevel++;
277	$cursection =
278	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
279	}
280
281	# add the metadata to this section
282	$self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
283
284	# clean up the section html
285	$sectiontext = $self->HB_clean_section($sectiontext);
286
287	# associate any files
288	map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
289	if /_httpdocimg_\/([^\"]+)\"/; 0; }
290	split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
291
292	# add the text for this section
293	if ($self->{'input_encoding'} eq "ascii") {
294	$doc_obj->add_utf8_text ($cursection, $sectiontext);
295	} else {
296	$doc_obj->add_text ($cursection, $sectiontext);
297	}
298	} else {
299	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
300	"\nin $htmlfile\n";
301	last;
302	}
303	$firstsection = 0;
304	}
305
306	# add a OID
307	$doc_obj->set_OID ();
308
309	# process the document
310	$processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
311
312	return 1; # processed the file
313	}
314
315
316	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: