Context Navigation

source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 5845

Last change on this file since 5845 was 5680, checked in by mdewsnip, 21 years ago
Moved plugin descriptions into the resource bundle (perllib/strings.rb), for translation by UNESCO.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 10.2 KB

Line
1	###########################################################################
2	#
3	# HBPlug.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# plugin which processes an HTML book directory
27
28	# This plugin is used by the Humanity Library collections and does not handle
29	# input encodings other than ascii or extended ascii
30
31	# this code is kind of ugly and could no doubt be made to run faster, by leaving
32	# it in this state I hope to encourage people to make their collections use
33	# HBSPlug instead ;-)
34
35	# Use HBSPlug if creating a new collection and marking up files like the
36	# Humanity Library collections. HBSPlug accepts all input encodings but
37	# expects the marked up files to be cleaner than those used by the
38	# Humanity Library collections
39
40	# 12/05/02 Added usage datastructure - John Thompson
41
42	package HBPlug;
43
44	use ghtml;
45	use BasPlug;
46	use unicode;
47	use util;
48	use doc;
49
50
51	sub BEGIN {
52	@ISA = ('BasPlug');
53	}
54
55	my $options = { 'name' => "HBPlug",
56	'desc' => "{HBPlug.desc}",
57	'inherits' => "yes" };
58
59	sub new {
60	my ($class) = @_;
61	my $self = new BasPlug ("HBPlug", @_);
62
63	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
64	my $option_list = $self->{'option_list'};
65	push( @{$option_list}, $options );
66
67	return bless $self, $class;
68	}
69
70	sub init {
71	my $self = shift (@_);
72	my ($verbosity, $outhandle) = @_;
73
74	$self->BasPlug::init($verbosity, $outhandle);
75	$self->{'input_encoding'} = "iso_8859_1";
76
77	# this plugin only handles ascii encodings
78	if ($self->{'input_encoding'} !~ /^(iso_8859_1\|ascii)$/) {
79	die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
80	$self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
81	}
82	}
83
84	# this is included only to prevent warnings being printed out
85	# from BasPlug::init. The process_exp is not used by this plugin
86	sub get_default_process_exp {
87	my $self = shift (@_);
88
89	return "This plugin does not use a process_exp\n";
90	}
91
92
93	sub HB_read_html_file {
94	my $self = shift (@_);
95	my ($htmlfile, $text) = @_;
96
97	# load in the file
98	if (!open (FILE, $htmlfile)) {
99	my $outhandle = $self->{'outhandle'};
100	print $outhandle "ERROR - could not open $htmlfile\n";
101	return;
102	}
103
104	my $foundbody = 0;
105	$self->HB_gettext (\$foundbody, $text, "FILE");
106	close FILE;
107
108	# just in case there was no <body> tag
109	if (!$foundbody) {
110	$foundbody = 1;
111	open (FILE, $htmlfile) \|\| return;
112	$self->HB_gettext (\$foundbody, $text, "FILE");
113	close FILE;
114	}
115	# text is in utf8
116	}
117
118	# converts the text to utf8, as ghtml does that for é etc.
119	sub HB_gettext {
120	my $self = shift (@_);
121	my ($foundbody, $text, $handle) = @_;
122	my $outhandle = $self->{'outhandle'};
123
124	my $line = "";
125	while (defined ($line = <$handle>)) {
126	# look for body tag
127	if (!$$foundbody) {
128	if ($line =~ s/^.<body[^>]>//i) {
129	$$foundbody = 1;
130	} else {
131	next;
132	}
133	}
134
135	# check for symbol fonts
136	if ($line =~ /<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
137	my $font = $1;
138	print $outhandle "HBPlug::HB_gettext - warning removed font $font\n"
139	if ($font !~ /^arial$/i);
140	}
141
142	$line =~ s/<\/p>//ig; # remove </p> tags
143	$line =~ s/<\/?(body\|html\|font)\b[^>]*>//ig; # remove any unwanted tags
144
145	$$text .= $line;
146	}
147	#
148	if ($self->{'input_encoding'} eq "iso_8859_1") {
149	# convert to utf-8
150	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
151	}
152	# convert any alphanumeric character entities to their utf-8
153	# equivalent for indexing purposes
154	&ghtml::convertcharentities ($$text);
155
156	$$text =~ s/\s+/ /g; # remove \n's
157	}
158
159	sub HB_clean_section {
160	my $self = shift (@_);
161	my ($section) = @_;
162
163	# remove tags without a starting tag from the section
164	my ($tag, $tagstart);
165	while ($section =~ /<\/([^>]{1,10})>/) {
166	$tag = $1;
167	$tagstart = index($section, "<$tag");
168	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
169	$section =~ s/<\/$tag>//;
170	}
171
172	# remove extra paragraph tags
173	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
174
175	# remove extra stuff at the end of the section
176	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
177
178	# add a newline at the beginning of each paragraph
179	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
180
181	# add a newline every 80 characters at a word boundary
182	# Note: this regular expression puts a line feed before
183	# the last word in each section, even when it is not
184	# needed.
185	$section =~ s/(.{1,80})\s/$1\n/g;
186
187	# fix up the image links
188	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
189	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
190	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
191	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
192
193	return $section;
194	}
195
196
197	sub shorten {
198	my $self = shift (@_);
199	my ($text) = @_;
200
201	return "\"$text\"" if (length($text) < 100);
202
203	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
204	substr ($text, length($text)-50) . "\"";
205	}
206
207	# if input_encoding is ascii we can call add_utf8_metadata
208	# directly but if it's iso_8859_1 (the default) we need to call
209	# add_metadata so that the ascii2utf8 conversion is done first
210	# this should speed things up a little if processing an ascii only
211	# document with input_encoding set to ascii
212	sub HB_add_metadata {
213	my $self = shift (@_);
214	my ($doc_obj, $cursection, $field, $value) = @_;
215
216	# All text should now be in utf-8
217	# if ($self->{'input_encoding'} eq "ascii") {
218	$doc_obj->add_utf8_metadata ($cursection, $field, $value);
219	# } else {
220	# $doc_obj->add_metadata ($cursection, $field, $value);
221	# }
222	}
223
224	# return number of files processed, undef if can't process
225	# Note that $base_dir might be "" and that $file might
226	# include directories
227	sub read {
228	my $self = shift (@_);
229	my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
230	my $outhandle = $self->{'outhandle'};
231
232	# get the html filename and see if this is an HTML Book...
233	my $jobnumber = $file;
234	if ($file =~ /[\\\/]/) {
235	($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
236	}
237	return undef unless defined $jobnumber;
238	my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
239	return undef unless -e $htmlfile;
240
241	print $outhandle "HBPlug: processing $file\n";
242
243	# read in the file and do basic html cleaning (removing header etc)
244	my $html = "";
245	$self->HB_read_html_file ($htmlfile, \$html);
246	# html is in utf8
247
248	# create a new document
249	my $doc_obj = new doc ($file, "indexed_doc");
250	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
251
252	# copy the book cover if it exists
253	my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
254	$doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
255
256	my $cursection = $doc_obj->get_top_section();
257
258	# add metadata for top level of document
259	foreach my $field (keys(%$metadata)) {
260	# $metadata->{$field} may be an array reference
261	if (ref ($metadata->{$field}) eq "ARRAY") {
262	map {
263	$self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
264	} @{$metadata->{$field}};
265	} else {
266	$self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
267	}
268	}
269
270	# process the file one section at a time
271	my $curtoclevel = 1;
272	my $firstsection = 1;
273	while (length ($html) > 0) {
274	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
275	my $toclevel = $3;
276	my $title = $4;
277	my $sectiontext = "";
278	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
279	$sectiontext = $1;
280	} else {
281	$sectiontext = $html;
282	$html = "";
283	}
284
285	# remove tags and extra spaces from the title
286	$title =~ s/<\/?[^>]+>//g;
287	$title =~ s/^\s+\|\s+$//g;
288
289	# close any sections below the current level and
290	# create a new section (special case for the firstsection)
291	while (($curtoclevel > $toclevel) \|\|
292	(!$firstsection && $curtoclevel == $toclevel)) {
293	$cursection = $doc_obj->get_parent_section ($cursection);
294	$curtoclevel--;
295	}
296	if ($curtoclevel+1 < $toclevel) {
297	print $outhandle "WARNING - jump in toc levels in $htmlfile " .
298	"from $curtoclevel to $toclevel\n";
299	}
300	while ($curtoclevel < $toclevel) {
301	$curtoclevel++;
302	$cursection =
303	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
304	}
305
306	# add the metadata to this section
307	$self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
308
309	# clean up the section html
310	$sectiontext = $self->HB_clean_section($sectiontext);
311
312	# associate any files
313	map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
314	if /_httpdocimg_\/([^\"]+)\"/; 0; }
315	split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
316
317	# add the text for this section
318	# All read text should now be in utf-8
319	# if ($self->{'input_encoding'} eq "ascii") {
320	$doc_obj->add_utf8_text ($cursection, $sectiontext);
321	# } else {
322	# $doc_obj->add_text ($cursection, $sectiontext);
323	# }
324	} else {
325	print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
326	"\nin $htmlfile\n";
327	last;
328	}
329	$firstsection = 0;
330	}
331
332	# add a OID
333	$doc_obj->set_OID ();
334
335	# process the document
336	$processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
337
338	return 1; # processed the file
339	}
340
341
342	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: