Context Navigation

source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 5096

Last change on this file since 5096 was 4744, checked in by mdewsnip, 21 years ago
Tidied up and structures (representing the options of the plugin) in preparation for removing the print_usage() routines.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 10.7 KB

Line
1	###########################################################################
2	#
3	# HBPlug.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# plugin which processes an HTML book directory
27
28	# This plugin is used by the Humanity Library collections and does not handle
29	# input encodings other than ascii or extended ascii
30
31	# this code is kind of ugly and could no doubt be made to run faster, by leaving
32	# it in this state I hope to encourage people to make their collections use
33	# HBSPlug instead ;-)
34
35	# Use HBSPlug if creating a new collection and marking up files like the
36	# Humanity Library collections. HBSPlug accepts all input encodings but
37	# expects the marked up files to be cleaner than those used by the
38	# Humanity Library collections
39
40	# 12/05/02 Added usage datastructure - John Thompson
41
42	package HBPlug;
43
44	use ghtml;
45	use BasPlug;
46	use unicode;
47	use util;
48	use doc;
49
50
51	sub BEGIN {
52	@ISA = ('BasPlug');
53	}
54
55	my $options = { 'name' => "HBPlug",
56	'desc' => "Plugin which processes an HTML book directory. This plugin is used by the Humanity Library collections and does not handle input encodings other than ascii or extended ascii. This code is kind of ugly and could no doubt be made to run faster, by leaving it in this state I hope to encourage people to make their collections use HBSPlug instead ;-)\n\nUse HBSPlug if creating a new collection and marking up files like the Humanity Library collections. HBSPlug accepts all input encodings but expects the marked up files to be cleaner than those used by the Humanity Library collections",
57	'inherits' => "yes" };
58
59	sub new {
60	my ($class) = @_;
61	my $self = new BasPlug ("HBPlug", @_);
62
63	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
64	my $option_list = $self->{'option_list'};
65	push( @{$option_list}, $options );
66
67	return bless $self, $class;
68	}
69
70	sub init {
71	my $self = shift (@_);
72	my ($verbosity, $outhandle) = @_;
73
74	$self->BasPlug::init($verbosity, $outhandle);
75	$self->{'input_encoding'} = "iso_8859_1";
76
77	# this plugin only handles ascii encodings
78	if ($self->{'input_encoding'} !~ /^(iso_8859_1\|ascii)$/) {
79	die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
80	$self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
81	}
82	}
83
84	# this is included only to prevent warnings being printed out
85	# from BasPlug::init. The process_exp is not used by this plugin
86	sub get_default_process_exp {
87	my $self = shift (@_);
88
89	return "This plugin does not use a process_exp\n";
90	}
91
92
93	sub HB_read_html_file {
94	my $self = shift (@_);
95	my ($htmlfile, $text) = @_;
96
97	# load in the file
98	if (!open (FILE, $htmlfile)) {
99	my $outhandle = $self->{'outhandle'};
100	print $outhandle "ERROR - could not open $htmlfile\n";
101	return;
102	}
103
104	my $foundbody = 0;
105	$self->HB_gettext (\$foundbody, $text, "FILE");
106	close FILE;
107
108	# just in case there was no <body> tag
109	if (!$foundbody) {
110	$foundbody = 1;
111	open (FILE, $htmlfile) \|\| return;
112	$self->HB_gettext (\$foundbody, $text, "FILE");
113	close FILE;
114	}
115	# text is in utf8
116	}
117
118	# converts the text to utf8, as ghtml does that for é etc.
119	sub HB_gettext {
120	my $self = shift (@_);
121	my ($foundbody, $text, $handle) = @_;
122	my $outhandle = $self->{'outhandle'};
123
124	my $line = "";
125	while (defined ($line = <$handle>)) {
126	# look for body tag
127	if (!$$foundbody) {
128	if ($line =~ s/^.<body[^>]>//i) {
129	$$foundbody = 1;
130	} else {
131	next;
132	}
133	}
134
135	# check for symbol fonts
136	if ($line =~ /<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
137	my $font = $1;
138	print $outhandle "HBPlug::HB_gettext - warning removed font $font\n"
139	if ($font !~ /^arial$/i);
140	}
141
142	$line =~ s/<\/p>//ig; # remove </p> tags
143	$line =~ s/<\/?(body\|html\|font)\b[^>]*>//ig; # remove any unwanted tags
144
145	$$text .= $line;
146	}
147	#
148	if ($self->{'input_encoding'} eq "iso_8859_1") {
149	# convert to utf-8
150	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
151	}
152	# convert any alphanumeric character entities to their utf-8
153	# equivalent for indexing purposes
154	&ghtml::convertcharentities ($$text);
155
156	$$text =~ s/\s+/ /g; # remove \n's
157	}
158
159	sub HB_clean_section {
160	my $self = shift (@_);
161	my ($section) = @_;
162
163	# remove tags without a starting tag from the section
164	my ($tag, $tagstart);
165	while ($section =~ /<\/([^>]{1,10})>/) {
166	$tag = $1;
167	$tagstart = index($section, "<$tag");
168	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
169	$section =~ s/<\/$tag>//;
170	}
171
172	# remove extra paragraph tags
173	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
174
175	# remove extra stuff at the end of the section
176	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
177
178	# add a newline at the beginning of each paragraph
179	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
180
181	# add a newline every 80 characters at a word boundary
182	# Note: this regular expression puts a line feed before
183	# the last word in each section, even when it is not
184	# needed.
185	$section =~ s/(.{1,80})\s/$1\n/g;
186
187	# fix up the image links
188	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
189	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
190	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
191	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
192
193	return $section;
194	}
195
196
197	sub shorten {
198	my $self = shift (@_);
199	my ($text) = @_;
200
201	return "\"$text\"" if (length($text) < 100);
202
203	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
204	substr ($text, length($text)-50) . "\"";
205	}
206
207	# if input_encoding is ascii we can call add_utf8_metadata
208	# directly but if it's iso_8859_1 (the default) we need to call
209	# add_metadata so that the ascii2utf8 conversion is done first
210	# this should speed things up a little if processing an ascii only
211	# document with input_encoding set to ascii
212	sub HB_add_metadata {
213	my $self = shift (@_);
214	my ($doc_obj, $cursection, $field, $value) = @_;
215
216	# All text should now be in utf-8
217	# if ($self->{'input_encoding'} eq "ascii") {
218	$doc_obj->add_utf8_metadata ($cursection, $field, $value);
219	# } else {
220	# $doc_obj->add_metadata ($cursection, $field, $value);
221	# }
222	}
223
224	# return number of files processed, undef if can't process
225	# Note that $base_dir might be "" and that $file might
226	# include directories
227	sub read {
228	my $self = shift (@_);
229	my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
230	my $outhandle = $self->{'outhandle'};
231
232	# get the html filename and see if this is an HTML Book...
233	my $jobnumber = $file;
234	if ($file =~ /[\\\/]/) {
235	($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
236	}
237	return undef unless defined $jobnumber;
238	my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
239	return undef unless -e $htmlfile;
240
241	print $outhandle "HBPlug: processing $file\n";
242
243	# read in the file and do basic html cleaning (removing header etc)
244	my $html = "";
245	$self->HB_read_html_file ($htmlfile, \$html);
246	# html is in utf8
247
248	# create a new document
249	my $doc_obj = new doc ($file, "indexed_doc");
250	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
251
252	# copy the book cover if it exists
253	my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
254	$doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
255
256	my $cursection = $doc_obj->get_top_section();
257
258	# add metadata for top level of document
259	foreach my $field (keys(%$metadata)) {
260	# $metadata->{$field} may be an array reference
261	if (ref ($metadata->{$field}) eq "ARRAY") {
262	map {
263	$self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
264	} @{$metadata->{$field}};
265	} else {
266	$self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
267	}
268	}
269
270	# process the file one section at a time
271	my $curtoclevel = 1;
272	my $firstsection = 1;
273	while (length ($html) > 0) {
274	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
275	my $toclevel = $3;
276	my $title = $4;
277	my $sectiontext = "";
278	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
279	$sectiontext = $1;
280	} else {
281	$sectiontext = $html;
282	$html = "";
283	}
284
285	# remove tags and extra spaces from the title
286	$title =~ s/<\/?[^>]+>//g;
287	$title =~ s/^\s+\|\s+$//g;
288
289	# close any sections below the current level and
290	# create a new section (special case for the firstsection)
291	while (($curtoclevel > $toclevel) \|\|
292	(!$firstsection && $curtoclevel == $toclevel)) {
293	$cursection = $doc_obj->get_parent_section ($cursection);
294	$curtoclevel--;
295	}
296	if ($curtoclevel+1 < $toclevel) {
297	print $outhandle "WARNING - jump in toc levels in $htmlfile " .
298	"from $curtoclevel to $toclevel\n";
299	}
300	while ($curtoclevel < $toclevel) {
301	$curtoclevel++;
302	$cursection =
303	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
304	}
305
306	# add the metadata to this section
307	$self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
308
309	# clean up the section html
310	$sectiontext = $self->HB_clean_section($sectiontext);
311
312	# associate any files
313	map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
314	if /_httpdocimg_\/([^\"]+)\"/; 0; }
315	split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
316
317	# add the text for this section
318	# All read text should now be in utf-8
319	# if ($self->{'input_encoding'} eq "ascii") {
320	$doc_obj->add_utf8_text ($cursection, $sectiontext);
321	# } else {
322	# $doc_obj->add_text ($cursection, $sectiontext);
323	# }
324	} else {
325	print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
326	"\nin $htmlfile\n";
327	last;
328	}
329	$firstsection = 0;
330	}
331
332	# add a OID
333	$doc_obj->set_OID ();
334
335	# process the document
336	$processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
337
338	return 1; # processed the file
339	}
340
341
342	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: