Context Navigation

source: gsdl/trunk/perllib/plugins/HBPlug.pm@ 14661

Last change on this file since 14661 was 12270, checked in by kjdon, 18 years ago
set_OIDtype now takes two arguments, the type and the metadata (used if type=assigned)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 10.9 KB

Line
1	###########################################################################
2	#
3	# HBPlug.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# plugin which processes an HTML book directory
27
28	# This plugin is used by the Humanity Library collections and does not handle
29	# input encodings other than ascii or extended ascii
30
31	# this code is kind of ugly and could no doubt be made to run faster, by leaving
32	# it in this state I hope to encourage people to make their collections use
33	# HBSPlug instead ;-)
34
35	# Use HBSPlug if creating a new collection and marking up files like the
36	# Humanity Library collections. HBSPlug accepts all input encodings but
37	# expects the marked up files to be cleaner than those used by the
38	# Humanity Library collections
39
40	package HBPlug;
41
42	use ghtml;
43	use BasPlug;
44	use unicode;
45	use util;
46	use doc;
47
48	use strict;
49	no strict 'refs'; # allow filehandles to be variables and viceversa
50
51	sub BEGIN {
52	@HBPlug::ISA = ('BasPlug');
53	}
54
55	my $arguments =
56	[ { 'name' => "process_exp",
57	'desc' => "{BasPlug.process_exp}",
58	'type' => "regexp",
59	'reqd' => "no",
60	'deft' => &get_default_process_exp() }
61	];
62
63	my $options = { 'name' => "HBPlug",
64	'desc' => "{HBPlug.desc}",
65	'abstract' => "no",
66	'inherits' => "yes",
67	'args' => $arguments };
68
69	sub new {
70	my ($class) = shift (@_);
71	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
72	push(@$pluginlist, $class);
73
74	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
75	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
76
77	my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
78
79	return bless $self, $class;
80	}
81
82	sub init {
83	my $self = shift (@_);
84	my ($verbosity, $outhandle) = @_;
85
86	$self->BasPlug::init($verbosity, $outhandle);
87	$self->{'input_encoding'} = "iso_8859_1";
88
89	# this plugin only handles ascii encodings
90	if ($self->{'input_encoding'} !~ /^(iso_8859_1\|ascii)$/) {
91	die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
92	$self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
93	}
94	}
95
96	# this is included only to prevent warnings being printed out
97	# from BasPlug::init. The process_exp is not used by this plugin
98	sub get_default_process_exp {
99	my $self = shift (@_);
100
101	return "This plugin does not use a process_exp\n";
102	}
103
104
105	sub HB_read_html_file {
106	my $self = shift (@_);
107	my ($htmlfile, $text) = @_;
108
109	# load in the file
110	if (!open (FILE, $htmlfile)) {
111	my $outhandle = $self->{'outhandle'};
112	print $outhandle "ERROR - could not open $htmlfile\n";
113	return;
114	}
115
116	my $foundbody = 0;
117	$self->HB_gettext (\$foundbody, $text, "FILE");
118	close FILE;
119
120	# just in case there was no <body> tag
121	if (!$foundbody) {
122	$foundbody = 1;
123	open (FILE, $htmlfile) \|\| return;
124	$self->HB_gettext (\$foundbody, $text, "FILE");
125	close FILE;
126	}
127	# text is in utf8
128	}
129
130	# converts the text to utf8, as ghtml does that for é etc.
131	sub HB_gettext {
132	my $self = shift (@_);
133	my ($foundbody, $text, $handle) = @_;
134	my $outhandle = $self->{'outhandle'};
135
136	my $line = "";
137	while (defined ($line = <$handle>)) {
138	# look for body tag
139	if (!$$foundbody) {
140	if ($line =~ s/^.<body[^>]>//i) {
141	$$foundbody = 1;
142	} else {
143	next;
144	}
145	}
146
147	# check for symbol fonts
148	if ($line =~ /<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
149	my $font = $1;
150	print $outhandle "HBPlug::HB_gettext - warning removed font $font\n"
151	if ($font !~ /^arial$/i);
152	}
153
154	$line =~ s/<\/p>//ig; # remove </p> tags
155	$line =~ s/<\/?(body\|html\|font)\b[^>]*>//ig; # remove any unwanted tags
156
157	$$text .= $line;
158	}
159	#
160	if ($self->{'input_encoding'} eq "iso_8859_1") {
161	# convert to utf-8
162	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
163	}
164	# convert any alphanumeric character entities to their utf-8
165	# equivalent for indexing purposes
166	&ghtml::convertcharentities ($$text);
167
168	$$text =~ s/\s+/ /g; # remove \n's
169	}
170
171	sub HB_clean_section {
172	my $self = shift (@_);
173	my ($section) = @_;
174
175	# remove tags without a starting tag from the section
176	my ($tag, $tagstart);
177	while ($section =~ /<\/([^>]{1,10})>/) {
178	$tag = $1;
179	$tagstart = index($section, "<$tag");
180	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
181	$section =~ s/<\/$tag>//;
182	}
183
184	# remove extra paragraph tags
185	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
186
187	# remove extra stuff at the end of the section
188	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
189
190	# add a newline at the beginning of each paragraph
191	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
192
193	# add a newline every 80 characters at a word boundary
194	# Note: this regular expression puts a line feed before
195	# the last word in each section, even when it is not
196	# needed.
197	$section =~ s/(.{1,80})\s/$1\n/g;
198
199	# fix up the image links
200	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
201	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
202	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
203	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
204
205	return $section;
206	}
207
208
209	sub shorten {
210	my $self = shift (@_);
211	my ($text) = @_;
212
213	return "\"$text\"" if (length($text) < 100);
214
215	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
216	substr ($text, length($text)-50) . "\"";
217	}
218
219	# if input_encoding is ascii we can call add_utf8_metadata
220	# directly but if it's iso_8859_1 (the default) we need to call
221	# add_metadata so that the ascii2utf8 conversion is done first
222	# this should speed things up a little if processing an ascii only
223	# document with input_encoding set to ascii
224	sub HB_add_metadata {
225	my $self = shift (@_);
226	my ($doc_obj, $cursection, $field, $value) = @_;
227
228	# All text should now be in utf-8
229	# if ($self->{'input_encoding'} eq "ascii") {
230	$doc_obj->add_utf8_metadata ($cursection, $field, $value);
231	# } else {
232	# $doc_obj->add_metadata ($cursection, $field, $value);
233	# }
234	}
235
236	# return number of files processed, undef if can't process
237	# Note that $base_dir might be "" and that $file might
238	# include directories
239	sub read {
240	my $self = shift (@_);
241	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
242	my $outhandle = $self->{'outhandle'};
243
244	# get the html filename and see if this is an HTML Book...
245	my $jobnumber = $file;
246	if ($file =~ /[\\\/]/) {
247	($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
248	}
249	return undef unless defined $jobnumber;
250	my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
251	return undef unless -e $htmlfile;
252
253	print STDERR "<Processing n='$file' p='HBPlug'>\n" if ($gli);
254	print $outhandle "HBPlug: processing $file\n";
255
256	# read in the file and do basic html cleaning (removing header etc)
257	my $html = "";
258	$self->HB_read_html_file ($htmlfile, \$html);
259	# html is in utf8
260
261	# create a new document
262	my $doc_obj = new doc ($file, "indexed_doc");
263	$doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
264
265	# copy the book cover if it exists
266	my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
267	$doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
268	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
269	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileFormat", "HB");
270	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $htmlfile));
271
272	my $cursection = $doc_obj->get_top_section();
273
274	# add metadata for top level of document
275	foreach my $field (keys(%$metadata)) {
276	# $metadata->{$field} may be an array reference
277	if (ref ($metadata->{$field}) eq "ARRAY") {
278	map {
279	$self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
280	} @{$metadata->{$field}};
281	} else {
282	$self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
283	}
284	}
285
286	# process the file one section at a time
287	my $curtoclevel = 1;
288	my $firstsection = 1;
289	while (length ($html) > 0) {
290	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
291	my $toclevel = $3;
292	my $title = $4;
293	my $sectiontext = "";
294	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
295	$sectiontext = $1;
296	} else {
297	$sectiontext = $html;
298	$html = "";
299	}
300
301	# remove tags and extra spaces from the title
302	$title =~ s/<\/?[^>]+>//g;
303	$title =~ s/^\s+\|\s+$//g;
304
305	# close any sections below the current level and
306	# create a new section (special case for the firstsection)
307	while (($curtoclevel > $toclevel) \|\|
308	(!$firstsection && $curtoclevel == $toclevel)) {
309	$cursection = $doc_obj->get_parent_section ($cursection);
310	$curtoclevel--;
311	}
312	if ($curtoclevel+1 < $toclevel) {
313	print $outhandle "WARNING - jump in toc levels in $htmlfile " .
314	"from $curtoclevel to $toclevel\n";
315	}
316	while ($curtoclevel < $toclevel) {
317	$curtoclevel++;
318	$cursection =
319	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
320	}
321
322	# add the metadata to this section
323	$self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
324
325	# clean up the section html
326	$sectiontext = $self->HB_clean_section($sectiontext);
327
328	# associate any files
329	map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
330	if /_httpdocimg_\/([^\"]+)\"/; 0; }
331	split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
332
333	# add the text for this section
334	# All read text should now be in utf-8
335	# if ($self->{'input_encoding'} eq "ascii") {
336	$doc_obj->add_utf8_text ($cursection, $sectiontext);
337	# } else {
338	# $doc_obj->add_text ($cursection, $sectiontext);
339	# }
340	} else {
341	print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
342	"\nin $htmlfile\n";
343	last;
344	}
345	$firstsection = 0;
346	}
347
348	# add a OID
349	$doc_obj->set_OID ();
350
351	# process the document
352	$processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
353
354	return 1; # processed the file
355	}
356
357
358	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: