Context Navigation

source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago
added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 11.0 KB

Line
1	###########################################################################
2	#
3	# HBPlug.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# plugin which processes an HTML book directory
27
28	# This plugin is used by the Humanity Library collections and does not handle
29	# input encodings other than ascii or extended ascii
30
31	# this code is kind of ugly and could no doubt be made to run faster, by leaving
32	# it in this state I hope to encourage people to make their collections use
33	# HBSPlug instead ;-)
34
35	# Use HBSPlug if creating a new collection and marking up files like the
36	# Humanity Library collections. HBSPlug accepts all input encodings but
37	# expects the marked up files to be cleaner than those used by the
38	# Humanity Library collections
39
40	package HBPlug;
41
42	use ghtml;
43	use BasPlug;
44	use unicode;
45	use util;
46	use doc;
47
48	use strict;
49	no strict 'refs'; # allow filehandles to be variables and viceversa
50
51	sub BEGIN {
52	@HBPlug::ISA = ('BasPlug');
53	}
54
55	my $arguments =
56	[ { 'name' => "process_exp",
57	'desc' => "{BasPlug.process_exp}",
58	'type' => "regexp",
59	'reqd' => "no",
60	'deft' => &get_default_process_exp() }
61	];
62
63	my $options = { 'name' => "HBPlug",
64	'desc' => "{HBPlug.desc}",
65	'abstract' => "no",
66	'inherits' => "yes",
67	'args' => $arguments };
68
69	sub new {
70	my ($class) = shift (@_);
71	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
72	push(@$pluginlist, $class);
73
74	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
75	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
76
77	my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
78
79	return bless $self, $class;
80	}
81
82	sub init {
83	my $self = shift (@_);
84	my ($verbosity, $outhandle) = @_;
85
86	$self->BasPlug::init($verbosity, $outhandle);
87	$self->{'input_encoding'} = "iso_8859_1";
88
89	# this plugin only handles ascii encodings
90	if ($self->{'input_encoding'} !~ /^(iso_8859_1\|ascii)$/) {
91	die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
92	$self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
93	}
94	}
95
96	# this is included only to prevent warnings being printed out
97	# from BasPlug::init. The process_exp is not used by this plugin
98	sub get_default_process_exp {
99	my $self = shift (@_);
100
101	return "This plugin does not use a process_exp\n";
102	}
103
104
105	sub HB_read_html_file {
106	my $self = shift (@_);
107	my ($htmlfile, $text) = @_;
108
109	# load in the file
110	if (!open (FILE, $htmlfile)) {
111	my $outhandle = $self->{'outhandle'};
112	print $outhandle "ERROR - could not open $htmlfile\n";
113	return;
114	}
115
116	my $foundbody = 0;
117	$self->HB_gettext (\$foundbody, $text, "FILE");
118	close FILE;
119
120	# just in case there was no <body> tag
121	if (!$foundbody) {
122	$foundbody = 1;
123	open (FILE, $htmlfile) \|\| return;
124	$self->HB_gettext (\$foundbody, $text, "FILE");
125	close FILE;
126	}
127	# text is in utf8
128	}
129
130	# converts the text to utf8, as ghtml does that for é etc.
131	sub HB_gettext {
132	my $self = shift (@_);
133	my ($foundbody, $text, $handle) = @_;
134	my $outhandle = $self->{'outhandle'};
135
136	my $line = "";
137	while (defined ($line = <$handle>)) {
138	# look for body tag
139	if (!$$foundbody) {
140	if ($line =~ s/^.<body[^>]>//i) {
141	$$foundbody = 1;
142	} else {
143	next;
144	}
145	}
146
147	# check for symbol fonts
148	if ($line =~ /<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
149	my $font = $1;
150	print $outhandle "HBPlug::HB_gettext - warning removed font $font\n"
151	if ($font !~ /^arial$/i);
152	}
153
154	$line =~ s/<\/p>//ig; # remove </p> tags
155	$line =~ s/<\/?(body\|html\|font)\b[^>]*>//ig; # remove any unwanted tags
156
157	$$text .= $line;
158	}
159	#
160	if ($self->{'input_encoding'} eq "iso_8859_1") {
161	# convert to utf-8
162	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
163	}
164	# convert any alphanumeric character entities to their utf-8
165	# equivalent for indexing purposes
166	&ghtml::convertcharentities ($$text);
167
168	$$text =~ s/\s+/ /g; # remove \n's
169	}
170
171	sub HB_clean_section {
172	my $self = shift (@_);
173	my ($section) = @_;
174
175	# remove tags without a starting tag from the section
176	my ($tag, $tagstart);
177	while ($section =~ /<\/([^>]{1,10})>/) {
178	$tag = $1;
179	$tagstart = index($section, "<$tag");
180	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
181	$section =~ s/<\/$tag>//;
182	}
183
184	# remove extra paragraph tags
185	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
186
187	# remove extra stuff at the end of the section
188	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
189
190	# add a newline at the beginning of each paragraph
191	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
192
193	# add a newline every 80 characters at a word boundary
194	# Note: this regular expression puts a line feed before
195	# the last word in each section, even when it is not
196	# needed.
197	$section =~ s/(.{1,80})\s/$1\n/g;
198
199	# fix up the image links
200	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
201	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
202	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
203	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
204
205	return $section;
206	}
207
208
209	sub shorten {
210	my $self = shift (@_);
211	my ($text) = @_;
212
213	return "\"$text\"" if (length($text) < 100);
214
215	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
216	substr ($text, length($text)-50) . "\"";
217	}
218
219	# if input_encoding is ascii we can call add_utf8_metadata
220	# directly but if it's iso_8859_1 (the default) we need to call
221	# add_metadata so that the ascii2utf8 conversion is done first
222	# this should speed things up a little if processing an ascii only
223	# document with input_encoding set to ascii
224	sub HB_add_metadata {
225	my $self = shift (@_);
226	my ($doc_obj, $cursection, $field, $value) = @_;
227
228	# All text should now be in utf-8
229	# if ($self->{'input_encoding'} eq "ascii") {
230	$doc_obj->add_utf8_metadata ($cursection, $field, $value);
231	# } else {
232	# $doc_obj->add_metadata ($cursection, $field, $value);
233	# }
234	}
235
236	# return number of files processed, undef if can't process
237	# Note that $base_dir might be "" and that $file might
238	# include directories
239	sub read {
240	my $self = shift (@_);
241	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
242	my $outhandle = $self->{'outhandle'};
243
244	# get the html filename and see if this is an HTML Book...
245	my $jobnumber = $file;
246	if ($file =~ /[\\\/]/) {
247	($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
248	}
249	return undef unless defined $jobnumber;
250	my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
251	return undef unless -e $htmlfile;
252
253	print STDERR "<Processing n='$file' p='HBPlug'>\n" if ($gli);
254	print $outhandle "HBPlug: processing $file\n";
255
256	# read in the file and do basic html cleaning (removing header etc)
257	my $html = "";
258	$self->HB_read_html_file ($htmlfile, \$html);
259	# html is in utf8
260
261	# create a new document
262	my $doc_obj = new doc ($file, "indexed_doc");
263	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
264
265	# copy the book cover if it exists
266	my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
267	$doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
268	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
269	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileFormat", "HB");
270	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $htmlfile));
271
272	my $cursection = $doc_obj->get_top_section();
273
274	# add metadata for top level of document
275	foreach my $field (keys(%$metadata)) {
276	# $metadata->{$field} may be an array reference
277	if (ref ($metadata->{$field}) eq "ARRAY") {
278	map {
279	$self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
280	} @{$metadata->{$field}};
281	} else {
282	$self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
283	}
284	}
285
286	# process the file one section at a time
287	my $curtoclevel = 1;
288	my $firstsection = 1;
289	while (length ($html) > 0) {
290	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
291	my $toclevel = $3;
292	my $title = $4;
293	my $sectiontext = "";
294	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
295	$sectiontext = $1;
296	} else {
297	$sectiontext = $html;
298	$html = "";
299	}
300
301	# remove tags and extra spaces from the title
302	$title =~ s/<\/?[^>]+>//g;
303	$title =~ s/^\s+\|\s+$//g;
304
305	# close any sections below the current level and
306	# create a new section (special case for the firstsection)
307	while (($curtoclevel > $toclevel) \|\|
308	(!$firstsection && $curtoclevel == $toclevel)) {
309	$cursection = $doc_obj->get_parent_section ($cursection);
310	$curtoclevel--;
311	}
312	if ($curtoclevel+1 < $toclevel) {
313	print $outhandle "WARNING - jump in toc levels in $htmlfile " .
314	"from $curtoclevel to $toclevel\n";
315	}
316	while ($curtoclevel < $toclevel) {
317	$curtoclevel++;
318	$cursection =
319	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
320	}
321
322	# add the metadata to this section
323	$self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
324
325	# clean up the section html
326	$sectiontext = $self->HB_clean_section($sectiontext);
327
328	# associate any files
329	map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
330	if /_httpdocimg_\/([^\"]+)\"/; 0; }
331	split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
332
333	# add the text for this section
334	# All read text should now be in utf-8
335	# if ($self->{'input_encoding'} eq "ascii") {
336	$doc_obj->add_utf8_text ($cursection, $sectiontext);
337	# } else {
338	# $doc_obj->add_text ($cursection, $sectiontext);
339	# }
340	} else {
341	print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
342	"\nin $htmlfile\n";
343	last;
344	}
345	$firstsection = 0;
346	}
347
348	# add a OID
349	$doc_obj->set_OID ();
350
351	# process the document
352	$processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
353
354	return 1; # processed the file
355	}
356
357
358	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: