Context Navigation

source: gsdl/trunk/perllib/plugins/HBPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago
plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 10.0 KB

Line
1	###########################################################################
2	#
3	# HBPlugin.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# plugin which processes an HTML book directory
27
28	# This plugin is used by the Humanity Library collections and does not handle
29	# input encodings other than ascii or extended ascii
30
31	# this code is kind of ugly and could no doubt be made to run faster, by leaving
32	# it in this state I hope to encourage people to make their collections use
33	# HBSPlug instead ;-)
34
35	# Use HBSPlug if creating a new collection and marking up files like the
36	# Humanity Library collections. HBSPlug accepts all input encodings but
37	# expects the marked up files to be cleaner than those used by the
38	# Humanity Library collections
39
40	package HBPlugin;
41
42	use ghtml;
43	use BasePlugin;
44	use unicode;
45	use util;
46	use doc;
47
48	use strict;
49	no strict 'refs'; # allow filehandles to be variables and viceversa
50
51	sub BEGIN {
52	@HBPlugin::ISA = ('BasePlugin');
53	}
54	my $encoding_list =
55	[ { 'name' => "ascii",
56	'desc' => "{ReadTextFile.input_encoding.ascii}" },
57	{ 'name' => "iso_8859_1",
58	'desc' => "Latin1 (western languages)" } ];
59
60	my $arguments =
61	[ { 'name' => "process_exp",
62	'desc' => "{BasePlugin.process_exp}",
63	'type' => "regexp",
64	'reqd' => "no",
65	'deft' => &get_default_process_exp() },
66	{ 'name' => "input_encoding",
67	'desc' => "{ReadTextFile.input_encoding}",
68	'type' => "enum",
69	'deft' => "iso_8859_1",
70	'list' => $encoding_list,
71	'reqd' => "no" }
72	];
73
74	my $options = { 'name' => "HBPlugin",
75	'desc' => "{HBPlugin.desc}",
76	'abstract' => "no",
77	'inherits' => "yes",
78	'args' => $arguments };
79
80	sub new {
81	my ($class) = shift (@_);
82	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
83	push(@$pluginlist, $class);
84
85	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
86	push(@{$hashArgOptLists->{"OptList"}},$options);
87
88	my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
89
90	return bless $self, $class;
91	}
92
93	# this is included only to prevent warnings being printed out
94	# from BasePlugin::init. The process_exp is not used by this plugin
95	sub get_default_process_exp {
96	my $self = shift (@_);
97
98	return "This plugin does not use a process_exp\n";
99	}
100
101
102	sub HB_read_html_file {
103	my $self = shift (@_);
104	my ($htmlfile, $text) = @_;
105
106	# load in the file
107	if (!open (FILE, $htmlfile)) {
108	my $outhandle = $self->{'outhandle'};
109	print $outhandle "ERROR - could not open $htmlfile\n";
110	return;
111	}
112
113	my $foundbody = 0;
114	$self->HB_gettext (\$foundbody, $text, "FILE");
115	close FILE;
116
117	# just in case there was no <body> tag
118	if (!$foundbody) {
119	$foundbody = 1;
120	open (FILE, $htmlfile) \|\| return;
121	$self->HB_gettext (\$foundbody, $text, "FILE");
122	close FILE;
123	}
124	# text is in utf8
125	}
126
127	# converts the text to utf8, as ghtml does that for é etc.
128	sub HB_gettext {
129	my $self = shift (@_);
130	my ($foundbody, $text, $handle) = @_;
131	my $outhandle = $self->{'outhandle'};
132
133	my $line = "";
134	while (defined ($line = <$handle>)) {
135	# look for body tag
136	if (!$$foundbody) {
137	if ($line =~ s/^.<body[^>]>//i) {
138	$$foundbody = 1;
139	} else {
140	next;
141	}
142	}
143
144	# check for symbol fonts
145	if ($line =~ /<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
146	my $font = $1;
147	print $outhandle "HBPlugin::HB_gettext - warning removed font $font\n"
148	if ($font !~ /^arial$/i);
149	}
150
151	$line =~ s/<\/p>//ig; # remove </p> tags
152	$line =~ s/<\/?(body\|html\|font)\b[^>]*>//ig; # remove any unwanted tags
153
154	$$text .= $line;
155	}
156	#
157	if ($self->{'input_encoding'} eq "iso_8859_1") {
158	# convert to utf-8
159	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
160	}
161	# convert any alphanumeric character entities to their utf-8
162	# equivalent for indexing purposes
163	&ghtml::convertcharentities ($$text);
164
165	$$text =~ s/\s+/ /g; # remove \n's
166	}
167
168	sub HB_clean_section {
169	my $self = shift (@_);
170	my ($section) = @_;
171
172	# remove tags without a starting tag from the section
173	my ($tag, $tagstart);
174	while ($section =~ /<\/([^>]{1,10})>/) {
175	$tag = $1;
176	$tagstart = index($section, "<$tag");
177	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
178	$section =~ s/<\/$tag>//;
179	}
180
181	# remove extra paragraph tags
182	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
183
184	# remove extra stuff at the end of the section
185	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
186
187	# add a newline at the beginning of each paragraph
188	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
189
190	# add a newline every 80 characters at a word boundary
191	# Note: this regular expression puts a line feed before
192	# the last word in each section, even when it is not
193	# needed.
194	$section =~ s/(.{1,80})\s/$1\n/g;
195
196	# fix up the image links
197	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
198	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
199	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
200	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
201
202	return $section;
203	}
204
205
206	sub shorten {
207	my $self = shift (@_);
208	my ($text) = @_;
209
210	return "\"$text\"" if (length($text) < 100);
211
212	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
213	substr ($text, length($text)-50) . "\"";
214	}
215
216	# return number of files processed, undef if can't process
217	# Note that $base_dir might be "" and that $file might
218	# include directories
219	sub read {
220	my $self = shift (@_);
221	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
222	my $outhandle = $self->{'outhandle'};
223
224	# get the html filename and see if this is an HTML Book...
225	my $jobnumber = $file;
226	if ($file =~ /[\\\/]/) {
227	($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
228	}
229	return undef unless defined $jobnumber;
230	my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
231	return undef unless -e $htmlfile;
232
233	print STDERR "<Processing n='$file' p='HBPlugin'>\n" if ($gli);
234	print $outhandle "HBPlugin: processing $file\n";
235
236	# read in the file and do basic html cleaning (removing header etc)
237	my $html = "";
238	$self->HB_read_html_file ($htmlfile, \$html);
239	# html is in utf8
240
241	# create a new document
242	my $doc_obj = new doc ($file, "indexed_doc");
243	$doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
244
245	# copy the book cover if it exists
246	my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
247	$doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
248	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
249	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileFormat", "HB");
250	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $htmlfile));
251
252	my $cursection = $doc_obj->get_top_section();
253
254	# add metadata for top level of document
255	foreach my $field (keys(%$metadata)) {
256	# $metadata->{$field} may be an array reference
257	if (ref ($metadata->{$field}) eq "ARRAY") {
258	map {
259	$doc_obj->add_utf8_metadata($cursection, $field, $_);
260	} @{$metadata->{$field}};
261	} else {
262	$doc_obj->add_utf8_metadata($cursection, $field, $metadata->{$field});
263	}
264	}
265
266	# process the file one section at a time
267	my $curtoclevel = 1;
268	my $firstsection = 1;
269	while (length ($html) > 0) {
270	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
271	my $toclevel = $3;
272	my $title = $4;
273	my $sectiontext = "";
274	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
275	$sectiontext = $1;
276	} else {
277	$sectiontext = $html;
278	$html = "";
279	}
280
281	# remove tags and extra spaces from the title
282	$title =~ s/<\/?[^>]+>//g;
283	$title =~ s/^\s+\|\s+$//g;
284
285	# close any sections below the current level and
286	# create a new section (special case for the firstsection)
287	while (($curtoclevel > $toclevel) \|\|
288	(!$firstsection && $curtoclevel == $toclevel)) {
289	$cursection = $doc_obj->get_parent_section ($cursection);
290	$curtoclevel--;
291	}
292	if ($curtoclevel+1 < $toclevel) {
293	print $outhandle "WARNING - jump in toc levels in $htmlfile " .
294	"from $curtoclevel to $toclevel\n";
295	}
296	while ($curtoclevel < $toclevel) {
297	$curtoclevel++;
298	$cursection =
299	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
300	}
301
302	# add the metadata to this section
303	$doc_obj->add_utf8_metadata($cursection, "Title", $title);
304
305	# clean up the section html
306	$sectiontext = $self->HB_clean_section($sectiontext);
307
308	# associate any files
309	map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
310	if /_httpdocimg_\/([^\"]+)\"/; 0; }
311	split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
312
313	# add the text for this section
314	$doc_obj->add_utf8_text ($cursection, $sectiontext);
315	} else {
316	print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
317	"\nin $htmlfile\n";
318	last;
319	}
320	$firstsection = 0;
321	}
322
323	# add a OID
324	$doc_obj->set_OID ();
325
326	# process the document
327	$processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
328
329	return 1; # processed the file
330	}
331
332
333	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: