Context Navigation

source: gsdl/trunk/perllib/plugins/HBPlugin.pm@ 17026

Last change on this file since 17026 was 17026, checked in by kjdon, 16 years ago
OID generation modifications: OIDtype and OIDmetadata options now available for plugins as well as import. OIDtype for plugins defaults to auto - if set to auto, then use the values from import. All plugins now call self->add_OID instead of doc_obj->set_OID. This sets the doc_obj OIDtype so that doesn't need to be donein other places any more. all plugins have the get_oid_hash_type method - normally returns hash_on_file, but can be overridden to return hash_on_ga_xml for those plugins that don't want hashing on file (MP3,OggVorbis...)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 10.0 KB

Line
1	###########################################################################
2	#
3	# HBPlugin.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# plugin which processes an HTML book directory
27
28	# This plugin is used by the Humanity Library collections and does not handle
29	# input encodings other than ascii or extended ascii
30
31	# this code is kind of ugly and could no doubt be made to run faster, by leaving
32	# it in this state I hope to encourage people to make their collections use
33	# HBSPlug instead ;-)
34
35	# Use HBSPlug if creating a new collection and marking up files like the
36	# Humanity Library collections. HBSPlug accepts all input encodings but
37	# expects the marked up files to be cleaner than those used by the
38	# Humanity Library collections
39
40	package HBPlugin;
41
42	use ghtml;
43	use BasePlugin;
44	use unicode;
45	use util;
46	use doc;
47
48	use strict;
49	no strict 'refs'; # allow filehandles to be variables and viceversa
50
51	sub BEGIN {
52	@HBPlugin::ISA = ('BasePlugin');
53	}
54	my $encoding_list =
55	[ { 'name' => "ascii",
56	'desc' => "{BasePlugin.encoding.ascii}" },
57	{ 'name' => "iso_8859_1",
58	'desc' => "{HBPlugin.encoding.iso_8859_1}" } ];
59
60	my $arguments =
61	[ { 'name' => "process_exp",
62	'desc' => "{BasePlugin.process_exp}",
63	'type' => "regexp",
64	'reqd' => "no",
65	'deft' => &get_default_process_exp() },
66	{ 'name' => "input_encoding",
67	'desc' => "{ReadTextFile.input_encoding}",
68	'type' => "enum",
69	'deft' => "iso_8859_1",
70	'list' => $encoding_list,
71	'reqd' => "no" }
72	];
73
74	my $options = { 'name' => "HBPlugin",
75	'desc' => "{HBPlugin.desc}",
76	'abstract' => "no",
77	'inherits' => "yes",
78	'args' => $arguments };
79
80	sub new {
81	my ($class) = shift (@_);
82	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
83	push(@$pluginlist, $class);
84
85	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
86	push(@{$hashArgOptLists->{"OptList"}},$options);
87
88	my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
89
90	return bless $self, $class;
91	}
92
93	# this is included only to prevent warnings being printed out
94	# from BasePlugin::init. The process_exp is not used by this plugin
95	sub get_default_process_exp {
96	my $self = shift (@_);
97
98	return "This plugin does not use a process_exp\n";
99	}
100
101
102	sub HB_read_html_file {
103	my $self = shift (@_);
104	my ($htmlfile, $text) = @_;
105
106	# load in the file
107	if (!open (FILE, $htmlfile)) {
108	my $outhandle = $self->{'outhandle'};
109	print $outhandle "ERROR - could not open $htmlfile\n";
110	return;
111	}
112
113	my $foundbody = 0;
114	$self->HB_gettext (\$foundbody, $text, "FILE");
115	close FILE;
116
117	# just in case there was no <body> tag
118	if (!$foundbody) {
119	$foundbody = 1;
120	open (FILE, $htmlfile) \|\| return;
121	$self->HB_gettext (\$foundbody, $text, "FILE");
122	close FILE;
123	}
124	# text is in utf8
125	}
126
127	# converts the text to utf8, as ghtml does that for é etc.
128	sub HB_gettext {
129	my $self = shift (@_);
130	my ($foundbody, $text, $handle) = @_;
131	my $outhandle = $self->{'outhandle'};
132
133	my $line = "";
134	while (defined ($line = <$handle>)) {
135	# look for body tag
136	if (!$$foundbody) {
137	if ($line =~ s/^.<body[^>]>//i) {
138	$$foundbody = 1;
139	} else {
140	next;
141	}
142	}
143
144	# check for symbol fonts
145	if ($line =~ /<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
146	my $font = $1;
147	print $outhandle "HBPlugin::HB_gettext - warning removed font $font\n"
148	if ($font !~ /^arial$/i);
149	}
150
151	$line =~ s/<\/p>//ig; # remove </p> tags
152	$line =~ s/<\/?(body\|html\|font)\b[^>]*>//ig; # remove any unwanted tags
153
154	$$text .= $line;
155	}
156	#
157	if ($self->{'input_encoding'} eq "iso_8859_1") {
158	# convert to utf-8
159	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
160	}
161	# convert any alphanumeric character entities to their utf-8
162	# equivalent for indexing purposes
163	&ghtml::convertcharentities ($$text);
164
165	$$text =~ s/\s+/ /g; # remove \n's
166	}
167
168	sub HB_clean_section {
169	my $self = shift (@_);
170	my ($section) = @_;
171
172	# remove tags without a starting tag from the section
173	my ($tag, $tagstart);
174	while ($section =~ /<\/([^>]{1,10})>/) {
175	$tag = $1;
176	$tagstart = index($section, "<$tag");
177	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
178	$section =~ s/<\/$tag>//;
179	}
180
181	# remove extra paragraph tags
182	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
183
184	# remove extra stuff at the end of the section
185	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
186
187	# add a newline at the beginning of each paragraph
188	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
189
190	# add a newline every 80 characters at a word boundary
191	# Note: this regular expression puts a line feed before
192	# the last word in each section, even when it is not
193	# needed.
194	$section =~ s/(.{1,80})\s/$1\n/g;
195
196	# fix up the image links
197	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
198	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
199	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
200	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
201
202	return $section;
203	}
204
205
206	sub shorten {
207	my $self = shift (@_);
208	my ($text) = @_;
209
210	return "\"$text\"" if (length($text) < 100);
211
212	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
213	substr ($text, length($text)-50) . "\"";
214	}
215
216	# return number of files processed, undef if can't process
217	# Note that $base_dir might be "" and that $file might
218	# include directories
219	sub read {
220	my $self = shift (@_);
221	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
222	my $outhandle = $self->{'outhandle'};
223
224	# get the html filename and see if this is an HTML Book...
225	my $jobnumber = $file;
226	if ($file =~ /[\\\/]/) {
227	($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
228	}
229	return undef unless defined $jobnumber;
230	my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
231	return undef unless -e $htmlfile;
232
233	print STDERR "<Processing n='$file' p='HBPlugin'>\n" if ($gli);
234	print $outhandle "HBPlugin: processing $file\n";
235
236	# read in the file and do basic html cleaning (removing header etc)
237	my $html = "";
238	$self->HB_read_html_file ($htmlfile, \$html);
239	# html is in utf8
240
241	# create a new document
242	my $doc_obj = new doc ($file, "indexed_doc");
243
244	# copy the book cover if it exists
245	my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
246	$doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
247	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
248	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileFormat", "HB");
249	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $htmlfile));
250
251	my $cursection = $doc_obj->get_top_section();
252
253	# add metadata for top level of document
254	foreach my $field (keys(%$metadata)) {
255	# $metadata->{$field} may be an array reference
256	if (ref ($metadata->{$field}) eq "ARRAY") {
257	map {
258	$doc_obj->add_utf8_metadata($cursection, $field, $_);
259	} @{$metadata->{$field}};
260	} else {
261	$doc_obj->add_utf8_metadata($cursection, $field, $metadata->{$field});
262	}
263	}
264
265	# process the file one section at a time
266	my $curtoclevel = 1;
267	my $firstsection = 1;
268	while (length ($html) > 0) {
269	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
270	my $toclevel = $3;
271	my $title = $4;
272	my $sectiontext = "";
273	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
274	$sectiontext = $1;
275	} else {
276	$sectiontext = $html;
277	$html = "";
278	}
279
280	# remove tags and extra spaces from the title
281	$title =~ s/<\/?[^>]+>//g;
282	$title =~ s/^\s+\|\s+$//g;
283
284	# close any sections below the current level and
285	# create a new section (special case for the firstsection)
286	while (($curtoclevel > $toclevel) \|\|
287	(!$firstsection && $curtoclevel == $toclevel)) {
288	$cursection = $doc_obj->get_parent_section ($cursection);
289	$curtoclevel--;
290	}
291	if ($curtoclevel+1 < $toclevel) {
292	print $outhandle "WARNING - jump in toc levels in $htmlfile " .
293	"from $curtoclevel to $toclevel\n";
294	}
295	while ($curtoclevel < $toclevel) {
296	$curtoclevel++;
297	$cursection =
298	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
299	}
300
301	# add the metadata to this section
302	$doc_obj->add_utf8_metadata($cursection, "Title", $title);
303
304	# clean up the section html
305	$sectiontext = $self->HB_clean_section($sectiontext);
306
307	# associate any files
308	map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
309	if /_httpdocimg_\/([^\"]+)\"/; 0; }
310	split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
311
312	# add the text for this section
313	$doc_obj->add_utf8_text ($cursection, $sectiontext);
314	} else {
315	print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
316	"\nin $htmlfile\n";
317	last;
318	}
319	$firstsection = 0;
320	}
321
322	# add a OID
323	$self->add_OID($doc_obj);
324
325	# process the document
326	$processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
327
328	return 1; # processed the file
329	}
330
331
332	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: