Context Navigation

source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 7243

Last change on this file since 7243 was 6408, checked in by jmt12, 20 years ago
Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 10.6 KB

Line
1	###########################################################################
2	#
3	# HBPlug.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# plugin which processes an HTML book directory
27
28	# This plugin is used by the Humanity Library collections and does not handle
29	# input encodings other than ascii or extended ascii
30
31	# this code is kind of ugly and could no doubt be made to run faster, by leaving
32	# it in this state I hope to encourage people to make their collections use
33	# HBSPlug instead ;-)
34
35	# Use HBSPlug if creating a new collection and marking up files like the
36	# Humanity Library collections. HBSPlug accepts all input encodings but
37	# expects the marked up files to be cleaner than those used by the
38	# Humanity Library collections
39
40	# 12/05/02 Added usage datastructure - John Thompson
41
42	package HBPlug;
43
44	use ghtml;
45	use BasPlug;
46	use unicode;
47	use util;
48	use doc;
49
50
51	sub BEGIN {
52	@ISA = ('BasPlug');
53	}
54
55	my $arguments =
56	[ { 'name' => "process_exp",
57	'desc' => "{BasPlug.process_exp}",
58	'type' => "regexp",
59	'reqd' => "no",
60	'deft' => &get_default_process_exp() }
61	];
62
63	my $options = { 'name' => "HBPlug",
64	'desc' => "{HBPlug.desc}",
65	'abstract' => "no",
66	'inherits' => "yes",
67	'args' => $arguments };
68
69	sub new {
70	my ($class) = @_;
71	my $self = new BasPlug ("HBPlug", @_);
72	$self->{'plugin_type'} = "HBPlug";
73	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
74	my $option_list = $self->{'option_list'};
75	push( @{$option_list}, $options );
76
77	return bless $self, $class;
78	}
79
80	sub init {
81	my $self = shift (@_);
82	my ($verbosity, $outhandle) = @_;
83
84	$self->BasPlug::init($verbosity, $outhandle);
85	$self->{'input_encoding'} = "iso_8859_1";
86
87	# this plugin only handles ascii encodings
88	if ($self->{'input_encoding'} !~ /^(iso_8859_1\|ascii)$/) {
89	die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
90	$self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
91	}
92	}
93
94	# this is included only to prevent warnings being printed out
95	# from BasPlug::init. The process_exp is not used by this plugin
96	sub get_default_process_exp {
97	my $self = shift (@_);
98
99	return "This plugin does not use a process_exp\n";
100	}
101
102
103	sub HB_read_html_file {
104	my $self = shift (@_);
105	my ($htmlfile, $text) = @_;
106
107	# load in the file
108	if (!open (FILE, $htmlfile)) {
109	my $outhandle = $self->{'outhandle'};
110	print $outhandle "ERROR - could not open $htmlfile\n";
111	return;
112	}
113
114	my $foundbody = 0;
115	$self->HB_gettext (\$foundbody, $text, "FILE");
116	close FILE;
117
118	# just in case there was no <body> tag
119	if (!$foundbody) {
120	$foundbody = 1;
121	open (FILE, $htmlfile) \|\| return;
122	$self->HB_gettext (\$foundbody, $text, "FILE");
123	close FILE;
124	}
125	# text is in utf8
126	}
127
128	# converts the text to utf8, as ghtml does that for é etc.
129	sub HB_gettext {
130	my $self = shift (@_);
131	my ($foundbody, $text, $handle) = @_;
132	my $outhandle = $self->{'outhandle'};
133
134	my $line = "";
135	while (defined ($line = <$handle>)) {
136	# look for body tag
137	if (!$$foundbody) {
138	if ($line =~ s/^.<body[^>]>//i) {
139	$$foundbody = 1;
140	} else {
141	next;
142	}
143	}
144
145	# check for symbol fonts
146	if ($line =~ /<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
147	my $font = $1;
148	print $outhandle "HBPlug::HB_gettext - warning removed font $font\n"
149	if ($font !~ /^arial$/i);
150	}
151
152	$line =~ s/<\/p>//ig; # remove </p> tags
153	$line =~ s/<\/?(body\|html\|font)\b[^>]*>//ig; # remove any unwanted tags
154
155	$$text .= $line;
156	}
157	#
158	if ($self->{'input_encoding'} eq "iso_8859_1") {
159	# convert to utf-8
160	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
161	}
162	# convert any alphanumeric character entities to their utf-8
163	# equivalent for indexing purposes
164	&ghtml::convertcharentities ($$text);
165
166	$$text =~ s/\s+/ /g; # remove \n's
167	}
168
169	sub HB_clean_section {
170	my $self = shift (@_);
171	my ($section) = @_;
172
173	# remove tags without a starting tag from the section
174	my ($tag, $tagstart);
175	while ($section =~ /<\/([^>]{1,10})>/) {
176	$tag = $1;
177	$tagstart = index($section, "<$tag");
178	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
179	$section =~ s/<\/$tag>//;
180	}
181
182	# remove extra paragraph tags
183	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
184
185	# remove extra stuff at the end of the section
186	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
187
188	# add a newline at the beginning of each paragraph
189	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
190
191	# add a newline every 80 characters at a word boundary
192	# Note: this regular expression puts a line feed before
193	# the last word in each section, even when it is not
194	# needed.
195	$section =~ s/(.{1,80})\s/$1\n/g;
196
197	# fix up the image links
198	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
199	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
200	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
201	<center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
202
203	return $section;
204	}
205
206
207	sub shorten {
208	my $self = shift (@_);
209	my ($text) = @_;
210
211	return "\"$text\"" if (length($text) < 100);
212
213	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
214	substr ($text, length($text)-50) . "\"";
215	}
216
217	# if input_encoding is ascii we can call add_utf8_metadata
218	# directly but if it's iso_8859_1 (the default) we need to call
219	# add_metadata so that the ascii2utf8 conversion is done first
220	# this should speed things up a little if processing an ascii only
221	# document with input_encoding set to ascii
222	sub HB_add_metadata {
223	my $self = shift (@_);
224	my ($doc_obj, $cursection, $field, $value) = @_;
225
226	# All text should now be in utf-8
227	# if ($self->{'input_encoding'} eq "ascii") {
228	$doc_obj->add_utf8_metadata ($cursection, $field, $value);
229	# } else {
230	# $doc_obj->add_metadata ($cursection, $field, $value);
231	# }
232	}
233
234	# return number of files processed, undef if can't process
235	# Note that $base_dir might be "" and that $file might
236	# include directories
237	sub read {
238	my $self = shift (@_);
239	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
240	my $outhandle = $self->{'outhandle'};
241
242	# get the html filename and see if this is an HTML Book...
243	my $jobnumber = $file;
244	if ($file =~ /[\\\/]/) {
245	($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
246	}
247	return undef unless defined $jobnumber;
248	my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
249	return undef unless -e $htmlfile;
250
251	print STDERR "<Processing n='$file' p='HBPlug'>\n" if ($gli);
252	print $outhandle "HBPlug: processing $file\n";
253
254	# read in the file and do basic html cleaning (removing header etc)
255	my $html = "";
256	$self->HB_read_html_file ($htmlfile, \$html);
257	# html is in utf8
258
259	# create a new document
260	my $doc_obj = new doc ($file, "indexed_doc");
261	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
262
263	# copy the book cover if it exists
264	my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
265	$doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
266	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "$self->{'plugin_type'}", "1");
267
268	my $cursection = $doc_obj->get_top_section();
269
270	# add metadata for top level of document
271	foreach my $field (keys(%$metadata)) {
272	# $metadata->{$field} may be an array reference
273	if (ref ($metadata->{$field}) eq "ARRAY") {
274	map {
275	$self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
276	} @{$metadata->{$field}};
277	} else {
278	$self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
279	}
280	}
281
282	# process the file one section at a time
283	my $curtoclevel = 1;
284	my $firstsection = 1;
285	while (length ($html) > 0) {
286	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
287	my $toclevel = $3;
288	my $title = $4;
289	my $sectiontext = "";
290	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
291	$sectiontext = $1;
292	} else {
293	$sectiontext = $html;
294	$html = "";
295	}
296
297	# remove tags and extra spaces from the title
298	$title =~ s/<\/?[^>]+>//g;
299	$title =~ s/^\s+\|\s+$//g;
300
301	# close any sections below the current level and
302	# create a new section (special case for the firstsection)
303	while (($curtoclevel > $toclevel) \|\|
304	(!$firstsection && $curtoclevel == $toclevel)) {
305	$cursection = $doc_obj->get_parent_section ($cursection);
306	$curtoclevel--;
307	}
308	if ($curtoclevel+1 < $toclevel) {
309	print $outhandle "WARNING - jump in toc levels in $htmlfile " .
310	"from $curtoclevel to $toclevel\n";
311	}
312	while ($curtoclevel < $toclevel) {
313	$curtoclevel++;
314	$cursection =
315	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
316	}
317
318	# add the metadata to this section
319	$self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
320
321	# clean up the section html
322	$sectiontext = $self->HB_clean_section($sectiontext);
323
324	# associate any files
325	map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
326	if /_httpdocimg_\/([^\"]+)\"/; 0; }
327	split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
328
329	# add the text for this section
330	# All read text should now be in utf-8
331	# if ($self->{'input_encoding'} eq "ascii") {
332	$doc_obj->add_utf8_text ($cursection, $sectiontext);
333	# } else {
334	# $doc_obj->add_text ($cursection, $sectiontext);
335	# }
336	} else {
337	print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
338	"\nin $htmlfile\n";
339	last;
340	}
341	$firstsection = 0;
342	}
343
344	# add a OID
345	$doc_obj->set_OID ();
346
347	# process the document
348	$processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
349
350	return 1; # processed the file
351	}
352
353
354	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: