source: gsdl/trunk/perllib/plugins/HBPlug.pm@ 14661

Last change on this file since 14661 was 12270, checked in by kjdon, 18 years ago

set_OIDtype now takes two arguments, the type and the metadata (used if type=assigned)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.9 KB
Line 
1###########################################################################
2#
3# HBPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which processes an HTML book directory
27
28# This plugin is used by the Humanity Library collections and does not handle
29# input encodings other than ascii or extended ascii
30
31# this code is kind of ugly and could no doubt be made to run faster, by leaving
32# it in this state I hope to encourage people to make their collections use
33# HBSPlug instead ;-)
34
35# Use HBSPlug if creating a new collection and marking up files like the
36# Humanity Library collections. HBSPlug accepts all input encodings but
37# expects the marked up files to be cleaner than those used by the
38# Humanity Library collections
39
40package HBPlug;
41
42use ghtml;
43use BasPlug;
44use unicode;
45use util;
46use doc;
47
48use strict;
49no strict 'refs'; # allow filehandles to be variables and viceversa
50
51sub BEGIN {
52 @HBPlug::ISA = ('BasPlug');
53}
54
55my $arguments =
56 [ { 'name' => "process_exp",
57 'desc' => "{BasPlug.process_exp}",
58 'type' => "regexp",
59 'reqd' => "no",
60 'deft' => &get_default_process_exp() }
61 ];
62
63my $options = { 'name' => "HBPlug",
64 'desc' => "{HBPlug.desc}",
65 'abstract' => "no",
66 'inherits' => "yes",
67 'args' => $arguments };
68
69sub new {
70 my ($class) = shift (@_);
71 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
72 push(@$pluginlist, $class);
73
74 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
75 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
76
77 my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
78
79 return bless $self, $class;
80}
81
82sub init {
83 my $self = shift (@_);
84 my ($verbosity, $outhandle) = @_;
85
86 $self->BasPlug::init($verbosity, $outhandle);
87 $self->{'input_encoding'} = "iso_8859_1";
88
89 # this plugin only handles ascii encodings
90 if ($self->{'input_encoding'} !~ /^(iso_8859_1|ascii)$/) {
91 die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
92 $self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
93 }
94}
95
96# this is included only to prevent warnings being printed out
97# from BasPlug::init. The process_exp is not used by this plugin
98sub get_default_process_exp {
99 my $self = shift (@_);
100
101 return "This plugin does not use a process_exp\n";
102}
103
104
105sub HB_read_html_file {
106 my $self = shift (@_);
107 my ($htmlfile, $text) = @_;
108
109 # load in the file
110 if (!open (FILE, $htmlfile)) {
111 my $outhandle = $self->{'outhandle'};
112 print $outhandle "ERROR - could not open $htmlfile\n";
113 return;
114 }
115
116 my $foundbody = 0;
117 $self->HB_gettext (\$foundbody, $text, "FILE");
118 close FILE;
119
120 # just in case there was no <body> tag
121 if (!$foundbody) {
122 $foundbody = 1;
123 open (FILE, $htmlfile) || return;
124 $self->HB_gettext (\$foundbody, $text, "FILE");
125 close FILE;
126 }
127 # text is in utf8
128}
129
130# converts the text to utf8, as ghtml does that for &eacute; etc.
131sub HB_gettext {
132 my $self = shift (@_);
133 my ($foundbody, $text, $handle) = @_;
134 my $outhandle = $self->{'outhandle'};
135
136 my $line = "";
137 while (defined ($line = <$handle>)) {
138 # look for body tag
139 if (!$$foundbody) {
140 if ($line =~ s/^.*<body[^>]*>//i) {
141 $$foundbody = 1;
142 } else {
143 next;
144 }
145 }
146
147 # check for symbol fonts
148 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
149 my $font = $1;
150 print $outhandle "HBPlug::HB_gettext - warning removed font $font\n"
151 if ($font !~ /^arial$/i);
152 }
153
154 $line =~ s/<\/p>//ig; # remove </p> tags
155 $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
156
157 $$text .= $line;
158 }
159 #
160 if ($self->{'input_encoding'} eq "iso_8859_1") {
161 # convert to utf-8
162 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
163 }
164 # convert any alphanumeric character entities to their utf-8
165 # equivalent for indexing purposes
166 &ghtml::convertcharentities ($$text);
167
168 $$text =~ s/\s+/ /g; # remove \n's
169}
170
171sub HB_clean_section {
172 my $self = shift (@_);
173 my ($section) = @_;
174
175 # remove tags without a starting tag from the section
176 my ($tag, $tagstart);
177 while ($section =~ /<\/([^>]{1,10})>/) {
178 $tag = $1;
179 $tagstart = index($section, "<$tag");
180 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
181 $section =~ s/<\/$tag>//;
182 }
183
184 # remove extra paragraph tags
185 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
186
187 # remove extra stuff at the end of the section
188 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
189
190 # add a newline at the beginning of each paragraph
191 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
192
193 # add a newline every 80 characters at a word boundary
194 # Note: this regular expression puts a line feed before
195 # the last word in each section, even when it is not
196 # needed.
197 $section =~ s/(.{1,80})\s/$1\n/g;
198
199 # fix up the image links
200 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
201 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
202 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
203 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
204
205 return $section;
206}
207
208
209sub shorten {
210 my $self = shift (@_);
211 my ($text) = @_;
212
213 return "\"$text\"" if (length($text) < 100);
214
215 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
216 substr ($text, length($text)-50) . "\"";
217}
218
219# if input_encoding is ascii we can call add_utf8_metadata
220# directly but if it's iso_8859_1 (the default) we need to call
221# add_metadata so that the ascii2utf8 conversion is done first
222# this should speed things up a little if processing an ascii only
223# document with input_encoding set to ascii
224sub HB_add_metadata {
225 my $self = shift (@_);
226 my ($doc_obj, $cursection, $field, $value) = @_;
227
228# All text should now be in utf-8
229# if ($self->{'input_encoding'} eq "ascii") {
230 $doc_obj->add_utf8_metadata ($cursection, $field, $value);
231# } else {
232# $doc_obj->add_metadata ($cursection, $field, $value);
233# }
234}
235
236# return number of files processed, undef if can't process
237# Note that $base_dir might be "" and that $file might
238# include directories
239sub read {
240 my $self = shift (@_);
241 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
242 my $outhandle = $self->{'outhandle'};
243
244 # get the html filename and see if this is an HTML Book...
245 my $jobnumber = $file;
246 if ($file =~ /[\\\/]/) {
247 ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
248 }
249 return undef unless defined $jobnumber;
250 my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
251 return undef unless -e $htmlfile;
252
253 print STDERR "<Processing n='$file' p='HBPlug'>\n" if ($gli);
254 print $outhandle "HBPlug: processing $file\n";
255
256 # read in the file and do basic html cleaning (removing header etc)
257 my $html = "";
258 $self->HB_read_html_file ($htmlfile, \$html);
259 # html is in utf8
260
261 # create a new document
262 my $doc_obj = new doc ($file, "indexed_doc");
263 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
264
265 # copy the book cover if it exists
266 my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
267 $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
268 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
269 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileFormat", "HB");
270 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $htmlfile));
271
272 my $cursection = $doc_obj->get_top_section();
273
274 # add metadata for top level of document
275 foreach my $field (keys(%$metadata)) {
276 # $metadata->{$field} may be an array reference
277 if (ref ($metadata->{$field}) eq "ARRAY") {
278 map {
279 $self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
280 } @{$metadata->{$field}};
281 } else {
282 $self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
283 }
284 }
285
286 # process the file one section at a time
287 my $curtoclevel = 1;
288 my $firstsection = 1;
289 while (length ($html) > 0) {
290 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
291 my $toclevel = $3;
292 my $title = $4;
293 my $sectiontext = "";
294 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
295 $sectiontext = $1;
296 } else {
297 $sectiontext = $html;
298 $html = "";
299 }
300
301 # remove tags and extra spaces from the title
302 $title =~ s/<\/?[^>]+>//g;
303 $title =~ s/^\s+|\s+$//g;
304
305 # close any sections below the current level and
306 # create a new section (special case for the firstsection)
307 while (($curtoclevel > $toclevel) ||
308 (!$firstsection && $curtoclevel == $toclevel)) {
309 $cursection = $doc_obj->get_parent_section ($cursection);
310 $curtoclevel--;
311 }
312 if ($curtoclevel+1 < $toclevel) {
313 print $outhandle "WARNING - jump in toc levels in $htmlfile " .
314 "from $curtoclevel to $toclevel\n";
315 }
316 while ($curtoclevel < $toclevel) {
317 $curtoclevel++;
318 $cursection =
319 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
320 }
321
322 # add the metadata to this section
323 $self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
324
325 # clean up the section html
326 $sectiontext = $self->HB_clean_section($sectiontext);
327
328 # associate any files
329 map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
330 if /_httpdocimg_\/([^\"]+)\"/; 0; }
331 split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
332
333 # add the text for this section
334# All read text should now be in utf-8
335# if ($self->{'input_encoding'} eq "ascii") {
336 $doc_obj->add_utf8_text ($cursection, $sectiontext);
337# } else {
338# $doc_obj->add_text ($cursection, $sectiontext);
339# }
340 } else {
341 print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
342 "\nin $htmlfile\n";
343 last;
344 }
345 $firstsection = 0;
346 }
347
348 # add a OID
349 $doc_obj->set_OID ();
350
351 # process the document
352 $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
353
354 return 1; # processed the file
355}
356
357
3581;
Note: See TracBrowser for help on using the repository browser.