source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 5845

Last change on this file since 5845 was 5680, checked in by mdewsnip, 21 years ago

Moved plugin descriptions into the resource bundle (perllib/strings.rb), for translation by UNESCO.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.2 KB
Line 
1###########################################################################
2#
3# HBPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which processes an HTML book directory
27
28# This plugin is used by the Humanity Library collections and does not handle
29# input encodings other than ascii or extended ascii
30
31# this code is kind of ugly and could no doubt be made to run faster, by leaving
32# it in this state I hope to encourage people to make their collections use
33# HBSPlug instead ;-)
34
35# Use HBSPlug if creating a new collection and marking up files like the
36# Humanity Library collections. HBSPlug accepts all input encodings but
37# expects the marked up files to be cleaner than those used by the
38# Humanity Library collections
39
40# 12/05/02 Added usage datastructure - John Thompson
41
42package HBPlug;
43
44use ghtml;
45use BasPlug;
46use unicode;
47use util;
48use doc;
49
50
51sub BEGIN {
52 @ISA = ('BasPlug');
53}
54
55my $options = { 'name' => "HBPlug",
56 'desc' => "{HBPlug.desc}",
57 'inherits' => "yes" };
58
59sub new {
60 my ($class) = @_;
61 my $self = new BasPlug ("HBPlug", @_);
62
63 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
64 my $option_list = $self->{'option_list'};
65 push( @{$option_list}, $options );
66
67 return bless $self, $class;
68}
69
70sub init {
71 my $self = shift (@_);
72 my ($verbosity, $outhandle) = @_;
73
74 $self->BasPlug::init($verbosity, $outhandle);
75 $self->{'input_encoding'} = "iso_8859_1";
76
77 # this plugin only handles ascii encodings
78 if ($self->{'input_encoding'} !~ /^(iso_8859_1|ascii)$/) {
79 die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
80 $self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
81 }
82}
83
84# this is included only to prevent warnings being printed out
85# from BasPlug::init. The process_exp is not used by this plugin
86sub get_default_process_exp {
87 my $self = shift (@_);
88
89 return "This plugin does not use a process_exp\n";
90}
91
92
93sub HB_read_html_file {
94 my $self = shift (@_);
95 my ($htmlfile, $text) = @_;
96
97 # load in the file
98 if (!open (FILE, $htmlfile)) {
99 my $outhandle = $self->{'outhandle'};
100 print $outhandle "ERROR - could not open $htmlfile\n";
101 return;
102 }
103
104 my $foundbody = 0;
105 $self->HB_gettext (\$foundbody, $text, "FILE");
106 close FILE;
107
108 # just in case there was no <body> tag
109 if (!$foundbody) {
110 $foundbody = 1;
111 open (FILE, $htmlfile) || return;
112 $self->HB_gettext (\$foundbody, $text, "FILE");
113 close FILE;
114 }
115 # text is in utf8
116}
117
118# converts the text to utf8, as ghtml does that for &eacute; etc.
119sub HB_gettext {
120 my $self = shift (@_);
121 my ($foundbody, $text, $handle) = @_;
122 my $outhandle = $self->{'outhandle'};
123
124 my $line = "";
125 while (defined ($line = <$handle>)) {
126 # look for body tag
127 if (!$$foundbody) {
128 if ($line =~ s/^.*<body[^>]*>//i) {
129 $$foundbody = 1;
130 } else {
131 next;
132 }
133 }
134
135 # check for symbol fonts
136 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
137 my $font = $1;
138 print $outhandle "HBPlug::HB_gettext - warning removed font $font\n"
139 if ($font !~ /^arial$/i);
140 }
141
142 $line =~ s/<\/p>//ig; # remove </p> tags
143 $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
144
145 $$text .= $line;
146 }
147 #
148 if ($self->{'input_encoding'} eq "iso_8859_1") {
149 # convert to utf-8
150 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
151 }
152 # convert any alphanumeric character entities to their utf-8
153 # equivalent for indexing purposes
154 &ghtml::convertcharentities ($$text);
155
156 $$text =~ s/\s+/ /g; # remove \n's
157}
158
159sub HB_clean_section {
160 my $self = shift (@_);
161 my ($section) = @_;
162
163 # remove tags without a starting tag from the section
164 my ($tag, $tagstart);
165 while ($section =~ /<\/([^>]{1,10})>/) {
166 $tag = $1;
167 $tagstart = index($section, "<$tag");
168 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
169 $section =~ s/<\/$tag>//;
170 }
171
172 # remove extra paragraph tags
173 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
174
175 # remove extra stuff at the end of the section
176 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
177
178 # add a newline at the beginning of each paragraph
179 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
180
181 # add a newline every 80 characters at a word boundary
182 # Note: this regular expression puts a line feed before
183 # the last word in each section, even when it is not
184 # needed.
185 $section =~ s/(.{1,80})\s/$1\n/g;
186
187 # fix up the image links
188 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
189 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
190 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
191 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
192
193 return $section;
194}
195
196
197sub shorten {
198 my $self = shift (@_);
199 my ($text) = @_;
200
201 return "\"$text\"" if (length($text) < 100);
202
203 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
204 substr ($text, length($text)-50) . "\"";
205}
206
207# if input_encoding is ascii we can call add_utf8_metadata
208# directly but if it's iso_8859_1 (the default) we need to call
209# add_metadata so that the ascii2utf8 conversion is done first
210# this should speed things up a little if processing an ascii only
211# document with input_encoding set to ascii
212sub HB_add_metadata {
213 my $self = shift (@_);
214 my ($doc_obj, $cursection, $field, $value) = @_;
215
216# All text should now be in utf-8
217# if ($self->{'input_encoding'} eq "ascii") {
218 $doc_obj->add_utf8_metadata ($cursection, $field, $value);
219# } else {
220# $doc_obj->add_metadata ($cursection, $field, $value);
221# }
222}
223
224# return number of files processed, undef if can't process
225# Note that $base_dir might be "" and that $file might
226# include directories
227sub read {
228 my $self = shift (@_);
229 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
230 my $outhandle = $self->{'outhandle'};
231
232 # get the html filename and see if this is an HTML Book...
233 my $jobnumber = $file;
234 if ($file =~ /[\\\/]/) {
235 ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
236 }
237 return undef unless defined $jobnumber;
238 my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
239 return undef unless -e $htmlfile;
240
241 print $outhandle "HBPlug: processing $file\n";
242
243 # read in the file and do basic html cleaning (removing header etc)
244 my $html = "";
245 $self->HB_read_html_file ($htmlfile, \$html);
246 # html is in utf8
247
248 # create a new document
249 my $doc_obj = new doc ($file, "indexed_doc");
250 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
251
252 # copy the book cover if it exists
253 my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
254 $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
255
256 my $cursection = $doc_obj->get_top_section();
257
258 # add metadata for top level of document
259 foreach my $field (keys(%$metadata)) {
260 # $metadata->{$field} may be an array reference
261 if (ref ($metadata->{$field}) eq "ARRAY") {
262 map {
263 $self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
264 } @{$metadata->{$field}};
265 } else {
266 $self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
267 }
268 }
269
270 # process the file one section at a time
271 my $curtoclevel = 1;
272 my $firstsection = 1;
273 while (length ($html) > 0) {
274 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
275 my $toclevel = $3;
276 my $title = $4;
277 my $sectiontext = "";
278 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
279 $sectiontext = $1;
280 } else {
281 $sectiontext = $html;
282 $html = "";
283 }
284
285 # remove tags and extra spaces from the title
286 $title =~ s/<\/?[^>]+>//g;
287 $title =~ s/^\s+|\s+$//g;
288
289 # close any sections below the current level and
290 # create a new section (special case for the firstsection)
291 while (($curtoclevel > $toclevel) ||
292 (!$firstsection && $curtoclevel == $toclevel)) {
293 $cursection = $doc_obj->get_parent_section ($cursection);
294 $curtoclevel--;
295 }
296 if ($curtoclevel+1 < $toclevel) {
297 print $outhandle "WARNING - jump in toc levels in $htmlfile " .
298 "from $curtoclevel to $toclevel\n";
299 }
300 while ($curtoclevel < $toclevel) {
301 $curtoclevel++;
302 $cursection =
303 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
304 }
305
306 # add the metadata to this section
307 $self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
308
309 # clean up the section html
310 $sectiontext = $self->HB_clean_section($sectiontext);
311
312 # associate any files
313 map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
314 if /_httpdocimg_\/([^\"]+)\"/; 0; }
315 split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
316
317 # add the text for this section
318# All read text should now be in utf-8
319# if ($self->{'input_encoding'} eq "ascii") {
320 $doc_obj->add_utf8_text ($cursection, $sectiontext);
321# } else {
322# $doc_obj->add_text ($cursection, $sectiontext);
323# }
324 } else {
325 print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
326 "\nin $htmlfile\n";
327 last;
328 }
329 $firstsection = 0;
330 }
331
332 # add a OID
333 $doc_obj->set_OID ();
334
335 # process the document
336 $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
337
338 return 1; # processed the file
339}
340
341
3421;
Note: See TracBrowser for help on using the repository browser.