source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.0 KB
Line 
1###########################################################################
2#
3# HBPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which processes an HTML book directory
27
28# This plugin is used by the Humanity Library collections and does not handle
29# input encodings other than ascii or extended ascii
30
31# this code is kind of ugly and could no doubt be made to run faster, by leaving
32# it in this state I hope to encourage people to make their collections use
33# HBSPlug instead ;-)
34
35# Use HBSPlug if creating a new collection and marking up files like the
36# Humanity Library collections. HBSPlug accepts all input encodings but
37# expects the marked up files to be cleaner than those used by the
38# Humanity Library collections
39
40package HBPlug;
41
42use ghtml;
43use BasPlug;
44use unicode;
45use util;
46use doc;
47
48use strict;
49no strict 'refs'; # allow filehandles to be variables and viceversa
50
51sub BEGIN {
52 @HBPlug::ISA = ('BasPlug');
53}
54
55my $arguments =
56 [ { 'name' => "process_exp",
57 'desc' => "{BasPlug.process_exp}",
58 'type' => "regexp",
59 'reqd' => "no",
60 'deft' => &get_default_process_exp() }
61 ];
62
63my $options = { 'name' => "HBPlug",
64 'desc' => "{HBPlug.desc}",
65 'abstract' => "no",
66 'inherits' => "yes",
67 'args' => $arguments };
68
69sub new {
70 my ($class) = shift (@_);
71 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
72 push(@$pluginlist, $class);
73
74 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
75 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
76
77 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
78
79 return bless $self, $class;
80}
81
82sub init {
83 my $self = shift (@_);
84 my ($verbosity, $outhandle) = @_;
85
86 $self->BasPlug::init($verbosity, $outhandle);
87 $self->{'input_encoding'} = "iso_8859_1";
88
89 # this plugin only handles ascii encodings
90 if ($self->{'input_encoding'} !~ /^(iso_8859_1|ascii)$/) {
91 die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
92 $self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
93 }
94}
95
96# this is included only to prevent warnings being printed out
97# from BasPlug::init. The process_exp is not used by this plugin
98sub get_default_process_exp {
99 my $self = shift (@_);
100
101 return "This plugin does not use a process_exp\n";
102}
103
104
105sub HB_read_html_file {
106 my $self = shift (@_);
107 my ($htmlfile, $text) = @_;
108
109 # load in the file
110 if (!open (FILE, $htmlfile)) {
111 my $outhandle = $self->{'outhandle'};
112 print $outhandle "ERROR - could not open $htmlfile\n";
113 return;
114 }
115
116 my $foundbody = 0;
117 $self->HB_gettext (\$foundbody, $text, "FILE");
118 close FILE;
119
120 # just in case there was no <body> tag
121 if (!$foundbody) {
122 $foundbody = 1;
123 open (FILE, $htmlfile) || return;
124 $self->HB_gettext (\$foundbody, $text, "FILE");
125 close FILE;
126 }
127 # text is in utf8
128}
129
130# converts the text to utf8, as ghtml does that for &eacute; etc.
131sub HB_gettext {
132 my $self = shift (@_);
133 my ($foundbody, $text, $handle) = @_;
134 my $outhandle = $self->{'outhandle'};
135
136 my $line = "";
137 while (defined ($line = <$handle>)) {
138 # look for body tag
139 if (!$$foundbody) {
140 if ($line =~ s/^.*<body[^>]*>//i) {
141 $$foundbody = 1;
142 } else {
143 next;
144 }
145 }
146
147 # check for symbol fonts
148 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
149 my $font = $1;
150 print $outhandle "HBPlug::HB_gettext - warning removed font $font\n"
151 if ($font !~ /^arial$/i);
152 }
153
154 $line =~ s/<\/p>//ig; # remove </p> tags
155 $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
156
157 $$text .= $line;
158 }
159 #
160 if ($self->{'input_encoding'} eq "iso_8859_1") {
161 # convert to utf-8
162 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
163 }
164 # convert any alphanumeric character entities to their utf-8
165 # equivalent for indexing purposes
166 &ghtml::convertcharentities ($$text);
167
168 $$text =~ s/\s+/ /g; # remove \n's
169}
170
171sub HB_clean_section {
172 my $self = shift (@_);
173 my ($section) = @_;
174
175 # remove tags without a starting tag from the section
176 my ($tag, $tagstart);
177 while ($section =~ /<\/([^>]{1,10})>/) {
178 $tag = $1;
179 $tagstart = index($section, "<$tag");
180 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
181 $section =~ s/<\/$tag>//;
182 }
183
184 # remove extra paragraph tags
185 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
186
187 # remove extra stuff at the end of the section
188 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
189
190 # add a newline at the beginning of each paragraph
191 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
192
193 # add a newline every 80 characters at a word boundary
194 # Note: this regular expression puts a line feed before
195 # the last word in each section, even when it is not
196 # needed.
197 $section =~ s/(.{1,80})\s/$1\n/g;
198
199 # fix up the image links
200 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
201 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
202 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
203 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
204
205 return $section;
206}
207
208
209sub shorten {
210 my $self = shift (@_);
211 my ($text) = @_;
212
213 return "\"$text\"" if (length($text) < 100);
214
215 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
216 substr ($text, length($text)-50) . "\"";
217}
218
219# if input_encoding is ascii we can call add_utf8_metadata
220# directly but if it's iso_8859_1 (the default) we need to call
221# add_metadata so that the ascii2utf8 conversion is done first
222# this should speed things up a little if processing an ascii only
223# document with input_encoding set to ascii
224sub HB_add_metadata {
225 my $self = shift (@_);
226 my ($doc_obj, $cursection, $field, $value) = @_;
227
228# All text should now be in utf-8
229# if ($self->{'input_encoding'} eq "ascii") {
230 $doc_obj->add_utf8_metadata ($cursection, $field, $value);
231# } else {
232# $doc_obj->add_metadata ($cursection, $field, $value);
233# }
234}
235
236# return number of files processed, undef if can't process
237# Note that $base_dir might be "" and that $file might
238# include directories
239sub read {
240 my $self = shift (@_);
241 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
242 my $outhandle = $self->{'outhandle'};
243
244 # get the html filename and see if this is an HTML Book...
245 my $jobnumber = $file;
246 if ($file =~ /[\\\/]/) {
247 ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
248 }
249 return undef unless defined $jobnumber;
250 my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
251 return undef unless -e $htmlfile;
252
253 print STDERR "<Processing n='$file' p='HBPlug'>\n" if ($gli);
254 print $outhandle "HBPlug: processing $file\n";
255
256 # read in the file and do basic html cleaning (removing header etc)
257 my $html = "";
258 $self->HB_read_html_file ($htmlfile, \$html);
259 # html is in utf8
260
261 # create a new document
262 my $doc_obj = new doc ($file, "indexed_doc");
263 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
264
265 # copy the book cover if it exists
266 my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
267 $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
268 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
269 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileFormat", "HB");
270 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $htmlfile));
271
272 my $cursection = $doc_obj->get_top_section();
273
274 # add metadata for top level of document
275 foreach my $field (keys(%$metadata)) {
276 # $metadata->{$field} may be an array reference
277 if (ref ($metadata->{$field}) eq "ARRAY") {
278 map {
279 $self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
280 } @{$metadata->{$field}};
281 } else {
282 $self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
283 }
284 }
285
286 # process the file one section at a time
287 my $curtoclevel = 1;
288 my $firstsection = 1;
289 while (length ($html) > 0) {
290 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
291 my $toclevel = $3;
292 my $title = $4;
293 my $sectiontext = "";
294 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
295 $sectiontext = $1;
296 } else {
297 $sectiontext = $html;
298 $html = "";
299 }
300
301 # remove tags and extra spaces from the title
302 $title =~ s/<\/?[^>]+>//g;
303 $title =~ s/^\s+|\s+$//g;
304
305 # close any sections below the current level and
306 # create a new section (special case for the firstsection)
307 while (($curtoclevel > $toclevel) ||
308 (!$firstsection && $curtoclevel == $toclevel)) {
309 $cursection = $doc_obj->get_parent_section ($cursection);
310 $curtoclevel--;
311 }
312 if ($curtoclevel+1 < $toclevel) {
313 print $outhandle "WARNING - jump in toc levels in $htmlfile " .
314 "from $curtoclevel to $toclevel\n";
315 }
316 while ($curtoclevel < $toclevel) {
317 $curtoclevel++;
318 $cursection =
319 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
320 }
321
322 # add the metadata to this section
323 $self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
324
325 # clean up the section html
326 $sectiontext = $self->HB_clean_section($sectiontext);
327
328 # associate any files
329 map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
330 if /_httpdocimg_\/([^\"]+)\"/; 0; }
331 split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
332
333 # add the text for this section
334# All read text should now be in utf-8
335# if ($self->{'input_encoding'} eq "ascii") {
336 $doc_obj->add_utf8_text ($cursection, $sectiontext);
337# } else {
338# $doc_obj->add_text ($cursection, $sectiontext);
339# }
340 } else {
341 print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
342 "\nin $htmlfile\n";
343 last;
344 }
345 $firstsection = 0;
346 }
347
348 # add a OID
349 $doc_obj->set_OID ();
350
351 # process the document
352 $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
353
354 return 1; # processed the file
355}
356
357
3581;
Note: See TracBrowser for help on using the repository browser.