source: gsdl/trunk/perllib/plugins/HBPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.0 KB
Line 
1###########################################################################
2#
3# HBPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which processes an HTML book directory
27
28# This plugin is used by the Humanity Library collections and does not handle
29# input encodings other than ascii or extended ascii
30
31# this code is kind of ugly and could no doubt be made to run faster, by leaving
32# it in this state I hope to encourage people to make their collections use
33# HBSPlug instead ;-)
34
35# Use HBSPlug if creating a new collection and marking up files like the
36# Humanity Library collections. HBSPlug accepts all input encodings but
37# expects the marked up files to be cleaner than those used by the
38# Humanity Library collections
39
40package HBPlugin;
41
42use ghtml;
43use BasePlugin;
44use unicode;
45use util;
46use doc;
47
48use strict;
49no strict 'refs'; # allow filehandles to be variables and viceversa
50
51sub BEGIN {
52 @HBPlugin::ISA = ('BasePlugin');
53}
54my $encoding_list =
55 [ { 'name' => "ascii",
56 'desc' => "{ReadTextFile.input_encoding.ascii}" },
57 { 'name' => "iso_8859_1",
58 'desc' => "Latin1 (western languages)" } ];
59
60my $arguments =
61 [ { 'name' => "process_exp",
62 'desc' => "{BasePlugin.process_exp}",
63 'type' => "regexp",
64 'reqd' => "no",
65 'deft' => &get_default_process_exp() },
66 { 'name' => "input_encoding",
67 'desc' => "{ReadTextFile.input_encoding}",
68 'type' => "enum",
69 'deft' => "iso_8859_1",
70 'list' => $encoding_list,
71 'reqd' => "no" }
72 ];
73
74my $options = { 'name' => "HBPlugin",
75 'desc' => "{HBPlugin.desc}",
76 'abstract' => "no",
77 'inherits' => "yes",
78 'args' => $arguments };
79
80sub new {
81 my ($class) = shift (@_);
82 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
83 push(@$pluginlist, $class);
84
85 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
86 push(@{$hashArgOptLists->{"OptList"}},$options);
87
88 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
89
90 return bless $self, $class;
91}
92
93# this is included only to prevent warnings being printed out
94# from BasePlugin::init. The process_exp is not used by this plugin
95sub get_default_process_exp {
96 my $self = shift (@_);
97
98 return "This plugin does not use a process_exp\n";
99}
100
101
102sub HB_read_html_file {
103 my $self = shift (@_);
104 my ($htmlfile, $text) = @_;
105
106 # load in the file
107 if (!open (FILE, $htmlfile)) {
108 my $outhandle = $self->{'outhandle'};
109 print $outhandle "ERROR - could not open $htmlfile\n";
110 return;
111 }
112
113 my $foundbody = 0;
114 $self->HB_gettext (\$foundbody, $text, "FILE");
115 close FILE;
116
117 # just in case there was no <body> tag
118 if (!$foundbody) {
119 $foundbody = 1;
120 open (FILE, $htmlfile) || return;
121 $self->HB_gettext (\$foundbody, $text, "FILE");
122 close FILE;
123 }
124 # text is in utf8
125}
126
127# converts the text to utf8, as ghtml does that for &eacute; etc.
128sub HB_gettext {
129 my $self = shift (@_);
130 my ($foundbody, $text, $handle) = @_;
131 my $outhandle = $self->{'outhandle'};
132
133 my $line = "";
134 while (defined ($line = <$handle>)) {
135 # look for body tag
136 if (!$$foundbody) {
137 if ($line =~ s/^.*<body[^>]*>//i) {
138 $$foundbody = 1;
139 } else {
140 next;
141 }
142 }
143
144 # check for symbol fonts
145 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
146 my $font = $1;
147 print $outhandle "HBPlugin::HB_gettext - warning removed font $font\n"
148 if ($font !~ /^arial$/i);
149 }
150
151 $line =~ s/<\/p>//ig; # remove </p> tags
152 $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
153
154 $$text .= $line;
155 }
156 #
157 if ($self->{'input_encoding'} eq "iso_8859_1") {
158 # convert to utf-8
159 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
160 }
161 # convert any alphanumeric character entities to their utf-8
162 # equivalent for indexing purposes
163 &ghtml::convertcharentities ($$text);
164
165 $$text =~ s/\s+/ /g; # remove \n's
166}
167
168sub HB_clean_section {
169 my $self = shift (@_);
170 my ($section) = @_;
171
172 # remove tags without a starting tag from the section
173 my ($tag, $tagstart);
174 while ($section =~ /<\/([^>]{1,10})>/) {
175 $tag = $1;
176 $tagstart = index($section, "<$tag");
177 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
178 $section =~ s/<\/$tag>//;
179 }
180
181 # remove extra paragraph tags
182 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
183
184 # remove extra stuff at the end of the section
185 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
186
187 # add a newline at the beginning of each paragraph
188 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
189
190 # add a newline every 80 characters at a word boundary
191 # Note: this regular expression puts a line feed before
192 # the last word in each section, even when it is not
193 # needed.
194 $section =~ s/(.{1,80})\s/$1\n/g;
195
196 # fix up the image links
197 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
198 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
199 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
200 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
201
202 return $section;
203}
204
205
206sub shorten {
207 my $self = shift (@_);
208 my ($text) = @_;
209
210 return "\"$text\"" if (length($text) < 100);
211
212 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
213 substr ($text, length($text)-50) . "\"";
214}
215
216# return number of files processed, undef if can't process
217# Note that $base_dir might be "" and that $file might
218# include directories
219sub read {
220 my $self = shift (@_);
221 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
222 my $outhandle = $self->{'outhandle'};
223
224 # get the html filename and see if this is an HTML Book...
225 my $jobnumber = $file;
226 if ($file =~ /[\\\/]/) {
227 ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
228 }
229 return undef unless defined $jobnumber;
230 my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
231 return undef unless -e $htmlfile;
232
233 print STDERR "<Processing n='$file' p='HBPlugin'>\n" if ($gli);
234 print $outhandle "HBPlugin: processing $file\n";
235
236 # read in the file and do basic html cleaning (removing header etc)
237 my $html = "";
238 $self->HB_read_html_file ($htmlfile, \$html);
239 # html is in utf8
240
241 # create a new document
242 my $doc_obj = new doc ($file, "indexed_doc");
243 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
244
245 # copy the book cover if it exists
246 my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
247 $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
248 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
249 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileFormat", "HB");
250 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $htmlfile));
251
252 my $cursection = $doc_obj->get_top_section();
253
254 # add metadata for top level of document
255 foreach my $field (keys(%$metadata)) {
256 # $metadata->{$field} may be an array reference
257 if (ref ($metadata->{$field}) eq "ARRAY") {
258 map {
259 $doc_obj->add_utf8_metadata($cursection, $field, $_);
260 } @{$metadata->{$field}};
261 } else {
262 $doc_obj->add_utf8_metadata($cursection, $field, $metadata->{$field});
263 }
264 }
265
266 # process the file one section at a time
267 my $curtoclevel = 1;
268 my $firstsection = 1;
269 while (length ($html) > 0) {
270 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
271 my $toclevel = $3;
272 my $title = $4;
273 my $sectiontext = "";
274 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
275 $sectiontext = $1;
276 } else {
277 $sectiontext = $html;
278 $html = "";
279 }
280
281 # remove tags and extra spaces from the title
282 $title =~ s/<\/?[^>]+>//g;
283 $title =~ s/^\s+|\s+$//g;
284
285 # close any sections below the current level and
286 # create a new section (special case for the firstsection)
287 while (($curtoclevel > $toclevel) ||
288 (!$firstsection && $curtoclevel == $toclevel)) {
289 $cursection = $doc_obj->get_parent_section ($cursection);
290 $curtoclevel--;
291 }
292 if ($curtoclevel+1 < $toclevel) {
293 print $outhandle "WARNING - jump in toc levels in $htmlfile " .
294 "from $curtoclevel to $toclevel\n";
295 }
296 while ($curtoclevel < $toclevel) {
297 $curtoclevel++;
298 $cursection =
299 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
300 }
301
302 # add the metadata to this section
303 $doc_obj->add_utf8_metadata($cursection, "Title", $title);
304
305 # clean up the section html
306 $sectiontext = $self->HB_clean_section($sectiontext);
307
308 # associate any files
309 map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
310 if /_httpdocimg_\/([^\"]+)\"/; 0; }
311 split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
312
313 # add the text for this section
314 $doc_obj->add_utf8_text ($cursection, $sectiontext);
315 } else {
316 print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
317 "\nin $htmlfile\n";
318 last;
319 }
320 $firstsection = 0;
321 }
322
323 # add a OID
324 $doc_obj->set_OID ();
325
326 # process the document
327 $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
328
329 return 1; # processed the file
330}
331
332
3331;
Note: See TracBrowser for help on using the repository browser.