source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 2484

Last change on this file since 2484 was 2327, checked in by sjboddie, 23 years ago

* empty log message *

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.5 KB
Line 
1###########################################################################
2#
3# HBPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which processes an HTML book directory
27
28# This plugin is used by the Humanity Library collections and does not handle
29# input encodings other than ascii or extended ascii
30
31# this code is kind of ugly and could no doubt be made to run faster, by leaving
32# it in this state I hope to encourage people to make their collections use
33# HBSPlug instead ;-)
34
35# Use HBSPlug if creating a new collection and marking up files like the
36# Humanity Library collections. HBSPlug accepts all input encodings but
37# expects the marked up files to be cleaner than those used by the
38# Humanity Library collections
39
40package HBPlug;
41
42use ghtml;
43use BasPlug;
44use util;
45use doc;
46
47
48sub BEGIN {
49 @ISA = ('BasPlug');
50}
51
52sub new {
53 my ($class) = @_;
54 my $self = new BasPlug ("HBPlug", @_);
55
56 return bless $self, $class;
57}
58
59sub init {
60 my $self = shift (@_);
61 my ($verbosity, $outhandle) = @_;
62
63 $self->BasPlug::init($verbosity, $outhandle);
64
65 # this plugin only handles ascii encodings
66 if ($self->{'input_encoding'} !~ /^(iso_8859_1|ascii)$/) {
67 die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
68 $self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
69 }
70}
71
72# this is included only to prevent warnings being printed out
73# from BasPlug::init. The process_exp is not used by this plugin
74sub get_default_process_exp {
75 my $self = shift (@_);
76
77 return "This plugin does not use a process_exp\n";
78}
79
80
81sub HB_read_html_file {
82 my $self = shift (@_);
83 my ($htmlfile, $text) = @_;
84
85 # load in the file
86 if (!open (FILE, $htmlfile)) {
87 my $outhandle = $self->{'outhandle'};
88 print $outhandle "ERROR - could not open $htmlfile\n";
89 return;
90 }
91
92 my $foundbody = 0;
93 $self->HB_gettext (\$foundbody, $text, "FILE");
94 close FILE;
95
96 # just in case there was no <body> tag
97 if (!$foundbody) {
98 $foundbody = 1;
99 open (FILE, $htmlfile) || return;
100 $self->HB_gettext (\$foundbody, $text, "FILE");
101 close FILE;
102 }
103}
104
105sub HB_gettext {
106 my $self = shift (@_);
107 my ($foundbody, $text, $handle) = @_;
108 my $outhandle = $self->{'outhandle'};
109
110 my $line = "";
111 while (defined ($line = <$handle>)) {
112 # look for body tag
113 if (!$$foundbody) {
114 if ($line =~ s/^.*<body[^>]*>//i) {
115 $$foundbody = 1;
116 } else {
117 next;
118 }
119 }
120
121 # check for symbol fonts
122 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
123 my $font = $1;
124 print $outhandle "HBPlug::HB_gettext - warning removed font $font\n"
125 if ($font !~ /^arial$/i);
126 }
127
128 $line =~ s/<\/p>//ig; # remove </p> tags
129 $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
130
131 # convert any alphanumeric character entities to their extended
132 # ascii equivalent for indexing purposes
133 &ghtml::convertcharentities ($line);
134
135 $$text .= $line;
136 }
137 $$text =~ s/\s+/ /g; # remove \n's
138}
139
140sub HB_clean_section {
141 my $self = shift (@_);
142 my ($section) = @_;
143
144 # remove tags without a starting tag from the section
145 my ($tag, $tagstart);
146 while ($section =~ /<\/([^>]{1,10})>/) {
147 $tag = $1;
148 $tagstart = index($section, "<$tag");
149 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
150 $section =~ s/<\/$tag>//;
151 }
152
153 # remove extra paragraph tags
154 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
155
156 # remove extra stuff at the end of the section
157 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
158
159 # add a newline at the beginning of each paragraph
160 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
161
162 # add a newline every 80 characters at a word boundary
163 # Note: this regular expression puts a line feed before
164 # the last word in each section, even when it is not
165 # needed.
166 $section =~ s/(.{1,80})\s/$1\n/g;
167
168 # fix up the image links
169 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
170 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
171 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
172 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
173
174 return $section;
175}
176
177
178sub shorten {
179 my $self = shift (@_);
180 my ($text) = @_;
181
182 return "\"$text\"" if (length($text) < 100);
183
184 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
185 substr ($text, length($text)-50) . "\"";
186}
187
188# if input_encoding is ascii we can call add_utf8_metadata
189# directly but if it's iso_8859_1 (the default) we need to call
190# add_metadata so that the ascii2utf8 conversion is done first
191# this should speed things up a little if processing an ascii only
192# document with input_encoding set to ascii
193sub HB_add_metadata {
194 my $self = shift (@_);
195 my ($doc_obj, $cursection, $field, $value) = @_;
196
197 if ($self->{'input_encoding'} eq "ascii") {
198 $doc_obj->add_utf8_metadata ($cursection, $field, $value);
199 } else {
200 $doc_obj->add_metadata ($cursection, $field, $value);
201 }
202}
203
204# return number of files processed, undef if can't process
205# Note that $base_dir might be "" and that $file might
206# include directories
207sub read {
208 my $self = shift (@_);
209 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
210 my $outhandle = $self->{'outhandle'};
211
212 # get the html filename and see if this is an HTML Book...
213 my $jobnumber = $file;
214 if ($file =~ /[\\\/]/) {
215 ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
216 }
217 return undef unless defined $jobnumber;
218 my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
219 return undef unless -e $htmlfile;
220
221 print $outhandle "HBPlug: processing $file\n";
222
223 # read in the file and do basic html cleaning (removing header etc)
224 my $html = "";
225 $self->HB_read_html_file ($htmlfile, \$html);
226
227 # create a new document
228 my $doc_obj = new doc ($file, "indexed_doc");
229 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
230
231 # copy the book cover if it exists
232 my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
233 $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
234
235 my $cursection = $doc_obj->get_top_section();
236
237 # add metadata for top level of document
238 foreach my $field (keys(%$metadata)) {
239 # $metadata->{$field} may be an array reference
240 if (ref ($metadata->{$field}) eq "ARRAY") {
241 map {
242 $self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
243 } @{$metadata->{$field}};
244 } else {
245 $self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
246 }
247 }
248
249 # process the file one section at a time
250 my $curtoclevel = 1;
251 my $firstsection = 1;
252 while (length ($html) > 0) {
253 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
254 my $toclevel = $3;
255 my $title = $4;
256 my $sectiontext = "";
257 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
258 $sectiontext = $1;
259 } else {
260 $sectiontext = $html;
261 $html = "";
262 }
263
264 # remove tags and extra spaces from the title
265 $title =~ s/<\/?[^>]+>//g;
266 $title =~ s/^\s+|\s+$//g;
267
268 # close any sections below the current level and
269 # create a new section (special case for the firstsection)
270 while (($curtoclevel > $toclevel) ||
271 (!$firstsection && $curtoclevel == $toclevel)) {
272 $cursection = $doc_obj->get_parent_section ($cursection);
273 $curtoclevel--;
274 }
275 if ($curtoclevel+1 < $toclevel) {
276 print $outhandle "WARNING - jump in toc levels in $htmlfile " .
277 "from $curtoclevel to $toclevel\n";
278 }
279 while ($curtoclevel < $toclevel) {
280 $curtoclevel++;
281 $cursection =
282 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
283 }
284
285 # add the metadata to this section
286 $self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
287
288 # clean up the section html
289 $sectiontext = $self->HB_clean_section($sectiontext);
290
291 # associate any files
292 map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
293 if /_httpdocimg_\/([^\"]+)\"/; 0; }
294 split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
295
296 # add the text for this section
297 if ($self->{'input_encoding'} eq "ascii") {
298 $doc_obj->add_utf8_text ($cursection, $sectiontext);
299 } else {
300 $doc_obj->add_text ($cursection, $sectiontext);
301 }
302 } else {
303 print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
304 "\nin $htmlfile\n";
305 last;
306 }
307 $firstsection = 0;
308 }
309
310 # add a OID
311 $doc_obj->set_OID ();
312
313 # process the document
314 $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
315
316 return 1; # processed the file
317}
318
319
3201;
Note: See TracBrowser for help on using the repository browser.