source: branches/New_Config_Format-branch/gsdl/perllib/plugins/HBPlug.pm@ 1279

Last change on this file since 1279 was 1279, checked in by sjboddie, 24 years ago

merged changes to trunk into New_Config_Format branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.2 KB
Line 
1###########################################################################
2#
3# HBPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which processes an HTML book directory
27
28# This plugin is used by the Humanity Library collections and does not handle
29# input encodings other than ascii or extended ascii
30
31# this code is kind of ugly and could no doubt be made to run faster, by leaving
32# it in this state I hope to encourage people to make their collections use
33# HBSPlug instead ;-)
34
35# Use HBSPlug if creating a new collection and marking up files like the
36# Humanity Library collections. HBSPlug accepts all input encodings but
37# expects the marked up files to be cleaner than those used by the
38# Humanity Library collections
39
40package HBPlug;
41
42use ghtml;
43use BasPlug;
44use util;
45use doc;
46
47
48sub BEGIN {
49 @ISA = ('BasPlug');
50}
51
52sub new {
53 my ($class) = @_;
54 my $self = new BasPlug ("HBPlug", @_);
55
56 return bless $self, $class;
57}
58
59sub init {
60 my $self = shift (@_);
61 my ($verbosity) = @_;
62
63 $self->BasPlug::init();
64
65 # this plugin only handles ascii encodings
66 if ($self->{'input_encoding'} !~ /^(iso_8859_1|ascii)$/) {
67 die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
68 $self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
69 }
70}
71
72# this is included only to prevent warnings being printed out
73# from BasPlug::init. The process_exp is not used by this plugin
74sub get_default_process_exp {
75 my $self = shift (@_);
76
77 return "This plugin does not use a process_exp\n";
78}
79
80
81sub HB_read_html_file {
82 my $self = shift (@_);
83 my ($htmlfile, $text) = @_;
84
85 # load in the file
86 if (!open (FILE, $htmlfile)) {
87 print STDERR "ERROR - could not open $htmlfile\n";
88 return;
89 }
90
91 my $foundbody = 0;
92 $self->HB_gettext (\$foundbody, $text, "FILE");
93 close FILE;
94
95 # just in case there was no <body> tag
96 if (!$foundbody) {
97 $foundbody = 1;
98 open (FILE, $htmlfile) || return;
99 $self->HB_gettext (\$foundbody, $text, "FILE");
100 close FILE;
101 }
102}
103
104sub HB_gettext {
105 my $self = shift (@_);
106 my ($foundbody, $text, $handle) = @_;
107
108 my $line = "";
109 while (defined ($line = <$handle>)) {
110 # look for body tag
111 if (!$$foundbody) {
112 if ($line =~ s/^.*<body[^>]*>//i) {
113 $$foundbody = 1;
114 } else {
115 next;
116 }
117 }
118
119 # check for symbol fonts
120 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
121 my $font = $1;
122 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
123 if ($font !~ /^arial$/i);
124 }
125
126 $line =~ s/<\/p>//ig; # remove </p> tags
127 $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
128
129 # convert any alphanumeric character entities to their extended
130 # ascii equivalent for indexing purposes
131 &ghtml::convertcharentities ($line);
132
133 $$text .= $line;
134 }
135 $$text =~ s/\s+/ /g; # remove \n's
136}
137
138sub HB_clean_section {
139 my $self = shift (@_);
140 my ($section) = @_;
141
142 # remove tags without a starting tag from the section
143 my ($tag, $tagstart);
144 while ($section =~ /<\/([^>]{1,10})>/) {
145 $tag = $1;
146 $tagstart = index($section, "<$tag");
147 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
148 $section =~ s/<\/$tag>//;
149 }
150
151 # remove extra paragraph tags
152 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
153
154 # remove extra stuff at the end of the section
155 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
156
157 # add a newline at the beginning of each paragraph
158 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
159
160 # add a newline every 80 characters at a word boundary
161 # Note: this regular expression puts a line feed before
162 # the last word in each section, even when it is not
163 # needed.
164 $section =~ s/(.{1,80})\s/$1\n/g;
165
166 # fix up the image links
167 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
168 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
169 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
170 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
171
172 return $section;
173}
174
175
176sub shorten {
177 my $self = shift (@_);
178 my ($text) = @_;
179
180 return "\"$text\"" if (length($text) < 100);
181
182 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
183 substr ($text, length($text)-50) . "\"";
184}
185
186# if input_encoding is ascii we can call add_utf8_metadata
187# directly but if it's iso_8859_1 (the default) we need to call
188# add_metadata so that the ascii2utf8 conversion is done first
189# this should speed things up a little if processing an ascii only
190# document with input_encoding set to ascii
191sub HB_add_metadata {
192 my $self = shift (@_);
193 my ($doc_obj, $cursection, $field, $value) = @_;
194
195 if ($self->{'input_encoding'} eq "ascii") {
196 $doc_obj->add_utf8_metadata ($cursection, $field, $value);
197 } else {
198 $doc_obj->add_metadata ($cursection, $field, $value);
199 }
200}
201
202# return number of files processed, undef if can't process
203# Note that $base_dir might be "" and that $file might
204# include directories
205sub read {
206 my $self = shift (@_);
207 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
208
209 # get the html filename and see if this is an HTML Book...
210 my $jobnumber = $file;
211 if ($file =~ /[\\\/]/) {
212 ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
213 }
214 return undef unless defined $jobnumber;
215 my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
216 return undef unless -e $htmlfile;
217
218 print STDERR "HBPlug: processing $file\n";
219
220 # read in the file and do basic html cleaning (removing header etc)
221 my $html = "";
222 $self->HB_read_html_file ($htmlfile, \$html);
223
224 # create a new document
225 my $doc_obj = new doc ($file, "indexed_doc");
226
227 # copy the book cover if it exists
228 my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
229 $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
230
231 my $cursection = $doc_obj->get_top_section();
232
233 # add metadata for top level of document
234 foreach my $field (keys(%$metadata)) {
235 # $metadata->{$field} may be an array reference
236 if (ref ($metadata->{$field}) eq "ARRAY") {
237 map {
238 $self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
239 } @{$metadata->{$field}};
240 } else {
241 $self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
242 }
243 }
244
245 # process the file one section at a time
246 my $curtoclevel = 1;
247 my $firstsection = 1;
248 while (length ($html) > 0) {
249 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
250 my $toclevel = $3;
251 my $title = $4;
252 my $sectiontext = "";
253 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
254 $sectiontext = $1;
255 } else {
256 $sectiontext = $html;
257 $html = "";
258 }
259
260 # remove tags and extra spaces from the title
261 $title =~ s/<\/?[^>]+>//g;
262 $title =~ s/^\s+|\s+$//g;
263
264 # close any sections below the current level and
265 # create a new section (special case for the firstsection)
266 while (($curtoclevel > $toclevel) ||
267 (!$firstsection && $curtoclevel == $toclevel)) {
268 $cursection = $doc_obj->get_parent_section ($cursection);
269 $curtoclevel--;
270 }
271 if ($curtoclevel+1 < $toclevel) {
272 print STDERR "WARNING - jump in toc levels in $htmlfile " .
273 "from $curtoclevel to $toclevel\n";
274 }
275 while ($curtoclevel < $toclevel) {
276 $curtoclevel++;
277 $cursection =
278 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
279 }
280
281 # add the metadata to this section
282 $self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
283
284 # clean up the section html
285 $sectiontext = $self->HB_clean_section($sectiontext);
286
287 # associate any files
288 map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
289 if /_httpdocimg_\/([^\"]+)\"/; 0; }
290 split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
291
292 # add the text for this section
293 if ($self->{'input_encoding'} eq "ascii") {
294 $doc_obj->add_utf8_text ($cursection, $sectiontext);
295 } else {
296 $doc_obj->add_text ($cursection, $sectiontext);
297 }
298 } else {
299 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
300 "\nin $htmlfile\n";
301 last;
302 }
303 $firstsection = 0;
304 }
305
306 # add a OID
307 $doc_obj->set_OID ();
308
309 # process the document
310 $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
311
312 return 1; # processed the file
313}
314
315
3161;
Note: See TracBrowser for help on using the repository browser.