source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 5096

Last change on this file since 5096 was 4744, checked in by mdewsnip, 21 years ago

Tidied up and structures (representing the options of the plugin) in preparation for removing the print_usage() routines.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.7 KB
Line 
1###########################################################################
2#
3# HBPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which processes an HTML book directory
27
28# This plugin is used by the Humanity Library collections and does not handle
29# input encodings other than ascii or extended ascii
30
31# this code is kind of ugly and could no doubt be made to run faster, by leaving
32# it in this state I hope to encourage people to make their collections use
33# HBSPlug instead ;-)
34
35# Use HBSPlug if creating a new collection and marking up files like the
36# Humanity Library collections. HBSPlug accepts all input encodings but
37# expects the marked up files to be cleaner than those used by the
38# Humanity Library collections
39
40# 12/05/02 Added usage datastructure - John Thompson
41
42package HBPlug;
43
44use ghtml;
45use BasPlug;
46use unicode;
47use util;
48use doc;
49
50
51sub BEGIN {
52 @ISA = ('BasPlug');
53}
54
55my $options = { 'name' => "HBPlug",
56 'desc' => "Plugin which processes an HTML book directory. This plugin is used by the Humanity Library collections and does not handle input encodings other than ascii or extended ascii. This code is kind of ugly and could no doubt be made to run faster, by leaving it in this state I hope to encourage people to make their collections use HBSPlug instead ;-)\n\nUse HBSPlug if creating a new collection and marking up files like the Humanity Library collections. HBSPlug accepts all input encodings but expects the marked up files to be cleaner than those used by the Humanity Library collections",
57 'inherits' => "yes" };
58
59sub new {
60 my ($class) = @_;
61 my $self = new BasPlug ("HBPlug", @_);
62
63 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
64 my $option_list = $self->{'option_list'};
65 push( @{$option_list}, $options );
66
67 return bless $self, $class;
68}
69
70sub init {
71 my $self = shift (@_);
72 my ($verbosity, $outhandle) = @_;
73
74 $self->BasPlug::init($verbosity, $outhandle);
75 $self->{'input_encoding'} = "iso_8859_1";
76
77 # this plugin only handles ascii encodings
78 if ($self->{'input_encoding'} !~ /^(iso_8859_1|ascii)$/) {
79 die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
80 $self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
81 }
82}
83
84# this is included only to prevent warnings being printed out
85# from BasPlug::init. The process_exp is not used by this plugin
86sub get_default_process_exp {
87 my $self = shift (@_);
88
89 return "This plugin does not use a process_exp\n";
90}
91
92
93sub HB_read_html_file {
94 my $self = shift (@_);
95 my ($htmlfile, $text) = @_;
96
97 # load in the file
98 if (!open (FILE, $htmlfile)) {
99 my $outhandle = $self->{'outhandle'};
100 print $outhandle "ERROR - could not open $htmlfile\n";
101 return;
102 }
103
104 my $foundbody = 0;
105 $self->HB_gettext (\$foundbody, $text, "FILE");
106 close FILE;
107
108 # just in case there was no <body> tag
109 if (!$foundbody) {
110 $foundbody = 1;
111 open (FILE, $htmlfile) || return;
112 $self->HB_gettext (\$foundbody, $text, "FILE");
113 close FILE;
114 }
115 # text is in utf8
116}
117
118# converts the text to utf8, as ghtml does that for &eacute; etc.
119sub HB_gettext {
120 my $self = shift (@_);
121 my ($foundbody, $text, $handle) = @_;
122 my $outhandle = $self->{'outhandle'};
123
124 my $line = "";
125 while (defined ($line = <$handle>)) {
126 # look for body tag
127 if (!$$foundbody) {
128 if ($line =~ s/^.*<body[^>]*>//i) {
129 $$foundbody = 1;
130 } else {
131 next;
132 }
133 }
134
135 # check for symbol fonts
136 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
137 my $font = $1;
138 print $outhandle "HBPlug::HB_gettext - warning removed font $font\n"
139 if ($font !~ /^arial$/i);
140 }
141
142 $line =~ s/<\/p>//ig; # remove </p> tags
143 $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
144
145 $$text .= $line;
146 }
147 #
148 if ($self->{'input_encoding'} eq "iso_8859_1") {
149 # convert to utf-8
150 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
151 }
152 # convert any alphanumeric character entities to their utf-8
153 # equivalent for indexing purposes
154 &ghtml::convertcharentities ($$text);
155
156 $$text =~ s/\s+/ /g; # remove \n's
157}
158
159sub HB_clean_section {
160 my $self = shift (@_);
161 my ($section) = @_;
162
163 # remove tags without a starting tag from the section
164 my ($tag, $tagstart);
165 while ($section =~ /<\/([^>]{1,10})>/) {
166 $tag = $1;
167 $tagstart = index($section, "<$tag");
168 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
169 $section =~ s/<\/$tag>//;
170 }
171
172 # remove extra paragraph tags
173 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
174
175 # remove extra stuff at the end of the section
176 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
177
178 # add a newline at the beginning of each paragraph
179 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
180
181 # add a newline every 80 characters at a word boundary
182 # Note: this regular expression puts a line feed before
183 # the last word in each section, even when it is not
184 # needed.
185 $section =~ s/(.{1,80})\s/$1\n/g;
186
187 # fix up the image links
188 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
189 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
190 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
191 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
192
193 return $section;
194}
195
196
197sub shorten {
198 my $self = shift (@_);
199 my ($text) = @_;
200
201 return "\"$text\"" if (length($text) < 100);
202
203 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
204 substr ($text, length($text)-50) . "\"";
205}
206
207# if input_encoding is ascii we can call add_utf8_metadata
208# directly but if it's iso_8859_1 (the default) we need to call
209# add_metadata so that the ascii2utf8 conversion is done first
210# this should speed things up a little if processing an ascii only
211# document with input_encoding set to ascii
212sub HB_add_metadata {
213 my $self = shift (@_);
214 my ($doc_obj, $cursection, $field, $value) = @_;
215
216# All text should now be in utf-8
217# if ($self->{'input_encoding'} eq "ascii") {
218 $doc_obj->add_utf8_metadata ($cursection, $field, $value);
219# } else {
220# $doc_obj->add_metadata ($cursection, $field, $value);
221# }
222}
223
224# return number of files processed, undef if can't process
225# Note that $base_dir might be "" and that $file might
226# include directories
227sub read {
228 my $self = shift (@_);
229 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
230 my $outhandle = $self->{'outhandle'};
231
232 # get the html filename and see if this is an HTML Book...
233 my $jobnumber = $file;
234 if ($file =~ /[\\\/]/) {
235 ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
236 }
237 return undef unless defined $jobnumber;
238 my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
239 return undef unless -e $htmlfile;
240
241 print $outhandle "HBPlug: processing $file\n";
242
243 # read in the file and do basic html cleaning (removing header etc)
244 my $html = "";
245 $self->HB_read_html_file ($htmlfile, \$html);
246 # html is in utf8
247
248 # create a new document
249 my $doc_obj = new doc ($file, "indexed_doc");
250 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
251
252 # copy the book cover if it exists
253 my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
254 $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
255
256 my $cursection = $doc_obj->get_top_section();
257
258 # add metadata for top level of document
259 foreach my $field (keys(%$metadata)) {
260 # $metadata->{$field} may be an array reference
261 if (ref ($metadata->{$field}) eq "ARRAY") {
262 map {
263 $self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
264 } @{$metadata->{$field}};
265 } else {
266 $self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
267 }
268 }
269
270 # process the file one section at a time
271 my $curtoclevel = 1;
272 my $firstsection = 1;
273 while (length ($html) > 0) {
274 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
275 my $toclevel = $3;
276 my $title = $4;
277 my $sectiontext = "";
278 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
279 $sectiontext = $1;
280 } else {
281 $sectiontext = $html;
282 $html = "";
283 }
284
285 # remove tags and extra spaces from the title
286 $title =~ s/<\/?[^>]+>//g;
287 $title =~ s/^\s+|\s+$//g;
288
289 # close any sections below the current level and
290 # create a new section (special case for the firstsection)
291 while (($curtoclevel > $toclevel) ||
292 (!$firstsection && $curtoclevel == $toclevel)) {
293 $cursection = $doc_obj->get_parent_section ($cursection);
294 $curtoclevel--;
295 }
296 if ($curtoclevel+1 < $toclevel) {
297 print $outhandle "WARNING - jump in toc levels in $htmlfile " .
298 "from $curtoclevel to $toclevel\n";
299 }
300 while ($curtoclevel < $toclevel) {
301 $curtoclevel++;
302 $cursection =
303 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
304 }
305
306 # add the metadata to this section
307 $self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
308
309 # clean up the section html
310 $sectiontext = $self->HB_clean_section($sectiontext);
311
312 # associate any files
313 map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
314 if /_httpdocimg_\/([^\"]+)\"/; 0; }
315 split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
316
317 # add the text for this section
318# All read text should now be in utf-8
319# if ($self->{'input_encoding'} eq "ascii") {
320 $doc_obj->add_utf8_text ($cursection, $sectiontext);
321# } else {
322# $doc_obj->add_text ($cursection, $sectiontext);
323# }
324 } else {
325 print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
326 "\nin $htmlfile\n";
327 last;
328 }
329 $firstsection = 0;
330 }
331
332 # add a OID
333 $doc_obj->set_OID ();
334
335 # process the document
336 $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
337
338 return 1; # processed the file
339}
340
341
3421;
Note: See TracBrowser for help on using the repository browser.