1 | ###########################################################################
|
---|
2 | #
|
---|
3 | # HBPlug.pm --
|
---|
4 | # A component of the Greenstone digital library software
|
---|
5 | # from the New Zealand Digital Library Project at the
|
---|
6 | # University of Waikato, New Zealand.
|
---|
7 | #
|
---|
8 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
9 | #
|
---|
10 | # This program is free software; you can redistribute it and/or modify
|
---|
11 | # it under the terms of the GNU General Public License as published by
|
---|
12 | # the Free Software Foundation; either version 2 of the License, or
|
---|
13 | # (at your option) any later version.
|
---|
14 | #
|
---|
15 | # This program is distributed in the hope that it will be useful,
|
---|
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
18 | # GNU General Public License for more details.
|
---|
19 | #
|
---|
20 | # You should have received a copy of the GNU General Public License
|
---|
21 | # along with this program; if not, write to the Free Software
|
---|
22 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
23 | #
|
---|
24 | ###########################################################################
|
---|
25 |
|
---|
26 | # plugin which processes an HTML book directory
|
---|
27 |
|
---|
28 | # This plugin is used by the Humanity Library collections and does not handle
|
---|
29 | # input encodings other than ascii or extended ascii
|
---|
30 |
|
---|
31 | # this code is kind of ugly and could no doubt be made to run faster, by leaving
|
---|
32 | # it in this state I hope to encourage people to make their collections use
|
---|
33 | # HBSPlug instead ;-)
|
---|
34 |
|
---|
35 | # Use HBSPlug if creating a new collection and marking up files like the
|
---|
36 | # Humanity Library collections. HBSPlug accepts all input encodings but
|
---|
37 | # expects the marked up files to be cleaner than those used by the
|
---|
38 | # Humanity Library collections
|
---|
39 |
|
---|
40 | # 12/05/02 Added usage datastructure - John Thompson
|
---|
41 |
|
---|
42 | package HBPlug;
|
---|
43 |
|
---|
44 | use ghtml;
|
---|
45 | use BasPlug;
|
---|
46 | use unicode;
|
---|
47 | use util;
|
---|
48 | use doc;
|
---|
49 |
|
---|
50 |
|
---|
51 | sub BEGIN {
|
---|
52 | @ISA = ('BasPlug');
|
---|
53 | }
|
---|
54 |
|
---|
55 | my $options = { 'name' => "HBPlug",
|
---|
56 | 'desc' => "Plugin which processes an HTML book directory. This plugin is used by the Humanity Library collections and does not handle input encodings other than ascii or extended ascii. This code is kind of ugly and could no doubt be made to run faster, by leaving it in this state I hope to encourage people to make their collections use HBSPlug instead ;-)\n\nUse HBSPlug if creating a new collection and marking up files like the Humanity Library collections. HBSPlug accepts all input encodings but expects the marked up files to be cleaner than those used by the Humanity Library collections",
|
---|
57 | 'inherits' => "yes" };
|
---|
58 |
|
---|
59 | sub new {
|
---|
60 | my ($class) = @_;
|
---|
61 | my $self = new BasPlug ("HBPlug", @_);
|
---|
62 |
|
---|
63 | # 14-05-02 To allow for proper inheritance of arguments - John Thompson
|
---|
64 | my $option_list = $self->{'option_list'};
|
---|
65 | push( @{$option_list}, $options );
|
---|
66 |
|
---|
67 | return bless $self, $class;
|
---|
68 | }
|
---|
69 |
|
---|
70 | sub init {
|
---|
71 | my $self = shift (@_);
|
---|
72 | my ($verbosity, $outhandle) = @_;
|
---|
73 |
|
---|
74 | $self->BasPlug::init($verbosity, $outhandle);
|
---|
75 | $self->{'input_encoding'} = "iso_8859_1";
|
---|
76 |
|
---|
77 | # this plugin only handles ascii encodings
|
---|
78 | if ($self->{'input_encoding'} !~ /^(iso_8859_1|ascii)$/) {
|
---|
79 | die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
|
---|
80 | $self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
|
---|
81 | }
|
---|
82 | }
|
---|
83 |
|
---|
84 | # this is included only to prevent warnings being printed out
|
---|
85 | # from BasPlug::init. The process_exp is not used by this plugin
|
---|
86 | sub get_default_process_exp {
|
---|
87 | my $self = shift (@_);
|
---|
88 |
|
---|
89 | return "This plugin does not use a process_exp\n";
|
---|
90 | }
|
---|
91 |
|
---|
92 |
|
---|
93 | sub HB_read_html_file {
|
---|
94 | my $self = shift (@_);
|
---|
95 | my ($htmlfile, $text) = @_;
|
---|
96 |
|
---|
97 | # load in the file
|
---|
98 | if (!open (FILE, $htmlfile)) {
|
---|
99 | my $outhandle = $self->{'outhandle'};
|
---|
100 | print $outhandle "ERROR - could not open $htmlfile\n";
|
---|
101 | return;
|
---|
102 | }
|
---|
103 |
|
---|
104 | my $foundbody = 0;
|
---|
105 | $self->HB_gettext (\$foundbody, $text, "FILE");
|
---|
106 | close FILE;
|
---|
107 |
|
---|
108 | # just in case there was no <body> tag
|
---|
109 | if (!$foundbody) {
|
---|
110 | $foundbody = 1;
|
---|
111 | open (FILE, $htmlfile) || return;
|
---|
112 | $self->HB_gettext (\$foundbody, $text, "FILE");
|
---|
113 | close FILE;
|
---|
114 | }
|
---|
115 | # text is in utf8
|
---|
116 | }
|
---|
117 |
|
---|
118 | # converts the text to utf8, as ghtml does that for é etc.
|
---|
119 | sub HB_gettext {
|
---|
120 | my $self = shift (@_);
|
---|
121 | my ($foundbody, $text, $handle) = @_;
|
---|
122 | my $outhandle = $self->{'outhandle'};
|
---|
123 |
|
---|
124 | my $line = "";
|
---|
125 | while (defined ($line = <$handle>)) {
|
---|
126 | # look for body tag
|
---|
127 | if (!$$foundbody) {
|
---|
128 | if ($line =~ s/^.*<body[^>]*>//i) {
|
---|
129 | $$foundbody = 1;
|
---|
130 | } else {
|
---|
131 | next;
|
---|
132 | }
|
---|
133 | }
|
---|
134 |
|
---|
135 | # check for symbol fonts
|
---|
136 | if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
|
---|
137 | my $font = $1;
|
---|
138 | print $outhandle "HBPlug::HB_gettext - warning removed font $font\n"
|
---|
139 | if ($font !~ /^arial$/i);
|
---|
140 | }
|
---|
141 |
|
---|
142 | $line =~ s/<\/p>//ig; # remove </p> tags
|
---|
143 | $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
|
---|
144 |
|
---|
145 | $$text .= $line;
|
---|
146 | }
|
---|
147 | #
|
---|
148 | if ($self->{'input_encoding'} eq "iso_8859_1") {
|
---|
149 | # convert to utf-8
|
---|
150 | $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
|
---|
151 | }
|
---|
152 | # convert any alphanumeric character entities to their utf-8
|
---|
153 | # equivalent for indexing purposes
|
---|
154 | &ghtml::convertcharentities ($$text);
|
---|
155 |
|
---|
156 | $$text =~ s/\s+/ /g; # remove \n's
|
---|
157 | }
|
---|
158 |
|
---|
159 | sub HB_clean_section {
|
---|
160 | my $self = shift (@_);
|
---|
161 | my ($section) = @_;
|
---|
162 |
|
---|
163 | # remove tags without a starting tag from the section
|
---|
164 | my ($tag, $tagstart);
|
---|
165 | while ($section =~ /<\/([^>]{1,10})>/) {
|
---|
166 | $tag = $1;
|
---|
167 | $tagstart = index($section, "<$tag");
|
---|
168 | last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
|
---|
169 | $section =~ s/<\/$tag>//;
|
---|
170 | }
|
---|
171 |
|
---|
172 | # remove extra paragraph tags
|
---|
173 | while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
|
---|
174 |
|
---|
175 | # remove extra stuff at the end of the section
|
---|
176 | while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>| |\s)$//i) {}
|
---|
177 |
|
---|
178 | # add a newline at the beginning of each paragraph
|
---|
179 | $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
|
---|
180 |
|
---|
181 | # add a newline every 80 characters at a word boundary
|
---|
182 | # Note: this regular expression puts a line feed before
|
---|
183 | # the last word in each section, even when it is not
|
---|
184 | # needed.
|
---|
185 | $section =~ s/(.{1,80})\s/$1\n/g;
|
---|
186 |
|
---|
187 | # fix up the image links
|
---|
188 | $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
|
---|
189 | <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
|
---|
190 | $section =~ s/<<I>>\s*([^\.]+\.(png|jpg|gif))/
|
---|
191 | <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
|
---|
192 |
|
---|
193 | return $section;
|
---|
194 | }
|
---|
195 |
|
---|
196 |
|
---|
197 | sub shorten {
|
---|
198 | my $self = shift (@_);
|
---|
199 | my ($text) = @_;
|
---|
200 |
|
---|
201 | return "\"$text\"" if (length($text) < 100);
|
---|
202 |
|
---|
203 | return "\"" . substr ($text, 0, 50) . "\" ... \"" .
|
---|
204 | substr ($text, length($text)-50) . "\"";
|
---|
205 | }
|
---|
206 |
|
---|
207 | # if input_encoding is ascii we can call add_utf8_metadata
|
---|
208 | # directly but if it's iso_8859_1 (the default) we need to call
|
---|
209 | # add_metadata so that the ascii2utf8 conversion is done first
|
---|
210 | # this should speed things up a little if processing an ascii only
|
---|
211 | # document with input_encoding set to ascii
|
---|
212 | sub HB_add_metadata {
|
---|
213 | my $self = shift (@_);
|
---|
214 | my ($doc_obj, $cursection, $field, $value) = @_;
|
---|
215 |
|
---|
216 | # All text should now be in utf-8
|
---|
217 | # if ($self->{'input_encoding'} eq "ascii") {
|
---|
218 | $doc_obj->add_utf8_metadata ($cursection, $field, $value);
|
---|
219 | # } else {
|
---|
220 | # $doc_obj->add_metadata ($cursection, $field, $value);
|
---|
221 | # }
|
---|
222 | }
|
---|
223 |
|
---|
224 | # return number of files processed, undef if can't process
|
---|
225 | # Note that $base_dir might be "" and that $file might
|
---|
226 | # include directories
|
---|
227 | sub read {
|
---|
228 | my $self = shift (@_);
|
---|
229 | my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
|
---|
230 | my $outhandle = $self->{'outhandle'};
|
---|
231 |
|
---|
232 | # get the html filename and see if this is an HTML Book...
|
---|
233 | my $jobnumber = $file;
|
---|
234 | if ($file =~ /[\\\/]/) {
|
---|
235 | ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
|
---|
236 | }
|
---|
237 | return undef unless defined $jobnumber;
|
---|
238 | my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
|
---|
239 | return undef unless -e $htmlfile;
|
---|
240 |
|
---|
241 | print $outhandle "HBPlug: processing $file\n";
|
---|
242 |
|
---|
243 | # read in the file and do basic html cleaning (removing header etc)
|
---|
244 | my $html = "";
|
---|
245 | $self->HB_read_html_file ($htmlfile, \$html);
|
---|
246 | # html is in utf8
|
---|
247 |
|
---|
248 | # create a new document
|
---|
249 | my $doc_obj = new doc ($file, "indexed_doc");
|
---|
250 | $doc_obj->set_OIDtype ($processor->{'OIDtype'});
|
---|
251 |
|
---|
252 | # copy the book cover if it exists
|
---|
253 | my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
|
---|
254 | $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
|
---|
255 |
|
---|
256 | my $cursection = $doc_obj->get_top_section();
|
---|
257 |
|
---|
258 | # add metadata for top level of document
|
---|
259 | foreach my $field (keys(%$metadata)) {
|
---|
260 | # $metadata->{$field} may be an array reference
|
---|
261 | if (ref ($metadata->{$field}) eq "ARRAY") {
|
---|
262 | map {
|
---|
263 | $self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
|
---|
264 | } @{$metadata->{$field}};
|
---|
265 | } else {
|
---|
266 | $self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
|
---|
267 | }
|
---|
268 | }
|
---|
269 |
|
---|
270 | # process the file one section at a time
|
---|
271 | my $curtoclevel = 1;
|
---|
272 | my $firstsection = 1;
|
---|
273 | while (length ($html) > 0) {
|
---|
274 | if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)<<TOC(\d+)>>\s*(.*?)<p\b/<p/i) {
|
---|
275 | my $toclevel = $3;
|
---|
276 | my $title = $4;
|
---|
277 | my $sectiontext = "";
|
---|
278 | if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)<<TOC\d+>>)/$2/i) {
|
---|
279 | $sectiontext = $1;
|
---|
280 | } else {
|
---|
281 | $sectiontext = $html;
|
---|
282 | $html = "";
|
---|
283 | }
|
---|
284 |
|
---|
285 | # remove tags and extra spaces from the title
|
---|
286 | $title =~ s/<\/?[^>]+>//g;
|
---|
287 | $title =~ s/^\s+|\s+$//g;
|
---|
288 |
|
---|
289 | # close any sections below the current level and
|
---|
290 | # create a new section (special case for the firstsection)
|
---|
291 | while (($curtoclevel > $toclevel) ||
|
---|
292 | (!$firstsection && $curtoclevel == $toclevel)) {
|
---|
293 | $cursection = $doc_obj->get_parent_section ($cursection);
|
---|
294 | $curtoclevel--;
|
---|
295 | }
|
---|
296 | if ($curtoclevel+1 < $toclevel) {
|
---|
297 | print $outhandle "WARNING - jump in toc levels in $htmlfile " .
|
---|
298 | "from $curtoclevel to $toclevel\n";
|
---|
299 | }
|
---|
300 | while ($curtoclevel < $toclevel) {
|
---|
301 | $curtoclevel++;
|
---|
302 | $cursection =
|
---|
303 | $doc_obj->insert_section($doc_obj->get_end_child($cursection));
|
---|
304 | }
|
---|
305 |
|
---|
306 | # add the metadata to this section
|
---|
307 | $self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
|
---|
308 |
|
---|
309 | # clean up the section html
|
---|
310 | $sectiontext = $self->HB_clean_section($sectiontext);
|
---|
311 |
|
---|
312 | # associate any files
|
---|
313 | map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
|
---|
314 | if /_httpdocimg_\/([^\"]+)\"/; 0; }
|
---|
315 | split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
|
---|
316 |
|
---|
317 | # add the text for this section
|
---|
318 | # All read text should now be in utf-8
|
---|
319 | # if ($self->{'input_encoding'} eq "ascii") {
|
---|
320 | $doc_obj->add_utf8_text ($cursection, $sectiontext);
|
---|
321 | # } else {
|
---|
322 | # $doc_obj->add_text ($cursection, $sectiontext);
|
---|
323 | # }
|
---|
324 | } else {
|
---|
325 | print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
|
---|
326 | "\nin $htmlfile\n";
|
---|
327 | last;
|
---|
328 | }
|
---|
329 | $firstsection = 0;
|
---|
330 | }
|
---|
331 |
|
---|
332 | # add a OID
|
---|
333 | $doc_obj->set_OID ();
|
---|
334 |
|
---|
335 | # process the document
|
---|
336 | $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
|
---|
337 |
|
---|
338 | return 1; # processed the file
|
---|
339 | }
|
---|
340 |
|
---|
341 |
|
---|
342 | 1;
|
---|