source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 7243

Last change on this file since 7243 was 6408, checked in by jmt12, 20 years ago

Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.6 KB
Line 
1###########################################################################
2#
3# HBPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which processes an HTML book directory
27
28# This plugin is used by the Humanity Library collections and does not handle
29# input encodings other than ascii or extended ascii
30
31# this code is kind of ugly and could no doubt be made to run faster, by leaving
32# it in this state I hope to encourage people to make their collections use
33# HBSPlug instead ;-)
34
35# Use HBSPlug if creating a new collection and marking up files like the
36# Humanity Library collections. HBSPlug accepts all input encodings but
37# expects the marked up files to be cleaner than those used by the
38# Humanity Library collections
39
40# 12/05/02 Added usage datastructure - John Thompson
41
42package HBPlug;
43
44use ghtml;
45use BasPlug;
46use unicode;
47use util;
48use doc;
49
50
51sub BEGIN {
52 @ISA = ('BasPlug');
53}
54
55my $arguments =
56 [ { 'name' => "process_exp",
57 'desc' => "{BasPlug.process_exp}",
58 'type' => "regexp",
59 'reqd' => "no",
60 'deft' => &get_default_process_exp() }
61 ];
62
63my $options = { 'name' => "HBPlug",
64 'desc' => "{HBPlug.desc}",
65 'abstract' => "no",
66 'inherits' => "yes",
67 'args' => $arguments };
68
69sub new {
70 my ($class) = @_;
71 my $self = new BasPlug ("HBPlug", @_);
72 $self->{'plugin_type'} = "HBPlug";
73 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
74 my $option_list = $self->{'option_list'};
75 push( @{$option_list}, $options );
76
77 return bless $self, $class;
78}
79
80sub init {
81 my $self = shift (@_);
82 my ($verbosity, $outhandle) = @_;
83
84 $self->BasPlug::init($verbosity, $outhandle);
85 $self->{'input_encoding'} = "iso_8859_1";
86
87 # this plugin only handles ascii encodings
88 if ($self->{'input_encoding'} !~ /^(iso_8859_1|ascii)$/) {
89 die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
90 $self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
91 }
92}
93
94# this is included only to prevent warnings being printed out
95# from BasPlug::init. The process_exp is not used by this plugin
96sub get_default_process_exp {
97 my $self = shift (@_);
98
99 return "This plugin does not use a process_exp\n";
100}
101
102
103sub HB_read_html_file {
104 my $self = shift (@_);
105 my ($htmlfile, $text) = @_;
106
107 # load in the file
108 if (!open (FILE, $htmlfile)) {
109 my $outhandle = $self->{'outhandle'};
110 print $outhandle "ERROR - could not open $htmlfile\n";
111 return;
112 }
113
114 my $foundbody = 0;
115 $self->HB_gettext (\$foundbody, $text, "FILE");
116 close FILE;
117
118 # just in case there was no <body> tag
119 if (!$foundbody) {
120 $foundbody = 1;
121 open (FILE, $htmlfile) || return;
122 $self->HB_gettext (\$foundbody, $text, "FILE");
123 close FILE;
124 }
125 # text is in utf8
126}
127
128# converts the text to utf8, as ghtml does that for &eacute; etc.
129sub HB_gettext {
130 my $self = shift (@_);
131 my ($foundbody, $text, $handle) = @_;
132 my $outhandle = $self->{'outhandle'};
133
134 my $line = "";
135 while (defined ($line = <$handle>)) {
136 # look for body tag
137 if (!$$foundbody) {
138 if ($line =~ s/^.*<body[^>]*>//i) {
139 $$foundbody = 1;
140 } else {
141 next;
142 }
143 }
144
145 # check for symbol fonts
146 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
147 my $font = $1;
148 print $outhandle "HBPlug::HB_gettext - warning removed font $font\n"
149 if ($font !~ /^arial$/i);
150 }
151
152 $line =~ s/<\/p>//ig; # remove </p> tags
153 $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
154
155 $$text .= $line;
156 }
157 #
158 if ($self->{'input_encoding'} eq "iso_8859_1") {
159 # convert to utf-8
160 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
161 }
162 # convert any alphanumeric character entities to their utf-8
163 # equivalent for indexing purposes
164 &ghtml::convertcharentities ($$text);
165
166 $$text =~ s/\s+/ /g; # remove \n's
167}
168
169sub HB_clean_section {
170 my $self = shift (@_);
171 my ($section) = @_;
172
173 # remove tags without a starting tag from the section
174 my ($tag, $tagstart);
175 while ($section =~ /<\/([^>]{1,10})>/) {
176 $tag = $1;
177 $tagstart = index($section, "<$tag");
178 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
179 $section =~ s/<\/$tag>//;
180 }
181
182 # remove extra paragraph tags
183 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
184
185 # remove extra stuff at the end of the section
186 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
187
188 # add a newline at the beginning of each paragraph
189 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
190
191 # add a newline every 80 characters at a word boundary
192 # Note: this regular expression puts a line feed before
193 # the last word in each section, even when it is not
194 # needed.
195 $section =~ s/(.{1,80})\s/$1\n/g;
196
197 # fix up the image links
198 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
199 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
200 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
201 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
202
203 return $section;
204}
205
206
207sub shorten {
208 my $self = shift (@_);
209 my ($text) = @_;
210
211 return "\"$text\"" if (length($text) < 100);
212
213 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
214 substr ($text, length($text)-50) . "\"";
215}
216
217# if input_encoding is ascii we can call add_utf8_metadata
218# directly but if it's iso_8859_1 (the default) we need to call
219# add_metadata so that the ascii2utf8 conversion is done first
220# this should speed things up a little if processing an ascii only
221# document with input_encoding set to ascii
222sub HB_add_metadata {
223 my $self = shift (@_);
224 my ($doc_obj, $cursection, $field, $value) = @_;
225
226# All text should now be in utf-8
227# if ($self->{'input_encoding'} eq "ascii") {
228 $doc_obj->add_utf8_metadata ($cursection, $field, $value);
229# } else {
230# $doc_obj->add_metadata ($cursection, $field, $value);
231# }
232}
233
234# return number of files processed, undef if can't process
235# Note that $base_dir might be "" and that $file might
236# include directories
237sub read {
238 my $self = shift (@_);
239 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
240 my $outhandle = $self->{'outhandle'};
241
242 # get the html filename and see if this is an HTML Book...
243 my $jobnumber = $file;
244 if ($file =~ /[\\\/]/) {
245 ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
246 }
247 return undef unless defined $jobnumber;
248 my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
249 return undef unless -e $htmlfile;
250
251 print STDERR "<Processing n='$file' p='HBPlug'>\n" if ($gli);
252 print $outhandle "HBPlug: processing $file\n";
253
254 # read in the file and do basic html cleaning (removing header etc)
255 my $html = "";
256 $self->HB_read_html_file ($htmlfile, \$html);
257 # html is in utf8
258
259 # create a new document
260 my $doc_obj = new doc ($file, "indexed_doc");
261 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
262
263 # copy the book cover if it exists
264 my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
265 $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
266 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "$self->{'plugin_type'}", "1");
267
268 my $cursection = $doc_obj->get_top_section();
269
270 # add metadata for top level of document
271 foreach my $field (keys(%$metadata)) {
272 # $metadata->{$field} may be an array reference
273 if (ref ($metadata->{$field}) eq "ARRAY") {
274 map {
275 $self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
276 } @{$metadata->{$field}};
277 } else {
278 $self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
279 }
280 }
281
282 # process the file one section at a time
283 my $curtoclevel = 1;
284 my $firstsection = 1;
285 while (length ($html) > 0) {
286 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
287 my $toclevel = $3;
288 my $title = $4;
289 my $sectiontext = "";
290 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
291 $sectiontext = $1;
292 } else {
293 $sectiontext = $html;
294 $html = "";
295 }
296
297 # remove tags and extra spaces from the title
298 $title =~ s/<\/?[^>]+>//g;
299 $title =~ s/^\s+|\s+$//g;
300
301 # close any sections below the current level and
302 # create a new section (special case for the firstsection)
303 while (($curtoclevel > $toclevel) ||
304 (!$firstsection && $curtoclevel == $toclevel)) {
305 $cursection = $doc_obj->get_parent_section ($cursection);
306 $curtoclevel--;
307 }
308 if ($curtoclevel+1 < $toclevel) {
309 print $outhandle "WARNING - jump in toc levels in $htmlfile " .
310 "from $curtoclevel to $toclevel\n";
311 }
312 while ($curtoclevel < $toclevel) {
313 $curtoclevel++;
314 $cursection =
315 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
316 }
317
318 # add the metadata to this section
319 $self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
320
321 # clean up the section html
322 $sectiontext = $self->HB_clean_section($sectiontext);
323
324 # associate any files
325 map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
326 if /_httpdocimg_\/([^\"]+)\"/; 0; }
327 split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
328
329 # add the text for this section
330# All read text should now be in utf-8
331# if ($self->{'input_encoding'} eq "ascii") {
332 $doc_obj->add_utf8_text ($cursection, $sectiontext);
333# } else {
334# $doc_obj->add_text ($cursection, $sectiontext);
335# }
336 } else {
337 print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
338 "\nin $htmlfile\n";
339 last;
340 }
341 $firstsection = 0;
342 }
343
344 # add a OID
345 $doc_obj->set_OID ();
346
347 # process the document
348 $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
349
350 return 1; # processed the file
351}
352
353
3541;
Note: See TracBrowser for help on using the repository browser.