source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 620

Last change on this file since 620 was 537, checked in by sjboddie, 25 years ago

added GPL headers

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.8 KB
Line 
1###########################################################################
2#
3# HBPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which process an HTML book directory
27
28package HBPlug;
29
30use plugin;
31use html;
32use BasPlug;
33use util;
34use lang;
35use doc;
36use cfgread;
37
38
39sub BEGIN {
40 @ISA = ('BasPlug');
41}
42
43sub new {
44 my ($class) = @_;
45 $self = new BasPlug ();
46
47 return bless $self, $class;
48}
49
50sub is_recursive {
51 my $self = shift (@_);
52
53 return 0; # this is not a recursive plugin
54}
55
56sub HB_read_html_file {
57 my $self = shift (@_);
58 my ($htmlfile, $text) = @_;
59
60 # load in the file
61 if (!open (FILE, $htmlfile)) {
62 print STDERR "ERROR - could not open $htmlfile\n";
63 return;
64 }
65
66 my $foundbody = 0;
67 $self->HB_gettext (\$foundbody, $text, FILE);
68 close FILE;
69
70 # just in case there was no <body> tag
71 if (!$foundbody) {
72 $foundbody = 1;
73 open (FILE, $htmlfile) || return;
74 $self->HB_gettext (\$foundbody, $text, FILE);
75 close FILE;
76 }
77}
78
79sub HB_gettext {
80 my $self = shift (@_);
81 my ($foundbody, $text, $handle) = @_;
82
83 my $line = "";
84 while (defined ($line = <$handle>)) {
85 # look for body tag
86 if (!$$foundbody) {
87 if ($line =~ s/^.*<body[^>]*>//i) {
88 $$foundbody = 1;
89 } else {
90 next;
91 }
92 }
93
94 # check for symbol fonts
95 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
96 my $font = $1;
97 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
98 if ($font !~ /^arial$/i);
99 }
100
101 $line =~ s/<\/p>//ig; # remove </p> tags
102 $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
103
104 # convert any alphanumeric character entities to their extended
105 # ascii equivalent for indexing purposes
106 &html::convertcharentities ($line);
107
108 $$text .= $line;
109 }
110 $$text =~ s/\s+/ /g; # remove \n's
111}
112
113sub HB_clean_section {
114 my $self = shift (@_);
115 my ($section) = @_;
116
117 # remove tags without a starting tag from the section
118 my ($tag, $tagstart);
119 while ($section =~ /<\/([^>]{1,10})>/) {
120 $tag = $1;
121 $tagstart = index($section, "<$tag");
122 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
123 $section =~ s/<\/$tag>//;
124 }
125
126 # remove extra paragraph tags
127 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
128
129 # remove extra stuff at the end of the section
130 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
131
132 # add a newline at the beginning of each paragraph
133 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
134
135 # add a newline every 80 characters at a word boundary
136 # Note: this regular expression puts a line feed before
137 # the last word in each section, even when it is not
138 # needed.
139 $section =~ s/(.{1,80})\s/$1\n/g;
140
141 # fix up the image links
142 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
143 <center><img src=\"_httpcollection_\/archives\/_thisOID_\/$1\"><\/center><br>/ig;
144 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
145 <center><img src=\"_httpcollection_\/archives\/_thisOID_\/$1\"><\/center><br>/ig;
146
147 return $section;
148}
149
150
151sub shorten {
152 my $self = shift (@_);
153 my ($text) = @_;
154
155 return "\"$text\"" if (length($text) < 100);
156
157 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
158 substr ($text, length($text)-50) . "\"";
159}
160
161
162# return number of files processed, undef if can't process
163# Note that $base_dir might be "" and that $file might
164# include directories
165sub read {
166 my $self = shift (@_);
167 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
168
169 # get the html filename and see if this is an HTML Book...
170 my $jobnumber = $file;
171 if ($file =~ /[\\\/]/) {
172 ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
173 }
174 return undef unless defined $jobnumber;
175 my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
176 return undef unless -e $htmlfile;
177
178 print STDERR "HBPlug: processing $file\n";
179
180 # read in the file and do basic html cleaning (removing header etc)
181 my $html = "";
182 $self->HB_read_html_file ($htmlfile, \$html);
183
184 # create a new document
185 my $doc_obj = new doc ($file, "indexed_doc");
186
187 # copy the book cover if it exists
188 my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
189 $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg")
190 if -e $bookcover;
191
192 my $cursection = $doc_obj->get_top_section();
193
194 # add metadata for top level of document
195 foreach $field (keys(%$metadata)) {
196 # $metadata->{$field} may be an array reference
197 if (ref ($metadata->{$field}) eq "ARRAY") {
198 map {
199 $doc_obj->add_metadata ($cursection, $field, $_);
200 } @{$metadata->{$field}};
201 } else {
202 $doc_obj->add_metadata ($cursection, $field, $metadata->{$field});
203 }
204 }
205 # need to add <classifytype> to each book as we'll be using 'Hierarchy'
206 # for HB collections rather than the default ('Book')
207 $doc_obj->add_metadata ($cursection, 'classifytype', 'Hierarchy');
208
209
210 # process the file one section at a time
211 my $curtoclevel = 1;
212 my $firstsection = 1;
213 while (length ($html) > 0) {
214 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
215 my $toclevel = $3;
216 my $title = $4;
217 my $sectiontext = "";
218 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
219 $sectiontext = $1;
220 } else {
221 $sectiontext = $html;
222 $html = "";
223 }
224
225 # remove tags and extra spaces from the title
226 $title =~ s/<\/?[^>]+>//g;
227 $title =~ s/^\s+|\s+$//g;
228
229 # close any sections below the current level and
230 # create a new section (special case for the firstsection)
231 while (($curtoclevel > $toclevel) ||
232 (!$firstsection && $curtoclevel == $toclevel)) {
233 $cursection = $doc_obj->get_parent_section ($cursection);
234 $curtoclevel--;
235 }
236 if ($curtoclevel+1 < $toclevel) {
237 print STDERR "WARNING - jump in toc levels in $htmlfile " .
238 "from $curtoclevel to $toclevel\n";
239 }
240 while ($curtoclevel < $toclevel) {
241 $curtoclevel++;
242 $cursection =
243 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
244 }
245
246 # add the metadata to this section
247 $doc_obj->add_metadata ($cursection, "Title", $title);
248
249 # clean up the section html
250 $sectiontext = $self->HB_clean_section($sectiontext);
251
252 # associate any files
253 map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
254 if /_httpcollection_\/archives\/_thisOID_\/([^\"]+)\"/; 0; }
255 split (/(_httpcollection_\/archives\/_thisOID_\/[^\"]+\")/, $sectiontext);
256
257 # add the text for this section
258 $doc_obj->add_text ($cursection, $sectiontext);
259
260 } else {
261 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
262 "\nin $htmlfile\n";
263 last;
264 }
265 $firstsection = 0;
266 }
267
268 # add a OID
269 $doc_obj->set_OID ();
270
271 # process the document
272 $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
273
274 return 1; # processed the file
275}
276
277
2781;
Note: See TracBrowser for help on using the repository browser.