source: gsdl/trunk/perllib/plugins/HBPlugin.pm@ 18327

Last change on this file since 18327 was 18327, checked in by ak19, 15 years ago

Extra parameter to new doc(): the renaming method to be used on the file (base64 or URL encoding).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.0 KB
RevLine 
[537]1###########################################################################
2#
[15872]3# HBPlugin.pm --
[537]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[1244]26# plugin which processes an HTML book directory
[4]27
[1244]28# This plugin is used by the Humanity Library collections and does not handle
29# input encodings other than ascii or extended ascii
30
31# this code is kind of ugly and could no doubt be made to run faster, by leaving
32# it in this state I hope to encourage people to make their collections use
33# HBSPlug instead ;-)
34
35# Use HBSPlug if creating a new collection and marking up files like the
36# Humanity Library collections. HBSPlug accepts all input encodings but
37# expects the marked up files to be cleaner than those used by the
38# Humanity Library collections
39
[15872]40package HBPlugin;
[4]41
[1010]42use ghtml;
[15872]43use BasePlugin;
[3542]44use unicode;
[286]45use util;
[288]46use doc;
[4]47
[10254]48use strict;
49no strict 'refs'; # allow filehandles to be variables and viceversa
[288]50
[4]51sub BEGIN {
[15872]52 @HBPlugin::ISA = ('BasePlugin');
[4]53}
[15872]54my $encoding_list =
55 [ { 'name' => "ascii",
[16019]56 'desc' => "{BasePlugin.encoding.ascii}" },
[15872]57 { 'name' => "iso_8859_1",
[16019]58 'desc' => "{HBPlugin.encoding.iso_8859_1}" } ];
[15872]59
[6408]60my $arguments =
61 [ { 'name' => "process_exp",
[15872]62 'desc' => "{BasePlugin.process_exp}",
[6408]63 'type' => "regexp",
64 'reqd' => "no",
[15872]65 'deft' => &get_default_process_exp() },
66 { 'name' => "input_encoding",
67 'desc' => "{ReadTextFile.input_encoding}",
68 'type' => "enum",
69 'deft' => "iso_8859_1",
70 'list' => $encoding_list,
71 'reqd' => "no" }
[6408]72 ];
73
[15872]74my $options = { 'name' => "HBPlugin",
75 'desc' => "{HBPlugin.desc}",
[6408]76 'abstract' => "no",
77 'inherits' => "yes",
78 'args' => $arguments };
[3540]79
[4]80sub new {
[10218]81 my ($class) = shift (@_);
82 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
83 push(@$pluginlist, $class);
[4]84
[15872]85 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
86 push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]87
[15872]88 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
[10218]89
[4]90 return bless $self, $class;
91}
92
[1244]93# this is included only to prevent warnings being printed out
[15872]94# from BasePlugin::init. The process_exp is not used by this plugin
[1244]95sub get_default_process_exp {
96 my $self = shift (@_);
97
98 return "This plugin does not use a process_exp\n";
99}
100
101
[4]102sub HB_read_html_file {
103 my $self = shift (@_);
[318]104 my ($htmlfile, $text) = @_;
[4]105
106 # load in the file
107 if (!open (FILE, $htmlfile)) {
[1424]108 my $outhandle = $self->{'outhandle'};
109 print $outhandle "ERROR - could not open $htmlfile\n";
[318]110 return;
[4]111 }
112
[318]113 my $foundbody = 0;
[1244]114 $self->HB_gettext (\$foundbody, $text, "FILE");
[318]115 close FILE;
116
117 # just in case there was no <body> tag
118 if (!$foundbody) {
119 $foundbody = 1;
120 open (FILE, $htmlfile) || return;
[1244]121 $self->HB_gettext (\$foundbody, $text, "FILE");
[318]122 close FILE;
123 }
[3542]124 # text is in utf8
[318]125}
126
[3542]127# converts the text to utf8, as ghtml does that for &eacute; etc.
[318]128sub HB_gettext {
129 my $self = shift (@_);
130 my ($foundbody, $text, $handle) = @_;
[1424]131 my $outhandle = $self->{'outhandle'};
[318]132
[4]133 my $line = "";
[318]134 while (defined ($line = <$handle>)) {
[4]135 # look for body tag
[318]136 if (!$$foundbody) {
[4]137 if ($line =~ s/^.*<body[^>]*>//i) {
[318]138 $$foundbody = 1;
[4]139 } else {
140 next;
141 }
142 }
[318]143
[4]144 # check for symbol fonts
145 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
146 my $font = $1;
[15872]147 print $outhandle "HBPlugin::HB_gettext - warning removed font $font\n"
[4]148 if ($font !~ /^arial$/i);
149 }
150
151 $line =~ s/<\/p>//ig; # remove </p> tags
152 $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
153
[318]154 $$text .= $line;
[4]155 }
[3542]156 #
157 if ($self->{'input_encoding'} eq "iso_8859_1") {
158 # convert to utf-8
159 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
160 }
161 # convert any alphanumeric character entities to their utf-8
162 # equivalent for indexing purposes
163 &ghtml::convertcharentities ($$text);
164
[318]165 $$text =~ s/\s+/ /g; # remove \n's
[4]166}
167
168sub HB_clean_section {
169 my $self = shift (@_);
170 my ($section) = @_;
171
172 # remove tags without a starting tag from the section
173 my ($tag, $tagstart);
174 while ($section =~ /<\/([^>]{1,10})>/) {
175 $tag = $1;
176 $tagstart = index($section, "<$tag");
177 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
178 $section =~ s/<\/$tag>//;
179 }
180
181 # remove extra paragraph tags
182 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
183
184 # remove extra stuff at the end of the section
185 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
186
187 # add a newline at the beginning of each paragraph
188 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
189
190 # add a newline every 80 characters at a word boundary
191 # Note: this regular expression puts a line feed before
192 # the last word in each section, even when it is not
193 # needed.
194 $section =~ s/(.{1,80})\s/$1\n/g;
195
[317]196 # fix up the image links
197 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
[1020]198 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
[317]199 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
[1020]200 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
[4]201
202 return $section;
203}
204
205
206sub shorten {
207 my $self = shift (@_);
208 my ($text) = @_;
209
210 return "\"$text\"" if (length($text) < 100);
211
212 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
213 substr ($text, length($text)-50) . "\"";
214}
215
[317]216# return number of files processed, undef if can't process
[4]217# Note that $base_dir might be "" and that $file might
218# include directories
219sub read {
220 my $self = shift (@_);
[16392]221 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[1424]222 my $outhandle = $self->{'outhandle'};
[4]223
[288]224 # get the html filename and see if this is an HTML Book...
225 my $jobnumber = $file;
[4]226 if ($file =~ /[\\\/]/) {
227 ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
228 }
[317]229 return undef unless defined $jobnumber;
[286]230 my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
[317]231 return undef unless -e $htmlfile;
[4]232
[15872]233 print STDERR "<Processing n='$file' p='HBPlugin'>\n" if ($gli);
234 print $outhandle "HBPlugin: processing $file\n";
[4]235
236 # read in the file and do basic html cleaning (removing header etc)
[318]237 my $html = "";
238 $self->HB_read_html_file ($htmlfile, \$html);
[3542]239 # html is in utf8
[4]240
241 # create a new document
[18327]242 my $doc_obj = new doc ($file, "indexed_doc", $self->{'file_rename_method'});
[4]243
244 # copy the book cover if it exists
[286]245 my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
[796]246 $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
[7508]247 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[8166]248 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileFormat", "HB");
249 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $htmlfile));
[4]250
[317]251 my $cursection = $doc_obj->get_top_section();
[8121]252
[317]253 # add metadata for top level of document
[1244]254 foreach my $field (keys(%$metadata)) {
[317]255 # $metadata->{$field} may be an array reference
256 if (ref ($metadata->{$field}) eq "ARRAY") {
[15872]257 map {
258 $doc_obj->add_utf8_metadata($cursection, $field, $_);
[317]259 } @{$metadata->{$field}};
260 } else {
[15872]261 $doc_obj->add_utf8_metadata($cursection, $field, $metadata->{$field});
[317]262 }
263 }
264
[4]265 # process the file one section at a time
266 my $curtoclevel = 1;
267 my $firstsection = 1;
268 while (length ($html) > 0) {
[339]269 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
[4]270 my $toclevel = $3;
271 my $title = $4;
272 my $sectiontext = "";
[321]273 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
[4]274 $sectiontext = $1;
275 } else {
276 $sectiontext = $html;
277 $html = "";
278 }
279
280 # remove tags and extra spaces from the title
281 $title =~ s/<\/?[^>]+>//g;
282 $title =~ s/^\s+|\s+$//g;
283
284 # close any sections below the current level and
285 # create a new section (special case for the firstsection)
286 while (($curtoclevel > $toclevel) ||
287 (!$firstsection && $curtoclevel == $toclevel)) {
288 $cursection = $doc_obj->get_parent_section ($cursection);
289 $curtoclevel--;
290 }
291 if ($curtoclevel+1 < $toclevel) {
[1424]292 print $outhandle "WARNING - jump in toc levels in $htmlfile " .
[4]293 "from $curtoclevel to $toclevel\n";
294 }
295 while ($curtoclevel < $toclevel) {
296 $curtoclevel++;
297 $cursection =
298 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
299 }
300
301 # add the metadata to this section
[15872]302 $doc_obj->add_utf8_metadata($cursection, "Title", $title);
[286]303
[4]304 # clean up the section html
305 $sectiontext = $self->HB_clean_section($sectiontext);
306
307 # associate any files
[286]308 map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
[1020]309 if /_httpdocimg_\/([^\"]+)\"/; 0; }
310 split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
[4]311
312 # add the text for this section
[3542]313 $doc_obj->add_utf8_text ($cursection, $sectiontext);
[4]314 } else {
[1424]315 print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
[4]316 "\nin $htmlfile\n";
317 last;
318 }
[317]319 $firstsection = 0;
[4]320 }
321
322 # add a OID
[17026]323 $self->add_OID($doc_obj);
[4]324
325 # process the document
[288]326 $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
[4]327
328 return 1; # processed the file
329}
330
331
3321;
Note: See TracBrowser for help on using the repository browser.