########################################################################### # # HBPlugin.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # plugin which processes an HTML book directory # This plugin is used by the Humanity Library collections and does not handle # input encodings other than ascii or extended ascii # this code is kind of ugly and could no doubt be made to run faster, by leaving # it in this state I hope to encourage people to make their collections use # HBSPlug instead ;-) # Use HBSPlug if creating a new collection and marking up files like the # Humanity Library collections. HBSPlug accepts all input encodings but # expects the marked up files to be cleaner than those used by the # Humanity Library collections package HBPlugin; use ghtml; use BasePlugin; use unicode; use util; use doc; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa sub BEGIN { @HBPlugin::ISA = ('BasePlugin'); } my $encoding_list = [ { 'name' => "ascii", 'desc' => "{BasePlugin.encoding.ascii}" }, { 'name' => "iso_8859_1", 'desc' => "{HBPlugin.encoding.iso_8859_1}" } ]; my $arguments = [ { 'name' => "process_exp", 'desc' => "{BasePlugin.process_exp}", 'type' => "regexp", 'reqd' => "no", 'deft' => &get_default_process_exp() }, { 'name' => "input_encoding", 'desc' => "{ReadTextFile.input_encoding}", 'type' => "enum", 'deft' => "iso_8859_1", 'list' => $encoding_list, 'reqd' => "no" } ]; my $options = { 'name' => "HBPlugin", 'desc' => "{HBPlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists); return bless $self, $class; } # this is included only to prevent warnings being printed out # from BasePlugin::init. The process_exp is not used by this plugin sub get_default_process_exp { my $self = shift (@_); return "This plugin does not use a process_exp\n"; } sub HB_read_html_file { my $self = shift (@_); my ($htmlfile, $text) = @_; # load in the file if (!open (FILE, $htmlfile)) { my $outhandle = $self->{'outhandle'}; print $outhandle "ERROR - could not open $htmlfile\n"; return; } my $foundbody = 0; $self->HB_gettext (\$foundbody, $text, "FILE"); close FILE; # just in case there was no
tag if (!$foundbody) { $foundbody = 1; open (FILE, $htmlfile) || return; $self->HB_gettext (\$foundbody, $text, "FILE"); close FILE; } # text is in utf8 } # converts the text to utf8, as ghtml does that for é etc. sub HB_gettext { my $self = shift (@_); my ($foundbody, $text, $handle) = @_; my $outhandle = $self->{'outhandle'}; my $line = ""; while (defined ($line = <$handle>)) { # look for body tag if (!$$foundbody) { if ($line =~ s/^.*]*>//i) { $$foundbody = 1; } else { next; } } # check for symbol fonts if ($line =~ /]*?face\s*=\s*\"?(\w+)\"?/i) { my $font = $1; print $outhandle "HBPlugin::HB_gettext - warning removed font $font\n" if ($font !~ /^arial$/i); } $line =~ s/<\/p>//ig; # remove tags $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags $$text .= $line; } # if ($self->{'input_encoding'} eq "iso_8859_1") { # convert to utf-8 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text)); } # convert any alphanumeric character entities to their utf-8 # equivalent for indexing purposes &ghtml::convertcharentities ($$text); $$text =~ s/\s+/ /g; # remove \n's } sub HB_clean_section { my $self = shift (@_); my ($section) = @_; # remove tags without a starting tag from the section my ($tag, $tagstart); while ($section =~ /<\/([^>]{1,10})>/) { $tag = $1; $tagstart = index($section, "<$tag"); last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag"))); $section =~ s/<\/$tag>//; } # remove extra paragraph tags while ($section =~ s/]*>\s*
|||
]*>| |\s)$//i) {} # add a newline at the beginning of each paragraph $section =~ s/(.)\s*
]*?src=\"?([^\">]+)\"?[^>]*>/
]*>)?((|||\s)*)<<TOC(\d+)>>\s*(.*?)
]*>)?((|||\s)*)<<TOC\d+>>)/$2/i) { $sectiontext = $1; } else { $sectiontext = $html; $html = ""; } # remove tags and extra spaces from the title $title =~ s/<\/?[^>]+>//g; $title =~ s/^\s+|\s+$//g; # close any sections below the current level and # create a new section (special case for the firstsection) while (($curtoclevel > $toclevel) || (!$firstsection && $curtoclevel == $toclevel)) { $cursection = $doc_obj->get_parent_section ($cursection); $curtoclevel--; } if ($curtoclevel+1 < $toclevel) { print $outhandle "WARNING - jump in toc levels in $htmlfile " . "from $curtoclevel to $toclevel\n"; } while ($curtoclevel < $toclevel) { $curtoclevel++; $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)); } # add the metadata to this section $doc_obj->add_utf8_metadata($cursection, "Title", $title); # clean up the section html $sectiontext = $self->HB_clean_section($sectiontext); # associate any files map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1) if /_httpdocimg_\/([^\"]+)\"/; 0; } split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext); # add the text for this section $doc_obj->add_utf8_text ($cursection, $sectiontext); } else { print $outhandle "WARNING - leftover text\n" , $self->shorten($html), "\nin $htmlfile\n"; last; } $firstsection = 0; } # add a OID $self->add_OID($doc_obj); # process the document $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm")); return 1; # processed the file } 1;