########################################################################### # # HBPlug.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # plugin which processes an HTML book directory # This plugin is used by the Humanity Library collections and does not handle # input encodings other than ascii or extended ascii # this code is kind of ugly and could no doubt be made to run faster, by leaving # it in this state I hope to encourage people to make their collections use # HBSPlug instead ;-) # Use HBSPlug if creating a new collection and marking up files like the # Humanity Library collections. HBSPlug accepts all input encodings but # expects the marked up files to be cleaner than those used by the # Humanity Library collections package HBPlug; use ghtml; use BasPlug; use unicode; use util; use doc; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa sub BEGIN { @HBPlug::ISA = ('BasPlug'); } my $arguments = [ { 'name' => "process_exp", 'desc' => "{BasPlug.process_exp}", 'type' => "regexp", 'reqd' => "no", 'deft' => &get_default_process_exp() } ]; my $options = { 'name' => "HBPlug", 'desc' => "{HBPlug.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists); return bless $self, $class; } sub init { my $self = shift (@_); my ($verbosity, $outhandle) = @_; $self->BasPlug::init($verbosity, $outhandle); $self->{'input_encoding'} = "iso_8859_1"; # this plugin only handles ascii encodings if ($self->{'input_encoding'} !~ /^(iso_8859_1|ascii)$/) { die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" . $self->{'input_encoding'} . " is not an acceptable input_encoding value\n"; } } # this is included only to prevent warnings being printed out # from BasPlug::init. The process_exp is not used by this plugin sub get_default_process_exp { my $self = shift (@_); return "This plugin does not use a process_exp\n"; } sub HB_read_html_file { my $self = shift (@_); my ($htmlfile, $text) = @_; # load in the file if (!open (FILE, $htmlfile)) { my $outhandle = $self->{'outhandle'}; print $outhandle "ERROR - could not open $htmlfile\n"; return; } my $foundbody = 0; $self->HB_gettext (\$foundbody, $text, "FILE"); close FILE; # just in case there was no tag if (!$foundbody) { $foundbody = 1; open (FILE, $htmlfile) || return; $self->HB_gettext (\$foundbody, $text, "FILE"); close FILE; } # text is in utf8 } # converts the text to utf8, as ghtml does that for é etc. sub HB_gettext { my $self = shift (@_); my ($foundbody, $text, $handle) = @_; my $outhandle = $self->{'outhandle'}; my $line = ""; while (defined ($line = <$handle>)) { # look for body tag if (!$$foundbody) { if ($line =~ s/^.*]*>//i) { $$foundbody = 1; } else { next; } } # check for symbol fonts if ($line =~ /]*?face\s*=\s*\"?(\w+)\"?/i) { my $font = $1; print $outhandle "HBPlug::HB_gettext - warning removed font $font\n" if ($font !~ /^arial$/i); } $line =~ s/<\/p>//ig; # remove

tags $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags $$text .= $line; } # if ($self->{'input_encoding'} eq "iso_8859_1") { # convert to utf-8 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text)); } # convert any alphanumeric character entities to their utf-8 # equivalent for indexing purposes &ghtml::convertcharentities ($$text); $$text =~ s/\s+/ /g; # remove \n's } sub HB_clean_section { my $self = shift (@_); my ($section) = @_; # remove tags without a starting tag from the section my ($tag, $tagstart); while ($section =~ /<\/([^>]{1,10})>/) { $tag = $1; $tagstart = index($section, "<$tag"); last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag"))); $section =~ s/<\/$tag>//; } # remove extra paragraph tags while ($section =~ s/]*>\s*|||]*>| |\s)$//i) {} # add a newline at the beginning of each paragraph $section =~ s/(.)\s*]*?src=\"?([^\">]+)\"?[^>]*>/ <\/center>
/ig; $section =~ s/<<I>>\s*([^\.]+\.(png|jpg|gif))/ <\/center>
/ig; return $section; } sub shorten { my $self = shift (@_); my ($text) = @_; return "\"$text\"" if (length($text) < 100); return "\"" . substr ($text, 0, 50) . "\" ... \"" . substr ($text, length($text)-50) . "\""; } # if input_encoding is ascii we can call add_utf8_metadata # directly but if it's iso_8859_1 (the default) we need to call # add_metadata so that the ascii2utf8 conversion is done first # this should speed things up a little if processing an ascii only # document with input_encoding set to ascii sub HB_add_metadata { my $self = shift (@_); my ($doc_obj, $cursection, $field, $value) = @_; # All text should now be in utf-8 # if ($self->{'input_encoding'} eq "ascii") { $doc_obj->add_utf8_metadata ($cursection, $field, $value); # } else { # $doc_obj->add_metadata ($cursection, $field, $value); # } } # return number of files processed, undef if can't process # Note that $base_dir might be "" and that $file might # include directories sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; my $outhandle = $self->{'outhandle'}; # get the html filename and see if this is an HTML Book... my $jobnumber = $file; if ($file =~ /[\\\/]/) { ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/; } return undef unless defined $jobnumber; my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm"); return undef unless -e $htmlfile; print STDERR "\n" if ($gli); print $outhandle "HBPlug: processing $file\n"; # read in the file and do basic html cleaning (removing header etc) my $html = ""; $self->HB_read_html_file ($htmlfile, \$html); # html is in utf8 # create a new document my $doc_obj = new doc ($file, "indexed_doc"); $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); # copy the book cover if it exists my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg"); $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg"); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileFormat", "HB"); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $htmlfile)); my $cursection = $doc_obj->get_top_section(); # add metadata for top level of document foreach my $field (keys(%$metadata)) { # $metadata->{$field} may be an array reference if (ref ($metadata->{$field}) eq "ARRAY") { map { $self->HB_add_metadata ($doc_obj, $cursection, $field, $_); } @{$metadata->{$field}}; } else { $self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field}); } } # process the file one section at a time my $curtoclevel = 1; my $firstsection = 1; while (length ($html) > 0) { if ($html =~ s/^.*?(?:]*>)?((|||\s)*)<<TOC(\d+)>>\s*(.*?)]*>)?((|||\s)*)<<TOC\d+>>)/$2/i) { $sectiontext = $1; } else { $sectiontext = $html; $html = ""; } # remove tags and extra spaces from the title $title =~ s/<\/?[^>]+>//g; $title =~ s/^\s+|\s+$//g; # close any sections below the current level and # create a new section (special case for the firstsection) while (($curtoclevel > $toclevel) || (!$firstsection && $curtoclevel == $toclevel)) { $cursection = $doc_obj->get_parent_section ($cursection); $curtoclevel--; } if ($curtoclevel+1 < $toclevel) { print $outhandle "WARNING - jump in toc levels in $htmlfile " . "from $curtoclevel to $toclevel\n"; } while ($curtoclevel < $toclevel) { $curtoclevel++; $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)); } # add the metadata to this section $self->HB_add_metadata ($doc_obj, $cursection, "Title", $title); # clean up the section html $sectiontext = $self->HB_clean_section($sectiontext); # associate any files map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1) if /_httpdocimg_\/([^\"]+)\"/; 0; } split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext); # add the text for this section # All read text should now be in utf-8 # if ($self->{'input_encoding'} eq "ascii") { $doc_obj->add_utf8_text ($cursection, $sectiontext); # } else { # $doc_obj->add_text ($cursection, $sectiontext); # } } else { print $outhandle "WARNING - leftover text\n" , $self->shorten($html), "\nin $htmlfile\n"; last; } $firstsection = 0; } # add a OID $doc_obj->set_OID (); # process the document $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm")); return 1; # processed the file } 1;