########################################################################### # # HBPlug.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # plugin which process an HTML book directory package HBPlug; use plugin; use ghtml; use BasPlug; use util; use lang; use doc; use cfgread; sub BEGIN { @ISA = ('BasPlug'); } sub new { my ($class) = @_; $self = new BasPlug (); return bless $self, $class; } sub is_recursive { my $self = shift (@_); return 0; # this is not a recursive plugin } sub HB_read_html_file { my $self = shift (@_); my ($htmlfile, $text) = @_; # load in the file if (!open (FILE, $htmlfile)) { print STDERR "ERROR - could not open $htmlfile\n"; return; } my $foundbody = 0; $self->HB_gettext (\$foundbody, $text, FILE); close FILE; # just in case there was no tag if (!$foundbody) { $foundbody = 1; open (FILE, $htmlfile) || return; $self->HB_gettext (\$foundbody, $text, FILE); close FILE; } } sub HB_gettext { my $self = shift (@_); my ($foundbody, $text, $handle) = @_; my $line = ""; while (defined ($line = <$handle>)) { # look for body tag if (!$$foundbody) { if ($line =~ s/^.*]*>//i) { $$foundbody = 1; } else { next; } } # check for symbol fonts if ($line =~ /]*?face\s*=\s*\"?(\w+)\"?/i) { my $font = $1; print STDERR "HBPlug::HB_gettext - warning removed font $font\n" if ($font !~ /^arial$/i); } $line =~ s/<\/p>//ig; # remove

tags $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags # convert any alphanumeric character entities to their extended # ascii equivalent for indexing purposes &ghtml::convertcharentities ($line); $$text .= $line; } $$text =~ s/\s+/ /g; # remove \n's } sub HB_clean_section { my $self = shift (@_); my ($section) = @_; # remove tags without a starting tag from the section my ($tag, $tagstart); while ($section =~ /<\/([^>]{1,10})>/) { $tag = $1; $tagstart = index($section, "<$tag"); last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag"))); $section =~ s/<\/$tag>//; } # remove extra paragraph tags while ($section =~ s/]*>\s*|||]*>| |\s)$//i) {} # add a newline at the beginning of each paragraph $section =~ s/(.)\s*]*?src=\"?([^\">]+)\"?[^>]*>/
<\/center>
/ig; $section =~ s/<<I>>\s*([^\.]+\.(png|jpg|gif))/
<\/center>
/ig; return $section; } sub shorten { my $self = shift (@_); my ($text) = @_; return "\"$text\"" if (length($text) < 100); return "\"" . substr ($text, 0, 50) . "\" ... \"" . substr ($text, length($text)-50) . "\""; } # return number of files processed, undef if can't process # Note that $base_dir might be "" and that $file might # include directories sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; # get the html filename and see if this is an HTML Book... my $jobnumber = $file; if ($file =~ /[\\\/]/) { ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/; } return undef unless defined $jobnumber; my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm"); return undef unless -e $htmlfile; print STDERR "HBPlug: processing $file\n"; # read in the file and do basic html cleaning (removing header etc) my $html = ""; $self->HB_read_html_file ($htmlfile, \$html); # create a new document my $doc_obj = new doc ($file, "indexed_doc"); # copy the book cover if it exists my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg"); $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg"); my $cursection = $doc_obj->get_top_section(); # add metadata for top level of document foreach $field (keys(%$metadata)) { # $metadata->{$field} may be an array reference if (ref ($metadata->{$field}) eq "ARRAY") { map { $doc_obj->add_metadata ($cursection, $field, $_); } @{$metadata->{$field}}; } else { $doc_obj->add_metadata ($cursection, $field, $metadata->{$field}); } } # process the file one section at a time my $curtoclevel = 1; my $firstsection = 1; while (length ($html) > 0) { if ($html =~ s/^.*?(?:]*>)?((|||\s)*)<<TOC(\d+)>>\s*(.*?)]*>)?((|||\s)*)<<TOC\d+>>)/$2/i) { $sectiontext = $1; } else { $sectiontext = $html; $html = ""; } # remove tags and extra spaces from the title $title =~ s/<\/?[^>]+>//g; $title =~ s/^\s+|\s+$//g; # close any sections below the current level and # create a new section (special case for the firstsection) while (($curtoclevel > $toclevel) || (!$firstsection && $curtoclevel == $toclevel)) { $cursection = $doc_obj->get_parent_section ($cursection); $curtoclevel--; } if ($curtoclevel+1 < $toclevel) { print STDERR "WARNING - jump in toc levels in $htmlfile " . "from $curtoclevel to $toclevel\n"; } while ($curtoclevel < $toclevel) { $curtoclevel++; $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)); } # add the metadata to this section $doc_obj->add_metadata ($cursection, "Title", $title); # clean up the section html $sectiontext = $self->HB_clean_section($sectiontext); # associate any files map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1) if /_httpdocimg_\/([^\"]+)\"/; 0; } split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext); # add the text for this section $doc_obj->add_text ($cursection, $sectiontext); } else { print STDERR "WARNING - leftover text\n" , $self->shorten($html), "\nin $htmlfile\n"; last; } $firstsection = 0; } # add a OID $doc_obj->set_OID (); # process the document $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm")); return 1; # processed the file } 1;