# plugin which process an HTML book directory
package HBPlug;
use html;
use BasPlug;
sub BEGIN {
@ISA = ('BasPlug');
}
sub new {
my ($class) = @_;
$self = new BasPlug ();
return bless $self, $class;
}
sub is_recursive {
my $self = shift (@_);
return 0; # this is not a recursive plugin
}
sub HB_read_html_file {
my $self = shift (@_);
my ($htmlfile) = @_;
# load in the file
if (!open (FILE, $htmlfile)) {
print STDERR "ERROR - could not open $htmlfile\n";
return "";
}
my $line = "";
my $file = "";
my $foundbody = 0;
while (defined ($line =
]*>\s*
|||
]*>| |\s)$//i) {} # add a newline at the beginning of each paragraph $section =~ s/(.)\s*
]*?src=\"?([^\">]+)\"?[^>]*>/
/ig;
# $section =~ s/<<I>>\s*([^\.]+\.(png|jpg|gif))/
/ig;
$section =~ s/]*?src=\"?([^\">]+)\"?[^>]*>/
]*>((|||\s)*)<<TOC(\d+)>>\s*(.*?)
]*>((|||\s)*)<<TOC\d+>>)/$2/i) { $sectiontext = $1; } else { $sectiontext = $html; $html = ""; } # remove tags and extra spaces from the title $title =~ s/<\/?[^>]+>//g; $title =~ s/^\s+|\s+$//g; # close any sections below the current level and # create a new section (special case for the firstsection) while (($curtoclevel > $toclevel) || (!$firstsection && $curtoclevel == $toclevel)) { $cursection = $doc_obj->get_parent_section ($cursection); $curtoclevel--; } if ($curtoclevel+1 < $toclevel) { print STDERR "WARNING - jump in toc levels in $htmlfile " . "from $curtoclevel to $toclevel\n"; } while ($curtoclevel < $toclevel) { $curtoclevel++; $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)); } # add the metadata to this section if (!$firstsection || !defined($metadata->{'Title'})) { $doc_obj->add_metadata ($cursection, "Title", $title); } if ($firstsection) { foreach $field (keys(%$metadata)) { # Subjects may be a colon separated list if ($field eq "Subject") { my @subjects = split /:/, $metadata->{'Subject'}; foreach $subject (@subjects) { $doc_obj->add_metadata ($cursection, 'Subject', $subject); } } else { $doc_obj->add_metadata ($cursection, $field, $metadata->{$field}); } } $firstsection = 0; } # clean up the section html $sectiontext = $self->HB_clean_section($sectiontext); # associate any files # map { $doc_obj->associate_file("$base_dir$file/$1", $1) # if /_linkOID_\(_thisOID_\/([^\)]+)\)/; 0; } # split (/(_linkOID_\(_thisOID_\/[^\)]+\))/, $sectiontext); map { $doc_obj->associate_file("$base_dir$file/$1", $1) if /_httpcollection_\/archives\/_thisOID_\/([^\"]+)\"/; 0; } split (/(_httpcollection_\/archives\/_thisOID_\/[^\"]+\")/, $sectiontext); # add the text for this section $doc_obj->add_text ($cursection, $sectiontext); } else { print STDERR "WARNING - leftover text\n" , $self->shorten($html), "\nin $htmlfile\n"; last; } } # add a OID $doc_obj->set_OID (); print STDERR "OID: ", $doc_obj->get_OID(), "\n"; # process the document $processor->process($doc_obj); return 1; # processed the file } 1;