# plugin which process an HTML book directory package HBPlug; use html; use BasPlug; sub BEGIN { @ISA = ('BasPlug'); } sub new { my ($class) = @_; $self = new BasPlug (); return bless $self, $class; } sub is_recursive { my $self = shift (@_); return 0; # this is not a recursive plugin } sub HB_read_html_file { my $self = shift (@_); my ($htmlfile) = @_; # load in the file if (!open (FILE, $htmlfile)) { print STDERR "ERROR - could not open $htmlfile\n"; return ""; } my $line = ""; my $file = ""; my $foundbody = 0; while (defined ($line = )) { # look for body tag if (!$foundbody) { if ($line =~ s/^.*]*>//i) { $foundbody = 1; } else { next; } } # check for symbol fonts if ($line =~ /]*?face\s*=\s*\"?(\w+)\"?/i) { my $font = $1; print STDERR "HBPlug::HB_read_html_file - warning font $font used\n" if ($font !~ /^arial$/i); } $line =~ s/<\/p>//ig; # remove

tags $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags # convert any alphanumeric character entities to their extended # ascii equivalent for indexing purposes &html::convertcharentities ($line); $file .= $line; } close FILE; $file =~ s/\s+/ /g; # remove \n's return $file; } sub HB_clean_section { my $self = shift (@_); my ($section) = @_; # remove tags without a starting tag from the section my ($tag, $tagstart); while ($section =~ /<\/([^>]{1,10})>/) { $tag = $1; $tagstart = index($section, "<$tag"); last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag"))); $section =~ s/<\/$tag>//; } # remove extra paragraph tags while ($section =~ s/]*>\s*|||]*>| |\s)$//i) {} # add a newline at the beginning of each paragraph $section =~ s/(.)\s*]*?src=\"?([^\">]+)\"?[^>]*>/
/ig; # $section =~ s/<<I>>\s*([^\.]+\.(png|jpg|gif))/
/ig; $section =~ s/]*?src=\"?([^\">]+)\"?[^>]*>/<\/center>
/ig; $section =~ s/<<I>>\s*([^\.]+\.(png|jpg|gif))/<\/center>
/ig; return $section; } sub shorten { my $self = shift (@_); my ($text) = @_; return "\"$text\"" if (length($text) < 100); return "\"" . substr ($text, 0, 50) . "\" ... \"" . substr ($text, length($text)-50) . "\""; } # return 1 if processed, 0 if not processed # Note that $base_dir might be "" and that $file might # include directories sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; # get the html filename and see if this is a HTML Book... my ($jobnumber); if ($file =~ /[\\\/]/) { ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/; } else { ($jobnumber) = $file =~ /^([^\\\/]+)$/; } return 0 unless defined $jobnumber; my $htmlfile = "$base_dir$file/$jobnumber.htm"; return 0 unless -e $htmlfile; print STDERR "HBPlug: processing $file\n"; # read in the file and do basic html cleaning (removing header etc) my $html = $self->HB_read_html_file ($htmlfile); # create a new document my $doc_obj = new doc ($file, "indexed_doc"); # copy the book cover if it exists my $bookcover = "$base_dir$file/$jobnumber.jpg"; $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg") if -e $bookcover; # process the file one section at a time my $curtoclevel = 1; my $cursection = $doc_obj->get_top_section(); my $firstsection = 1; while (length ($html) > 0) { if ($html =~ s/^.*?]*>((|||\s)*)<<TOC(\d+)>>\s*(.*?)]*>((|||\s)*)<<TOC\d+>>)/$2/i) { $sectiontext = $1; } else { $sectiontext = $html; $html = ""; } # remove tags and extra spaces from the title $title =~ s/<\/?[^>]+>//g; $title =~ s/^\s+|\s+$//g; # close any sections below the current level and # create a new section (special case for the firstsection) while (($curtoclevel > $toclevel) || (!$firstsection && $curtoclevel == $toclevel)) { $cursection = $doc_obj->get_parent_section ($cursection); $curtoclevel--; } if ($curtoclevel+1 < $toclevel) { print STDERR "WARNING - jump in toc levels in $htmlfile " . "from $curtoclevel to $toclevel\n"; } while ($curtoclevel < $toclevel) { $curtoclevel++; $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)); } # add the metadata to this section if (!$firstsection || !defined($metadata->{'Title'})) { $doc_obj->add_metadata ($cursection, "Title", $title); } if ($firstsection) { foreach $field (keys(%$metadata)) { # Subjects may be a colon separated list if ($field eq "Subject") { my @subjects = split /:/, $metadata->{'Subject'}; foreach $subject (@subjects) { $doc_obj->add_metadata ($cursection, 'Subject', $subject); } } else { $doc_obj->add_metadata ($cursection, $field, $metadata->{$field}); } } $firstsection = 0; } # clean up the section html $sectiontext = $self->HB_clean_section($sectiontext); # associate any files # map { $doc_obj->associate_file("$base_dir$file/$1", $1) # if /_linkOID_$_thisOID_\/([^$]+)\)/; 0; } # split (/(_linkOID_$_thisOID_\/[^$]+\))/, $sectiontext); map { $doc_obj->associate_file("$base_dir$file/$1", $1) if /_httpcollection_\/archives\/_thisOID_\/([^\"]+)\"/; 0; } split (/(_httpcollection_\/archives\/_thisOID_\/[^\"]+\")/, $sectiontext); # add the text for this section $doc_obj->add_text ($cursection, $sectiontext); } else { print STDERR "WARNING - leftover text\n" , $self->shorten($html), "\nin $htmlfile\n"; last; } } # add a OID $doc_obj->set_OID (); print STDERR "OID: ", $doc_obj->get_OID(), "\n"; # process the document $processor->process($doc_obj); return 1; # processed the file } 1;