########################################################################### # # PrePlug.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # plugin that processes simple html files like those output # by prescript when converting postscript to html. # prescript output has

tags separating paragraphs # and tags separating pages. # it may also have tags at top or bottom of # some or all pages. # if a .html1 version of a .html file exists then that version # will be used so that the author name can be extracted # (.html1 files were generated by software which automatically # extracts author names from .html files). # also looks for a .info file containing metadata (as used by # the cstr collection) package PrePlug; use BasPlug; use sorttools; sub BEGIN { @ISA = ('BasPlug'); } sub new { my ($class) = @_; $self = new BasPlug (); return bless $self, $class; } sub is_recursive { my $self = shift (@_); return 0; # this is not a recursive plugin } # return number of files processed, undef if can't process # Note that $base_dir might be "" and that $file might # include directories sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; my $filename = &util::filename_cat($base_dir, $file); return 0 if ($filename =~ /\.(html1|html1\.gz|info|gif)/); return undef unless ($filename =~ /\.(html(\.gz)?)$/i && (-e $filename)); my $gz = 0; $gz = 1 if (defined $2 && $2 eq ".gz"); my $filename1 = $filename; if ($gz) { $filename1 =~ s/\.html\.gz$/\.html1\.gz/; } else { $filename1 .= "1"; } $filename = $filename1 if (-e $filename1); print STDERR "PrePlug: processing $filename\n" if $processor->{'verbosity'}; # create a new document my $doc_obj = new doc ($file, "indexed_doc"); if ($gz) { open (FILE, "zcat $filename |") || die "PrePlug::read - zcat can't open $filename\n"; } else { open (FILE, $filename) || die "PrePlug::read - can't open $filename\n"; } my $cursection = $doc_obj->get_top_section(); my $text = ""; my $line = ""; my $pagenumber = 1; my @creators = (); my $numlines = 0; while (defined ($line = )) { if ($filename =~ /html1$/) { while ($text =~ s/<_author_search_\([^\)]*\)>([^<]*)<\/a>/$1/i) { push (@creators, $1); } } # numlines sorts out documents that just have a single tag at the end if ($line =~ //i && $numlines < 200) { $numlines = -1000; $cursection = $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section())); $text =~ s/\s*

\s*

\s*\s*(\d+)<\/b>\s*<\/center>\s*//i; # if (defined $1 && $1 != $pagenumber) { # print STDERR "PrePlug Warning: Pagenumbers don't line up in $filename\n"; # } $doc_obj->add_text ($cursection, $text); $doc_obj->add_metadata ($cursection, "Title", $pagenumber); $pagenumber ++; $text = ""; } else { $text .= $line; $numlines ++; } } close FILE; if ($cursection eq $doc_obj->get_top_section()) { # there weren't any tags # see if there were any tags to split on if ($text =~ //i) { while ($text =~ s/^(.*?)\s*

\s*

\s*\s*\d+<\/b>\s*<\/center>\s*//i) { $cursection = $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section())); $doc_obj->add_text ($cursection, $text); $doc_obj->add_metadata ($cursection, "Title", $pagenumber); $pagenumber ++; } } # otherwise we'll just have to split pages on set number of lines else { my $pagetext = ""; my $line = ""; my $count = 0; while (length ($text) && $text =~ s/^(.*?)(\n|$)//) { $line = $1; $line = "" unless defined $line; $pagetext .= $line . "\n"; if (($count >= 50 && $line !~ /\w/) || $count == 80) { $cursection = $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section())); $doc_obj->add_text ($cursection, $pagetext); $doc_obj->add_metadata ($cursection, "Title", $pagenumber); $pagetext = ""; $pagenumber ++; $count = 0; } else { $count ++; } } } } # final section if ($text =~ /\w/) { $cursection = $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section())); $text =~ s/\s*

\s*

\s*\s*(\d+)<\/b>\s*<\/center>\s*//; # if (defined $1 && $1 != $pagenumber) { # print STDERR "PrePlug Warning: Pagenumbers don't line up in $filename\n"; # } $doc_obj->add_text ($cursection, $text); $doc_obj->add_metadata ($cursection, "Title", $pagenumber); } # add meta data for top level of document map {$doc_obj->add_metadata ($doc_obj->get_top_section(), "Creator", $_)} @creators; my ($filesuff) = $filename =~ /^(.*?)\.html1?/; if (!-e $filesuff . ".info") { print STDERR "Preplug Warning: $filename has no corresponding .info file\n"; } else { my ($dir) = $filesuff =~ /^(.*?)~?[^~]*$/; open (INFO, $filesuff . ".info") || die "PrePlug::read - can't open $filesuff.info\n"; my $line = ""; while (defined ($line = )) { chop $line; my ($key, $value) = $line =~ /^<([^>]*)>(.*)$/; next if $key =~ /^(pages|compressedsize|\/?info|size)$/i; next if $value !~ /\w/; if ($key =~ /^abstract$/i) { $key = "Description"; } elsif ($key =~ /^url$/i) { $key = "Source"; } elsif ($key =~ /^filedate$/i) { $key = "Date"; my ($day, $month, $year) = split /\//, $value; $value = &sorttools::format_date ($day, $month, $year); } elsif ($key =~ /^transferdate$/i) { my ($day, $month, $year) = split /\//, $value; $value = &sorttools::format_date ($day, $month, $year); } elsif ($key =~ /^facsimiles$/i) { $value =~ s/^\s+//; $value =~ s/\s+$//; my @facsimiles = split /\s+/, $value; $value = join ",", @facsimiles; foreach $facsimile (@facsimiles) { # assume images are in the same directory my $imagefile = $dir . "~" . $facsimile; if (-e $imagefile) { my ($imagetype) = $facsimile =~ /\.([^\.]*)$/; $doc_obj->associate_file($imagefile, $facsimile, "image/" . $imagetype); } else { print STDERR "PrePlug: Warning - facsimile file $imagefile doesn't exist\n"; next; } } } elsif ($key =~ /^figures$/i) { $value =~ s/^\s+//; $value =~ s/\s+$//; my @figures = split /\s+/, $value; foreach $figure (@figures) { # assume images are in the same directory my $imagefile = $dir . "~" . $figure; if (-e $imagefile) { my ($imagetype) = $figure =~ /\.([^\.]*)$/; $doc_obj->associate_file($imagefile, $figure, "image/" . $imagetype); } else { print STDERR "PrePlug: Warning - figure file $imagefile doesn't exist\n"; next; } $value = join ",", @figures; } } $doc_obj->add_metadata ($doc_obj->get_top_section(), $key, $value); } close INFO; } foreach $field (keys(%$metadata)) { # $metadata->{$field} may be an array reference if (ref ($metadata->{$field}) eq "ARRAY") { map { $doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $_); } @{$metadata->{$field}}; } else { $doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $metadata->{$field}); } } # add OID $doc_obj->set_OID (); # process the document $processor->process($doc_obj); return 1; # processed the file } 1;