###########################################################################
#
# PrePlug.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
# plugin that processes simple html files like those output
# by prescript when converting postscript to html.
# prescript output has
tags separating paragraphs
# and tags separating pages.
# it may also have tags at top or bottom of
# some or all pages.
# if a .html1 version of a .html file exists then that version
# will be used so that the author name can be extracted
# (.html1 files were generated by software which automatically
# extracts author names from .html files).
# also looks for a .info file containing metadata (as used by
# the cstr collection)
package PrePlug;
use BasPlug;
use sorttools;
sub BEGIN {
@ISA = ('BasPlug');
}
sub new {
my ($class) = @_;
$self = new BasPlug ();
return bless $self, $class;
}
sub is_recursive {
my $self = shift (@_);
return 0; # this is not a recursive plugin
}
# return number of files processed, undef if can't process
# Note that $base_dir might be "" and that $file might
# include directories
sub read {
my $self = shift (@_);
my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
my $filename = &util::filename_cat($base_dir, $file);
return 0 if ($filename =~ /\.(html1|html1\.gz|info|gif)/);
return undef unless ($filename =~ /\.(html(\.gz)?)$/i && (-e $filename));
my $gz = 0;
$gz = 1 if (defined $2 && $2 eq ".gz");
my $filename1 = $filename;
if ($gz) {
$filename1 =~ s/\.html\.gz$/\.html1\.gz/;
} else {
$filename1 .= "1";
}
$filename = $filename1 if (-e $filename1);
print STDERR "PrePlug: processing $filename\n" if $processor->{'verbosity'};
# create a new document
my $doc_obj = new doc ($file, "indexed_doc");
if ($gz) {
open (FILE, "zcat $filename |") || die "PrePlug::read - zcat can't open $filename\n";
} else {
open (FILE, $filename) || die "PrePlug::read - can't open $filename\n";
}
my $cursection = $doc_obj->get_top_section();
my $text = "";
my $line = "";
my $pagenumber = 1;
my @creators = ();
my $numlines = 0;
while (defined ($line = )) {
if ($filename =~ /html1$/) {
while ($text =~ s/<_author_search_\([^\)]*\)>([^<]*)<\/a>/$1/i) {
push (@creators, $1);
}
}
# numlines sorts out documents that just have a single tag at the end
if ($line =~ //i && $numlines < 200) {
$numlines = -1000;
$cursection =
$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
$text =~ s/\s*\s*
\s*\s*(\d+)<\/b>\s*<\/center>\s*//i;
# if (defined $1 && $1 != $pagenumber) {
# print STDERR "PrePlug Warning: Pagenumbers don't line up in $filename\n";
# }
$doc_obj->add_text ($cursection, $text);
$doc_obj->add_metadata ($cursection, "Title", $pagenumber);
$pagenumber ++;
$text = "";
} else {
$text .= $line;
$numlines ++;
}
}
close FILE;
if ($cursection eq $doc_obj->get_top_section()) {
# there weren't any tags
# see if there were any tags to split on
if ($text =~ //i) {
while ($text =~ s/^(.*?)\s*\s*
\s*\s*\d+<\/b>\s*<\/center>\s*//i) {
$cursection =
$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
$doc_obj->add_text ($cursection, $text);
$doc_obj->add_metadata ($cursection, "Title", $pagenumber);
$pagenumber ++;
}
}
# otherwise we'll just have to split pages on set number of lines
else {
my $pagetext = "";
my $line = "";
my $count = 0;
while (length ($text) && $text =~ s/^(.*?)(\n|$)//) {
$line = $1;
$line = "" unless defined $line;
$pagetext .= $line . "\n";
if (($count >= 50 && $line !~ /\w/) || $count == 80) {
$cursection =
$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
$doc_obj->add_text ($cursection, $pagetext);
$doc_obj->add_metadata ($cursection, "Title", $pagenumber);
$pagetext = "";
$pagenumber ++;
$count = 0;
} else {
$count ++;
}
}
}
}
# final section
if ($text =~ /\w/) {
$cursection =
$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
$text =~ s/\s*\s*
\s*\s*(\d+)<\/b>\s*<\/center>\s*//;
# if (defined $1 && $1 != $pagenumber) {
# print STDERR "PrePlug Warning: Pagenumbers don't line up in $filename\n";
# }
$doc_obj->add_text ($cursection, $text);
$doc_obj->add_metadata ($cursection, "Title", $pagenumber);
}
# add meta data for top level of document
map {$doc_obj->add_metadata ($doc_obj->get_top_section(), "Creator", $_)} @creators;
my ($filesuff) = $filename =~ /^(.*?)\.html1?/;
if (!-e $filesuff . ".info") {
print STDERR "Preplug Warning: $filename has no corresponding .info file\n";
} else {
my ($dir) = $filesuff =~ /^(.*?)~?[^~]*$/;
open (INFO, $filesuff . ".info") || die "PrePlug::read - can't open $filesuff.info\n";
my $line = "";
while (defined ($line = )) {
chop $line;
my ($key, $value) = $line =~ /^<([^>]*)>(.*)$/;
next if $key =~ /^(pages|compressedsize|\/?info|size)$/i;
next if $value !~ /\w/;
if ($key =~ /^abstract$/i) {
$key = "Description";
} elsif ($key =~ /^url$/i) {
$key = "Source";
} elsif ($key =~ /^filedate$/i) {
$key = "Date";
my ($day, $month, $year) = split /\//, $value;
$value = &sorttools::format_date ($day, $month, $year);
} elsif ($key =~ /^transferdate$/i) {
my ($day, $month, $year) = split /\//, $value;
$value = &sorttools::format_date ($day, $month, $year);
} elsif ($key =~ /^facsimiles$/i) {
$value =~ s/^\s+//;
$value =~ s/\s+$//;
my @facsimiles = split /\s+/, $value;
$value = join ",", @facsimiles;
foreach $facsimile (@facsimiles) {
# assume images are in the same directory
my $imagefile = $dir . "~" . $facsimile;
if (-e $imagefile) {
my ($imagetype) = $facsimile =~ /\.([^\.]*)$/;
$doc_obj->associate_file($imagefile, $facsimile, "image/" . $imagetype);
} else {
print STDERR "PrePlug: Warning - facsimile file $imagefile doesn't exist\n";
next;
}
}
} elsif ($key =~ /^figures$/i) {
$value =~ s/^\s+//;
$value =~ s/\s+$//;
my @figures = split /\s+/, $value;
foreach $figure (@figures) {
# assume images are in the same directory
my $imagefile = $dir . "~" . $figure;
if (-e $imagefile) {
my ($imagetype) = $figure =~ /\.([^\.]*)$/;
$doc_obj->associate_file($imagefile, $figure, "image/" . $imagetype);
} else {
print STDERR "PrePlug: Warning - figure file $imagefile doesn't exist\n";
next;
}
$value = join ",", @figures;
}
}
$doc_obj->add_metadata ($doc_obj->get_top_section(), $key, $value);
}
close INFO;
}
foreach $field (keys(%$metadata)) {
# $metadata->{$field} may be an array reference
if (ref ($metadata->{$field}) eq "ARRAY") {
map {
$doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $_);
} @{$metadata->{$field}};
} else {
$doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $metadata->{$field});
}
}
# add OID
$doc_obj->set_OID ();
# process the document
$processor->process($doc_obj);
return 1; # processed the file
}
1;