# plugin which process an HTML book directory
package GMLPlug;
use html;
use BasPlug;
use util;
use doc;
sub BEGIN {
@ISA = ('BasPlug');
}
sub new {
my ($class) = @_;
$self = new BasPlug ();
return bless $self, $class;
}
sub is_recursive {
my $self = shift (@_);
return 0; # this is not a recursive plugin
}
sub _unescape_text {
my ($text) = @_;
# special characters in the gml encoding
$text =~ s/<//g;
$text =~ s/"/\"/g;
$text =~ s/&/&/g; # this has to be last...
return $text;
}
# return number of files processed, undef if can't process
# Note that $base_dir might be "" and that $file might
# include directories
sub read {
my $self = shift (@_);
my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
my $fullname = &util::filename_cat ($base_dir, $file);
# see if this is a gml book
return undef unless (-f $fullname && $fullname =~ /\.gml$/i);
my ($parent_dir) = $fullname =~ /^(.*)\/[^\/]+.gml$/;
# create a new document
my $doc_obj = new doc ();
my $section = $doc_obj->get_top_section();
print STDERR "GMLPlug: processing $file\n";
# read in the document
my $gml = "";
my $line = "";
if (!open (INFILE, $fullname)) {
print STDERR "GMLPlug::read - couldn't read $fullname\n";
return undef;
}
while (defined ($line = )) {
$gml .= $line;
}
close (INFILE);
# process the document
my $firstsection = 1;
while ($gml =~ /\S/) {
if ($gml =~ s/^\s*]*)>(.*?)(<\/?gsdlsection)/$3/is) {
my $tags = $1;
$tags = "" unless defined $tags;
my $text = &_unescape_text($2);
# create the section (unless this is the first section)
if (!$firstsection) {
$tags =~ s/gsdlnum\s*=\s*\"?(\d+)\"?//i;
if (defined $1) {
$section .= ".$1";
$doc_obj->create_named_section($section);
} else {
$section = $doc_obj->insert_section($doc_obj->get_end_child($section));
}
}
$firstsection = 0;
# add the tags
while ((defined $tags) && ($tags =~ s/^\s*(\w+)\s*=\s*\"([^\"]*)\"//)) {
$doc_obj->add_utf8_metadata($section, $1 , &_unescape_text($2))
if (defined $1 and defined $2);
}
# add the text
$doc_obj->add_utf8_text($section, $text)
if ((defined $text) && ($text ne ""));
} elsif ($gml =~ s/^\s*<\/gsdlsection>//) {
$section = $doc_obj->get_parent_section ($section);
} else {
print STDERR "GMLPlug::read - error in file $fullname\n";
print STDERR "text: \"$gml\"\n";
last;
}
}
# add the associated files
$assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");
my ($assoc_file_info);
foreach $assoc_file_info (@$assoc_files) {
my ($assoc_file, $mime_type) = split (":", $assoc_file_info);
$doc_obj->associate_file(&util::filename_cat($parent_dir, $assoc_file),
$assoc_file, $mime_type);
}
$doc_obj->delete_metadata($doc_obj->get_top_section(), "gsdlassocfile");
# add metadata
foreach $field (keys(%$metadata)) {
# $metadata->{$field} may be an array reference
if (ref ($metadata->{$field}) eq "ARRAY") {
map {
$doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $_);
} @{$metadata->{$field}};
} else {
$doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $metadata->{$field});
}
}
# assume the document has an OID
# process the document
$processor->process($doc_obj, $file);
return 1; # processed the file
}
1;