########################################################################### # # XMLPlug.pm -- base class for XML plugins # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2001 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package XMLPlug; use BasPlug; use doc; sub BEGIN { @ISA = ('BasPlug'); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan"); } use XML::Parser; my ($self); sub new { my $class = shift (@_); # $self is global for use within subroutines called by XML::Parser $self = new BasPlug ($class, @_); my $parser = new XML::Parser('Style' => 'Stream', 'Handlers' => {'Char' => \&Char, 'XMLDecl' => \&XMLDecl, 'Doctype' => \&Doctype, 'Default' => \&Default } ); $self->{'parser'} = $parser; return bless $self, $class; } sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; my $filename = $file; $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) { $self->{'num_blocked'} ++; return 0; } if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { return undef; } $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up $self->{'file'} = $file; $self->{'filename'} = $filename; $self->{'processor'} = $processor; $self->{'metadata'} = $metadata; eval { $self->{'parser'}->parsefile($filename); }; if ($@) { # parsefile may either croak somewhere in XML::Parser (e.g. because # the document is not well formed) or die somewhere in XMLPlug or a # derived plugin (e.g. because we're attempting to process a # document whose DOCTYPE is not meant for this plugin). For the # first case we'll print a warning and continue, for the second # we'll just continue quietly my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/; if (defined $msg) { my $outhandle = $self->{'outhandle'}; my $plugin_name = ref ($self); print $outhandle "$plugin_name failed to process $file ($msg)\n"; } return undef; } return 1; # processed the file } sub get_default_process_exp { my $self = shift (@_); return q^(?i)\.xml$^; } sub StartDocument {$self->xml_start_document(@_);} sub XMLDecl {$self->xml_xmldecl(@_);} sub Doctype {$self->xml_doctype(@_);} sub StartTag {$self->xml_start_tag(@_);} sub EndTag {$self->xml_end_tag(@_);} sub Text {$self->xml_text(@_);} sub PI {$self->xml_pi(@_);} sub EndDocument {$self->xml_end_document(@_);} sub Default {$self->xml_default(@_);} # This Char function overrides the one in XML::Parser::Stream to overcome a # problem where $expat->{Text} is treated as the return value, slowing # things down significantly in some cases. sub Char { $_[0]->{'Text'} .= $_[1]; return undef; } # Called at the beginning of the XML document. sub xml_start_document { my $self = shift(@_); my ($expat) = @_; $self->open_document(); } # Called for XML declarations sub xml_xmldecl { my $self = shift(@_); my ($expat, $version, $encoding, $standalone) = @_; } # Called for DOCTYPE declarations - use die to bail out if this doctype # is not meant for this plugin sub xml_doctype { my $self = shift(@_); my ($expat, $name, $sysid, $pubid, $internal) = @_; die "XMLPlug Cannot process XML document with DOCTYPE of $name"; } # Called for every start tag. The $_ variable will contain a copy of the # tag and the %_ variable will contain the element's attributes. sub xml_start_tag { my $self = shift(@_); my ($expat, $element) = @_; } # Called for every end tag. The $_ variable will contain a copy of the tag. sub xml_end_tag { my $self = shift(@_); my ($expat, $element) = @_; } # Called just before start or end tags with accumulated non-markup text in # the $_ variable. sub xml_text { my $self = shift(@_); my ($expat) = @_; } # Called for processing instructions. The $_ variable will contain a copy # of the pi. sub xml_pi { my $self = shift(@_); my ($expat, $target, $data) = @_; } # Called at the end of the XML document. sub xml_end_document { my $self = shift(@_); my ($expat) = @_; $self->close_document(); } # Called for any characters not handled by the above functions. sub xml_default { my $self = shift(@_); my ($expat, $text) = @_; } sub open_document { my $self = shift(@_); # create a new document $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc"); $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'}); } sub close_document { my $self = shift(@_); # include any metadata passed in from previous plugins # note that this metadata is associated with the top level section $self->extra_metadata ($self->{'doc_obj'}, $self->{'doc_obj'}->get_top_section(), $self->{'metadata'}); # do any automatic metadata extraction $self->auto_extract_metadata ($self->{'doc_obj'}); # add an OID $self->{'doc_obj'}->set_OID(); # process the document $self->{'processor'}->process($self->{'doc_obj'}); $self->{'num_processed'} ++; } 1;