########################################################################### # # XMLPlug.pm -- base class for XML plugins # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2001 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package XMLPlug; use BasPlug; use doc; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa sub BEGIN { @XMLPlug::ISA = ('BasPlug'); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan"); } use XMLParser; my $arguments = [ { 'name' => "process_exp", 'desc' => "{BasPlug.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp(), 'reqd' => "no" }, { 'name' => "xslt", 'desc' => "{XMLPlug.xslt}", 'type' => "string", 'deft' => "", 'reqd' => "no" } ]; my $options = { 'name' => "XMLPlug", 'desc' => "{XMLPlug.desc}", 'abstract' => "yes", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; # $self is global for use within subroutines called by XML::Parser my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists); if ($self->{'info_only'}) { # don't worry about any options etc return bless $self, $class; } my $parser = new XML::Parser('Style' => 'Stream', 'Pkg' => 'XMLPlug', 'PluginObj' => $self, 'Handlers' => {'Char' => \&Char, 'XMLDecl' => \&XMLDecl, 'Entity' => \&Entity, 'Doctype' => \&Doctype, 'Default' => \&Default, }); $self->{'parser'} = $parser; return bless $self, $class; } # the inheriting class must implement this method to tell whether to parse this doc type sub get_doctype { my $self = shift(@_); die "$self The inheriting class must implement get_doctype method"; } sub apply_xslt { my $self = shift @_; my ($xslt,$filename) = @_; my $outhandle = $self->{'outhandle'}; my $xslt_filename = $xslt; if (! -e $xslt_filename) { # Look in main site directory my $gsdlhome = $ENV{'GSDLHOME'}; $xslt_filename = &util::filename_cat($gsdlhome,$xslt); } if (! -e $xslt_filename) { # Look in collection directory my $coldir = $ENV{'GSDLCOLLECTDIR'}; $xslt_filename = &util::filename_cat($coldir,$xslt); } if (! -e $xslt_filename) { print $outhandle "Warning: Unable to find XSLT $xslt\n"; if (open(XMLIN,"<$filename")) { my $untransformed_xml = ""; while (defined (my $line = )) { $untransformed_xml .= $line; } close(XMLIN); return $untransformed_xml; } else { print $outhandle "Error: Unable to open file $filename\n"; print $outhandle " $!\n"; return ""; } } my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java"); my $jar_filename = &util::filename_cat($bin_java,"xalan.jar"); my $xslt_base_cmd = "java -jar $jar_filename"; my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\""; my $transformed_xml = ""; if (open(XSLT_IN,"$xslt_cmd |")) { while (defined (my $line = )) { $transformed_xml .= $line; } close(XSLT_IN); } else { print $outhandle "Error: Unable to run command $xslt_cmd\n"; print $outhandle " $!\n"; } return $transformed_xml; } sub check_doctype { my $self = shift (@_); my ($filename) = @_; if (open(XMLIN,"<$filename")) { my $doctype = $self->get_doctype(); ## check whether the doctype has the same name as the root element tag while (defined (my $line = )) { ## find the root element if ($line =~ /<([\w\d:]+)[\s>]/){ my $root = $1; if ($root !~ $doctype){ close(XMLIN); return 0; } else { close(XMLIN); return 1; } } } close(XMLIN); } return undef; # haven't found a valid line } # because we are not just using process_exp to determine whether to process or not, we need to implement this too, so that a file can be passed down if we are not actually processing it sub metadata_read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_; my $result = $self->SUPER::metadata_read($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli); if (defined $result) { # we think we are processing this, but check that we actually are my $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; if ($self->check_doctype($filename)) { return $result; } } return undef; } sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; # Make sure we're processing the correct file, do blocking etc my ($block_status,$filename) = $self->read_block(@_); return $block_status if ((!defined $block_status) || ($block_status==0)); ## check the doctype to see whether we really want to process the file if (!$self->check_doctype($filename)) { # this file is not for us return undef; } $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up $self->{'base_dir'} = $base_dir; $self->{'file'} = $file; $self->{'filename'} = $filename; $self->{'processor'} = $processor; $self->{'metadata'} = $metadata; eval { my $xslt = $self->{'xslt'}; if (defined $xslt && ($xslt ne "")) { # perform xslt my $transformed_xml = $self->apply_xslt($xslt,$filename); # feed transformed file (now in memory as string) into XML parser $self->{'parser'}->parse($transformed_xml); } else { $self->{'parser'}->parsefile($filename); } }; if ($@) { # parsefile may either croak somewhere in XML::Parser (e.g. because # the document is not well formed) or die somewhere in XMLPlug or a # derived plugin (e.g. because we're attempting to process a # document whose DOCTYPE is not meant for this plugin). For the # first case we'll print a warning and continue, for the second # we'll just continue quietly print STDERR "**** Error is: $@\n"; my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/; if (defined $msg) { my $outhandle = $self->{'outhandle'}; my $plugin_name = ref ($self); print $outhandle "$plugin_name failed to process $file ($msg)\n"; } # reset ourself for the next document $self->{'section_level'}=0; print STDERR "\n" if ($gli); return -1; # error during processing } return 1; # processed the file } # the following two methods are for if you want to do the parsing from a # plugin that inherits from this. it seems that you can't call the parse # methods directly. WHY??? # # [Stefan 27/5/07] These two methods may not be necessary any more as I've # fixed XMLPlug so $self is no longer required to be a global variable # (that was why inheritance wasn't working quite right with XMLPlug I # think). I don't really know what other plugins rely on these methods # though so have left them here for now. sub parse_file { my $self = shift (@_); my ($filename) = @_; $self->{'parser'}->parsefile($filename); } sub parse_string { my $self = shift (@_); my ($xml_string) = @_; $self->{'parser'}->parse($xml_string); } sub get_default_process_exp { my $self = shift (@_); return q^(?i)\.xml$^; } sub StartDocument {$_[0]->{'PluginObj'}->xml_start_document(@_);} sub XMLDecl {$_[0]->{'PluginObj'}->xml_xmldecl(@_);} sub Entity {$_[0]->{'PluginObj'}->xml_entity(@_);} sub Doctype {$_[0]->{'PluginObj'}->xml_doctype(@_);} sub StartTag {$_[0]->{'PluginObj'}->xml_start_tag(@_);} sub EndTag {$_[0]->{'PluginObj'}->xml_end_tag(@_);} sub Text {$_[0]->{'PluginObj'}->xml_text(@_);} sub PI {$_[0]->{'PluginObj'}->xml_pi(@_);} sub EndDocument {$_[0]->{'PluginObj'}->xml_end_document(@_);} sub Default {$_[0]->{'PluginObj'}->xml_default(@_);} # This Char function overrides the one in XML::Parser::Stream to overcome a # problem where $expat->{Text} is treated as the return value, slowing # things down significantly in some cases. sub Char { use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+ $_[0]->{'Text'} .= $_[1]; return undef; } # Called at the beginning of the XML document. sub xml_start_document { my $self = shift(@_); my ($expat) = @_; $self->open_document(); } # Called for XML declarations sub xml_xmldecl { my $self = shift(@_); my ($expat, $version, $encoding, $standalone) = @_; } # Called for XML entities sub xml_entity { my $self = shift(@_); my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_; } # Called for DOCTYPE declarations - use die to bail out if this doctype # is not meant for this plugin sub xml_doctype { my $self = shift(@_); my ($expat, $name, $sysid, $pubid, $internal) = @_; die "XMLPlug Cannot process XML document with DOCTYPE of $name"; } # Called for every start tag. The $_ variable will contain a copy of the # tag and the %_ variable will contain the element's attributes. sub xml_start_tag { my $self = shift(@_); my ($expat, $element) = @_; } # Called for every end tag. The $_ variable will contain a copy of the tag. sub xml_end_tag { my $self = shift(@_); my ($expat, $element) = @_; } # Called just before start or end tags with accumulated non-markup text in # the $_ variable. sub xml_text { my $self = shift(@_); my ($expat) = @_; } # Called for processing instructions. The $_ variable will contain a copy # of the pi. sub xml_pi { my $self = shift(@_); my ($expat, $target, $data) = @_; } # Called at the end of the XML document. sub xml_end_document { my $self = shift(@_); my ($expat) = @_; $self->close_document(); } # Called for any characters not handled by the above functions. sub xml_default { my $self = shift(@_); my ($expat, $text) = @_; } sub open_document { my $self = shift(@_); # create a new document $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc"); $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'}, $self->{'processor'}->{'OIDmetadata'}); } sub close_document { my $self = shift(@_); my $doc_obj = $self->{'doc_obj'}; # include any metadata passed in from previous plugins # note that this metadata is associated with the top level section $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $self->{'metadata'}); # do any automatic metadata extraction $self->auto_extract_metadata ($doc_obj); # add an OID $doc_obj->set_OID(); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML"); # process the document $self->{'processor'}->process($doc_obj); $self->{'num_processed'} ++; } 1;