########################################################################### # # BNContentePlug.pm -- plugin for import the BN-Portugal Collection # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # BNContentePlug - 11/2004 # # # This plugin takes "mets.xml" and "record/NCB_***.xml: the file contain MARC details # about BN-Portugal ccllection. The intension is to import such a collection into GS2. package BNContentePlug; use BasPlug; use plugin; #use ghtml; use XMLParser; use XML::Parser; sub BEGIN { @ISA = ('BasPlug'); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan"); } my $arguments = [ { 'name' => "process_exp", 'desc' => "{BasPlug.process_exp}", 'type' => "string", 'deft' => &get_default_process_exp(), 'reqd' => "no" }, { 'name' => "only_first_doc", 'desc' => "{BNContentePlug.only_first_doc}", 'type' => "flag", 'reqd' => "no" }, { 'name' => "first_inoder_ext", 'desc' => "{BNContentePlug.first_inorder_ext}", 'type' => "flag", 'reqd' => "no" }, { 'name' => "first_inorder_mime", 'desc' => "{BNContentePlug.first_inorder_mime}", 'type' => "flag", 'reqd' => "no" }, { 'name' => "block_exp", 'desc' => "{BasPlug.block_exp}", 'type' => "string", 'deft' => &get_default_block_exp(), 'reqd' => "no" }]; my $options = { 'name' => "BNContentePlug", 'desc' => "{BNContentePlug.desc}", 'inherits' => "yes", 'args' => $arguments }; # Important variation to regular plugin structure. Need to desclare # $self as global variable to file so XMLParser callback routines # can access the content of the object. my ($self); sub get_default_process_exp { my $self = shift (@_); return q^(?i)(metsHTML\.xml)$^; } # block files sub get_default_block_exp { my $self = shift (@_); # Block all files besides contents #return q^(?i)(metsHTML\.xml|)$^; return q^(?i)((.*?)\.(.*?))$^; } sub new { my $class = shift (@_); #my $plugin_name = shift (@_); $self = new BasPlug ($class, @_); $self->{'plugin_type'} = "BNContentePlug"; my $option_list = $self->{'option_list'}; push( @{$option_list}, $options ); if (!parsargv::parse(\@_, "allow_extra_options")) { print STDERR "\nBNContentePlug uses an incorrect option.\n"; print STDERR "Check your collect.cfg configuration file.\n\n"; $self->print_txt_usage(""); # Use default resource bundle die "\n"; } #create XML::Parser object for parsing metsHTML.xml, NCB_???.xml files my $mets_parser = new XML::Parser('Style' => 'Stream', 'Handlers' => {'Doctype' => \&METS_Doctype, 'Start' => \&METS_StartTag, 'End' => \&METS_EndTag }); my $marc_parser = new XML::Parser('Style' => 'Stream', 'Handlers' => {'Char' => \&Char, 'Doctype' => \&MARC_Doctype, 'Start' => \&MARC_StartTag, 'End' => \&MARC_EndTag }); $self->{'mets_parser'} = $mets_parser; $self->{'marc_parser'} = $marc_parser; $self->{'index_file'} = ""; return bless $self, $class; } sub read_marc_content { my $self = shift (@_); my ($marc_file) = @_; # parse the Marc_file: NCB_???.xml eval{ $self->{'marc_parser'}->parsefile($marc_file); }; if ($@) { die "BNContentePlug: ERROR $marc_file is not a well formed XML file ($@)\n"; } } # Read metsHTML.xml from BN-Portugal collection sub metadata_read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs) = @_; my $outhandle = $self->{'outhandle'}; my $filename = &util::filename_cat($base_dir, $file); if ($filename !~ /metsHTML\.xml$/ || !-f $filename) { if ($filename =~ /\.xml$/i || $filename =~ /log\.txt$/i || $filename =~ /isbd\.html$/i) { $self->{'file_blocks'}->{$filename}=1; } return undef; } else { $self->{'file_blocks'}->{$filename}=1; } print $outhandle "BNContentePlug: extracting metadata from $filename\n" if $self->{'verbosity'} > 1; my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/; $self->{'dir'} = $dir; eval { $self->{'mets_parser'}->parsefile($filename); }; if ($@) { die "BNContentePlug: ERROR $filename is not a well formed XML file ($@)\n"; } # read NCB_???.xml to parse MARC records and save as metadata my $marc_file = &util::filename_cat($dir,$self->{'marc_file'}); $self->read_marc_content ($marc_file); if (defined $self->{'index_file'} && $self->{'index_file'} ne "") { my $index_file = $self->{'index_file'}; push(@$extrametakeys,$index_file); $extrametadata->{$index_file} = $self->{'saved_metadata'}; } else { print STDERR "####Warning can't find main index file\n"; } return 1; } # The BNContentePlug read() function. This function does all the right things # to make general options work for a given plugin. It calls the process() # function which does all the work specific to a plugin (like the old # read functions used to do). Most plugins should define their own # process() function and let this read() function keep control. # # Return number of files processed, undef if can't process # Note that $base_dir might be "" and that $file might # include directories sub readxxx { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; my $outhandle = $self->{'outhandle'}; my $filename = &util::filename_cat($base_dir, $file); #return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/; #return 0 if ($filename =~ /metsHTML\.xml$/); return 0 if ($filename =~ /\.xml$/); return 0 if (defined $self->{'file_blocks'}->{'filename'}); return undef; } # do plugin specific processing of doc_obj sub process { my $self = shift (@_); my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; my $outhandle = $self->{'outhandle'}; return 1; } sub METS_Doctype { my ($expat, $name, $sysid, $pubid, $internal) = @_; die if ($name !~ /^metsHTML\.xml$/); } sub MARC_Doctype { my ($expat, $name, $sysid, $pubid, $internal) = @_; print STDERR "###MARC Name =$name\n"; #die if ($name !~ /^metsHTML\.xml$/); #die if (!$name); } sub METS_StartTag { my ($expat, $element, %attr) = @_; my @file_blocks; if ($element eq "dmdSec") { $self->{'marc_file'} = ""; } elsif ($element eq "mdRef") { my $marc_href = $attr{'xlink:href'}; $self->{'marc_file'} = $marc_href; } elsif ($element eq "FLocat"){ my $assocfiles = $attr{'xlink:href'}; if ($assocfiles =~ /index\.html$/) { my $index_file = &util::filename_cat($self->{'dir'}, $assocfiles); $self->{'index_file'} = $index_file; } else { my $link = &util::filename_cat($self->{'dir'}, $assocfiles); #$self->{'file_blocks'}->{$link} = 1; } } } sub METS_EndTag { my ($expat, $element, %attr) = @_; } sub MARC_StartTag { my ($expat, $element, %attr) = @_; if ($element eq "record") { $self->{'saved_metadata'} = {}; } elsif ($element eq "datafield") { $self->{'metaname'} = $element; $self->{'datafield'} = $attr{'tag'}; } elsif ($element eq "subfield") { $self->{'subfield'} = $attr{'code'}; $self->{'text'} = ""; } } sub MARC_EndTag { my ($expat, $element) = @_; if ($element eq "datafield") { $self->{'metaname'} = ""; } elsif ($element eq "subfield") { my $mvalue = $self->{'text'}; my $mname = $self->{'datafield'}."^".$self->{'subfield'}; #print STDERR "**** $mname = $mvalue\n"; $mvalue =~ s/\[/&\#91;/g; $mvalue =~ s/\[/&\#93;/g; if (defined $self->{'saved_metadata'}->{$mname}) { # accumulate - add value to existing value(s) if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") { push (@{$self->{'saved_metadata'}->{$mname}}, $mvalue); } else { $self->{'saved_metadata'}->{$mname} = [$self->{'saved_metadata'}->{$mname}, $mvalue]; } } else { # accumulate - add value into (currently empty) array $self->{'saved_metadata'}->{$mname} = [$mvalue]; } # store something here $self->{'subfield'} = ""; $self->{'text'} = ""; } } # This Char function overrides the one in XML::Parser::Stream to overcome a # problem where $expat->{Text} is treated as the return value, slowing # things down significantly in some cases. sub Char { $_[0]->{'Text'} .= $_[1]; if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) { $self->{'text'} .= $_[1]; } return undef; } 1;