########################################################################### # # MARCXMLPlug.pm # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2001 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # Processes MARCXML documents. Note that this plugin does no # syntax checking (though the XML::Parser module tests for # well-formedness). package MARCXMLPlug; use XMLPlug; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa sub BEGIN { @MARCXMLPlug::ISA = ('XMLPlug'); } my $arguments = [{'name' => "metadata_mapping_file", 'desc' => "{MARCXMLPlug.metadata_mapping_file}", 'type' => "string", 'reqd' => "no" }]; my $options = { 'name' => "MARCXMLPlug", 'desc' => "{MARCXMLPlug.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; my $self = new XMLPlug($pluginlist, $inputargs, $hashArgOptLists); $self->{'content'} = ""; $self->{'record_count'} = 1; $self->{'language'} = ""; $self->{'encoding'} = ""; $self->{'marc_mapping'} = {}; $self->{'current_code'} = ""; $self->{'current_tag'} = ""; $self->{'current_element'} = ""; $self->{'metadata_mapping'} = undef; $self->{'num_processed'} = 0; $self->{'indent'} = 0; return bless $self, $class; } sub get_doctype { my $self = shift(@_); return "collection"; } sub init { my $self = shift (@_); my ($verbosity, $outhandle, $failhandle) = @_; ## the mapping file has already been loaded if (defined $self->{'metadata_mapping'} ){ $self->SUPER::init(@_); return; } my $metadata_mapping = {}; # read in the metadata mapping file my $mm_file = $self->{'metadata_mapping_file'}; if (! defined $mm_file or $mm_file eq ""){ $mm_file = &util::filename_cat( $ENV{'GSDLHOME'}, "etc","marctodc.txt" ); $self->{'metadata_mapping_file'} = $mm_file; } if (!-e $mm_file) { my $msg = "MARCXMLPlug ERROR: Can't locate mapping file \"" . $self->{'metadata_mapping'} . "\".\n This file should be at $mm_file\n" . " No marc files can be processed.\n"; print $outhandle $msg; print $failhandle $msg; $self->{'metadata_mapping'} = undef; # We pick up the error in process() if there is no $mm_file # If we exit here, then pluginfo.pl will exit too! } elsif (open(MMIN, "<$mm_file")) { my $l=1; my $line; while (defined($line=)) { chomp $line; if ($line =~ m/^(\d+\w?)\s*->\s*([\w\^]+)$/) { my $marc_info = $1; my $gsdl_info = $2; $metadata_mapping->{$marc_info} = $gsdl_info; } elsif ($line !~ m/^\#/ # allow comments (# in first column) && $line !~ m/^\s*$/) # allow blank lines { print $outhandle "Parse error on line $l of $mm_file:\n"; print $outhandle " \"$line\"\n"; } $l++ } close(MMIN); } else { print STDERR "Unable to open $mm_file: $!\n"; } $self->{'metadata_mapping'} = $metadata_mapping; ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping; $self->SUPER::init(@_); } # Called for DOCTYPE declarations - use die to bail out if this doctype # is not meant for this plugin sub xml_doctype { my $self = shift(@_); my ($expat, $name, $sysid, $pubid, $internal) = @_; return; } sub xml_start_document { my $self = shift(@_); my ($expat, $name, $sysid, $pubid, $internal) = @_; my $file = $self->{'file'}; my $filename = $self->{'filename'}; my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); $self->{'language'} = $language; $self->{'encoding'} = $encoding; $self->{'element_count'} = 1; $self->{'indent'} = 0; my $outhandle = $self->{'outhandle'}; print $outhandle "MARCXMLPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1; print STDERR "\n" if $self->{'gli'}; } sub xml_end_document { } sub xml_start_tag { my $self = shift; my $expat = shift; my $element = shift; my $text = $self->escape_text($_); $self->{'current_element'} = $element; ##get all atributes of this element and store it in a map name=>value my %attr_map = (); my $attrstring = $_; while ($attrstring =~ /(\w+)=\"(\w+)\"/){ $attr_map{$1}=$2; $attrstring = $'; #' } my $processor = $self->{'processor'}; ##create a new document for each record if ($element eq "record") { my $filename = $self->{'filename'}; my $language = $self->{'language'}; my $encoding = $self->{'encoding'}; my $file = $self->{'file'}; my $doc_obj = new doc($filename); $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); my ($filemeta) = $file =~ /([^\\\/]+)$/; $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta)); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}"); if ($self->{'cover_image'}) { $self->associate_cover_image($doc_obj, $filename); } $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML"); $doc_obj->set_OID(); $self->set_OID($doc_obj, $doc_obj->get_OID() , $self->{'record_count'}); my $outhandle = $self->{'outhandle'}; print $outhandle "Record $self->{'record_count'} - MARCXMLPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1; $self->{'record_count'}++; $self->{'doc_obj'} = $doc_obj; $self->{'num_processed'}++; } ## get the marc code, for example 520 if ($element eq "datafield") { if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){ $self->{'current_tag'} = $attr_map{tag}; } } ## append the subcode to the marc code for example 520a or 520b if ($element eq "subfield"){ if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){ $self->{'current_code'} = $attr_map{'code'}; } } if ($element eq "record"){ $self->{'indent'} = 0; } else { if ($element ne "subfield"){ $self->{'indent'} = 1; } else{ $self->{'indent'} = 2; } } if ($element ne "collection"){ $self->{'content'} .= "
".$self->calculate_indent($self->{'indent'}).$text; } } sub xml_end_tag { my $self = shift(@_); my ($expat, $element) = @_; my $text = $self->escape_text($_); if ($element eq "record" and defined $self->{'doc_obj'}) { # process the document my $processor = $self->{'processor'}; my $doc_obj = $self->{'doc_obj'}; $self->{'content'} .= "
".$text; $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'}); $processor->process($doc_obj); ##clean up $self->{'content'} = ""; $self->{'doc_obj'} = undef; return; } ## map the xmlmarc to gsdl metadata if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'}){ my $metadata_mapping = $self->{'metadata_mapping'}; my $marc_mapping = $self->{'marc_mapping'}; my $doc_obj = $self->{'doc_obj'}; ##map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping; ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping; foreach my $marc_field (keys %$metadata_mapping){ my ($meta_name,$meta_value); my $matched_field = $marc_mapping->{$marc_field}; my $subfield = undef; if (defined $matched_field){ ## test whether this field has subfield if ($marc_field =~ /\d\d\d(\w)/){ $subfield = $1; } $meta_name = $metadata_mapping->{$marc_field}; if (defined $subfield){ my %mapped_subfield = {@$matched_field}; $meta_value = $mapped_subfield{$subfield}; } else{ ## get all values my $i =0; foreach my $value (@$matched_field){ if ($i%2 != 0){ $meta_value .= $value." "; } $i++; } } ## escape [ and ] $meta_value =~ s/\[/\\\[/g; $meta_value =~ s/\]/\\\]/g; ##print STDERR "$meta_name=$meta_value\n"; $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value); } } ##clean up $self->{'marc_mapping'} = undef; $self->{'current_tag'} = ""; } if ($element eq "datafield"){ $self->{'indent'} = 1; $self->{'content'} .= "
".$self->calculate_indent($self->{'indent'}).$text; } else{ $self->{'content'} .= $text; } } sub set_OID { my $self = shift (@_); my ($doc_obj, $id, $record_number) = @_; $doc_obj->set_OID($id . "r" . $record_number); } sub xml_text { my $self = shift(@_); my ($expat) = @_; ## store the text of a marc code, for exapmle 520a=>A poem about.... if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){ ##stored it in the marc_mapping push(@{$self->{'marc_mapping'}->{$self->{'current_tag'}}},$self->{'current_code'}); push(@{$self->{'marc_mapping'}->{$self->{'current_tag'}}},$_); $self->{'current_code'} = ""; } $self->{'content'} .=$self->escape_text($_); } sub calculate_indent{ my ($self,$num) = @_; my $indent =""; for (my $i=0; $i<$num;$i++){ $indent .= "    "; } return $indent; } sub escape_text { my ($self,$text) = @_; # special characters in the xml encoding $text =~ s/&/&/g; # this has to be first... $text =~ s//>/g; $text =~ s/\"/"/g; return $text; } 1;