########################################################################### # # MARCXMLPlug.pm # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2001 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # Processes MARCXML documents. Note that this plugin does no # syntax checking (though the XML::Parser module tests for # well-formedness). package MARCXMLPlug; use XMLPlug; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa sub BEGIN { @MARCXMLPlug::ISA = ('XMLPlug'); } my $arguments = [{'name' => "metadata_mapping_file", 'desc' => "{MARCXMLPlug.metadata_mapping_file}", 'type' => "string", 'deft' => "marctodc.txt", 'reqd' => "no" }]; my $options = { 'name' => "MARCXMLPlug", 'desc' => "{MARCXMLPlug.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; my $self = new XMLPlug($pluginlist, $inputargs, $hashArgOptLists); $self->{'content'} = ""; $self->{'xmlcontent'} = ""; $self->{'record_count'} = 1; $self->{'language'} = ""; $self->{'encoding'} = ""; $self->{'marc_mapping'} = {}; $self->{'current_code'} = ""; $self->{'current_tag'} = ""; $self->{'current_element'} = ""; $self->{'metadata_mapping'} = undef; $self->{'num_processed'} = 0; $self->{'indent'} = 0; return bless $self, $class; } sub get_doctype { my $self = shift(@_); return "collection"; } sub _parse_marc_metadata_mapping { my $self = shift(@_); my ($mm_file,$metadata_mapping) = @_; my $outhandle = $self->{'outhandle'}; if (open(MMIN, "<$mm_file")) { my $l=0; my $line; while (defined($line=)) { $l++; chomp $line; $line =~ s/#.*$//; # strip out any comments, including end of line next if ($line =~ m/^\s*$/); $line =~ s/\s+$//; # remove any white space at end of line my $parse_error_count = 0; if ($line =~ m/^-(\d+)\s*$/) { # special "remove" rule syntax my $marc_info = $1; if (defined $metadata_mapping->{$marc_info}) { delete $metadata_mapping->{$marc_info}; } else { print $outhandle "Parse Warning: Did not file pre-existing rule $marc_info to remove"; print $outhandle " on line $l of $mm_file:\n"; print $outhandle " $line\n"; } } elsif ($line =~ m/^(.*?)->\s*([\w\^]+)$/) { my $lhs = $1; my $gsdl_info = $2; my @fields = split(/,\s*/,$lhs); my $f; while ($f = shift (@fields)) { $f =~ s/\s+$//; # remove any white space at end of line if ($f =~ m/^(\d+)\-(\d+)$/) { # number range => genrate number in range and # push on to array push(@fields,$1..$2); next; } if ($f =~ m/^(\d+)((?:(?:\$|\^)\w)*)\s*$/) { my $marc_info = $1; my $opt_sub_fields = $2; if ($opt_sub_fields ne "") { my @sub_fields = split(/\$|\^/,$opt_sub_fields); shift @sub_fields; # skip first entry, which is blank foreach my $sub_field (@sub_fields) { $metadata_mapping->{$marc_info."\$".$sub_field} = $gsdl_info; } } else { # no subfields to worry about $marc_info =~ s/\^/\$/; $metadata_mapping->{$marc_info} = $gsdl_info; } } else { $parse_error_count++; } } } else { $parse_error_count++; } if ($parse_error_count>0) { print $outhandle "Parse Error: $parse_error_count syntax error(s) on line $l of $mm_file:\n"; print $outhandle " $line\n"; } } close(MMIN); } else { print STDERR "Unable to open $mm_file: $!\n"; } } sub parse_marc_metadata_mapping { my $self = shift(@_); my ($mm_file_or_files) = @_; my $metadata_mapping = {}; if (ref ($mm_file_or_files) eq 'SCALAR') { my $mm_file = $mm_file_or_files; $self->_parse_marc_metadata_mapping($mm_file,$metadata_mapping); } else { my $mm_files = $mm_file_or_files; # Need to process files in reverse order. This is so in the # case where we have both a "collect" and "main" version, # the "collect" one tops up the main one my $mm_file; while ($mm_file = pop(@$mm_files)) { $self->_parse_marc_metadata_mapping($mm_file,$metadata_mapping); } } return $metadata_mapping; } sub init { my $self = shift (@_); my ($verbosity, $outhandle, $failhandle) = @_; ## the mapping file has already been loaded if (defined $self->{'metadata_mapping'} ){ $self->SUPER::init(@_); return; } # read in the metadata mapping file my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'}); if (scalar(@$mm_files)==0) { my $msg = "MARCXMLPlug ERROR: Can't locate mapping file \"" . $self->{'metadata_mapping_file'} . "\".\n " . " No marc files can be processed.\n"; print $outhandle $msg; print $failhandle $msg; $self->{'metadata_mapping'} = undef; # We pick up the error in process() if there is no $mm_file # If we exit here, then pluginfo.pl will exit too! } else { $self->{'metadata_mapping'} = $self->parse_marc_metadata_mapping($mm_files); } ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping; $self->SUPER::init(@_); } # Called for DOCTYPE declarations - use die to bail out if this doctype # is not meant for this plugin sub xml_doctype { my $self = shift(@_); my ($expat, $name, $sysid, $pubid, $internal) = @_; return; } sub xml_start_document { my $self = shift(@_); my ($expat, $name, $sysid, $pubid, $internal) = @_; my $file = $self->{'file'}; my $filename = $self->{'filename'}; my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); $self->{'language'} = $language; $self->{'encoding'} = $encoding; $self->{'element_count'} = 1; $self->{'indent'} = 0; my $outhandle = $self->{'outhandle'}; print $outhandle "MARCXMLPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1; print STDERR "\n" if $self->{'gli'}; } sub xml_end_document { } sub xml_start_tag { my $self = shift; my $expat = shift; my $element = shift; my $text = $_; my $escaped_text = $self->escape_text($_); $self->{'current_element'} = $element; ##get all atributes of this element and store it in a map name=>value my %attr_map = (); my $attrstring = $_; while ($attrstring =~ /(\w+)=\"(\w+)\"/){ $attr_map{$1}=$2; $attrstring = $'; #' } my $processor = $self->{'processor'}; ##create a new document for each record if ($element eq "record") { my $filename = $self->{'filename'}; my $language = $self->{'language'}; my $encoding = $self->{'encoding'}; my $file = $self->{'file'}; my $doc_obj = new doc($filename); $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); my ($filemeta) = $file =~ /([^\\\/]+)$/; $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta)); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}"); if ($self->{'cover_image'}) { $self->associate_cover_image($doc_obj, $filename); } $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML"); my $outhandle = $self->{'outhandle'}; print $outhandle "Record $self->{'record_count'} - MARCXMLPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1; $self->{'record_count'}++; $self->{'doc_obj'} = $doc_obj; $self->{'num_processed'}++; } ## get the marc code, for example 520 if ($element eq "datafield") { if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){ $self->{'current_tag'} = $attr_map{tag}; } } ## append the subcode to the marc code for example 520a or 520b if ($element eq "subfield"){ if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){ $self->{'current_code'} = $attr_map{'code'}; } } if ($element eq "record"){ $self->{'indent'} = 0; $self->{'content'} = ""; $self->{'xmlcontent'} = ""; } else { if ($element ne "subfield"){ $self->{'indent'} = 1; } else{ $self->{'indent'} = 2; } } if ($element eq "collection") { # remember the full start tag for # This is needed to wrap around each when generating its associate MARCXML file $self->{'xmlcollectiontag'} = $text; } else { $self->{'content'} .= "
" if ($element ne "record"); $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text; $self->{'xmlcontent'} .= $text; } } sub xml_end_tag { my $self = shift(@_); my ($expat, $element) = @_; my $text = $_; my $escaped_text = $self->escape_text($_); if ($element eq "record" and defined $self->{'doc_obj'}) { # process the document my $processor = $self->{'processor'}; my $doc_obj = $self->{'doc_obj'}; $self->{'content'} .= "
".$escaped_text; $self->{'xmlcontent'} .= $text; my $top_section = $doc_obj->get_top_section(); my $tmp_marcxml_filename = &util::get_tmp_filename().".xml"; if (open (XMLOUT,">$tmp_marcxml_filename")) { print XMLOUT "\n"; my $xml_content = $self->{'xmlcontent'}; $xml_content = $self->{'xmlcollectiontag'}.$xml_content."
"; print XMLOUT $xml_content; close(XMLOUT); $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section); # assicate xsl style file for presentation as HTML my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl"); $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section); } else { my $outhandle = $self->{'outhandle'}; print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n"; } $self->add_OID($doc_obj, $self->{'record_count'}); $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'}); $processor->process($doc_obj); ##clean up $self->{'content'} = ""; $self->{'xmlcontent'} = ""; $self->{'doc_obj'} = undef; return; } ## map the xmlmarc to gsdl metadata if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'}){ my $metadata_mapping = $self->{'metadata_mapping'}; my $marc_mapping = $self->{'marc_mapping'}; my $doc_obj = $self->{'doc_obj'}; ## print STDERR "**** Marc Record\n"; ## map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping; ## print STDERR "**** Metadata Mapping\n"; ## map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping; foreach my $marc_field (keys %$metadata_mapping){ ## test whether this field has subfield my $subfield = undef; if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){ $marc_field = $1; $subfield = $2; } my $matched_field = $marc_mapping->{$marc_field}; if (defined $matched_field) { my $meta_name = undef; my $meta_value = undef; if (defined $subfield){ $meta_name = $metadata_mapping->{$marc_field."\$".$subfield}; $meta_value = $matched_field->{$subfield}; if (!defined $meta_value) { # record read in does not have the specified subfield next; } } else { $meta_name = $metadata_mapping->{$marc_field}; # no subfield => get all the values foreach my $value (sort keys %{$matched_field}) { $meta_value .= $matched_field->{$value} ." "; } } ## escape [ and ] $meta_value =~ s/\[/\\\[/g; $meta_value =~ s/\]/\\\]/g; ##print STDERR "$meta_name=$meta_value\n"; $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value); } } ##clean up $self->{'marc_mapping'} = undef; $self->{'current_tag'} = ""; } if ($element eq "datafield"){ $self->{'indent'} = 1; $self->{'content'} .= "
".$self->calculate_indent($self->{'indent'}).$escaped_text; $self->{'xmlcontent'} .= $text; } else{ $self->{'content'} .= $escaped_text; $self->{'xmlcontent'} .= $text; } } sub set_OID { my $self = shift (@_); my ($doc_obj, $record_number) = @_; # first set it to generate hash value $doc_obj->set_OID(); # then top it up with an "r" + record-number suffix my $id = $doc_obj->get_OID(); $doc_obj->set_OID($id . "r" . $record_number); } sub xml_text { my $self = shift(@_); my ($expat) = @_; my $text = $_; my $escaped_text = $self->escape_text($_); # protect against & in raw text file $text =~ s/&/&/g; # can't have & in raw form, even in 'raw' xml text ## store the text of a marc code, for exapmle 520a=>A poem about.... if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){ ##stored it in the marc_mapping my $current_tag = $self->{'current_tag'}; my $current_code = $self->{'current_code'}; $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_; $self->{'current_code'} = ""; } $self->{'content'} .= $escaped_text; $self->{'xmlcontent'} .= $text; } sub calculate_indent{ my ($self,$num) = @_; my $indent =""; for (my $i=0; $i<$num;$i++){ $indent .= "    "; } return $indent; } sub escape_text { my ($self,$text) = @_; # special characters in the xml encoding $text =~ s/&/&/g; # this has to be first... $text =~ s//>/g; $text =~ s/\"/"/g; return $text; } sub unescape_text { my ($self,$text) = @_; # special characters in the xml encoding $text =~ s/<//g; $text =~ s/"/\"/g; $text =~ s/&/&/g; # can't have & in raw form, even in unescaped xml! return $text; } 1;