########################################################################### # # MARCPlug.pm -- basic MARC plugin # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2002 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package MARCPlug; use SplitPlug; use unicode; use util; use parsargv; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa sub BEGIN { @MARCPlug::ISA = ('SplitPlug'); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan"); } my $arguments = [ { 'name' => "metadata_mapping", 'desc' => "{MARCPlug.metadata_mapping}", 'type' => "string", 'deft' => "marctodc.txt", 'reqd' => "no" }, { 'name' => "process_exp", 'desc' => "{BasPlug.process_exp}", 'type' => "regexp", 'reqd' => "no", 'deft' => &get_default_process_exp() }, { 'name' => "split_exp", 'desc' => "{SplitPlug.split_exp}", 'type' => "regexp", 'reqd' => "no", 'deft' => &get_default_split_exp() } ]; my $options = { 'name' => "MARCPlug", 'desc' => "{MARCPlug.desc}", 'abstract' => "no", 'inherits' => "Yes", 'explodes' => "yes", 'args' => $arguments }; require MARC::Record; require MARC::Batch; #use MARC::Record; #use MARC::Batch; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs); return bless $self, $class; } sub init { my $self = shift (@_); my ($verbosity, $outhandle, $failhandle) = @_; my @metadata_mapping = (); # read in the metadata mapping file my $mm_file = &util::filename_cat( $ENV{'GSDLHOME'}, "etc", $self->{'metadata_mapping'} ); if (!-e $mm_file) { my $msg = "MARCPlug ERROR: Can't locate mapping file \"" . $self->{'metadata_mapping'} . "\".\n This file should be at $mm_file\n" . " No marc files can be processed.\n"; print $outhandle $msg; print $failhandle $msg; $self->{'metadata_mapping'} = undef; # We pick up the error in process() if there is no $mm_file # If we exit here, then pluginfo.pl will exit too! } elsif (open(MMIN, "<$mm_file")) { my $l=1; my $line; while (defined($line=)) { chomp $line; if ($line =~ m/^(\d+)\s*->\s*(\w+)$/) { my $marc_info = $1; my $gsdl_info = $2; my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info }; push(@metadata_mapping,$mapping); } elsif ($line !~ m/^\#/ # allow comments (# in first column) && $line !~ m/^\s*$/) # allow blank lines { print $outhandle "Parse error on line $l of $mm_file:\n"; print $outhandle " \"$line\"\n"; } $l++ } close(MMIN); } else { print STDERR "Unable to open $mm_file: $!\n"; } $self->{'metadata_mapping'} = \@metadata_mapping; $self->SUPER::init(@_); } sub get_default_process_exp { my $self = shift (@_); return q^(?i)(\.marc)$^; } sub get_default_split_exp { # \r\n for msdos eol, \n for unix return q^\r?\n\s*\r?\n^; } # The bulk of this function is based on read_line in multiread.pm # Unable to use read_line original because it expects to get its input # from a file. Here the line to be converted is passed in as a string sub to_utf8 { my $self = shift (@_); my ($encoding, $line) = @_; if ($encoding eq "iso_8859_1") { # we'll use ascii2utf8() for this as it's faster than going # through convert2unicode() return &unicode::ascii2utf8 (\$line); } # everything else uses unicode::convert2unicode return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line)); } sub read_file { my $self = shift (@_); my ($filename, $encoding, $language, $textref) = @_; $self->{'readfile_encoding'}->{$filename} = $encoding; my @marc_entries = (); if (!-r $filename) { my $outhandle = $self->{'outhandle'}; print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'}; return; } my $batch = new MARC::Batch( 'USMARC', $filename ); while ( my $marc = $batch->next ) { push(@marc_entries,$marc); $$textref .= $marc->as_formatted(); $$textref .= "\n\n"; # for SplitPlug - see default_split_exp above... } $self->{'marc_entries'}->{$filename} = \@marc_entries; } # do plugin specific processing of doc_obj # This gets done for each record found by SplitPlug in marc files. sub process { my $self = shift (@_); my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; my $outhandle = $self->{'outhandle'}; my $filename = &util::filename_cat($base_dir, $file); if (! defined($self->{'metadata_mapping'})) { print $outhandle "MARCPlug: no metadata file! Can't process $file\n"; return undef; } print STDERR "\n" if ($gli); print $outhandle "MARCPlug: processing $file\n" if $self->{'verbosity'} > 1; my $cursection = $doc_obj->get_top_section(); # Add fileFormat as the metadata $doc_obj->add_metadata($cursection, "FileFormat", "MARC"); my $marc_entries = $self->{'marc_entries'}->{$filename}; my $marc = shift(@$marc_entries); my $encoding = $self->{'readfile_encoding'}->{$filename}; $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection); # add spaces after the sub-field markers, for word boundaries $$textref =~ s/^(.{6} _\w)/$1 /gm; # add text to document object $$textref =~ s//>/g; $$textref = $self->to_utf8($encoding,$$textref); print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n" if $self->{'verbosity'} > 2; # line wrapping $$textref = &wrap_text_in_columns($$textref, 64); $$textref = "
\n" . $$textref . "
\n"; # HTML formatting... $doc_obj->add_utf8_text($cursection, $$textref); return 1; } sub wrap_text_in_columns { my ($text, $columnwidth) = @_; my $newtext = ""; my $linelength = 0; # Break the text into words, and display one at a time my @words = split(/ /, $text); foreach my $word (@words) { # If printing this word would exceed the column end, start a new line if (($linelength + length($word)) >= $columnwidth) { $newtext .= "\n"; $linelength = 0; } # Write the word $newtext .= " $word"; if ($word =~ /\n/) { $linelength = 0; } else { $linelength = $linelength + length(" $word"); } } $newtext .= "\n"; return $newtext; } sub extract_metadata { my $self = shift (@_); my ($marc, $metadata, $encoding, $doc_obj, $section) = @_; my $outhandle = $self->{'outhandle'}; my $metadata_mapping = $self->{'metadata_mapping'}; my $mm; foreach $mm ( @$metadata_mapping ) { my $marc_field = $mm->{'marc'}; my @metavalues = $marc->field($marc_field); if (scalar(@metavalues)>0) { my $metaname = $mm->{'gsdl'}; my $metavalue; foreach $metavalue ( @metavalues ) { my $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string()); $doc_obj->add_utf8_metadata ($section, $metaname, $metavalue_str); } } } } 1;