source: trunk/gsdl/perllib/plugins/MARCPlug.pm@ 5845

Last change on this file since 5845 was 5680, checked in by mdewsnip, 21 years ago

Moved plugin descriptions into the resource bundle (perllib/strings.rb), for translation by UNESCO.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.2 KB
Line 
1###########################################################################
2#
3# MARCPlug.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlug;
28
29use SplitPlug;
30
31use unicode;
32use util;
33use parsargv;
34
35sub BEGIN {
36 @ISA = ('SplitPlug');
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38}
39
40my $arguments =
41 [ { 'name' => "metadata_mapping",
42 'desc' => "{MARCPlug.metadata_mapping}",
43 'type' => "string",
44 'deft' => "marctodc.txt",
45 'reqd' => "no" } ];
46
47my $options = { 'name' => "MARCPlug",
48 'desc' => "{MARCPlug.desc}",
49 'inherits' => "Yes",
50 'args' => $arguments };
51
52use MARC::Record;
53use MARC::Batch;
54
55# sub print_usage {
56# print STDERR "\n usage: plugin MARCPlug [options]\n\n";
57# print STDERR " options:\n";
58# print STDERR
59# " -metadata_mapping Name of file that includes mapping details from MARC
60# values to Greenstone metadata names. Defaults to
61# 'marctodc.txt' found in the site's etc directory.\n\n";
62# }
63
64sub new {
65 my $class = shift (@_);
66 my $self = new SplitPlug ($class, @_);
67
68 my $metadata_mapping;
69
70 if (!parsargv::parse(\@_,
71 q^metadata_map/.*/marctodc.txt^, \$metadata_mapping,
72 "allow_extra_options")) {
73
74 print STDERR "\nIncorrect options passed to MARCPlug, check your collect.cfg configuration file\n";
75 $self->print_txt_usage(""); # Use default resource bundle
76 die "\n";
77 }
78
79
80 $self->{'mm_file'} = $metadata_mapping; # relative to etc dir
81
82 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
83 my $option_list = $self->{'option_list'};
84 push( @{$option_list}, $options );
85
86 return bless $self, $class;
87}
88
89sub init {
90 my $self = shift (@_);
91 my ($verbosity, $outhandle, $failhandle) = @_;
92
93 my @metadata_mapping = ();
94
95 # read in the metadata mapping file
96 my $mm_file =
97 &util::filename_cat( $ENV{'GSDLHOME'}, "etc", $self->{'mm_file'} );
98
99 if (!-e $mm_file)
100 {
101
102 my $msg = "MARCPlug ERROR: Can't locate mapping file \"" .
103 $self->{'mm_file'} . "\".\n This file should be at $mm_file\n" .
104 " No marc files can be processed.\n";
105
106 print $outhandle $msg;
107 print $failhandle $msg;
108 $self->{'mm_file'} = undef;
109 # We pick up the error in process() if there is no $mm_file
110 # If we exit here, then pluginfo.pl will exit too!
111 }
112 elsif (open(MMIN, "<$mm_file"))
113 {
114 my $l=1;
115 my $line;
116 while (defined($line=<MMIN>))
117 {
118 chomp $line;
119 if ($line =~ m/^(\d+)\s*->\s*(\w+)$/)
120 {
121 my $marc_info = $1;
122 my $gsdl_info = $2;
123 my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info };
124 push(@metadata_mapping,$mapping);
125 }
126 elsif ($line !~ m/^\#/ # allow comments (# in first column)
127 && $line !~ m/^\s*$/) # allow blank lines
128 {
129 print $outhandle "Parse error on line $l of $mm_file:\n";
130 print $outhandle " \"$line\"\n";
131 }
132 $l++
133 }
134 close(MMIN);
135 }
136 else
137 {
138 print STDERR "Unable to open $mm_file: $!\n";
139 }
140
141 $self->{'metadata_mapping'} = \@metadata_mapping;
142
143 $self->SUPER::init(@_);
144}
145
146
147sub get_default_process_exp {
148 my $self = shift (@_);
149
150 return q^(?i)(\.marc)$^;
151}
152
153
154sub get_default_split_exp {
155 # \r\n for msdos eol, \n for unix
156 return q^\r?\n\s*\r?\n^;
157}
158
159
160
161# The bulk of this function is based on read_line in multiread.pm
162# Unable to use read_line original because it expects to get its input
163# from a file. Here the line to be converted is passed in as a string
164
165sub to_utf8
166{
167 my $self = shift (@_);
168 my ($encoding, $line) = @_;
169
170 if ($encoding eq "iso_8859_1") {
171 # we'll use ascii2utf8() for this as it's faster than going
172 # through convert2unicode()
173 return &unicode::ascii2utf8 (\$line);
174 }
175
176 # everything else uses unicode::convert2unicode
177 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
178}
179
180
181sub read_file {
182 my $self = shift (@_);
183 my ($filename, $encoding, $language, $textref) = @_;
184
185 $self->{'readfile_encoding'} = $encoding;
186
187 @marc_entries = ();
188
189 if (!-r $filename)
190 {
191 my $outhandle = $self->{'outhandle'};
192 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
193 return;
194 }
195
196 my $batch = new MARC::Batch( 'USMARC', $filename );
197 while ( my $marc = $batch->next )
198 {
199 push(@marc_entries,$marc);
200 $$textref .= $marc->as_formatted();
201 $$textref .= "\n\n"; # for SplitPlug - see default_split_exp above...
202 }
203
204 $self->{'marc_entries'} = \@marc_entries;
205}
206
207
208
209# do plugin specific processing of doc_obj
210# This gets done for each record found by SplitPlug in marc files.
211sub process {
212 my $self = shift (@_);
213 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
214 my $outhandle = $self->{'outhandle'};
215
216 if (! defined($self->{'mm_file'}))
217 {
218 print $outhandle "MARCPlug: no metadata file! Can't process $file\n";
219 return undef;
220 }
221 print $outhandle "MARCPlug: processing $file\n"
222 if $self->{'verbosity'} > 1;
223
224 my $cursection = $doc_obj->get_top_section();
225
226 my $marc_entries = $self->{'marc_entries'};
227 my $marc = shift(@$marc_entries);
228
229 $self->extract_metadata ($marc, $metadata, $doc_obj, $cursection);
230
231 # add spaces after the sub-field markers, for word boundaries
232 $$textref =~ s/^(.{6} _\w)/$1 /gm;
233
234 # add text to document object
235 $$textref =~ s/</&lt;/g;
236 $$textref =~ s/>/&gt;/g;
237
238 my $encoding = $self->{'readfile_encoding'};
239 $$textref = $self->to_utf8($encoding,$$textref);
240
241 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
242 if $self->{'verbosity'} > 2;
243
244 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
245
246 $doc_obj->add_utf8_text($cursection, $$textref);
247
248 return 1;
249}
250
251
252
253sub extract_metadata
254{
255 my $self = shift (@_);
256 my ($marc, $metadata, $doc_obj, $section) = @_;
257 my $outhandle = $self->{'outhandle'};
258
259 my $encoding = $self->{'readfile_encoding'};
260
261 my $metadata_mapping = $self->{'metadata_mapping'};
262 my $mm;
263 foreach $mm ( @$metadata_mapping )
264 {
265 my $marc_field = $mm->{'marc'};
266 my @metavalues = $marc->field($marc_field);
267
268 if (scalar(@metavalues)>0)
269 {
270 my $metaname = $mm->{'gsdl'};
271 my $metavalue;
272 foreach $metavalue ( @metavalues )
273 {
274 $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
275 $doc_obj->add_utf8_metadata ($thissection, $metaname, $metavalue_str);
276 }
277 }
278 }
279}
2801;
Note: See TracBrowser for help on using the repository browser.