source: gsdl/trunk/perllib/plugins/MARCPlug.pm@ 15018

Last change on this file since 15018 was 15018, checked in by davidb, 14 years ago

Marc mapping upgraded to support richer set of operations, including subfields, multiple fields in one line (separated by comma), and the removal of rules, e.g. -245 at the start of a line. A Marc to Qualified Dublin Core crosswalk from the Library of congress has been added as "etc/marc2qdc.txt". A collection can then choose to, for example, top up the mapping with its own version of the file stored in its local "etc" folder, specifying only the rules that are different. This is where a rule like "-245" might be used to override a more general rule from the main file that has all subfields in 245 mapping to one metadata item (Title). If the user specifies a different different filename -- through a plugin option -- then they are free to divise a mapping from scratch and store it in the collections local "etc" folder.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.4 KB
Line 
1###########################################################################
2#
3# MARCPlug.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlug;
28
29use SplitPlug;
30
31use unicode;
32use util;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub BEGIN {
38 @MARCPlug::ISA = ('SplitPlug');
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
40}
41
42my $arguments =
43 [ { 'name' => "metadata_mapping",
44 'desc' => "{MARCPlug.metadata_mapping}",
45 'type' => "string",
46 'deft' => "marctodc.txt",
47 'hiddengli' = "yes", # deprecated in favour or 'metadata_mapping_file'
48 'reqd' => "no" },
49 { 'name' => "metadata_mapping_file",
50 'desc' => "{MARCXMLPlug.metadata_mapping_file}",
51 'type' => "string",
52 'deft' => "",
53 'reqd' => "no" },
54 { 'name' => "process_exp",
55 'desc' => "{BasPlug.process_exp}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => &get_default_process_exp() },
59 { 'name' => "split_exp",
60 'desc' => "{SplitPlug.split_exp}",
61 'type' => "regexp",
62 'reqd' => "no",
63 'deft' => &get_default_split_exp() }
64 ];
65
66my $options = { 'name' => "MARCPlug",
67 'desc' => "{MARCPlug.desc}",
68 'abstract' => "no",
69 'inherits' => "yes",
70 'explodes' => "yes",
71 'args' => $arguments };
72
73require MARC::Record;
74require MARC::Batch;
75#use MARC::Record;
76#use MARC::Batch;
77
78sub new {
79 my ($class) = shift (@_);
80 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
81 push(@$pluginlist, $class);
82
83 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
84 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
85
86 my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
87
88 # 'metadata_mapping' was used in two ways in the plugin: as a plugin
89 # option (filename) and as a datastructure to represent the mapping.
90 # In MARXXMLPlug (written later) the two are separated: filename is
91 # represented through 'metadata_mapping_file' and the data-structure
92 # mapping left as 'metadata_mapping'
93 # 'metadata_mapping' still present (but hidden in GLI) for
94 # backwards compatibility, but 'metadata_mapping_file' is used by
95 # preference
96
97 if ($self->{'metadata_mapping_file'} eq "") {
98 # If nothing set in the new version, use the old version
99 # that defaults to 'marctodc.txt'
100 $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'};
101 }
102
103 $self->{'type'} = "";
104
105 return bless $self, $class;
106}
107
108sub init {
109 my $self = shift (@_);
110 my ($verbosity, $outhandle, $failhandle) = @_;
111
112 my @metadata_mapping = ();
113
114 # read in the metadata mapping file
115
116 my $mm_file = &util::locate_config_file($self->{'metadata_mapping_file'});
117
118 if (!defined $mm_file)
119 {
120
121 my $msg = "MARCPlug ERROR: Can't locate mapping file \"" .
122 $self->{'metadata_mapping_file'} . "\".\n" .
123 " No marc files can be processed.\n";
124
125 print $outhandle $msg;
126 print $failhandle $msg;
127 $self->{'metadata_mapping'} = undef;
128 # We pick up the error in process() if there is no $mm_file
129 # If we exit here, then pluginfo.pl will exit too!
130 }
131 elsif (open(MMIN, "<$mm_file"))
132 {
133 my $l=1;
134 my $line;
135 while (defined($line=<MMIN>))
136 {
137 chomp $line;
138 if ($line =~ m/^(\d+)\s*->\s*([\w\^]+)$/)
139 {
140 my $marc_info = $1;
141 my $gsdl_info = $2;
142 my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info };
143 push(@metadata_mapping,$mapping);
144 }
145 elsif ($line !~ m/^\#/ # allow comments (# in first column)
146 && $line !~ m/^\s*$/) # allow blank lines
147 {
148 print $outhandle "Parse error on line $l of $mm_file:\n";
149 print $outhandle " \"$line\"\n";
150 }
151 $l++
152 }
153 close(MMIN);
154
155 $self->{'metadata_mapping'} = \@metadata_mapping;
156 }
157 else
158 {
159 print STDERR "Unable to open $mm_file: $!\n";
160 }
161
162
163
164 $self->SUPER::init(@_);
165}
166
167
168sub get_default_process_exp {
169 my $self = shift (@_);
170
171 return q^(?i)(\.marc)$^;
172}
173
174
175sub get_default_split_exp {
176 # \r\n for msdos eol, \n for unix
177 return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
178}
179
180
181
182# The bulk of this function is based on read_line in multiread.pm
183# Unable to use read_line original because it expects to get its input
184# from a file. Here the line to be converted is passed in as a string
185
186sub to_utf8
187{
188 my $self = shift (@_);
189 my ($encoding, $line) = @_;
190
191 if ($encoding eq "utf8") {
192 # nothing needs to be done
193 return $line;
194 }
195
196 if ($encoding eq "iso_8859_1") {
197 # we'll use ascii2utf8() for this as it's faster than going
198 # through convert2unicode()
199 return &unicode::ascii2utf8 (\$line);
200 }
201
202 # everything else uses unicode::convert2unicode
203 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
204}
205
206
207sub read_file {
208 my $self = shift (@_);
209 my ($filename, $encoding, $language, $textref) = @_;
210
211 $self->{'readfile_encoding'}->{$filename} = $encoding;
212
213
214 if (!-r $filename)
215 {
216 my $outhandle = $self->{'outhandle'};
217 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
218 return;
219 }
220
221 ##handle ascii marc
222 #test whether this is ascii marc file
223 if (open (FILE, $filename)) {
224 while (defined (my $line = <FILE>)) {
225 $$textref .= $line;
226 if ($line =~ /\[\w+\]Record type:/){
227 undef $/;
228 $$textref .= <FILE>;
229 $/ = "\n";
230 $self->{'type'} = "ascii";
231 close FILE;
232 return;
233 }
234 }
235 close FILE;
236 }
237
238
239 $$textref = "";
240 my @marc_entries = ();
241
242 my $batch = new MARC::Batch( 'USMARC', $filename );
243 while ( my $marc = $batch->next )
244 {
245 push(@marc_entries,$marc);
246 $$textref .= $marc->as_formatted();
247 $$textref .= "\n\n"; # for SplitPlug - see default_split_exp above...
248 }
249
250 $self->{'marc_entries'}->{$filename} = \@marc_entries;
251}
252
253
254
255# do plugin specific processing of doc_obj
256# This gets done for each record found by SplitPlug in marc files.
257sub process {
258 my $self = shift (@_);
259 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
260
261 my $outhandle = $self->{'outhandle'};
262 my $filename = &util::filename_cat($base_dir, $file);
263
264 if (! defined($self->{'metadata_mapping'}))
265 {
266 print $outhandle "MARCPlug: no metadata file! Can't process $file\n";
267 return undef;
268 }
269
270 print STDERR "<Processing n='$file' p='MARCPlug'>\n" if ($gli);
271 print $outhandle "MARCPlug: processing $file\n"
272 if $self->{'verbosity'} > 1;
273
274 my $cursection = $doc_obj->get_top_section();
275
276 # Add fileFormat as the metadata
277 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
278
279 my $marc_entries = $self->{'marc_entries'}->{$filename};
280 my $marc = shift(@$marc_entries);
281
282 my $encoding = $self->{'readfile_encoding'}->{$filename};
283
284 if ($self->{'type'} ne "ascii" ){
285 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
286 }
287 else{
288 $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
289 }
290
291 # add spaces after the sub-field markers, for word boundaries
292 $$textref =~ s/^(.{6} _\w)/$1 /gm;
293
294 # add text to document object
295 $$textref =~ s/</&lt;/g;
296 $$textref =~ s/>/&gt;/g;
297
298 $$textref = $self->to_utf8($encoding,$$textref);
299
300 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
301 if $self->{'verbosity'} > 2;
302
303 # line wrapping
304 $$textref = &wrap_text_in_columns($$textref, 64);
305 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
306
307 $doc_obj->add_utf8_text($cursection, $$textref);
308
309 return 1;
310}
311
312sub wrap_text_in_columns
313{
314 my ($text, $columnwidth) = @_;
315 my $newtext = "";
316 my $linelength = 0;
317
318 # Break the text into words, and display one at a time
319 my @words = split(/ /, $text);
320
321 foreach my $word (@words) {
322 # If printing this word would exceed the column end, start a new line
323 if (($linelength + length($word)) >= $columnwidth) {
324 $newtext .= "\n";
325 $linelength = 0;
326 }
327
328 # Write the word
329 $newtext .= " $word";
330 if ($word =~ /\n/) {
331 $linelength = 0;
332 } else {
333 $linelength = $linelength + length(" $word");
334 }
335 }
336
337 $newtext .= "\n";
338 return $newtext;
339}
340
341
342sub extract_metadata
343{
344 my $self = shift (@_);
345 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
346 my $outhandle = $self->{'outhandle'};
347
348 if (!defined $marc){
349 return;
350 }
351
352 my $metadata_mapping = $self->{'metadata_mapping'};
353 my $mm;
354
355 foreach $mm ( @$metadata_mapping )
356 {
357 my $marc_field = $mm->{'marc'};
358
359 my @metavalues = $marc->field($marc_field);
360
361 if (scalar(@metavalues)>0)
362 {
363 my $metaname = $mm->{'gsdl'};
364 my $metavalue;
365 foreach $metavalue ( @metavalues )
366 {
367 my $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
368 $doc_obj->add_utf8_metadata ($section, $metaname, $metavalue_str);
369 }
370 }
371 }
372}
373
374sub extract_ascii_metadata
375{
376 my $self = shift (@_);
377 my ($text, $metadata,$doc_obj, $section) = @_;
378 my $outhandle = $self->{'outhandle'};
379 my $metadata_mapping = $self->{'metadata_mapping'};
380 ## get fields
381 my @fields = split(/[\n\r]+/,$text);
382 my $marc_mapping ={};
383
384 foreach my $field (@fields){
385 if ($field ne ""){
386 $field =~ /^(\d\d\d)\s/;
387 my $code = $1;
388 $field = $';
389 ##get subfields
390 my @subfields = split(/\$/,$field);
391 my $i=0;
392 $marc_mapping->{$code} = [];
393 foreach my $subfield (@subfields){
394 if ($i == 0){
395 ##print STDERR $subfield."\n";
396 push(@{$marc_mapping->{$code}},"info");
397 push(@{$marc_mapping->{$code}},$subfield);
398
399 $i++;
400 }
401 else{
402 $subfield =~ /(\w)\s/;
403 ##print STDERR "$1=>$'\n";
404 push(@{$marc_mapping->{$code}},$1);
405 push(@{$marc_mapping->{$code}},$');
406 }
407 }
408 }
409 }
410
411
412 foreach my $mm ( @$metadata_mapping )
413 {
414 my $marc_field = $mm->{'marc'};
415
416 my $matched_field = $marc_mapping->{$marc_field};
417 my $subfield = undef;
418
419 if (defined $matched_field){
420 ## test whether this field has subfield
421 if ($marc_field =~ /\d\d\d(\w)/){
422 $subfield = $1;
423 }
424 my $metaname = $mm->{'gsdl'};
425
426 my $metavalue;
427 if (defined $subfield){
428 my %mapped_subfield = {@$matched_field};
429 $metavalue = $mapped_subfield{$subfield};
430 }
431 else{ ## get all values except info
432 my $i =0;
433 foreach my $value (@$matched_field){
434 if ($i%2 != 0 and $i != 1){
435 $metavalue .= $value." ";
436 }
437 $i++;
438 }
439 }
440
441 ## escape [ and ]
442 $metavalue =~ s/\[/\\\[/g;
443 $metavalue =~ s/\]/\\\]/g;
444 ##print STDERR "$metaname=$metavalue\n";
445 $doc_obj->add_metadata ($section, $metaname, $metavalue) ;
446 }
447
448 }
449
450}
451
452
4531;
Note: See TracBrowser for help on using the repository browser.