source: gsdl/trunk/perllib/plugins/MARCXMLPlug.pm@ 15018

Last change on this file since 15018 was 15018, checked in by davidb, 14 years ago

Marc mapping upgraded to support richer set of operations, including subfields, multiple fields in one line (separated by comma), and the removal of rules, e.g. -245 at the start of a line. A Marc to Qualified Dublin Core crosswalk from the Library of congress has been added as "etc/marc2qdc.txt". A collection can then choose to, for example, top up the mapping with its own version of the file stored in its local "etc" folder, specifying only the rules that are different. This is where a rule like "-245" might be used to override a more general rule from the main file that has all subfields in 245 mapping to one metadata item (Title). If the user specifies a different different filename -- through a plugin option -- then they are free to divise a mapping from scratch and store it in the collections local "etc" folder.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.3 KB
Line 
1###########################################################################
2#
3# MARCXMLPlug.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
30package MARCXMLPlug;
31
32use XMLPlug;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub BEGIN {
38 @MARCXMLPlug::ISA = ('XMLPlug');
39}
40
41my $arguments = [{'name' => "metadata_mapping_file",
42 'desc' => "{MARCXMLPlug.metadata_mapping_file}",
43 'type' => "string",
44 'deft' => "marctodc.txt",
45 'reqd' => "no" }];
46
47my $options = { 'name' => "MARCXMLPlug",
48 'desc' => "{MARCXMLPlug.desc}",
49 'abstract' => "no",
50 'inherits' => "yes",
51 'args' => $arguments
52 };
53
54sub new {
55 my ($class) = shift (@_);
56 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
57 push(@$pluginlist, $class);
58
59 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
60 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
61
62 my $self = new XMLPlug($pluginlist, $inputargs, $hashArgOptLists);
63
64 $self->{'content'} = "";
65 $self->{'xmlcontent'} = "";
66 $self->{'record_count'} = 1;
67 $self->{'language'} = "";
68 $self->{'encoding'} = "";
69 $self->{'marc_mapping'} = {};
70 $self->{'current_code'} = "";
71 $self->{'current_tag'} = "";
72 $self->{'current_element'} = "";
73 $self->{'metadata_mapping'} = undef;
74 $self->{'num_processed'} = 0;
75 $self->{'indent'} = 0;
76
77 return bless $self, $class;
78}
79
80sub get_doctype {
81 my $self = shift(@_);
82
83 return "collection";
84}
85
86
87sub _parse_marc_metadata_mapping
88{
89 my $self = shift(@_);
90 my ($mm_file,$metadata_mapping) = @_;
91
92 my $outhandle = $self->{'outhandle'};
93
94 if (open(MMIN, "<$mm_file"))
95 {
96 my $l=0;
97 my $line;
98 while (defined($line=<MMIN>))
99 {
100 $l++;
101 chomp $line;
102 $line =~ s/#.*$//; # strip out any comments, including end of line
103 next if ($line =~ m/^\s*$/);
104 $line =~ s/\s+$//; # remove any white space at end of line
105
106 my $parse_error_count = 0;
107 if ($line =~ m/^-(\d+)\s*$/) {
108 # special "remove" rule syntax
109 my $marc_info = $1;
110 if (defined $metadata_mapping->{$marc_info}) {
111 delete $metadata_mapping->{$marc_info};
112 }
113 else {
114 print $outhandle "Parse Warning: Did not file pre-existing rule $marc_info to remove";
115 print $outhandle " on line $l of $mm_file:\n";
116 print $outhandle " $line\n";
117 }
118 }
119 elsif ($line =~ m/^(.*?)->\s*([\w\^]+)$/)
120 {
121 my $lhs = $1;
122 my $gsdl_info = $2;
123
124 my @fields = split(/,\s*/,$lhs);
125 my $f;
126 while ($f = shift (@fields)) {
127 $f =~ s/\s+$//; # remove any white space at end of line
128
129 if ($f =~ m/^(\d+)\-(\d+)$/) {
130 # number range => genrate number in range and
131 # push on to array
132 push(@fields,$1..$2);
133 next;
134 }
135
136 if ($f =~ m/^(\d+)((?:(?:\$|\^)\w)*)\s*$/) {
137
138 my $marc_info = $1;
139 my $opt_sub_fields = $2;
140
141 if ($opt_sub_fields ne "") {
142 my @sub_fields = split(/\$|\^/,$opt_sub_fields);
143 shift @sub_fields; # skip first entry, which is blank
144
145 foreach my $sub_field (@sub_fields) {
146 $metadata_mapping->{$marc_info."\$".$sub_field} = $gsdl_info;
147 }
148 }
149 else {
150 # no subfields to worry about
151 $marc_info =~ s/\^/\$/;
152 $metadata_mapping->{$marc_info} = $gsdl_info;
153 }
154 }
155 else {
156 $parse_error_count++;
157 }
158 }
159 }
160 else
161 {
162 $parse_error_count++;
163 }
164
165 if ($parse_error_count>0) {
166
167 print $outhandle "Parse Error: $parse_error_count syntax error(s) on line $l of $mm_file:\n";
168 print $outhandle " $line\n";
169 }
170 }
171 close(MMIN);
172 }
173 else
174 {
175 print STDERR "Unable to open $mm_file: $!\n";
176 }
177}
178
179
180sub parse_marc_metadata_mapping
181{
182 my $self = shift(@_);
183 my ($mm_file_or_files) = @_;
184
185 my $metadata_mapping = {};
186
187 if (ref ($mm_file_or_files) eq 'SCALAR') {
188 my $mm_file = $mm_file_or_files;
189 $self->_parse_marc_metadata_mapping($mm_file,$metadata_mapping);
190 }
191 else {
192 my $mm_files = $mm_file_or_files;
193
194 # Need to process files in reverse order. This is so in the
195 # case where we have both a "collect" and "main" version,
196 # the "collect" one tops up the main one
197
198 my $mm_file;
199 while ($mm_file = pop(@$mm_files)) {
200 $self->_parse_marc_metadata_mapping($mm_file,$metadata_mapping);
201 }
202 }
203
204 return $metadata_mapping;
205}
206
207
208sub init {
209 my $self = shift (@_);
210 my ($verbosity, $outhandle, $failhandle) = @_;
211
212 ## the mapping file has already been loaded
213 if (defined $self->{'metadata_mapping'} ){
214 $self->SUPER::init(@_);
215 return;
216 }
217
218 # read in the metadata mapping file
219 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
220
221
222 if (scalar(@$mm_files)==0)
223 {
224 my $msg = "MARCXMLPlug ERROR: Can't locate mapping file \"" .
225 $self->{'metadata_mapping_file'} . "\".\n " .
226 " No marc files can be processed.\n";
227
228 print $outhandle $msg;
229 print $failhandle $msg;
230 $self->{'metadata_mapping'} = undef;
231 # We pick up the error in process() if there is no $mm_file
232 # If we exit here, then pluginfo.pl will exit too!
233 }
234 else {
235 $self->{'metadata_mapping'} = $self->parse_marc_metadata_mapping($mm_files);
236 }
237
238
239 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
240
241 $self->SUPER::init(@_);
242}
243
244# Called for DOCTYPE declarations - use die to bail out if this doctype
245# is not meant for this plugin
246sub xml_doctype {
247 my $self = shift(@_);
248
249 my ($expat, $name, $sysid, $pubid, $internal) = @_;
250 return;
251
252}
253
254
255sub xml_start_document {
256 my $self = shift(@_);
257
258 my ($expat, $name, $sysid, $pubid, $internal) = @_;
259
260
261 my $file = $self->{'file'};
262 my $filename = $self->{'filename'};
263
264 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
265
266 $self->{'language'} = $language;
267 $self->{'encoding'} = $encoding;
268 $self->{'element_count'} = 1;
269 $self->{'indent'} = 0;
270 my $outhandle = $self->{'outhandle'};
271 print $outhandle "MARCXMLPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
272 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlug'>\n" if $self->{'gli'};
273
274}
275
276sub xml_end_document {
277
278}
279
280sub xml_start_tag {
281 my $self = shift;
282 my $expat = shift;
283 my $element = shift;
284
285 my $text = $_;
286 my $escaped_text = $self->escape_text($_);
287
288 $self->{'current_element'} = $element;
289
290 ##get all atributes of this element and store it in a map name=>value
291 my %attr_map = ();
292 my $attrstring = $_;
293 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
294 $attr_map{$1}=$2;
295 $attrstring = $'; #'
296 }
297
298
299 my $processor = $self->{'processor'};
300
301 ##create a new document for each record
302 if ($element eq "record") {
303 my $filename = $self->{'filename'};
304 my $language = $self->{'language'};
305 my $encoding = $self->{'encoding'};
306 my $file = $self->{'file'};
307 my $doc_obj = new doc($filename);
308 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
309 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
310 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
311 my ($filemeta) = $file =~ /([^\\\/]+)$/;
312 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
313 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
314 if ($self->{'cover_image'}) {
315 $self->associate_cover_image($doc_obj, $filename);
316 }
317 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
318 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
319
320 my $outhandle = $self->{'outhandle'};
321 print $outhandle "Record $self->{'record_count'} - MARCXMLPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
322
323 $self->{'record_count'}++;
324 $self->{'doc_obj'} = $doc_obj;
325 $self->{'num_processed'}++;
326
327 }
328
329 ## get the marc code, for example 520
330 if ($element eq "datafield") {
331 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
332 $self->{'current_tag'} = $attr_map{tag};
333 }
334 }
335
336
337 ## append the subcode to the marc code for example 520a or 520b
338 if ($element eq "subfield"){
339 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
340 $self->{'current_code'} = $attr_map{'code'};
341 }
342 }
343
344 if ($element eq "record"){
345 $self->{'indent'} = 0;
346 $self->{'content'} = "";
347 $self->{'xmlcontent'} = "";
348 }
349 else {
350 if ($element ne "subfield"){
351 $self->{'indent'} = 1;
352 }
353 else{
354 $self->{'indent'} = 2;
355 }
356 }
357
358
359 if ($element eq "collection") {
360 # remember the full start tag for <collection ...>
361 # This is needed to wrap around each <record> when generating its associate MARCXML file
362
363 $self->{'xmlcollectiontag'} = $text;
364 }
365 else {
366 $self->{'content'} .= "<br/>" if ($element ne "record");
367 $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
368 $self->{'xmlcontent'} .= $text;
369 }
370
371}
372
373
374
375sub xml_end_tag {
376 my $self = shift(@_);
377 my ($expat, $element) = @_;
378
379 my $text = $_;
380 my $escaped_text = $self->escape_text($_);
381
382 if ($element eq "record" and defined $self->{'doc_obj'}) {
383 # process the document
384 my $processor = $self->{'processor'};
385 my $doc_obj = $self->{'doc_obj'};
386 $self->{'content'} .= "<br/>".$escaped_text;
387 $self->{'xmlcontent'} .= $text;
388
389
390 my $top_section = $doc_obj->get_top_section();
391
392 my $tmp_marcxml_filename = &util::get_tmp_filename().".xml";
393 if (open (XMLOUT,">$tmp_marcxml_filename")) {
394
395 print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";
396 my $xml_content = $self->{'xmlcontent'};
397
398 $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
399
400 print XMLOUT $xml_content;
401
402 close(XMLOUT);
403
404 $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
405
406 # assicate xsl style file for presentation as HTML
407 my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
408 $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
409
410 }
411 else {
412 my $outhandle = $self->{'outhandle'};
413 print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
414 }
415
416 $self->add_OID($doc_obj, $self->{'record_count'});
417
418 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
419 $processor->process($doc_obj);
420
421 ##clean up
422 $self->{'content'} = "";
423 $self->{'xmlcontent'} = "";
424 $self->{'doc_obj'} = undef;
425 return;
426 }
427
428 ## map the xmlmarc to gsdl metadata
429 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'}){
430 my $metadata_mapping = $self->{'metadata_mapping'};
431 my $marc_mapping = $self->{'marc_mapping'};
432 my $doc_obj = $self->{'doc_obj'};
433
434## print STDERR "**** Marc Record\n";
435## map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
436## print STDERR "**** Metadata Mapping\n";
437## map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
438
439
440 foreach my $marc_field (keys %$metadata_mapping){
441
442 ## test whether this field has subfield
443 my $subfield = undef;
444 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
445 $marc_field = $1;
446 $subfield = $2;
447 }
448
449 my $matched_field = $marc_mapping->{$marc_field};
450
451 if (defined $matched_field) {
452
453 my $meta_name = undef;
454 my $meta_value = undef;
455
456 if (defined $subfield){
457 $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
458
459 $meta_value = $matched_field->{$subfield};
460
461 if (!defined $meta_value) {
462 # record read in does not have the specified subfield
463 next;
464 }
465 }
466 else {
467 $meta_name = $metadata_mapping->{$marc_field};
468
469 # no subfield => get all the values
470 foreach my $value (sort keys %{$matched_field}) {
471 $meta_value .= $matched_field->{$value} ." ";
472 }
473
474 }
475
476 ## escape [ and ]
477 $meta_value =~ s/\[/\\\[/g;
478 $meta_value =~ s/\]/\\\]/g;
479 ##print STDERR "$meta_name=$meta_value\n";
480 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
481
482 }
483
484 }
485
486 ##clean up
487 $self->{'marc_mapping'} = undef;
488 $self->{'current_tag'} = "";
489 }
490
491 if ($element eq "datafield"){
492 $self->{'indent'} = 1;
493 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
494 $self->{'xmlcontent'} .= $text;
495 }
496 else{
497 $self->{'content'} .= $escaped_text;
498 $self->{'xmlcontent'} .= $text;
499 }
500
501}
502
503
504sub set_OID {
505 my $self = shift (@_);
506 my ($doc_obj, $record_number) = @_;
507
508 # first set it to generate hash value
509 $doc_obj->set_OID();
510
511 # then top it up with an "r" + record-number suffix
512 my $id = $doc_obj->get_OID();
513 $doc_obj->set_OID($id . "r" . $record_number);
514}
515
516sub xml_text {
517 my $self = shift(@_);
518 my ($expat) = @_;
519
520 my $text = $_;
521 my $escaped_text = $self->escape_text($_);
522
523 # protect against & in raw text file
524 $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
525
526 ## store the text of a marc code, for exapmle 520a=>A poem about....
527 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
528 ##stored it in the marc_mapping
529
530 my $current_tag = $self->{'current_tag'};
531 my $current_code = $self->{'current_code'};
532
533 $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
534
535 $self->{'current_code'} = "";
536 }
537
538 $self->{'content'} .= $escaped_text;
539 $self->{'xmlcontent'} .= $text;
540
541}
542
543sub calculate_indent{
544 my ($self,$num) = @_;
545
546 my $indent ="";
547
548 for (my $i=0; $i<$num;$i++){
549 $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
550 }
551
552 return $indent;
553
554}
555
556sub escape_text {
557 my ($self,$text) = @_;
558 # special characters in the xml encoding
559 $text =~ s/&/&amp;/g; # this has to be first...
560 $text =~ s/</&lt;/g;
561 $text =~ s/>/&gt;/g;
562 $text =~ s/\"/&quot;/g;
563
564 return $text;
565}
566
567
568sub unescape_text {
569 my ($self,$text) = @_;
570 # special characters in the xml encoding
571 $text =~ s/&lt;/</g;
572 $text =~ s/&gt;/>/g;
573 $text =~ s/&quot;/\"/g;
574
575 $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
576
577 return $text;
578}
579
580
5811;
582
583
Note: See TracBrowser for help on using the repository browser.