source: gsdl/trunk/perllib/plugins/MARCXMLPlug.pm@ 15178

Last change on this file since 15178 was 15178, checked in by kjdon, 16 years ago

needed to add extra_metadata() call in xml_end_tag so that can get metadata from metadata.xml files

  • Property svn:keywords set to Author Date Id Revision
File size: 15.5 KB
Line 
1###########################################################################
2#
3# MARCXMLPlug.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
30package MARCXMLPlug;
31
32use XMLPlug;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub BEGIN {
38 @MARCXMLPlug::ISA = ('XMLPlug');
39}
40
41my $arguments = [{'name' => "metadata_mapping_file",
42 'desc' => "{MARCXMLPlug.metadata_mapping_file}",
43 'type' => "string",
44 'deft' => "marctodc.txt",
45 'reqd' => "no" }];
46
47my $options = { 'name' => "MARCXMLPlug",
48 'desc' => "{MARCXMLPlug.desc}",
49 'abstract' => "no",
50 'inherits' => "yes",
51 'args' => $arguments
52 };
53
54sub new {
55 my ($class) = shift (@_);
56 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
57 push(@$pluginlist, $class);
58
59 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
60 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
61
62 my $self = new XMLPlug($pluginlist, $inputargs, $hashArgOptLists);
63
64 $self->{'content'} = "";
65 $self->{'xmlcontent'} = "";
66 $self->{'record_count'} = 1;
67 $self->{'language'} = "";
68 $self->{'encoding'} = "";
69 $self->{'marc_mapping'} = {};
70 $self->{'current_code'} = "";
71 $self->{'current_tag'} = "";
72 $self->{'current_element'} = "";
73 $self->{'metadata_mapping'} = undef;
74 $self->{'num_processed'} = 0;
75 $self->{'indent'} = 0;
76
77 return bless $self, $class;
78}
79
80sub get_doctype {
81 my $self = shift(@_);
82
83 return "collection";
84}
85
86
87sub _parse_marc_metadata_mapping
88{
89 my $self = shift(@_);
90 my ($mm_file,$metadata_mapping) = @_;
91
92 my $outhandle = $self->{'outhandle'};
93
94 if (open(MMIN, "<$mm_file"))
95 {
96 my $l=0;
97 my $line;
98 while (defined($line=<MMIN>))
99 {
100 $l++;
101 chomp $line;
102 $line =~ s/#.*$//; # strip out any comments, including end of line
103 next if ($line =~ m/^\s*$/);
104 $line =~ s/\s+$//; # remove any white space at end of line
105
106 my $parse_error_count = 0;
107 if ($line =~ m/^-(\d+)\s*$/) {
108 # special "remove" rule syntax
109 my $marc_info = $1;
110 if (defined $metadata_mapping->{$marc_info}) {
111 delete $metadata_mapping->{$marc_info};
112 }
113 else {
114 print $outhandle "Parse Warning: Did not file pre-existing rule $marc_info to remove";
115 print $outhandle " on line $l of $mm_file:\n";
116 print $outhandle " $line\n";
117 }
118 }
119 elsif ($line =~ m/^(.*?)->\s*([\w\^]+)$/)
120 {
121 my $lhs = $1;
122 my $gsdl_info = $2;
123
124 my @fields = split(/,\s*/,$lhs);
125 my $f;
126 while ($f = shift (@fields)) {
127 $f =~ s/\s+$//; # remove any white space at end of line
128
129 if ($f =~ m/^(\d+)\-(\d+)$/) {
130 # number range => genrate number in range and
131 # push on to array
132 push(@fields,$1..$2);
133 next;
134 }
135
136 if ($f =~ m/^(\d+)((?:(?:\$|\^)\w)*)\s*$/) {
137
138 my $marc_info = $1;
139 my $opt_sub_fields = $2;
140
141 if ($opt_sub_fields ne "") {
142 my @sub_fields = split(/\$|\^/,$opt_sub_fields);
143 shift @sub_fields; # skip first entry, which is blank
144
145 foreach my $sub_field (@sub_fields) {
146 $metadata_mapping->{$marc_info."\$".$sub_field} = $gsdl_info;
147 }
148 }
149 else {
150 # no subfields to worry about
151 $marc_info =~ s/\^/\$/;
152 $metadata_mapping->{$marc_info} = $gsdl_info;
153 }
154 }
155 else {
156 $parse_error_count++;
157 }
158 }
159 }
160 else
161 {
162 $parse_error_count++;
163 }
164
165 if ($parse_error_count>0) {
166
167 print $outhandle "Parse Error: $parse_error_count syntax error(s) on line $l of $mm_file:\n";
168 print $outhandle " $line\n";
169 }
170 }
171 close(MMIN);
172 }
173 else
174 {
175 print STDERR "Unable to open $mm_file: $!\n";
176 }
177}
178
179
180sub parse_marc_metadata_mapping
181{
182 my $self = shift(@_);
183 my ($mm_file_or_files) = @_;
184
185 my $metadata_mapping = {};
186
187 if (ref ($mm_file_or_files) eq 'SCALAR') {
188 my $mm_file = $mm_file_or_files;
189 $self->_parse_marc_metadata_mapping($mm_file,$metadata_mapping);
190 }
191 else {
192 my $mm_files = $mm_file_or_files;
193
194 # Need to process files in reverse order. This is so in the
195 # case where we have both a "collect" and "main" version,
196 # the "collect" one tops up the main one
197
198 my $mm_file;
199 while ($mm_file = pop(@$mm_files)) {
200 $self->_parse_marc_metadata_mapping($mm_file,$metadata_mapping);
201 }
202 }
203
204 return $metadata_mapping;
205}
206
207
208sub init {
209 my $self = shift (@_);
210 my ($verbosity, $outhandle, $failhandle) = @_;
211
212 ## the mapping file has already been loaded
213 if (defined $self->{'metadata_mapping'} ){
214 $self->SUPER::init(@_);
215 return;
216 }
217
218 # read in the metadata mapping file
219 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
220
221
222 if (scalar(@$mm_files)==0)
223 {
224 my $msg = "MARCXMLPlug ERROR: Can't locate mapping file \"" .
225 $self->{'metadata_mapping_file'} . "\".\n " .
226 " No marc files can be processed.\n";
227
228 print $outhandle $msg;
229 print $failhandle $msg;
230 $self->{'metadata_mapping'} = undef;
231 # We pick up the error in process() if there is no $mm_file
232 # If we exit here, then pluginfo.pl will exit too!
233 }
234 else {
235 $self->{'metadata_mapping'} = $self->parse_marc_metadata_mapping($mm_files);
236 }
237
238
239 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
240
241 $self->SUPER::init(@_);
242}
243
244# Called for DOCTYPE declarations - use die to bail out if this doctype
245# is not meant for this plugin
246sub xml_doctype {
247 my $self = shift(@_);
248
249 my ($expat, $name, $sysid, $pubid, $internal) = @_;
250 return;
251
252}
253
254
255sub xml_start_document {
256 my $self = shift(@_);
257
258 my ($expat, $name, $sysid, $pubid, $internal) = @_;
259
260
261 my $file = $self->{'file'};
262 my $filename = $self->{'filename'};
263
264 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
265
266 $self->{'language'} = $language;
267 $self->{'encoding'} = $encoding;
268 $self->{'element_count'} = 1;
269 $self->{'indent'} = 0;
270 my $outhandle = $self->{'outhandle'};
271 print $outhandle "MARCXMLPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
272 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlug'>\n" if $self->{'gli'};
273
274}
275
276sub xml_end_document {
277
278}
279
280sub xml_start_tag {
281 my $self = shift;
282 my $expat = shift;
283 my $element = shift;
284
285 my $text = $_;
286 my $escaped_text = $self->escape_text($_);
287
288 $self->{'current_element'} = $element;
289
290 ##get all atributes of this element and store it in a map name=>value
291 my %attr_map = ();
292 my $attrstring = $_;
293 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
294 $attr_map{$1}=$2;
295 $attrstring = $'; #'
296 }
297
298
299 my $processor = $self->{'processor'};
300
301 ##create a new document for each record
302 if ($element eq "record") {
303 my $filename = $self->{'filename'};
304 my $language = $self->{'language'};
305 my $encoding = $self->{'encoding'};
306 my $file = $self->{'file'};
307 my $doc_obj = new doc($filename);
308 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
309 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
310 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
311 my ($filemeta) = $file =~ /([^\\\/]+)$/;
312 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
313 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
314 if ($self->{'cover_image'}) {
315 $self->associate_cover_image($doc_obj, $filename);
316 }
317 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
318 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
319
320 my $outhandle = $self->{'outhandle'};
321 print $outhandle "Record $self->{'record_count'} - MARCXMLPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
322
323 $self->{'record_count'}++;
324 $self->{'doc_obj'} = $doc_obj;
325 $self->{'num_processed'}++;
326
327 }
328
329 ## get the marc code, for example 520
330 if ($element eq "datafield") {
331 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
332 $self->{'current_tag'} = $attr_map{tag};
333 }
334 }
335
336
337 ## append the subcode to the marc code for example 520a or 520b
338 if ($element eq "subfield"){
339 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
340 $self->{'current_code'} = $attr_map{'code'};
341 }
342 }
343
344 if ($element eq "record"){
345 $self->{'indent'} = 0;
346 $self->{'content'} = "";
347 $self->{'xmlcontent'} = "";
348 }
349 else {
350 if ($element ne "subfield"){
351 $self->{'indent'} = 1;
352 }
353 else{
354 $self->{'indent'} = 2;
355 }
356 }
357
358
359 if ($element eq "collection") {
360 # remember the full start tag for <collection ...>
361 # This is needed to wrap around each <record> when generating its associate MARCXML file
362
363 $self->{'xmlcollectiontag'} = $text;
364 }
365 else {
366 $self->{'content'} .= "<br/>" if ($element ne "record");
367 $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
368 $self->{'xmlcontent'} .= $text;
369 }
370
371}
372
373
374
375sub xml_end_tag {
376 my $self = shift(@_);
377 my ($expat, $element) = @_;
378
379 my $text = $_;
380 my $escaped_text = $self->escape_text($_);
381
382 if ($element eq "record" and defined $self->{'doc_obj'}) {
383 # process the document
384 my $processor = $self->{'processor'};
385 my $doc_obj = $self->{'doc_obj'};
386 $self->{'content'} .= "<br/>".$escaped_text;
387 $self->{'xmlcontent'} .= $text;
388
389
390 my $top_section = $doc_obj->get_top_section();
391
392 my $tmp_marcxml_filename = &util::get_tmp_filename().".xml";
393 if (open (XMLOUT,">$tmp_marcxml_filename")) {
394
395 print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";
396 my $xml_content = $self->{'xmlcontent'};
397
398 $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
399
400 print XMLOUT $xml_content;
401
402 close(XMLOUT);
403
404 $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
405
406 # assicate xsl style file for presentation as HTML
407 my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
408 $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
409
410 }
411 else {
412 my $outhandle = $self->{'outhandle'};
413 print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
414 }
415
416 # include any metadata passed in from previous plugins
417 # note that this metadata is associated with the top level section
418
419 $self->extra_metadata ($doc_obj,
420 $doc_obj->get_top_section(),
421 $self->{'metadata'});
422
423
424 $self->add_OID($doc_obj, $self->{'record_count'});
425
426 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
427 $processor->process($doc_obj);
428
429 ##clean up
430 $self->{'content'} = "";
431 $self->{'xmlcontent'} = "";
432 $self->{'doc_obj'} = undef;
433 return;
434 }
435
436 ## map the xmlmarc to gsdl metadata
437 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'}){
438 my $metadata_mapping = $self->{'metadata_mapping'};
439 my $marc_mapping = $self->{'marc_mapping'};
440 my $doc_obj = $self->{'doc_obj'};
441
442## print STDERR "**** Marc Record\n";
443## map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
444## print STDERR "**** Metadata Mapping\n";
445## map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
446
447
448 foreach my $marc_field (keys %$metadata_mapping){
449
450 ## test whether this field has subfield
451 my $subfield = undef;
452 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
453 $marc_field = $1;
454 $subfield = $2;
455 }
456
457 my $matched_field = $marc_mapping->{$marc_field};
458
459 if (defined $matched_field) {
460
461 my $meta_name = undef;
462 my $meta_value = undef;
463
464 if (defined $subfield){
465 $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
466
467 $meta_value = $matched_field->{$subfield};
468
469 if (!defined $meta_value) {
470 # record read in does not have the specified subfield
471 next;
472 }
473 }
474 else {
475 $meta_name = $metadata_mapping->{$marc_field};
476
477 # no subfield => get all the values
478 foreach my $value (sort keys %{$matched_field}) {
479 $meta_value .= $matched_field->{$value} ." ";
480 }
481
482 }
483
484 ## escape [ and ]
485 $meta_value =~ s/\[/\\\[/g;
486 $meta_value =~ s/\]/\\\]/g;
487 ##print STDERR "$meta_name=$meta_value\n";
488 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
489
490 }
491
492 }
493
494 ##clean up
495 $self->{'marc_mapping'} = undef;
496 $self->{'current_tag'} = "";
497 }
498
499 if ($element eq "datafield"){
500 $self->{'indent'} = 1;
501 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
502 $self->{'xmlcontent'} .= $text;
503 }
504 else{
505 $self->{'content'} .= $escaped_text;
506 $self->{'xmlcontent'} .= $text;
507 }
508
509}
510
511
512sub set_OID {
513 my $self = shift (@_);
514 my ($doc_obj, $record_number) = @_;
515
516 # first set it to generate hash value
517 $doc_obj->set_OID();
518
519 # then top it up with an "r" + record-number suffix
520 my $id = $doc_obj->get_OID();
521 $doc_obj->set_OID($id . "r" . $record_number);
522}
523
524sub xml_text {
525 my $self = shift(@_);
526 my ($expat) = @_;
527
528 my $text = $_;
529 my $escaped_text = $self->escape_text($_);
530
531 # protect against & in raw text file
532 $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
533
534 ## store the text of a marc code, for exapmle 520a=>A poem about....
535 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
536 ##stored it in the marc_mapping
537
538 my $current_tag = $self->{'current_tag'};
539 my $current_code = $self->{'current_code'};
540
541 $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
542
543 $self->{'current_code'} = "";
544 }
545
546 $self->{'content'} .= $escaped_text;
547 $self->{'xmlcontent'} .= $text;
548
549}
550
551sub calculate_indent{
552 my ($self,$num) = @_;
553
554 my $indent ="";
555
556 for (my $i=0; $i<$num;$i++){
557 $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
558 }
559
560 return $indent;
561
562}
563
564sub escape_text {
565 my ($self,$text) = @_;
566 # special characters in the xml encoding
567 $text =~ s/&/&amp;/g; # this has to be first...
568 $text =~ s/</&lt;/g;
569 $text =~ s/>/&gt;/g;
570 $text =~ s/\"/&quot;/g;
571
572 return $text;
573}
574
575
576sub unescape_text {
577 my ($self,$text) = @_;
578 # special characters in the xml encoding
579 $text =~ s/&lt;/</g;
580 $text =~ s/&gt;/>/g;
581 $text =~ s/&quot;/\"/g;
582
583 $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
584
585 return $text;
586}
587
588
5891;
590
591
Note: See TracBrowser for help on using the repository browser.