source: gsdl/trunk/perllib/plugins/MARCXMLPlugin.pm@ 15918

Last change on this file since 15918 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 15.5 KB
Line 
1###########################################################################
2#
3# MARCXMLPlugin.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
30package MARCXMLPlugin;
31
32use ReadXMLFile;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub BEGIN {
38 @MARCXMLPlugin::ISA = ('ReadXMLFile');
39}
40
41my $arguments = [{'name' => "metadata_mapping_file",
42 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
43 'type' => "string",
44 'deft' => "marctodc.txt",
45 'reqd' => "no" }];
46
47my $options = { 'name' => "MARCXMLPlugin",
48 'desc' => "{MARCXMLPlugin.desc}",
49 'abstract' => "no",
50 'inherits' => "yes",
51 'args' => $arguments
52 };
53
54sub new {
55 my ($class) = shift (@_);
56 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
57 push(@$pluginlist, $class);
58
59 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
60 push(@{$hashArgOptLists->{"OptList"}},$options);
61
62 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
63
64 $self->{'content'} = "";
65 $self->{'xmlcontent'} = "";
66 $self->{'record_count'} = 1;
67 $self->{'language'} = "";
68 $self->{'encoding'} = "";
69 $self->{'marc_mapping'} = {};
70 $self->{'current_code'} = "";
71 $self->{'current_tag'} = "";
72 $self->{'current_element'} = "";
73 $self->{'metadata_mapping'} = undef;
74 $self->{'num_processed'} = 0;
75 $self->{'indent'} = 0;
76
77 return bless $self, $class;
78}
79
80sub get_doctype {
81 my $self = shift(@_);
82
83 return "collection";
84}
85
86
87sub _parse_marc_metadata_mapping
88{
89 my $self = shift(@_);
90 my ($mm_file,$metadata_mapping) = @_;
91
92 my $outhandle = $self->{'outhandle'};
93
94 if (open(MMIN, "<$mm_file"))
95 {
96 my $l=0;
97 my $line;
98 while (defined($line=<MMIN>))
99 {
100 $l++;
101 chomp $line;
102 $line =~ s/#.*$//; # strip out any comments, including end of line
103 next if ($line =~ m/^\s*$/);
104 $line =~ s/\s+$//; # remove any white space at end of line
105
106 my $parse_error_count = 0;
107 if ($line =~ m/^-(\d+)\s*$/) {
108 # special "remove" rule syntax
109 my $marc_info = $1;
110 if (defined $metadata_mapping->{$marc_info}) {
111 delete $metadata_mapping->{$marc_info};
112 }
113 else {
114 print $outhandle "Parse Warning: Did not file pre-existing rule $marc_info to remove";
115 print $outhandle " on line $l of $mm_file:\n";
116 print $outhandle " $line\n";
117 }
118 }
119 elsif ($line =~ m/^(.*?)->\s*([\w\^]+)$/)
120 {
121 my $lhs = $1;
122 my $gsdl_info = $2;
123
124 my @fields = split(/,\s*/,$lhs);
125 my $f;
126 while ($f = shift (@fields)) {
127 $f =~ s/\s+$//; # remove any white space at end of line
128
129 if ($f =~ m/^(\d+)\-(\d+)$/) {
130 # number range => genrate number in range and
131 # push on to array
132 push(@fields,$1..$2);
133 next;
134 }
135
136 if ($f =~ m/^(\d+)((?:(?:\$|\^)\w)*)\s*$/) {
137
138 my $marc_info = $1;
139 my $opt_sub_fields = $2;
140
141 if ($opt_sub_fields ne "") {
142 my @sub_fields = split(/\$|\^/,$opt_sub_fields);
143 shift @sub_fields; # skip first entry, which is blank
144
145 foreach my $sub_field (@sub_fields) {
146 $metadata_mapping->{$marc_info."\$".$sub_field} = $gsdl_info;
147 }
148 }
149 else {
150 # no subfields to worry about
151 $marc_info =~ s/\^/\$/;
152 $metadata_mapping->{$marc_info} = $gsdl_info;
153 }
154 }
155 else {
156 $parse_error_count++;
157 }
158 }
159 }
160 else
161 {
162 $parse_error_count++;
163 }
164
165 if ($parse_error_count>0) {
166
167 print $outhandle "Parse Error: $parse_error_count syntax error(s) on line $l of $mm_file:\n";
168 print $outhandle " $line\n";
169 }
170 }
171 close(MMIN);
172 }
173 else
174 {
175 print STDERR "Unable to open $mm_file: $!\n";
176 }
177}
178
179
180sub parse_marc_metadata_mapping
181{
182 my $self = shift(@_);
183 my ($mm_file_or_files) = @_;
184
185 my $metadata_mapping = {};
186
187 if (ref ($mm_file_or_files) eq 'SCALAR') {
188 my $mm_file = $mm_file_or_files;
189 $self->_parse_marc_metadata_mapping($mm_file,$metadata_mapping);
190 }
191 else {
192 my $mm_files = $mm_file_or_files;
193
194 # Need to process files in reverse order. This is so in the
195 # case where we have both a "collect" and "main" version,
196 # the "collect" one tops up the main one
197
198 my $mm_file;
199 while ($mm_file = pop(@$mm_files)) {
200 $self->_parse_marc_metadata_mapping($mm_file,$metadata_mapping);
201 }
202 }
203
204 return $metadata_mapping;
205}
206
207
208sub init {
209 my $self = shift (@_);
210 my ($verbosity, $outhandle, $failhandle) = @_;
211
212 ## the mapping file has already been loaded
213 if (defined $self->{'metadata_mapping'} ){
214 $self->SUPER::init(@_);
215 return;
216 }
217
218 # read in the metadata mapping file
219 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
220
221
222 if (scalar(@$mm_files)==0)
223 {
224 my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
225 $self->{'metadata_mapping_file'} . "\".\n " .
226 " No marc files can be processed.\n";
227
228 print $outhandle $msg;
229 print $failhandle $msg;
230 $self->{'metadata_mapping'} = undef;
231 # We pick up the error in process() if there is no $mm_file
232 # If we exit here, then pluginfo.pl will exit too!
233 }
234 else {
235 $self->{'metadata_mapping'} = $self->parse_marc_metadata_mapping($mm_files);
236 }
237
238
239 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
240
241 $self->SUPER::init(@_);
242}
243
244# Called for DOCTYPE declarations - use die to bail out if this doctype
245# is not meant for this plugin
246sub xml_doctype {
247 my $self = shift(@_);
248
249 my ($expat, $name, $sysid, $pubid, $internal) = @_;
250 return;
251
252}
253
254
255sub xml_start_document {
256 my $self = shift(@_);
257
258 my ($expat, $name, $sysid, $pubid, $internal) = @_;
259
260
261 my $file = $self->{'file'};
262 my $filename = $self->{'filename'};
263
264 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
265
266 $self->{'language'} = $language;
267 $self->{'encoding'} = $encoding;
268 $self->{'element_count'} = 1;
269 $self->{'indent'} = 0;
270 my $outhandle = $self->{'outhandle'};
271 print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
272 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
273
274}
275
276sub xml_end_document {
277
278}
279
280sub xml_start_tag {
281 my $self = shift;
282 my $expat = shift;
283 my $element = shift;
284
285 my $text = $_;
286 my $escaped_text = $self->escape_text($_);
287
288 $self->{'current_element'} = $element;
289
290 ##get all atributes of this element and store it in a map name=>value
291 my %attr_map = ();
292 my $attrstring = $_;
293 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
294 $attr_map{$1}=$2;
295 $attrstring = $'; #'
296 }
297
298
299 my $processor = $self->{'processor'};
300
301 ##create a new document for each record
302 if ($element eq "record") {
303 my $filename = $self->{'filename'};
304 my $language = $self->{'language'};
305 my $encoding = $self->{'encoding'};
306 my $file = $self->{'file'};
307 my $doc_obj = new doc($filename);
308 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
309 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
310 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
311 my ($filemeta) = $file =~ /([^\\\/]+)$/;
312 $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
313 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
314 if ($self->{'cover_image'}) {
315 $self->associate_cover_image($doc_obj, $filename);
316 }
317 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
318 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
319
320 my $outhandle = $self->{'outhandle'};
321 print $outhandle "Record $self->{'record_count'} - MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
322
323 $self->{'record_count'}++;
324 $self->{'doc_obj'} = $doc_obj;
325 $self->{'num_processed'}++;
326
327 }
328
329 ## get the marc code, for example 520
330 if ($element eq "datafield") {
331 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
332 $self->{'current_tag'} = $attr_map{tag};
333 }
334 }
335
336
337 ## append the subcode to the marc code for example 520a or 520b
338 if ($element eq "subfield"){
339 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
340 $self->{'current_code'} = $attr_map{'code'};
341 }
342 }
343
344 if ($element eq "record"){
345 $self->{'indent'} = 0;
346 $self->{'content'} = "";
347 $self->{'xmlcontent'} = "";
348 }
349 else {
350 if ($element ne "subfield"){
351 $self->{'indent'} = 1;
352 }
353 else{
354 $self->{'indent'} = 2;
355 }
356 }
357
358
359 if ($element eq "collection") {
360 # remember the full start tag for <collection ...>
361 # This is needed to wrap around each <record> when generating its associate MARCXML file
362
363 $self->{'xmlcollectiontag'} = $text;
364 }
365 else {
366 $self->{'content'} .= "<br/>" if ($element ne "record");
367 $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
368 $self->{'xmlcontent'} .= $text;
369 }
370
371}
372
373
374
375sub xml_end_tag {
376 my $self = shift(@_);
377 my ($expat, $element) = @_;
378
379 my $text = $_;
380 my $escaped_text = $self->escape_text($_);
381
382 if ($element eq "record" and defined $self->{'doc_obj'}) {
383 # process the document
384 my $processor = $self->{'processor'};
385 my $doc_obj = $self->{'doc_obj'};
386 $self->{'content'} .= "<br/>".$escaped_text;
387 $self->{'xmlcontent'} .= $text;
388
389
390 my $top_section = $doc_obj->get_top_section();
391
392 my $tmp_marcxml_filename = &util::get_tmp_filename().".xml";
393 if (open (XMLOUT,">$tmp_marcxml_filename")) {
394
395 print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";
396 my $xml_content = $self->{'xmlcontent'};
397
398 $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
399
400 print XMLOUT $xml_content;
401
402 close(XMLOUT);
403
404 $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
405
406 # assicate xsl style file for presentation as HTML
407 my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
408 $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
409
410 }
411 else {
412 my $outhandle = $self->{'outhandle'};
413 print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
414 }
415
416 # include any metadata passed in from previous plugins
417 # note that this metadata is associated with the top level section
418
419 $self->extra_metadata ($doc_obj,
420 $doc_obj->get_top_section(),
421 $self->{'metadata'});
422
423
424 $self->add_OID($doc_obj, $self->{'record_count'});
425
426 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
427 $processor->process($doc_obj);
428
429 ##clean up
430 $self->{'content'} = "";
431 $self->{'xmlcontent'} = "";
432 $self->{'doc_obj'} = undef;
433 return;
434 }
435
436 ## map the xmlmarc to gsdl metadata
437 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'}){
438 my $metadata_mapping = $self->{'metadata_mapping'};
439 my $marc_mapping = $self->{'marc_mapping'};
440 my $doc_obj = $self->{'doc_obj'};
441
442## print STDERR "**** Marc Record\n";
443## map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
444## print STDERR "**** Metadata Mapping\n";
445## map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
446
447
448 foreach my $marc_field (keys %$metadata_mapping){
449
450 ## test whether this field has subfield
451 my $subfield = undef;
452 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
453 $marc_field = $1;
454 $subfield = $2;
455 }
456
457 my $matched_field = $marc_mapping->{$marc_field};
458
459 if (defined $matched_field) {
460
461 my $meta_name = undef;
462 my $meta_value = undef;
463
464 if (defined $subfield){
465 $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
466
467 $meta_value = $matched_field->{$subfield};
468
469 if (!defined $meta_value) {
470 # record read in does not have the specified subfield
471 next;
472 }
473 }
474 else {
475 $meta_name = $metadata_mapping->{$marc_field};
476
477 # no subfield => get all the values
478 foreach my $value (sort keys %{$matched_field}) {
479 $meta_value .= $matched_field->{$value} ." ";
480 }
481
482 }
483
484 ## escape [ and ]
485 $meta_value =~ s/\[/\\\[/g;
486 $meta_value =~ s/\]/\\\]/g;
487 ##print STDERR "$meta_name=$meta_value\n";
488 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
489
490 }
491
492 }
493
494 ##clean up
495 $self->{'marc_mapping'} = undef;
496 $self->{'current_tag'} = "";
497 }
498
499 if ($element eq "datafield"){
500 $self->{'indent'} = 1;
501 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
502 $self->{'xmlcontent'} .= $text;
503 }
504 else{
505 $self->{'content'} .= $escaped_text;
506 $self->{'xmlcontent'} .= $text;
507 }
508
509}
510
511
512sub set_OID {
513 my $self = shift (@_);
514 my ($doc_obj, $record_number) = @_;
515
516 # first set it to generate hash value
517 $doc_obj->set_OID();
518
519 # then top it up with an "r" + record-number suffix
520 my $id = $doc_obj->get_OID();
521 $doc_obj->set_OID($id . "r" . $record_number);
522}
523
524sub xml_text {
525 my $self = shift(@_);
526 my ($expat) = @_;
527
528 my $text = $_;
529 my $escaped_text = $self->escape_text($_);
530
531 # protect against & in raw text file
532 $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
533
534 ## store the text of a marc code, for exapmle 520a=>A poem about....
535 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
536 ##stored it in the marc_mapping
537
538 my $current_tag = $self->{'current_tag'};
539 my $current_code = $self->{'current_code'};
540
541 $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
542
543 $self->{'current_code'} = "";
544 }
545
546 $self->{'content'} .= $escaped_text;
547 $self->{'xmlcontent'} .= $text;
548
549}
550
551sub calculate_indent{
552 my ($self,$num) = @_;
553
554 my $indent ="";
555
556 for (my $i=0; $i<$num;$i++){
557 $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
558 }
559
560 return $indent;
561
562}
563
564sub escape_text {
565 my ($self,$text) = @_;
566 # special characters in the xml encoding
567 $text =~ s/&/&amp;/g; # this has to be first...
568 $text =~ s/</&lt;/g;
569 $text =~ s/>/&gt;/g;
570 $text =~ s/\"/&quot;/g;
571
572 return $text;
573}
574
575
576sub unescape_text {
577 my ($self,$text) = @_;
578 # special characters in the xml encoding
579 $text =~ s/&lt;/</g;
580 $text =~ s/&gt;/>/g;
581 $text =~ s/&quot;/\"/g;
582
583 $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
584
585 return $text;
586}
587
588
5891;
590
591
Note: See TracBrowser for help on using the repository browser.