source: gsdl/trunk/perllib/plugins/MARCXMLPlugin.pm@ 19619

Last change on this file since 19619 was 19619, checked in by kjdon, 15 years ago

just look for a single mappign file, not for a list of them. collection version should override the main one. and use util::tidy_up_oid for assigned ids

  • Property svn:keywords set to Author Date Id Revision
File size: 13.9 KB
Line 
1###########################################################################
2#
3# MARCXMLPlugin.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
30package MARCXMLPlugin;
31
32use ReadXMLFile;
33use ReadTextFile;
34use marcmapping;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39sub BEGIN {
40 @MARCXMLPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
41}
42
43my $arguments = [{'name' => "metadata_mapping_file",
44 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
45 'type' => "string",
46 'deft' => "marc2dc.txt",
47 'reqd' => "no" }];
48
49my $options = { 'name' => "MARCXMLPlugin",
50 'desc' => "{MARCXMLPlugin.desc}",
51 'abstract' => "no",
52 'inherits' => "yes",
53 'args' => $arguments
54 };
55
56sub new {
57 my ($class) = shift (@_);
58 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
59 push(@$pluginlist, $class);
60
61 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
62 push(@{$hashArgOptLists->{"OptList"}},$options);
63
64 # we want to be able to use the textcat methods from ReadTextFile
65 # to get the language and encoding
66 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
67
68 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
69
70 # we want to strip namespaces, so have to create a new XML parser
71 my $parser = new XML::Parser('Style' => 'Stream',
72 'Pkg' => 'ReadXMLFile',
73 'PluginObj' => $self,
74 'Namespaces' => 1, # strip out namespaces
75 'Handlers' => {'Char' => \&ReadXMLFile::Char,
76 'XMLDecl' => \&ReadXMLFile::XMLDecl,
77 'Entity' => \&ReadXMLFile::Entity,
78 'Doctype' => \&ReadXMLFile::Doctype,
79 'Default' => \&ReadXMLFile::Default
80 });
81
82 $self->{'parser'} = $parser;
83
84 $self->{'content'} = "";
85 $self->{'xmlcontent'} = "";
86 $self->{'record_count'} = 1;
87 $self->{'language'} = "";
88 $self->{'encoding'} = "";
89 $self->{'marc_mapping'} = {};
90 $self->{'current_code'} = "";
91 $self->{'current_tag'} = "";
92 $self->{'current_element'} = "";
93 $self->{'metadata_mapping'} = undef;
94 $self->{'num_processed'} = 0;
95 $self->{'indent'} = 0;
96
97 return bless $self, $class;
98}
99
100
101
102sub get_doctype {
103 my $self = shift(@_);
104
105 return "collection";
106}
107
108
109sub init {
110 my $self = shift (@_);
111 my ($verbosity, $outhandle, $failhandle) = @_;
112
113 ## the mapping file has already been loaded
114 if (defined $self->{'metadata_mapping'} ){
115 $self->SUPER::init(@_);
116 return;
117 }
118
119 # read in the metadata mapping file
120 my $mm_file = &util::locate_config_file($self->{'metadata_mapping_file'});
121
122 if (! defined $mm_file)
123 {
124 my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
125 $self->{'metadata_mapping_file'} . "\".\n " .
126 " No metadata will be extracted from MARCXML files.\n";
127
128 print $outhandle $msg;
129 print $failhandle $msg;
130 $self->{'metadata_mapping'} = undef;
131 # We pick up the error in process() if there is no $mm_file
132 # If we exit here, then pluginfo.pl will exit too!
133 }
134 else {
135 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_file, $outhandle);
136 }
137
138
139 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
140
141 $self->SUPER::init(@_);
142}
143
144# Called for DOCTYPE declarations - use die to bail out if this doctype
145# is not meant for this plugin
146sub xml_doctype {
147 my $self = shift(@_);
148
149 my ($expat, $name, $sysid, $pubid, $internal) = @_;
150 return;
151
152}
153
154
155sub xml_start_document {
156 my $self = shift(@_);
157
158 my ($expat, $name, $sysid, $pubid, $internal) = @_;
159
160
161 my $file = $self->{'file'};
162 my $filename = $self->{'filename'};
163
164 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
165
166 $self->{'language'} = $language;
167 $self->{'encoding'} = $encoding;
168 $self->{'element_count'} = 1;
169 $self->{'indent'} = 0;
170 my $outhandle = $self->{'outhandle'};
171 print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
172 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
173
174 # reset the base id
175 $self->{'base_oid'} = undef;
176
177}
178
179sub xml_end_document {
180
181}
182
183sub xml_start_tag {
184 my $self = shift;
185 my $expat = shift;
186 my $element = shift;
187
188 my $text = $_;
189 my $escaped_text = $self->escape_text($_);
190
191 $self->{'current_element'} = $element;
192
193 ##get all atributes of this element and store it in a map name=>value
194 my %attr_map = ();
195 my $attrstring = $_;
196 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
197 $attr_map{$1}=$2;
198 $attrstring = $'; #'
199 }
200
201
202 my $processor = $self->{'processor'};
203
204 ##create a new document for each record
205 if ($element eq "record") {
206 my $filename = $self->{'filename'};
207 my $language = $self->{'language'};
208 my $encoding = $self->{'encoding'};
209 my $file = $self->{'file'};
210 my $doc_obj = new doc($filename, undef, $self->{'file_rename_method'});
211 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
212 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
213 my ($filemeta) = $file =~ /([^\\\/]+)$/;
214 $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
215 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
216 if ($self->{'cover_image'}) {
217 $self->associate_cover_image($doc_obj, $filename);
218 }
219 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
220 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "MARCXML");
221
222 my $outhandle = $self->{'outhandle'};
223 print $outhandle "Record $self->{'record_count'}\n" if $self->{'verbosity'} > 1;
224
225 $self->{'record_count'}++;
226 $self->{'doc_obj'} = $doc_obj;
227 $self->{'num_processed'}++;
228 if (!defined $self->{'base_oid'}) {
229 $self->SUPER::add_OID($doc_obj);
230 $self->{'base_oid'} = $doc_obj->get_OID();
231 }
232
233
234 }
235
236 ## get the marc code, for example 520
237 if ($element eq "datafield") {
238 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
239 $self->{'current_tag'} = $attr_map{tag};
240 }
241 }
242
243
244 ## append the subcode to the marc code for example 520a or 520b
245 if ($element eq "subfield"){
246 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
247 $self->{'current_code'} = $attr_map{'code'};
248 }
249 }
250
251 if ($element eq "record"){
252 $self->{'indent'} = 0;
253 $self->{'content'} = "";
254 $self->{'xmlcontent'} = "";
255 }
256 else {
257 if ($element ne "subfield"){
258 $self->{'indent'} = 1;
259 }
260 else{
261 $self->{'indent'} = 2;
262 }
263 }
264
265
266 if ($element eq "collection") {
267 # remember the full start tag for <collection ...>
268 # This is needed to wrap around each <record> when generating its associate MARCXML file
269
270 $self->{'xmlcollectiontag'} = $text;
271 }
272 else {
273 $self->{'content'} .= "<br/>" if ($element ne "record");
274 $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
275 $self->{'xmlcontent'} .= $text;
276 }
277
278}
279
280
281
282sub xml_end_tag {
283 my $self = shift(@_);
284 my ($expat, $element) = @_;
285
286 my $text = $_;
287 my $escaped_text = $self->escape_text($_);
288
289 if ($element eq "record" and defined $self->{'doc_obj'}) {
290 # process the document
291 my $processor = $self->{'processor'};
292 my $doc_obj = $self->{'doc_obj'};
293 $self->{'content'} .= "<br/>".$escaped_text;
294 $self->{'xmlcontent'} .= $text;
295
296
297 my $top_section = $doc_obj->get_top_section();
298
299 my $tmp_marcxml_filename = &util::get_tmp_filename("xml");
300 if (open (XMLOUT,">$tmp_marcxml_filename")) {
301
302 print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";
303 my $xml_content = $self->{'xmlcontent'};
304
305 $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
306
307 print XMLOUT $xml_content;
308
309 close(XMLOUT);
310
311 $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
312
313 # assicate xsl style file for presentation as HTML
314 my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
315 $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
316
317 }
318 else {
319 my $outhandle = $self->{'outhandle'};
320 print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
321 }
322
323 # include any metadata passed in from previous plugins
324 # note that this metadata is associated with the top level section
325
326 $self->extra_metadata ($doc_obj,
327 $doc_obj->get_top_section(),
328 $self->{'metadata'});
329
330
331 $self->add_OID($doc_obj, $self->{'base_oid'}, $self->{'record_count'});
332
333 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
334 $processor->process($doc_obj);
335
336 ##clean up
337 $self->{'content'} = "";
338 $self->{'xmlcontent'} = "";
339 $self->{'doc_obj'} = undef;
340 return;
341 }
342
343 ## map the xmlmarc to gsdl metadata
344 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'} and defined $self->{'metadata_mapping'}){
345 my $metadata_mapping = $self->{'metadata_mapping'};
346 my $marc_mapping = $self->{'marc_mapping'};
347 my $doc_obj = $self->{'doc_obj'};
348
349## print STDERR "**** Marc Record\n";
350## map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
351## print STDERR "**** Metadata Mapping\n";
352## map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
353
354
355 foreach my $marc_field (keys %$metadata_mapping){
356
357 ## test whether this field has subfield
358 my $subfield = undef;
359 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
360 $marc_field = $1;
361 $subfield = $2;
362 }
363
364 my $matched_field = $marc_mapping->{$marc_field};
365
366 if (defined $matched_field) {
367
368 my $meta_name = undef;
369 my $meta_value = undef;
370
371 if (defined $subfield){
372 $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
373
374 $meta_value = $matched_field->{$subfield};
375
376 if (!defined $meta_value) {
377 # record read in does not have the specified subfield
378 next;
379 }
380 }
381 else {
382 $meta_name = $metadata_mapping->{$marc_field};
383
384 # no subfield => get all the values
385 foreach my $value (sort keys %{$matched_field}) {
386 $meta_value .= $matched_field->{$value} ." ";
387 }
388
389 }
390
391 ## escape [ and ]
392 $meta_value =~ s/\[/\\\[/g;
393 $meta_value =~ s/\]/\\\]/g;
394 ##print STDERR "$meta_name=$meta_value\n";
395 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
396
397 }
398
399 }
400
401 ##clean up
402 $self->{'marc_mapping'} = undef;
403 $self->{'current_tag'} = "";
404 }
405
406 if ($element eq "datafield"){
407 $self->{'indent'} = 1;
408 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
409 $self->{'xmlcontent'} .= $text;
410 }
411 else{
412 $self->{'content'} .= $escaped_text;
413 $self->{'xmlcontent'} .= $text;
414 }
415
416}
417
418sub add_OID {
419 my $self = shift (@_);
420 my ($doc_obj, $id, $record_number) = @_;
421
422 my $full_id = $id . "r" . $record_number;
423 if ($self->{'OIDtype'} eq "assigned") {
424 my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
425 if (defined $identifier && $identifier ne "") {
426 $full_id = $identifier;
427 $full_id = &util::tidy_up_oid($full_id);
428 }
429 }
430 $doc_obj->set_OID($full_id);
431}
432
433sub xml_text {
434 my $self = shift(@_);
435 my ($expat) = @_;
436
437 my $text = $_;
438 my $escaped_text = $self->escape_text($_);
439
440 # protect against & in raw text file
441 $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
442
443 ## store the text of a marc code, for exapmle 520a=>A poem about....
444 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
445 ##stored it in the marc_mapping
446
447 my $current_tag = $self->{'current_tag'};
448 my $current_code = $self->{'current_code'};
449
450 $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
451
452 $self->{'current_code'} = "";
453 }
454
455 $self->{'content'} .= $escaped_text;
456 $self->{'xmlcontent'} .= $text;
457
458}
459
460sub calculate_indent{
461 my ($self,$num) = @_;
462
463 my $indent ="";
464
465 for (my $i=0; $i<$num;$i++){
466 $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
467 }
468
469 return $indent;
470
471}
472
473sub escape_text {
474 my ($self,$text) = @_;
475 # special characters in the xml encoding
476 $text =~ s/&/&amp;/g; # this has to be first...
477 $text =~ s/</&lt;/g;
478 $text =~ s/>/&gt;/g;
479 $text =~ s/\"/&quot;/g;
480
481 return $text;
482}
483
484
485sub unescape_text {
486 my ($self,$text) = @_;
487 # special characters in the xml encoding
488 $text =~ s/&lt;/</g;
489 $text =~ s/&gt;/>/g;
490 $text =~ s/&quot;/\"/g;
491
492 $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
493
494 return $text;
495}
496
497
4981;
499
500
Note: See TracBrowser for help on using the repository browser.