source: gsdl/trunk/perllib/plugins/MARCXMLPlugin.pm@ 19672

Last change on this file since 19672 was 19672, checked in by kjdon, 15 years ago

if we are doing a mapping from a field with no subfield spacified, then we concatenate the values of all the subfields, separated by space. Have made it only add space between entries, not at the end of each entry (and therefore always a space at the end)

  • Property svn:keywords set to Author Date Id Revision
File size: 14.0 KB
Line 
1###########################################################################
2#
3# MARCXMLPlugin.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
30package MARCXMLPlugin;
31
32use ReadXMLFile;
33use ReadTextFile;
34use marcmapping;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39sub BEGIN {
40 @MARCXMLPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
41}
42
43my $arguments = [{'name' => "metadata_mapping_file",
44 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
45 'type' => "string",
46 'deft' => "marc2dc.txt",
47 'reqd' => "no" }];
48
49my $options = { 'name' => "MARCXMLPlugin",
50 'desc' => "{MARCXMLPlugin.desc}",
51 'abstract' => "no",
52 'inherits' => "yes",
53 'args' => $arguments
54 };
55
56sub new {
57 my ($class) = shift (@_);
58 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
59 push(@$pluginlist, $class);
60
61 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
62 push(@{$hashArgOptLists->{"OptList"}},$options);
63
64 # we want to be able to use the textcat methods from ReadTextFile
65 # to get the language and encoding
66 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
67
68 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
69
70 # we want to strip namespaces, so have to create a new XML parser
71 my $parser = new XML::Parser('Style' => 'Stream',
72 'Pkg' => 'ReadXMLFile',
73 'PluginObj' => $self,
74 'Namespaces' => 1, # strip out namespaces
75 'Handlers' => {'Char' => \&ReadXMLFile::Char,
76 'XMLDecl' => \&ReadXMLFile::XMLDecl,
77 'Entity' => \&ReadXMLFile::Entity,
78 'Doctype' => \&ReadXMLFile::Doctype,
79 'Default' => \&ReadXMLFile::Default
80 });
81
82 $self->{'parser'} = $parser;
83
84 $self->{'content'} = "";
85 $self->{'xmlcontent'} = "";
86 $self->{'record_count'} = 1;
87 $self->{'language'} = "";
88 $self->{'encoding'} = "";
89 $self->{'marc_mapping'} = {};
90 $self->{'current_code'} = "";
91 $self->{'current_tag'} = "";
92 $self->{'current_element'} = "";
93 $self->{'metadata_mapping'} = undef;
94 $self->{'num_processed'} = 0;
95 $self->{'indent'} = 0;
96
97 return bless $self, $class;
98}
99
100
101
102sub get_doctype {
103 my $self = shift(@_);
104
105 return "collection";
106}
107
108
109sub init {
110 my $self = shift (@_);
111 my ($verbosity, $outhandle, $failhandle) = @_;
112
113 ## the mapping file has already been loaded
114 if (defined $self->{'metadata_mapping'} ){
115 $self->SUPER::init(@_);
116 return;
117 }
118
119 # read in the metadata mapping file
120 my $mm_file = &util::locate_config_file($self->{'metadata_mapping_file'});
121
122 if (! defined $mm_file)
123 {
124 my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
125 $self->{'metadata_mapping_file'} . "\".\n " .
126 " No metadata will be extracted from MARCXML files.\n";
127
128 print $outhandle $msg;
129 print $failhandle $msg;
130 $self->{'metadata_mapping'} = undef;
131 # We pick up the error in process() if there is no $mm_file
132 # If we exit here, then pluginfo.pl will exit too!
133 }
134 else {
135 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_file, $outhandle);
136 }
137
138
139 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
140
141 $self->SUPER::init(@_);
142}
143
144# Called for DOCTYPE declarations - use die to bail out if this doctype
145# is not meant for this plugin
146sub xml_doctype {
147 my $self = shift(@_);
148
149 my ($expat, $name, $sysid, $pubid, $internal) = @_;
150 return;
151
152}
153
154
155sub xml_start_document {
156 my $self = shift(@_);
157
158 my ($expat, $name, $sysid, $pubid, $internal) = @_;
159
160
161 my $file = $self->{'file'};
162 my $filename = $self->{'filename'};
163
164 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
165
166 $self->{'language'} = $language;
167 $self->{'encoding'} = $encoding;
168 $self->{'element_count'} = 1;
169 $self->{'indent'} = 0;
170 my $outhandle = $self->{'outhandle'};
171 print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
172 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
173
174 # reset the base id
175 $self->{'base_oid'} = undef;
176
177}
178
179sub xml_end_document {
180
181}
182
183sub xml_start_tag {
184 my $self = shift;
185 my $expat = shift;
186 my $element = shift;
187
188 my $text = $_;
189 my $escaped_text = $self->escape_text($_);
190
191 $self->{'current_element'} = $element;
192
193 ##get all atributes of this element and store it in a map name=>value
194 my %attr_map = ();
195 my $attrstring = $_;
196 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
197 $attr_map{$1}=$2;
198 $attrstring = $'; #'
199 }
200
201
202 my $processor = $self->{'processor'};
203
204 ##create a new document for each record
205 if ($element eq "record") {
206 my $filename = $self->{'filename'};
207 my $language = $self->{'language'};
208 my $encoding = $self->{'encoding'};
209 my $file = $self->{'file'};
210 my $doc_obj = new doc($filename, undef, $self->{'file_rename_method'});
211 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
212 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
213 my ($filemeta) = $file =~ /([^\\\/]+)$/;
214 $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
215 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
216 if ($self->{'cover_image'}) {
217 $self->associate_cover_image($doc_obj, $filename);
218 }
219 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
220 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "MARCXML");
221
222 my $outhandle = $self->{'outhandle'};
223 print $outhandle "Record $self->{'record_count'}\n" if $self->{'verbosity'} > 1;
224
225 $self->{'record_count'}++;
226 $self->{'doc_obj'} = $doc_obj;
227 $self->{'num_processed'}++;
228 if (!defined $self->{'base_oid'}) {
229 $self->SUPER::add_OID($doc_obj);
230 $self->{'base_oid'} = $doc_obj->get_OID();
231 }
232
233
234 }
235
236 ## get the marc code, for example 520
237 if ($element eq "datafield") {
238 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
239 $self->{'current_tag'} = $attr_map{tag};
240 }
241 }
242
243
244 ## append the subcode to the marc code for example 520a or 520b
245 if ($element eq "subfield"){
246 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
247 $self->{'current_code'} = $attr_map{'code'};
248 }
249 }
250
251 if ($element eq "record"){
252 $self->{'indent'} = 0;
253 $self->{'content'} = "";
254 $self->{'xmlcontent'} = "";
255 }
256 else {
257 if ($element ne "subfield"){
258 $self->{'indent'} = 1;
259 }
260 else{
261 $self->{'indent'} = 2;
262 }
263 }
264
265
266 if ($element eq "collection") {
267 # remember the full start tag for <collection ...>
268 # This is needed to wrap around each <record> when generating its associate MARCXML file
269
270 $self->{'xmlcollectiontag'} = $text;
271 }
272 else {
273 $self->{'content'} .= "<br/>" if ($element ne "record");
274 $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
275 $self->{'xmlcontent'} .= $text;
276 }
277
278}
279
280
281
282sub xml_end_tag {
283 my $self = shift(@_);
284 my ($expat, $element) = @_;
285
286 my $text = $_;
287 my $escaped_text = $self->escape_text($_);
288
289 if ($element eq "record" and defined $self->{'doc_obj'}) {
290 # process the document
291 my $processor = $self->{'processor'};
292 my $doc_obj = $self->{'doc_obj'};
293 $self->{'content'} .= "<br/>".$escaped_text;
294 $self->{'xmlcontent'} .= $text;
295
296
297 my $top_section = $doc_obj->get_top_section();
298
299 my $tmp_marcxml_filename = &util::get_tmp_filename("xml");
300 if (open (XMLOUT,">$tmp_marcxml_filename")) {
301
302 print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";
303 my $xml_content = $self->{'xmlcontent'};
304
305 $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
306
307 print XMLOUT $xml_content;
308
309 close(XMLOUT);
310
311 $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
312
313 # assicate xsl style file for presentation as HTML
314 my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
315 $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
316
317 }
318 else {
319 my $outhandle = $self->{'outhandle'};
320 print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
321 }
322
323 # include any metadata passed in from previous plugins
324 # note that this metadata is associated with the top level section
325
326 $self->extra_metadata ($doc_obj,
327 $doc_obj->get_top_section(),
328 $self->{'metadata'});
329
330
331 $self->add_OID($doc_obj, $self->{'base_oid'}, $self->{'record_count'});
332
333 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
334 $processor->process($doc_obj);
335
336 ##clean up
337 $self->{'content'} = "";
338 $self->{'xmlcontent'} = "";
339 $self->{'doc_obj'} = undef;
340 return;
341 }
342
343 ## map the xmlmarc to gsdl metadata
344 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'} and defined $self->{'metadata_mapping'}){
345 my $metadata_mapping = $self->{'metadata_mapping'};
346 my $marc_mapping = $self->{'marc_mapping'};
347 my $doc_obj = $self->{'doc_obj'};
348
349 ##print STDERR "**** Marc Record\n";
350 ##map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
351 ##print STDERR "**** Metadata Mapping\n";
352 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
353
354
355 foreach my $marc_field (keys %$metadata_mapping){
356
357 ## test whether this field has subfield
358 my $subfield = undef;
359 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
360 $marc_field = $1;
361 $subfield = $2;
362 }
363
364 my $matched_field = $marc_mapping->{$marc_field};
365
366 if (defined $matched_field) {
367
368 my $meta_name = undef;
369 my $meta_value = undef;
370
371 if (defined $subfield){
372 $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
373
374 $meta_value = $matched_field->{$subfield};
375
376 if (!defined $meta_value) {
377 # record read in does not have the specified subfield
378 next;
379 }
380 }
381 else {
382 $meta_name = $metadata_mapping->{$marc_field};
383
384 # no subfield => get all the values
385 my $first = 1;
386 foreach my $value (sort keys %{$matched_field}) {
387 if ($first) {
388 $meta_value = $matched_field->{$value};
389 $first = 0;
390 } else {
391 $meta_value .= " " . $matched_field->{$value};
392 }
393 }
394
395 }
396
397 ## escape [ and ]
398 $meta_value =~ s/\[/\\\[/g;
399 $meta_value =~ s/\]/\\\]/g;
400 ##print STDERR "$meta_name=$meta_value\n";
401 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
402
403 }
404
405 }
406
407 ##clean up
408 $self->{'marc_mapping'} = undef;
409 $self->{'current_tag'} = "";
410 }
411
412 if ($element eq "datafield"){
413 $self->{'indent'} = 1;
414 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
415 $self->{'xmlcontent'} .= $text;
416 }
417 else{
418 $self->{'content'} .= $escaped_text;
419 $self->{'xmlcontent'} .= $text;
420 }
421
422}
423
424sub add_OID {
425 my $self = shift (@_);
426 my ($doc_obj, $id, $record_number) = @_;
427
428 my $full_id = $id . "r" . $record_number;
429 if ($self->{'OIDtype'} eq "assigned") {
430 my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
431 if (defined $identifier && $identifier ne "") {
432 $full_id = $identifier;
433 $full_id = &util::tidy_up_oid($full_id);
434 }
435 }
436 $doc_obj->set_OID($full_id);
437}
438
439sub xml_text {
440 my $self = shift(@_);
441 my ($expat) = @_;
442
443 my $text = $_;
444 my $escaped_text = $self->escape_text($_);
445
446 # protect against & in raw text file
447 $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
448
449 ## store the text of a marc code, for exapmle 520a=>A poem about....
450 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
451 ##stored it in the marc_mapping
452
453 my $current_tag = $self->{'current_tag'};
454 my $current_code = $self->{'current_code'};
455
456 $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
457
458 $self->{'current_code'} = "";
459 }
460
461 $self->{'content'} .= $escaped_text;
462 $self->{'xmlcontent'} .= $text;
463
464}
465
466sub calculate_indent{
467 my ($self,$num) = @_;
468
469 my $indent ="";
470
471 for (my $i=0; $i<$num;$i++){
472 $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
473 }
474
475 return $indent;
476
477}
478
479sub escape_text {
480 my ($self,$text) = @_;
481 # special characters in the xml encoding
482 $text =~ s/&/&amp;/g; # this has to be first...
483 $text =~ s/</&lt;/g;
484 $text =~ s/>/&gt;/g;
485 $text =~ s/\"/&quot;/g;
486
487 return $text;
488}
489
490
491sub unescape_text {
492 my ($self,$text) = @_;
493 # special characters in the xml encoding
494 $text =~ s/&lt;/</g;
495 $text =~ s/&gt;/>/g;
496 $text =~ s/&quot;/\"/g;
497
498 $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
499
500 return $text;
501}
502
503
5041;
505
506
Note: See TracBrowser for help on using the repository browser.