source: gsdl/trunk/perllib/plugins/MARCXMLPlugin.pm@ 18900

Last change on this file since 18900 was 18900, checked in by kjdon, 15 years ago

this relied on the fact that the parser created in ReadXMLFile strips out namepsaces. It no longer does this, so now we need to create our own xml parser that does do it.

  • Property svn:keywords set to Author Date Id Revision
File size: 14.0 KB
Line 
1###########################################################################
2#
3# MARCXMLPlugin.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
30package MARCXMLPlugin;
31
32use ReadXMLFile;
33use ReadTextFile;
34use marcmapping;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39sub BEGIN {
40 @MARCXMLPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
41}
42
43my $arguments = [{'name' => "metadata_mapping_file",
44 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
45 'type' => "string",
46 'deft' => "marc2dc.txt",
47 'reqd' => "no" }];
48
49my $options = { 'name' => "MARCXMLPlugin",
50 'desc' => "{MARCXMLPlugin.desc}",
51 'abstract' => "no",
52 'inherits' => "yes",
53 'args' => $arguments
54 };
55
56sub new {
57 my ($class) = shift (@_);
58 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
59 push(@$pluginlist, $class);
60
61 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
62 push(@{$hashArgOptLists->{"OptList"}},$options);
63
64 # we want to be able to use the textcat methods from ReadTextFile
65 # to get the language and encoding
66 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
67
68 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
69
70 # we want to strip namespaces, so have to create a new XML parser
71 my $parser = new XML::Parser('Style' => 'Stream',
72 'Pkg' => 'ReadXMLFile',
73 'PluginObj' => $self,
74 'Namespaces' => 1, # strip out namespaces
75 'Handlers' => {'Char' => \&ReadXMLFile::Char,
76 'XMLDecl' => \&ReadXMLFile::XMLDecl,
77 'Entity' => \&ReadXMLFile::Entity,
78 'Doctype' => \&ReadXMLFile::Doctype,
79 'Default' => \&ReadXMLFile::Default
80 });
81
82 $self->{'parser'} = $parser;
83
84 $self->{'content'} = "";
85 $self->{'xmlcontent'} = "";
86 $self->{'record_count'} = 1;
87 $self->{'language'} = "";
88 $self->{'encoding'} = "";
89 $self->{'marc_mapping'} = {};
90 $self->{'current_code'} = "";
91 $self->{'current_tag'} = "";
92 $self->{'current_element'} = "";
93 $self->{'metadata_mapping'} = undef;
94 $self->{'num_processed'} = 0;
95 $self->{'indent'} = 0;
96
97 return bless $self, $class;
98}
99
100
101
102sub get_doctype {
103 my $self = shift(@_);
104
105 return "collection";
106}
107
108
109sub init {
110 my $self = shift (@_);
111 my ($verbosity, $outhandle, $failhandle) = @_;
112
113 ## the mapping file has already been loaded
114 if (defined $self->{'metadata_mapping'} ){
115 $self->SUPER::init(@_);
116 return;
117 }
118
119 # read in the metadata mapping files
120 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
121
122
123 if (scalar(@$mm_files)==0)
124 {
125 my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
126 $self->{'metadata_mapping_file'} . "\".\n " .
127 " No metadata will be extracted from MARCXML files.\n";
128
129 print $outhandle $msg;
130 print $failhandle $msg;
131 $self->{'metadata_mapping'} = undef;
132 # We pick up the error in process() if there is no $mm_file
133 # If we exit here, then pluginfo.pl will exit too!
134 }
135 else {
136 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
137 }
138
139
140 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
141
142 $self->SUPER::init(@_);
143}
144
145# Called for DOCTYPE declarations - use die to bail out if this doctype
146# is not meant for this plugin
147sub xml_doctype {
148 my $self = shift(@_);
149
150 my ($expat, $name, $sysid, $pubid, $internal) = @_;
151 return;
152
153}
154
155
156sub xml_start_document {
157 my $self = shift(@_);
158
159 my ($expat, $name, $sysid, $pubid, $internal) = @_;
160
161
162 my $file = $self->{'file'};
163 my $filename = $self->{'filename'};
164
165 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
166
167 $self->{'language'} = $language;
168 $self->{'encoding'} = $encoding;
169 $self->{'element_count'} = 1;
170 $self->{'indent'} = 0;
171 my $outhandle = $self->{'outhandle'};
172 print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
173 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
174
175 # reset the base id
176 $self->{'base_oid'} = undef;
177
178}
179
180sub xml_end_document {
181
182}
183
184sub xml_start_tag {
185 my $self = shift;
186 my $expat = shift;
187 my $element = shift;
188
189 my $text = $_;
190 my $escaped_text = $self->escape_text($_);
191
192 $self->{'current_element'} = $element;
193
194 ##get all atributes of this element and store it in a map name=>value
195 my %attr_map = ();
196 my $attrstring = $_;
197 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
198 $attr_map{$1}=$2;
199 $attrstring = $'; #'
200 }
201
202
203 my $processor = $self->{'processor'};
204
205 ##create a new document for each record
206 if ($element eq "record") {
207 my $filename = $self->{'filename'};
208 my $language = $self->{'language'};
209 my $encoding = $self->{'encoding'};
210 my $file = $self->{'file'};
211 my $doc_obj = new doc($filename, undef, $self->{'file_rename_method'});
212 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
213 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
214 my ($filemeta) = $file =~ /([^\\\/]+)$/;
215 $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
216 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
217 if ($self->{'cover_image'}) {
218 $self->associate_cover_image($doc_obj, $filename);
219 }
220 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
221 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "MARCXML");
222
223 my $outhandle = $self->{'outhandle'};
224 print $outhandle "Record $self->{'record_count'}\n" if $self->{'verbosity'} > 1;
225
226 $self->{'record_count'}++;
227 $self->{'doc_obj'} = $doc_obj;
228 $self->{'num_processed'}++;
229 if (!defined $self->{'base_oid'}) {
230 $self->SUPER::add_OID($doc_obj);
231 $self->{'base_oid'} = $doc_obj->get_OID();
232 }
233
234
235 }
236
237 ## get the marc code, for example 520
238 if ($element eq "datafield") {
239 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
240 $self->{'current_tag'} = $attr_map{tag};
241 }
242 }
243
244
245 ## append the subcode to the marc code for example 520a or 520b
246 if ($element eq "subfield"){
247 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
248 $self->{'current_code'} = $attr_map{'code'};
249 }
250 }
251
252 if ($element eq "record"){
253 $self->{'indent'} = 0;
254 $self->{'content'} = "";
255 $self->{'xmlcontent'} = "";
256 }
257 else {
258 if ($element ne "subfield"){
259 $self->{'indent'} = 1;
260 }
261 else{
262 $self->{'indent'} = 2;
263 }
264 }
265
266
267 if ($element eq "collection") {
268 # remember the full start tag for <collection ...>
269 # This is needed to wrap around each <record> when generating its associate MARCXML file
270
271 $self->{'xmlcollectiontag'} = $text;
272 }
273 else {
274 $self->{'content'} .= "<br/>" if ($element ne "record");
275 $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
276 $self->{'xmlcontent'} .= $text;
277 }
278
279}
280
281
282
283sub xml_end_tag {
284 my $self = shift(@_);
285 my ($expat, $element) = @_;
286
287 my $text = $_;
288 my $escaped_text = $self->escape_text($_);
289
290 if ($element eq "record" and defined $self->{'doc_obj'}) {
291 # process the document
292 my $processor = $self->{'processor'};
293 my $doc_obj = $self->{'doc_obj'};
294 $self->{'content'} .= "<br/>".$escaped_text;
295 $self->{'xmlcontent'} .= $text;
296
297
298 my $top_section = $doc_obj->get_top_section();
299
300 my $tmp_marcxml_filename = &util::get_tmp_filename("xml");
301 if (open (XMLOUT,">$tmp_marcxml_filename")) {
302
303 print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";
304 my $xml_content = $self->{'xmlcontent'};
305
306 $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
307
308 print XMLOUT $xml_content;
309
310 close(XMLOUT);
311
312 $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
313
314 # assicate xsl style file for presentation as HTML
315 my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
316 $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
317
318 }
319 else {
320 my $outhandle = $self->{'outhandle'};
321 print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
322 }
323
324 # include any metadata passed in from previous plugins
325 # note that this metadata is associated with the top level section
326
327 $self->extra_metadata ($doc_obj,
328 $doc_obj->get_top_section(),
329 $self->{'metadata'});
330
331
332 $self->add_OID($doc_obj, $self->{'base_oid'}, $self->{'record_count'});
333
334 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
335 $processor->process($doc_obj);
336
337 ##clean up
338 $self->{'content'} = "";
339 $self->{'xmlcontent'} = "";
340 $self->{'doc_obj'} = undef;
341 return;
342 }
343
344 ## map the xmlmarc to gsdl metadata
345 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'} and defined $self->{'metadata_mapping'}){
346 my $metadata_mapping = $self->{'metadata_mapping'};
347 my $marc_mapping = $self->{'marc_mapping'};
348 my $doc_obj = $self->{'doc_obj'};
349
350## print STDERR "**** Marc Record\n";
351## map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
352## print STDERR "**** Metadata Mapping\n";
353## map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
354
355
356 foreach my $marc_field (keys %$metadata_mapping){
357
358 ## test whether this field has subfield
359 my $subfield = undef;
360 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
361 $marc_field = $1;
362 $subfield = $2;
363 }
364
365 my $matched_field = $marc_mapping->{$marc_field};
366
367 if (defined $matched_field) {
368
369 my $meta_name = undef;
370 my $meta_value = undef;
371
372 if (defined $subfield){
373 $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
374
375 $meta_value = $matched_field->{$subfield};
376
377 if (!defined $meta_value) {
378 # record read in does not have the specified subfield
379 next;
380 }
381 }
382 else {
383 $meta_name = $metadata_mapping->{$marc_field};
384
385 # no subfield => get all the values
386 foreach my $value (sort keys %{$matched_field}) {
387 $meta_value .= $matched_field->{$value} ." ";
388 }
389
390 }
391
392 ## escape [ and ]
393 $meta_value =~ s/\[/\\\[/g;
394 $meta_value =~ s/\]/\\\]/g;
395 ##print STDERR "$meta_name=$meta_value\n";
396 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
397
398 }
399
400 }
401
402 ##clean up
403 $self->{'marc_mapping'} = undef;
404 $self->{'current_tag'} = "";
405 }
406
407 if ($element eq "datafield"){
408 $self->{'indent'} = 1;
409 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
410 $self->{'xmlcontent'} .= $text;
411 }
412 else{
413 $self->{'content'} .= $escaped_text;
414 $self->{'xmlcontent'} .= $text;
415 }
416
417}
418
419sub add_OID {
420 my $self = shift (@_);
421 my ($doc_obj, $id, $record_number) = @_;
422
423 my $full_id = $id . "r" . $record_number;
424 if ($self->{'OIDtype'} eq "assigned") {
425 my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
426 if (defined $identifier && $identifier ne "") {
427 $full_id = $identifier;
428 $full_id =~ s/\.//g; #remove any periods
429 if ($full_id =~ /^[\d]*$/) {
430 $full_id = "D" . $full_id;
431 print STDERR "OID only contains numbers, adding a D\n";
432 }
433 }
434 }
435 $doc_obj->set_OID($full_id);
436}
437
438sub xml_text {
439 my $self = shift(@_);
440 my ($expat) = @_;
441
442 my $text = $_;
443 my $escaped_text = $self->escape_text($_);
444
445 # protect against & in raw text file
446 $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
447
448 ## store the text of a marc code, for exapmle 520a=>A poem about....
449 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
450 ##stored it in the marc_mapping
451
452 my $current_tag = $self->{'current_tag'};
453 my $current_code = $self->{'current_code'};
454
455 $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
456
457 $self->{'current_code'} = "";
458 }
459
460 $self->{'content'} .= $escaped_text;
461 $self->{'xmlcontent'} .= $text;
462
463}
464
465sub calculate_indent{
466 my ($self,$num) = @_;
467
468 my $indent ="";
469
470 for (my $i=0; $i<$num;$i++){
471 $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
472 }
473
474 return $indent;
475
476}
477
478sub escape_text {
479 my ($self,$text) = @_;
480 # special characters in the xml encoding
481 $text =~ s/&/&amp;/g; # this has to be first...
482 $text =~ s/</&lt;/g;
483 $text =~ s/>/&gt;/g;
484 $text =~ s/\"/&quot;/g;
485
486 return $text;
487}
488
489
490sub unescape_text {
491 my ($self,$text) = @_;
492 # special characters in the xml encoding
493 $text =~ s/&lt;/</g;
494 $text =~ s/&gt;/>/g;
495 $text =~ s/&quot;/\"/g;
496
497 $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
498
499 return $text;
500}
501
502
5031;
504
505
Note: See TracBrowser for help on using the repository browser.