source: main/trunk/greenstone2/perllib/plugins/MARCXMLPlugin.pm@ 28803

Last change on this file since 28803 was 28803, checked in by ak19, 8 years ago

Testing with accented characters in MARC data showed up problems in how text strings were being handled in the XML-Parsing MARCXMLPlugin. These changes fix this.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.5 KB
Line 
1###########################################################################
2#
3# MARCXMLPlugin.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
30package MARCXMLPlugin;
31
32use ReadXMLFile;
33use ReadTextFile;
34use MetadataRead;
35use marcmapping;
36
37use strict;
38no strict 'refs'; # allow filehandles to be variables and viceversa
39
40# methods with identical signatures take precedence in the order given in the ISA list.
41sub BEGIN {
42 @MARCXMLPlugin::ISA = ('MetadataRead', 'ReadXMLFile', 'ReadTextFile');
43}
44
45my $arguments = [{'name' => "metadata_mapping_file",
46 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
47 'type' => "string",
48 'deft' => "marc2dc.txt",
49 'reqd' => "no" },
50 { 'name' => "process_exp",
51 'desc' => "{BasePlugin.process_exp}",
52 'type' => "regexp",
53 'deft' => &get_default_process_exp(),
54 'reqd' => "no" }];
55
56my $options = { 'name' => "MARCXMLPlugin",
57 'desc' => "{MARCXMLPlugin.desc}",
58 'abstract' => "no",
59 'inherits' => "yes",
60 'args' => $arguments
61 };
62
63
64sub new {
65 my ($class) = shift (@_);
66 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
67 push(@$pluginlist, $class);
68
69 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
70 push(@{$hashArgOptLists->{"OptList"}},$options);
71
72 # we want to be able to use the textcat methods from ReadTextFile
73 # to get the language and encoding
74 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
75
76 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
77
78 # we want to strip namespaces, so have to create a new XML parser
79 my $parser = new XML::Parser('Style' => 'Stream',
80 'Pkg' => 'ReadXMLFile',
81 'PluginObj' => $self,
82 'Namespaces' => 1, # strip out namespaces
83 'Handlers' => {'Char' => \&Char,
84 'XMLDecl' => \&ReadXMLFile::XMLDecl,
85 'Entity' => \&ReadXMLFile::Entity,
86 'Doctype' => \&ReadXMLFile::Doctype,
87 'Default' => \&ReadXMLFile::Default
88 });
89
90 $self->{'parser'} = $parser;
91
92 $self->{'content'} = "";
93 $self->{'xmlcontent'} = "";
94 $self->{'record_count'} = 1;
95 $self->{'language'} = "";
96 $self->{'encoding'} = "";
97 $self->{'marc_mapping'} = {};
98 $self->{'current_code'} = "";
99 $self->{'current_tag'} = "";
100 $self->{'current_element'} = "";
101 $self->{'metadata_mapping'} = undef;
102 $self->{'num_processed'} = 0;
103 $self->{'indent'} = 0;
104
105 # in case we have individual records without a collection tag
106 $self->{'xmlcollectiontag'} = "<collection>";
107 return bless $self, $class;
108}
109
110
111sub get_default_process_exp {
112 my $self = shift (@_);
113
114 return q^(?i)\.xml$^;
115}
116
117sub get_doctype {
118 my $self = shift(@_);
119
120 return "(collection|record)";
121}
122
123
124sub init {
125 my $self = shift (@_);
126 my ($verbosity, $outhandle, $failhandle) = @_;
127
128 ## the mapping file has already been loaded
129 if (defined $self->{'metadata_mapping'} ){
130 $self->SUPER::init(@_);
131 return;
132 }
133
134 # read in the metadata mapping file
135 my $mm_file = &util::locate_config_file($self->{'metadata_mapping_file'});
136
137 if (! defined $mm_file)
138 {
139 my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
140 $self->{'metadata_mapping_file'} . "\".\n " .
141 " No metadata will be extracted from MARCXML files.\n";
142
143 print $outhandle $msg;
144 print $failhandle $msg;
145 $self->{'metadata_mapping'} = undef;
146 # We pick up the error in process() if there is no $mm_file
147 # If we exit here, then pluginfo.pl will exit too!
148 }
149 else {
150 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_file, $outhandle);
151 }
152
153
154 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
155
156 $self->SUPER::init(@_);
157}
158
159
160sub Char {
161 # ReadXMLPlugin currently has 'use bytes' here, apparently to sort out
162 # an encoding issue. Possible that the time that 'use bytes' was
163 # added in (to fix a problem) our understanding of Unicode in Perl
164 # wasn't completely correct
165
166 # Trialing out this new version (without 'use bytes') here for MarcXML data
167
168 $_[0]->{'Text'} .= $_[1];
169 return undef;
170}
171
172# Called for DOCTYPE declarations - use die to bail out if this doctype
173# is not meant for this plugin
174sub xml_doctype {
175 my $self = shift(@_);
176
177 my ($expat, $name, $sysid, $pubid, $internal) = @_;
178 return;
179
180}
181
182
183sub xml_start_document {
184 my $self = shift(@_);
185
186 my ($expat, $name, $sysid, $pubid, $internal) = @_;
187
188
189 my $file = $self->{'file'};
190 my $filename = $self->{'filename'};
191
192 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
193
194 $self->{'language'} = $language;
195 $self->{'encoding'} = $encoding;
196 $self->{'element_count'} = 1;
197 $self->{'indent'} = 0;
198 my $outhandle = $self->{'outhandle'};
199 print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
200 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
201
202 # reset the base id
203 $self->{'base_oid'} = undef;
204
205}
206
207sub xml_end_document {
208
209}
210
211sub xml_start_tag {
212 my $self = shift;
213 my $expat = shift;
214 my $element = shift;
215
216 my $text = $_;
217 my $escaped_text = $self->escape_text($_);
218
219 $self->{'current_element'} = $element;
220
221 ##get all atributes of this element and store it in a map name=>value
222 my %attr_map = ();
223 my $attrstring = $_;
224 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
225 $attr_map{$1}=$2;
226 $attrstring = $'; #'
227 }
228
229
230 my $processor = $self->{'processor'};
231 my $metadata = $self->{'metadata'};
232
233 ##create a new document for each record
234 if ($element eq "record") {
235 my $filename = $self->{'filename'};
236 my $language = $self->{'language'};
237 my $encoding = $self->{'encoding'};
238 my $file = $self->{'file'};
239 my $doc_obj = new doc($filename, undef, $self->{'file_rename_method'});
240 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
241 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
242
243 my ($filemeta) = $file =~ /([^\\\/]+)$/;
244 my $plugin_filename_encoding = $self->{'filename_encoding'};
245 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
246 $self->set_Source_metadata($doc_obj, $filename, $filename_encoding);
247
248 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
249 if ($self->{'cover_image'}) {
250 $self->associate_cover_image($doc_obj, $filename);
251 }
252 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
253 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "MARCXML");
254
255 my $outhandle = $self->{'outhandle'};
256 print $outhandle "Record $self->{'record_count'}\n" if $self->{'verbosity'} > 1;
257
258 $self->{'record_count'}++;
259 $self->{'doc_obj'} = $doc_obj;
260 $self->{'num_processed'}++;
261 if (!defined $self->{'base_oid'}) {
262 $self->SUPER::add_OID($doc_obj);
263 $self->{'base_oid'} = $doc_obj->get_OID();
264 }
265
266
267 }
268
269 ## get the marc code, for example 520
270 if ($element eq "datafield") {
271 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
272 $self->{'current_tag'} = $attr_map{tag};
273 }
274 }
275
276
277 ## append the subcode to the marc code for example 520a or 520b
278 if ($element eq "subfield"){
279 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
280 $self->{'current_code'} = $attr_map{'code'};
281 }
282 }
283
284 if ($element eq "record"){
285 $self->{'indent'} = 0;
286 $self->{'content'} = "";
287 $self->{'xmlcontent'} = "";
288 }
289 else {
290 if ($element ne "subfield"){
291 $self->{'indent'} = 1;
292 }
293 else{
294 $self->{'indent'} = 2;
295 }
296 }
297
298
299 if ($element eq "collection") {
300 # remember the full start tag for <collection ...>
301 # This is needed to wrap around each <record> when generating its associate MARCXML file
302
303 $self->{'xmlcollectiontag'} = $text;
304 }
305 else {
306 $self->{'content'} .= "<br/>" if ($element ne "record");
307 $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
308 $self->{'xmlcontent'} .= $text;
309 }
310
311}
312
313
314
315sub xml_end_tag {
316 my $self = shift(@_);
317 my ($expat, $element) = @_;
318
319 my $text = $_;
320 my $escaped_text = $self->escape_text($_);
321
322 if ($element eq "record" and defined $self->{'doc_obj'}) {
323 # process the document
324 my $processor = $self->{'processor'};
325 my $doc_obj = $self->{'doc_obj'};
326 $self->{'content'} .= "<br/>".$escaped_text;
327 $self->{'xmlcontent'} .= $text;
328
329
330 my $top_section = $doc_obj->get_top_section();
331
332 my $tmp_marcxml_filename = &util::get_tmp_filename("xml");
333 if (open (XMLOUT,">$tmp_marcxml_filename")) {
334 binmode(XMLOUT,":utf8");
335
336 print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";
337 my $xml_content = $self->{'xmlcontent'};
338
339 $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
340
341 print XMLOUT $xml_content;
342
343 close(XMLOUT);
344
345 $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
346
347 # assicate xsl style file for presentation as HTML
348 my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
349 $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
350
351 }
352 else {
353 my $outhandle = $self->{'outhandle'};
354 print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
355 }
356
357 # include any metadata passed in from previous plugins
358 # note that this metadata is associated with the top level section
359
360 $self->extra_metadata ($doc_obj,
361 $doc_obj->get_top_section(),
362 $self->{'metadata'});
363
364
365 $self->add_OID($doc_obj, $self->{'base_oid'}, $self->{'record_count'});
366
367 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
368 $processor->process($doc_obj);
369
370 ##clean up
371 $self->{'content'} = "";
372 $self->{'xmlcontent'} = "";
373 $self->{'doc_obj'} = undef;
374 return;
375 }
376
377 ## map the xmlmarc to gsdl metadata
378 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'} and defined $self->{'metadata_mapping'}){
379 my $metadata_mapping = $self->{'metadata_mapping'};
380 my $marc_mapping = $self->{'marc_mapping'};
381 my $doc_obj = $self->{'doc_obj'};
382
383 ##print STDERR "**** Marc Record\n";
384 ##map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
385 ##print STDERR "**** Metadata Mapping\n";
386 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
387
388
389 foreach my $marc_field (keys %$metadata_mapping){
390
391 ## test whether this field has subfield
392 my $subfield = undef;
393 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
394 $marc_field = $1;
395 $subfield = $2;
396 }
397
398 my $matched_field = $marc_mapping->{$marc_field};
399
400 if (defined $matched_field) {
401
402 my $meta_name = undef;
403 my $meta_value = undef;
404
405 if (defined $subfield){
406 $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
407
408 $meta_value = $matched_field->{$subfield};
409
410 if (!defined $meta_value) {
411 # record read in does not have the specified subfield
412 next;
413 }
414 }
415 else {
416 $meta_name = $metadata_mapping->{$marc_field};
417
418 # no subfield => get all the values
419 my $first = 1;
420 foreach my $value (sort keys %{$matched_field}) {
421 if ($first) {
422 $meta_value = $matched_field->{$value};
423 $first = 0;
424 } else {
425 $meta_value .= " " . $matched_field->{$value};
426 }
427 }
428
429 }
430
431 my $gs_mode = ($ENV{'GSDL3SRCHOME'}) ? "gs3" : "gs2";
432
433 if ($gs_mode eq "gs2") {
434 ## escape [ and ]
435 $meta_value =~ s/\[/\\\[/g;
436 $meta_value =~ s/\]/\\\]/g;
437
438 # The following is how MARCPlug does this
439 # If this is really OK for Greenstone2, then consider
440 # to switching to this, as this would mean we
441 # wouldn't need a special gs2/gs3 test here!
442
443# $meta_value =~ s/\[/&\#091;/g;
444# $meta_value =~ s/\]/&\#093;/g;
445
446 ##print STDERR "$meta_name=$meta_value\n";
447 }
448
449 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
450
451 }
452
453 }
454
455 ##clean up
456 $self->{'marc_mapping'} = undef;
457 $self->{'current_tag'} = "";
458 }
459
460 if ($element eq "datafield"){
461 $self->{'indent'} = 1;
462 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
463 $self->{'xmlcontent'} .= $text;
464 }
465 else{
466 $self->{'content'} .= $escaped_text;
467 $self->{'xmlcontent'} .= $text;
468 }
469
470}
471
472sub add_OID {
473 my $self = shift (@_);
474 my ($doc_obj, $id, $record_number) = @_;
475
476 my $full_id = $id . "r" . $record_number;
477 if ($self->{'OIDtype'} eq "assigned") {
478 my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
479 if (defined $identifier && $identifier ne "") {
480 $full_id = $identifier;
481 $full_id = &util::tidy_up_oid($full_id);
482 }
483 }
484 $doc_obj->set_OID($full_id);
485}
486
487sub xml_text {
488 my $self = shift(@_);
489 my ($expat) = @_;
490
491 my $text = $_;
492 my $escaped_text = $self->escape_text($_);
493
494 # protect against & in raw text file
495 $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
496
497 ## store the text of a marc code, for exapmle 520a=>A poem about....
498 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
499 ##stored it in the marc_mapping
500
501 my $current_tag = $self->{'current_tag'};
502 my $current_code = $self->{'current_code'};
503
504 $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
505
506 $self->{'current_code'} = "";
507 }
508
509 $self->{'content'} .= $escaped_text;
510 $self->{'xmlcontent'} .= $text;
511
512}
513
514sub calculate_indent{
515 my ($self,$num) = @_;
516
517 my $indent ="";
518
519 for (my $i=0; $i<$num;$i++){
520 $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
521 }
522
523 return $indent;
524
525}
526
527sub escape_text {
528 my ($self,$text) = @_;
529 # special characters in the xml encoding
530 $text =~ s/&/&amp;/g; # this has to be first...
531 $text =~ s/</&lt;/g;
532 $text =~ s/>/&gt;/g;
533 $text =~ s/\"/&quot;/g;
534
535 return $text;
536}
537
538
539sub unescape_text {
540 my ($self,$text) = @_;
541 # special characters in the xml encoding
542 $text =~ s/&lt;/</g;
543 $text =~ s/&gt;/>/g;
544 $text =~ s/&quot;/\"/g;
545
546 $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
547
548 return $text;
549}
550
551
5521;
553
554
Note: See TracBrowser for help on using the repository browser.