source: main/trunk/greenstone2/perllib/plugins/MARCXMLPlugin.pm@ 23484

Last change on this file since 23484 was 23352, checked in by davidb, 13 years ago

Modifications to code to support filename encoding issues when tested under Windows

  • Property svn:keywords set to Author Date Id Revision
File size: 14.5 KB
RevLine 
[13486]1###########################################################################
2#
[15872]3# MARCXMLPlugin.pm
[13486]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
[15872]30package MARCXMLPlugin;
[13486]31
[15872]32use ReadXMLFile;
[16693]33use ReadTextFile;
[16692]34use marcmapping;
[13486]35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39sub BEGIN {
[16693]40 @MARCXMLPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
[13486]41}
42
43my $arguments = [{'name' => "metadata_mapping_file",
[15872]44 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
[13486]45 'type' => "string",
[18563]46 'deft' => "marc2dc.txt",
[20609]47 'reqd' => "no" },
48 { 'name' => "process_exp",
49 'desc' => "{BasePlugin.process_exp}",
50 'type' => "regexp",
51 'deft' => &get_default_process_exp(),
52 'reqd' => "no" }];
[13486]53
[15872]54my $options = { 'name' => "MARCXMLPlugin",
55 'desc' => "{MARCXMLPlugin.desc}",
[13486]56 'abstract' => "no",
57 'inherits' => "yes",
58 'args' => $arguments
59 };
60
[20609]61
[13486]62sub new {
63 my ($class) = shift (@_);
64 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
65 push(@$pluginlist, $class);
66
[15872]67 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
68 push(@{$hashArgOptLists->{"OptList"}},$options);
[13486]69
[16693]70 # we want to be able to use the textcat methods from ReadTextFile
71 # to get the language and encoding
72 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
73
[15872]74 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
[18900]75
76 # we want to strip namespaces, so have to create a new XML parser
77 my $parser = new XML::Parser('Style' => 'Stream',
78 'Pkg' => 'ReadXMLFile',
79 'PluginObj' => $self,
80 'Namespaces' => 1, # strip out namespaces
81 'Handlers' => {'Char' => \&ReadXMLFile::Char,
82 'XMLDecl' => \&ReadXMLFile::XMLDecl,
83 'Entity' => \&ReadXMLFile::Entity,
84 'Doctype' => \&ReadXMLFile::Doctype,
85 'Default' => \&ReadXMLFile::Default
86 });
87
88 $self->{'parser'} = $parser;
89
[13486]90 $self->{'content'} = "";
[15018]91 $self->{'xmlcontent'} = "";
[13486]92 $self->{'record_count'} = 1;
93 $self->{'language'} = "";
94 $self->{'encoding'} = "";
95 $self->{'marc_mapping'} = {};
96 $self->{'current_code'} = "";
97 $self->{'current_tag'} = "";
98 $self->{'current_element'} = "";
99 $self->{'metadata_mapping'} = undef;
100 $self->{'num_processed'} = 0;
[13496]101 $self->{'indent'} = 0;
102
[13486]103 return bless $self, $class;
104}
105
[16694]106
[20609]107sub get_default_process_exp {
108 my $self = shift (@_);
[16694]109
[20609]110 return q^(?i)\.xml$^;
111}
112
[13486]113sub get_doctype {
114 my $self = shift(@_);
115
116 return "collection";
117}
118
119
120sub init {
121 my $self = shift (@_);
122 my ($verbosity, $outhandle, $failhandle) = @_;
123
124 ## the mapping file has already been loaded
125 if (defined $self->{'metadata_mapping'} ){
126 $self->SUPER::init(@_);
127 return;
128 }
129
[19619]130 # read in the metadata mapping file
131 my $mm_file = &util::locate_config_file($self->{'metadata_mapping_file'});
[13486]132
[19619]133 if (! defined $mm_file)
[13486]134 {
[15872]135 my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
[15018]136 $self->{'metadata_mapping_file'} . "\".\n " .
[16697]137 " No metadata will be extracted from MARCXML files.\n";
[13486]138
139 print $outhandle $msg;
140 print $failhandle $msg;
141 $self->{'metadata_mapping'} = undef;
142 # We pick up the error in process() if there is no $mm_file
143 # If we exit here, then pluginfo.pl will exit too!
144 }
[15018]145 else {
[19619]146 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_file, $outhandle);
[13486]147 }
148
149
[16695]150 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
[13486]151
152 $self->SUPER::init(@_);
153}
154
155# Called for DOCTYPE declarations - use die to bail out if this doctype
156# is not meant for this plugin
157sub xml_doctype {
158 my $self = shift(@_);
159
160 my ($expat, $name, $sysid, $pubid, $internal) = @_;
161 return;
162
163}
164
165
166sub xml_start_document {
167 my $self = shift(@_);
168
169 my ($expat, $name, $sysid, $pubid, $internal) = @_;
170
171
172 my $file = $self->{'file'};
173 my $filename = $self->{'filename'};
[13496]174
[13486]175 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
176
177 $self->{'language'} = $language;
178 $self->{'encoding'} = $encoding;
179 $self->{'element_count'} = 1;
[13496]180 $self->{'indent'} = 0;
[13486]181 my $outhandle = $self->{'outhandle'};
[15872]182 print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
183 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
[17026]184
185 # reset the base id
186 $self->{'base_oid'} = undef;
[13486]187
188}
189
190sub xml_end_document {
191
192}
193
194sub xml_start_tag {
195 my $self = shift;
196 my $expat = shift;
197 my $element = shift;
[13496]198
[15018]199 my $text = $_;
200 my $escaped_text = $self->escape_text($_);
[13486]201
202 $self->{'current_element'} = $element;
203
204 ##get all atributes of this element and store it in a map name=>value
205 my %attr_map = ();
206 my $attrstring = $_;
207 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
208 $attr_map{$1}=$2;
[13496]209 $attrstring = $'; #'
[13486]210 }
211
[15018]212
[13486]213 my $processor = $self->{'processor'};
[23349]214 my $metadata = $self->{'metadata'};
215
[13486]216 ##create a new document for each record
217 if ($element eq "record") {
[13496]218 my $filename = $self->{'filename'};
[13486]219 my $language = $self->{'language'};
220 my $encoding = $self->{'encoding'};
221 my $file = $self->{'file'};
[18327]222 my $doc_obj = new doc($filename, undef, $self->{'file_rename_method'});
[13486]223 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
224 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
[23349]225
[13486]226 my ($filemeta) = $file =~ /([^\\\/]+)$/;
[23349]227 my $plugin_filename_encoding = $self->{'filename_encoding'};
228 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
[23352]229 $self->set_Source_metadata($doc_obj, $filename, $filename_encoding);
[23349]230
[13486]231 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
232 if ($self->{'cover_image'}) {
233 $self->associate_cover_image($doc_obj, $filename);
234 }
235 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[16692]236 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "MARCXML");
[13486]237
238 my $outhandle = $self->{'outhandle'};
[16697]239 print $outhandle "Record $self->{'record_count'}\n" if $self->{'verbosity'} > 1;
[13486]240
241 $self->{'record_count'}++;
242 $self->{'doc_obj'} = $doc_obj;
243 $self->{'num_processed'}++;
[17026]244 if (!defined $self->{'base_oid'}) {
245 $self->SUPER::add_OID($doc_obj);
246 $self->{'base_oid'} = $doc_obj->get_OID();
247 }
248
[13486]249
250 }
251
252 ## get the marc code, for example 520
253 if ($element eq "datafield") {
[13496]254 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
[13486]255 $self->{'current_tag'} = $attr_map{tag};
256 }
257 }
258
259
260 ## append the subcode to the marc code for example 520a or 520b
261 if ($element eq "subfield"){
[13496]262 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
[13486]263 $self->{'current_code'} = $attr_map{'code'};
264 }
265 }
[13496]266
267 if ($element eq "record"){
268 $self->{'indent'} = 0;
[15018]269 $self->{'content'} = "";
270 $self->{'xmlcontent'} = "";
[13496]271 }
272 else {
273 if ($element ne "subfield"){
274 $self->{'indent'} = 1;
275 }
276 else{
277 $self->{'indent'} = 2;
278 }
279 }
280
[15018]281
282 if ($element eq "collection") {
283 # remember the full start tag for <collection ...>
284 # This is needed to wrap around each <record> when generating its associate MARCXML file
285
286 $self->{'xmlcollectiontag'} = $text;
287 }
288 else {
289 $self->{'content'} .= "<br/>" if ($element ne "record");
290 $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
291 $self->{'xmlcontent'} .= $text;
292 }
[13496]293
[13486]294}
295
296
297
298sub xml_end_tag {
299 my $self = shift(@_);
300 my ($expat, $element) = @_;
[15018]301
302 my $text = $_;
303 my $escaped_text = $self->escape_text($_);
[13496]304
[13486]305 if ($element eq "record" and defined $self->{'doc_obj'}) {
306 # process the document
307 my $processor = $self->{'processor'};
308 my $doc_obj = $self->{'doc_obj'};
[15018]309 $self->{'content'} .= "<br/>".$escaped_text;
310 $self->{'xmlcontent'} .= $text;
[13496]311
[15018]312
313 my $top_section = $doc_obj->get_top_section();
314
[16521]315 my $tmp_marcxml_filename = &util::get_tmp_filename("xml");
[15018]316 if (open (XMLOUT,">$tmp_marcxml_filename")) {
317
318 print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";
319 my $xml_content = $self->{'xmlcontent'};
320
321 $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
322
323 print XMLOUT $xml_content;
324
325 close(XMLOUT);
326
327 $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
328
329 # assicate xsl style file for presentation as HTML
330 my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
331 $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
332
333 }
334 else {
335 my $outhandle = $self->{'outhandle'};
336 print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
337 }
[15178]338
339 # include any metadata passed in from previous plugins
340 # note that this metadata is associated with the top level section
341
342 $self->extra_metadata ($doc_obj,
343 $doc_obj->get_top_section(),
344 $self->{'metadata'});
345
[15018]346
[17026]347 $self->add_OID($doc_obj, $self->{'base_oid'}, $self->{'record_count'});
[15018]348
[13496]349 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
[13486]350 $processor->process($doc_obj);
351
352 ##clean up
353 $self->{'content'} = "";
[15018]354 $self->{'xmlcontent'} = "";
[13486]355 $self->{'doc_obj'} = undef;
[13496]356 return;
[13486]357 }
358
359 ## map the xmlmarc to gsdl metadata
[16697]360 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'} and defined $self->{'metadata_mapping'}){
[13486]361 my $metadata_mapping = $self->{'metadata_mapping'};
362 my $marc_mapping = $self->{'marc_mapping'};
363 my $doc_obj = $self->{'doc_obj'};
364
[19672]365 ##print STDERR "**** Marc Record\n";
366 ##map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
367 ##print STDERR "**** Metadata Mapping\n";
368 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
[13486]369
[15018]370
[13486]371 foreach my $marc_field (keys %$metadata_mapping){
[15018]372
373 ## test whether this field has subfield
[13486]374 my $subfield = undef;
[15018]375 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
376 $marc_field = $1;
377 $subfield = $2;
378 }
[13486]379
[15018]380 my $matched_field = $marc_mapping->{$marc_field};
381
382 if (defined $matched_field) {
383
384 my $meta_name = undef;
385 my $meta_value = undef;
386
[13486]387 if (defined $subfield){
[15018]388 $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
389
390 $meta_value = $matched_field->{$subfield};
391
392 if (!defined $meta_value) {
393 # record read in does not have the specified subfield
394 next;
395 }
[13486]396 }
[15018]397 else {
398 $meta_name = $metadata_mapping->{$marc_field};
399
400 # no subfield => get all the values
[19672]401 my $first = 1;
[15018]402 foreach my $value (sort keys %{$matched_field}) {
[19672]403 if ($first) {
404 $meta_value = $matched_field->{$value};
405 $first = 0;
406 } else {
407 $meta_value .= " " . $matched_field->{$value};
408 }
[13486]409 }
[15018]410
[13486]411 }
412
413 ## escape [ and ]
414 $meta_value =~ s/\[/\\\[/g;
415 $meta_value =~ s/\]/\\\]/g;
416 ##print STDERR "$meta_name=$meta_value\n";
417 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
418
419 }
420
421 }
422
423 ##clean up
424 $self->{'marc_mapping'} = undef;
425 $self->{'current_tag'} = "";
426 }
[13496]427
428 if ($element eq "datafield"){
429 $self->{'indent'} = 1;
[15018]430 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
431 $self->{'xmlcontent'} .= $text;
[13496]432 }
433 else{
[15018]434 $self->{'content'} .= $escaped_text;
435 $self->{'xmlcontent'} .= $text;
[13496]436 }
437
[13486]438}
439
[17026]440sub add_OID {
[13486]441 my $self = shift (@_);
[17026]442 my ($doc_obj, $id, $record_number) = @_;
[15018]443
[17026]444 my $full_id = $id . "r" . $record_number;
445 if ($self->{'OIDtype'} eq "assigned") {
446 my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
447 if (defined $identifier && $identifier ne "") {
[17033]448 $full_id = $identifier;
[19619]449 $full_id = &util::tidy_up_oid($full_id);
[17026]450 }
451 }
452 $doc_obj->set_OID($full_id);
[13486]453}
454
455sub xml_text {
456 my $self = shift(@_);
457 my ($expat) = @_;
458
[15018]459 my $text = $_;
460 my $escaped_text = $self->escape_text($_);
[13486]461
[15018]462 # protect against & in raw text file
463 $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
464
[13486]465 ## store the text of a marc code, for exapmle 520a=>A poem about....
466 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
467 ##stored it in the marc_mapping
[15018]468
469 my $current_tag = $self->{'current_tag'};
470 my $current_code = $self->{'current_code'};
471
472 $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
473
[13486]474 $self->{'current_code'} = "";
475 }
476
[15018]477 $self->{'content'} .= $escaped_text;
478 $self->{'xmlcontent'} .= $text;
[13486]479
480}
481
[13496]482sub calculate_indent{
483 my ($self,$num) = @_;
[13486]484
[13496]485 my $indent ="";
486
487 for (my $i=0; $i<$num;$i++){
488 $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
489 }
490
491 return $indent;
492
493}
494
495sub escape_text {
496 my ($self,$text) = @_;
497 # special characters in the xml encoding
498 $text =~ s/&/&amp;/g; # this has to be first...
499 $text =~ s/</&lt;/g;
500 $text =~ s/>/&gt;/g;
501 $text =~ s/\"/&quot;/g;
502
503 return $text;
504}
505
506
[15018]507sub unescape_text {
508 my ($self,$text) = @_;
509 # special characters in the xml encoding
510 $text =~ s/&lt;/</g;
511 $text =~ s/&gt;/>/g;
512 $text =~ s/&quot;/\"/g;
513
514 $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
515
516 return $text;
517}
518
519
[13486]5201;
521
522
Note: See TracBrowser for help on using the repository browser.