source: main/trunk/greenstone2/perllib/plugins/MARCXMLPlugin.pm

Last change on this file was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 15.5 KB
RevLine 
[13486]1###########################################################################
2#
[15872]3# MARCXMLPlugin.pm
[13486]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
[15872]30package MARCXMLPlugin;
[13486]31
[15872]32use ReadXMLFile;
[16693]33use ReadTextFile;
[24547]34use MetadataRead;
[16692]35use marcmapping;
[13486]36
37use strict;
38no strict 'refs'; # allow filehandles to be variables and viceversa
39
[24547]40# methods with identical signatures take precedence in the order given in the ISA list.
[13486]41sub BEGIN {
[24547]42 @MARCXMLPlugin::ISA = ('MetadataRead', 'ReadXMLFile', 'ReadTextFile');
[13486]43}
44
45my $arguments = [{'name' => "metadata_mapping_file",
[15872]46 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
[13486]47 'type' => "string",
[18563]48 'deft' => "marc2dc.txt",
[20609]49 'reqd' => "no" },
50 { 'name' => "process_exp",
[31492]51 'desc' => "{BaseImporter.process_exp}",
[20609]52 'type' => "regexp",
53 'deft' => &get_default_process_exp(),
54 'reqd' => "no" }];
[13486]55
[15872]56my $options = { 'name' => "MARCXMLPlugin",
57 'desc' => "{MARCXMLPlugin.desc}",
[13486]58 'abstract' => "no",
59 'inherits' => "yes",
60 'args' => $arguments
61 };
62
[20609]63
[13486]64sub new {
65 my ($class) = shift (@_);
66 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
67 push(@$pluginlist, $class);
68
[15872]69 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
70 push(@{$hashArgOptLists->{"OptList"}},$options);
[13486]71
[16693]72 # we want to be able to use the textcat methods from ReadTextFile
73 # to get the language and encoding
74 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
75
[15872]76 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
[18900]77
78 # we want to strip namespaces, so have to create a new XML parser
79 my $parser = new XML::Parser('Style' => 'Stream',
80 'Pkg' => 'ReadXMLFile',
81 'PluginObj' => $self,
82 'Namespaces' => 1, # strip out namespaces
[28803]83 'Handlers' => {'Char' => \&Char,
[18900]84 'XMLDecl' => \&ReadXMLFile::XMLDecl,
85 'Entity' => \&ReadXMLFile::Entity,
86 'Doctype' => \&ReadXMLFile::Doctype,
87 'Default' => \&ReadXMLFile::Default
88 });
89
90 $self->{'parser'} = $parser;
91
[13486]92 $self->{'content'} = "";
[15018]93 $self->{'xmlcontent'} = "";
[13486]94 $self->{'record_count'} = 1;
95 $self->{'language'} = "";
96 $self->{'encoding'} = "";
97 $self->{'marc_mapping'} = {};
98 $self->{'current_code'} = "";
99 $self->{'current_tag'} = "";
100 $self->{'current_element'} = "";
101 $self->{'metadata_mapping'} = undef;
102 $self->{'num_processed'} = 0;
[13496]103 $self->{'indent'} = 0;
104
[27503]105 # in case we have individual records without a collection tag
106 $self->{'xmlcollectiontag'} = "<collection>";
[13486]107 return bless $self, $class;
108}
109
[16694]110
[20609]111sub get_default_process_exp {
112 my $self = shift (@_);
[16694]113
[20609]114 return q^(?i)\.xml$^;
115}
116
[13486]117sub get_doctype {
118 my $self = shift(@_);
119
[27503]120 return "(collection|record)";
[13486]121}
122
123
124sub init {
125 my $self = shift (@_);
126 my ($verbosity, $outhandle, $failhandle) = @_;
127
128 ## the mapping file has already been loaded
129 if (defined $self->{'metadata_mapping'} ){
130 $self->SUPER::init(@_);
131 return;
132 }
133
[19619]134 # read in the metadata mapping file
135 my $mm_file = &util::locate_config_file($self->{'metadata_mapping_file'});
[13486]136
[19619]137 if (! defined $mm_file)
[13486]138 {
[15872]139 my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
[15018]140 $self->{'metadata_mapping_file'} . "\".\n " .
[16697]141 " No metadata will be extracted from MARCXML files.\n";
[13486]142
143 print $outhandle $msg;
144 print $failhandle $msg;
145 $self->{'metadata_mapping'} = undef;
146 # We pick up the error in process() if there is no $mm_file
147 # If we exit here, then pluginfo.pl will exit too!
148 }
[15018]149 else {
[19619]150 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_file, $outhandle);
[13486]151 }
152
153
[16695]154 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
[13486]155
156 $self->SUPER::init(@_);
157}
158
[28803]159
160sub Char {
161 # ReadXMLPlugin currently has 'use bytes' here, apparently to sort out
162 # an encoding issue. Possible that the time that 'use bytes' was
163 # added in (to fix a problem) our understanding of Unicode in Perl
164 # wasn't completely correct
165
166 # Trialing out this new version (without 'use bytes') here for MarcXML data
167
168 $_[0]->{'Text'} .= $_[1];
169 return undef;
170}
171
[13486]172# Called for DOCTYPE declarations - use die to bail out if this doctype
173# is not meant for this plugin
174sub xml_doctype {
175 my $self = shift(@_);
176
177 my ($expat, $name, $sysid, $pubid, $internal) = @_;
178 return;
179
180}
181
182
183sub xml_start_document {
184 my $self = shift(@_);
185
186 my ($expat, $name, $sysid, $pubid, $internal) = @_;
187
188
189 my $file = $self->{'file'};
190 my $filename = $self->{'filename'};
[13496]191
[13486]192 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
193
194 $self->{'language'} = $language;
195 $self->{'encoding'} = $encoding;
196 $self->{'element_count'} = 1;
[13496]197 $self->{'indent'} = 0;
[13486]198 my $outhandle = $self->{'outhandle'};
[15872]199 print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
200 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
[17026]201
202 # reset the base id
203 $self->{'base_oid'} = undef;
[13486]204
205}
206
207sub xml_end_document {
208
209}
210
211sub xml_start_tag {
212 my $self = shift;
213 my $expat = shift;
214 my $element = shift;
[13496]215
[15018]216 my $text = $_;
217 my $escaped_text = $self->escape_text($_);
[13486]218
219 $self->{'current_element'} = $element;
220
221 ##get all atributes of this element and store it in a map name=>value
222 my %attr_map = ();
223 my $attrstring = $_;
224 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
225 $attr_map{$1}=$2;
[13496]226 $attrstring = $'; #'
[13486]227 }
228
[15018]229
[13486]230 my $processor = $self->{'processor'};
[23349]231 my $metadata = $self->{'metadata'};
232
[13486]233 ##create a new document for each record
234 if ($element eq "record") {
[13496]235 my $filename = $self->{'filename'};
[13486]236 my $language = $self->{'language'};
237 my $encoding = $self->{'encoding'};
238 my $file = $self->{'file'};
[18327]239 my $doc_obj = new doc($filename, undef, $self->{'file_rename_method'});
[13486]240 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
241 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
[23349]242
[13486]243 my ($filemeta) = $file =~ /([^\\\/]+)$/;
[23349]244 my $plugin_filename_encoding = $self->{'filename_encoding'};
245 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
[23352]246 $self->set_Source_metadata($doc_obj, $filename, $filename_encoding);
[23349]247
[13486]248 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
249 if ($self->{'cover_image'}) {
250 $self->associate_cover_image($doc_obj, $filename);
251 }
252 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[16692]253 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "MARCXML");
[13486]254
255 my $outhandle = $self->{'outhandle'};
[16697]256 print $outhandle "Record $self->{'record_count'}\n" if $self->{'verbosity'} > 1;
[13486]257
258 $self->{'record_count'}++;
259 $self->{'doc_obj'} = $doc_obj;
260 $self->{'num_processed'}++;
[17026]261 if (!defined $self->{'base_oid'}) {
262 $self->SUPER::add_OID($doc_obj);
263 $self->{'base_oid'} = $doc_obj->get_OID();
264 }
265
[13486]266
267 }
268
269 ## get the marc code, for example 520
270 if ($element eq "datafield") {
[13496]271 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
[13486]272 $self->{'current_tag'} = $attr_map{tag};
273 }
274 }
275
276
277 ## append the subcode to the marc code for example 520a or 520b
278 if ($element eq "subfield"){
[13496]279 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
[13486]280 $self->{'current_code'} = $attr_map{'code'};
281 }
282 }
[13496]283
284 if ($element eq "record"){
285 $self->{'indent'} = 0;
[15018]286 $self->{'content'} = "";
287 $self->{'xmlcontent'} = "";
[13496]288 }
289 else {
290 if ($element ne "subfield"){
291 $self->{'indent'} = 1;
292 }
293 else{
294 $self->{'indent'} = 2;
295 }
296 }
297
[15018]298
299 if ($element eq "collection") {
300 # remember the full start tag for <collection ...>
301 # This is needed to wrap around each <record> when generating its associate MARCXML file
302
303 $self->{'xmlcollectiontag'} = $text;
304 }
305 else {
306 $self->{'content'} .= "<br/>" if ($element ne "record");
307 $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
308 $self->{'xmlcontent'} .= $text;
309 }
[13496]310
[13486]311}
312
313
314
315sub xml_end_tag {
316 my $self = shift(@_);
317 my ($expat, $element) = @_;
[15018]318
319 my $text = $_;
320 my $escaped_text = $self->escape_text($_);
[13496]321
[13486]322 if ($element eq "record" and defined $self->{'doc_obj'}) {
323 # process the document
324 my $processor = $self->{'processor'};
325 my $doc_obj = $self->{'doc_obj'};
[15018]326 $self->{'content'} .= "<br/>".$escaped_text;
327 $self->{'xmlcontent'} .= $text;
[13496]328
[15018]329
330 my $top_section = $doc_obj->get_top_section();
331
[16521]332 my $tmp_marcxml_filename = &util::get_tmp_filename("xml");
[15018]333 if (open (XMLOUT,">$tmp_marcxml_filename")) {
[28803]334 binmode(XMLOUT,":utf8");
[15018]335
336 print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";
337 my $xml_content = $self->{'xmlcontent'};
338
339 $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
340
341 print XMLOUT $xml_content;
342
343 close(XMLOUT);
344
345 $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
346
347 # assicate xsl style file for presentation as HTML
348 my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
349 $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
350
351 }
352 else {
353 my $outhandle = $self->{'outhandle'};
354 print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
355 }
[15178]356
357 # include any metadata passed in from previous plugins
358 # note that this metadata is associated with the top level section
359
360 $self->extra_metadata ($doc_obj,
361 $doc_obj->get_top_section(),
362 $self->{'metadata'});
363
[15018]364
[17026]365 $self->add_OID($doc_obj, $self->{'base_oid'}, $self->{'record_count'});
[15018]366
[13496]367 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
[13486]368 $processor->process($doc_obj);
369
370 ##clean up
371 $self->{'content'} = "";
[15018]372 $self->{'xmlcontent'} = "";
[13486]373 $self->{'doc_obj'} = undef;
[13496]374 return;
[13486]375 }
376
377 ## map the xmlmarc to gsdl metadata
[16697]378 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'} and defined $self->{'metadata_mapping'}){
[13486]379 my $metadata_mapping = $self->{'metadata_mapping'};
380 my $marc_mapping = $self->{'marc_mapping'};
381 my $doc_obj = $self->{'doc_obj'};
382
[19672]383 ##print STDERR "**** Marc Record\n";
384 ##map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
385 ##print STDERR "**** Metadata Mapping\n";
386 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
[13486]387
[15018]388
[13486]389 foreach my $marc_field (keys %$metadata_mapping){
[15018]390
391 ## test whether this field has subfield
[13486]392 my $subfield = undef;
[15018]393 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
394 $marc_field = $1;
395 $subfield = $2;
396 }
[13486]397
[15018]398 my $matched_field = $marc_mapping->{$marc_field};
399
400 if (defined $matched_field) {
401
402 my $meta_name = undef;
403 my $meta_value = undef;
404
[13486]405 if (defined $subfield){
[15018]406 $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
407
408 $meta_value = $matched_field->{$subfield};
409
410 if (!defined $meta_value) {
411 # record read in does not have the specified subfield
412 next;
413 }
[13486]414 }
[15018]415 else {
416 $meta_name = $metadata_mapping->{$marc_field};
417
418 # no subfield => get all the values
[19672]419 my $first = 1;
[15018]420 foreach my $value (sort keys %{$matched_field}) {
[19672]421 if ($first) {
422 $meta_value = $matched_field->{$value};
423 $first = 0;
424 } else {
425 $meta_value .= " " . $matched_field->{$value};
426 }
[13486]427 }
[15018]428
[13486]429 }
430
[28783]431 my $gs_mode = ($ENV{'GSDL3SRCHOME'}) ? "gs3" : "gs2";
432
433 if ($gs_mode eq "gs2") {
434 ## escape [ and ]
435 $meta_value =~ s/\[/\\\[/g;
436 $meta_value =~ s/\]/\\\]/g;
437
438 # The following is how MARCPlug does this
439 # If this is really OK for Greenstone2, then consider
440 # to switching to this, as this would mean we
441 # wouldn't need a special gs2/gs3 test here!
442
443# $meta_value =~ s/\[/&\#091;/g;
444# $meta_value =~ s/\]/&\#093;/g;
445
446 ##print STDERR "$meta_name=$meta_value\n";
447 }
448
[13486]449 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
450
451 }
452
453 }
454
455 ##clean up
456 $self->{'marc_mapping'} = undef;
457 $self->{'current_tag'} = "";
458 }
[13496]459
460 if ($element eq "datafield"){
461 $self->{'indent'} = 1;
[15018]462 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
463 $self->{'xmlcontent'} .= $text;
[13496]464 }
465 else{
[15018]466 $self->{'content'} .= $escaped_text;
467 $self->{'xmlcontent'} .= $text;
[13496]468 }
469
[13486]470}
471
[17026]472sub add_OID {
[13486]473 my $self = shift (@_);
[17026]474 my ($doc_obj, $id, $record_number) = @_;
[15018]475
[17026]476 my $full_id = $id . "r" . $record_number;
477 if ($self->{'OIDtype'} eq "assigned") {
478 my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
479 if (defined $identifier && $identifier ne "") {
[17033]480 $full_id = $identifier;
[19619]481 $full_id = &util::tidy_up_oid($full_id);
[17026]482 }
483 }
484 $doc_obj->set_OID($full_id);
[13486]485}
486
487sub xml_text {
488 my $self = shift(@_);
489 my ($expat) = @_;
490
[15018]491 my $text = $_;
492 my $escaped_text = $self->escape_text($_);
[13486]493
[15018]494 # protect against & in raw text file
495 $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
496
[13486]497 ## store the text of a marc code, for exapmle 520a=>A poem about....
498 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
499 ##stored it in the marc_mapping
[15018]500
501 my $current_tag = $self->{'current_tag'};
502 my $current_code = $self->{'current_code'};
503
504 $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
505
[13486]506 $self->{'current_code'} = "";
507 }
508
[15018]509 $self->{'content'} .= $escaped_text;
510 $self->{'xmlcontent'} .= $text;
[13486]511
512}
513
[13496]514sub calculate_indent{
515 my ($self,$num) = @_;
[13486]516
[13496]517 my $indent ="";
518
519 for (my $i=0; $i<$num;$i++){
520 $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
521 }
522
523 return $indent;
524
525}
526
527sub escape_text {
528 my ($self,$text) = @_;
529 # special characters in the xml encoding
530 $text =~ s/&/&amp;/g; # this has to be first...
531 $text =~ s/</&lt;/g;
532 $text =~ s/>/&gt;/g;
533 $text =~ s/\"/&quot;/g;
534
535 return $text;
536}
537
538
[15018]539sub unescape_text {
540 my ($self,$text) = @_;
541 # special characters in the xml encoding
542 $text =~ s/&lt;/</g;
543 $text =~ s/&gt;/>/g;
544 $text =~ s/&quot;/\"/g;
545
546 $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
547
548 return $text;
549}
550
551
[13486]5521;
553
554
Note: See TracBrowser for help on using the repository browser.