source: main/trunk/greenstone2/perllib/plugins/MARCXMLPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 15.5 KB
Line 
1###########################################################################
2#
3# MARCXMLPlugin.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
30package MARCXMLPlugin;
31
32use ReadXMLFile;
33use ReadTextFile;
34use MetadataRead;
35use marcmapping;
36
37use strict;
38no strict 'refs'; # allow filehandles to be variables and viceversa
39
40# methods with identical signatures take precedence in the order given in the ISA list.
41sub BEGIN {
42 @MARCXMLPlugin::ISA = ('MetadataRead', 'ReadXMLFile', 'ReadTextFile');
43}
44
45my $arguments = [{'name' => "metadata_mapping_file",
46 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
47 'type' => "string",
48 'deft' => "marc2dc.txt",
49 'reqd' => "no" },
50 { 'name' => "process_exp",
51 'desc' => "{BaseImporter.process_exp}",
52 'type' => "regexp",
53 'deft' => &get_default_process_exp(),
54 'reqd' => "no" }];
55
56my $options = { 'name' => "MARCXMLPlugin",
57 'desc' => "{MARCXMLPlugin.desc}",
58 'abstract' => "no",
59 'inherits' => "yes",
60 'args' => $arguments
61 };
62
63
64sub new {
65 my ($class) = shift (@_);
66 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
67 push(@$pluginlist, $class);
68
69 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
70 push(@{$hashArgOptLists->{"OptList"}},$options);
71
72 # we want to be able to use the textcat methods from ReadTextFile
73 # to get the language and encoding
74 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
75
76 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
77
78 # we want to strip namespaces, so have to create a new XML parser
79 my $parser = new XML::Parser('Style' => 'Stream',
80 'Pkg' => 'ReadXMLFile',
81 'PluginObj' => $self,
82 'Namespaces' => 1, # strip out namespaces
83 'Handlers' => {'Char' => \&Char,
84 'XMLDecl' => \&ReadXMLFile::XMLDecl,
85 'Entity' => \&ReadXMLFile::Entity,
86 'Doctype' => \&ReadXMLFile::Doctype,
87 'Default' => \&ReadXMLFile::Default
88 });
89
90 $self->{'parser'} = $parser;
91
92 $self->{'content'} = "";
93 $self->{'xmlcontent'} = "";
94 $self->{'record_count'} = 1;
95 $self->{'language'} = "";
96 $self->{'encoding'} = "";
97 $self->{'marc_mapping'} = {};
98 $self->{'current_code'} = "";
99 $self->{'current_tag'} = "";
100 $self->{'current_element'} = "";
101 $self->{'metadata_mapping'} = undef;
102 $self->{'num_processed'} = 0;
103 $self->{'indent'} = 0;
104
105 # in case we have individual records without a collection tag
106 $self->{'xmlcollectiontag'} = "<collection>";
107 return bless $self, $class;
108}
109
110
111sub get_default_process_exp {
112 my $self = shift (@_);
113
114 return q^(?i)\.xml$^;
115}
116
117sub get_doctype {
118 my $self = shift(@_);
119
120 return "(collection|record)";
121}
122
123
124sub init {
125 my $self = shift (@_);
126 my ($verbosity, $outhandle, $failhandle) = @_;
127
128 ## the mapping file has already been loaded
129 if (defined $self->{'metadata_mapping'} ){
130 $self->SUPER::init(@_);
131 return;
132 }
133
134 # read in the metadata mapping file
135 my $mm_file = &util::locate_config_file($self->{'metadata_mapping_file'});
136
137 if (! defined $mm_file)
138 {
139 my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
140 $self->{'metadata_mapping_file'} . "\".\n " .
141 " No metadata will be extracted from MARCXML files.\n";
142
143 print $outhandle $msg;
144 print $failhandle $msg;
145 $self->{'metadata_mapping'} = undef;
146 # We pick up the error in process() if there is no $mm_file
147 # If we exit here, then pluginfo.pl will exit too!
148 }
149 else {
150 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_file, $outhandle);
151 }
152
153
154 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
155
156 $self->SUPER::init(@_);
157}
158
159
160sub Char {
161 # ReadXMLPlugin currently has 'use bytes' here, apparently to sort out
162 # an encoding issue. Possible that the time that 'use bytes' was
163 # added in (to fix a problem) our understanding of Unicode in Perl
164 # wasn't completely correct
165
166 # Trialing out this new version (without 'use bytes') here for MarcXML data
167
168 $_[0]->{'Text'} .= $_[1];
169 return undef;
170}
171
172# Called for DOCTYPE declarations - use die to bail out if this doctype
173# is not meant for this plugin
174sub xml_doctype {
175 my $self = shift(@_);
176
177 my ($expat, $name, $sysid, $pubid, $internal) = @_;
178 return;
179
180}
181
182
183sub xml_start_document {
184 my $self = shift(@_);
185
186 my ($expat, $name, $sysid, $pubid, $internal) = @_;
187
188
189 my $file = $self->{'file'};
190 my $filename = $self->{'filename'};
191
192 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
193
194 $self->{'language'} = $language;
195 $self->{'encoding'} = $encoding;
196 $self->{'element_count'} = 1;
197 $self->{'indent'} = 0;
198 my $outhandle = $self->{'outhandle'};
199 print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
200 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
201
202 # reset the base id
203 $self->{'base_oid'} = undef;
204
205}
206
207sub xml_end_document {
208
209}
210
211sub xml_start_tag {
212 my $self = shift;
213 my $expat = shift;
214 my $element = shift;
215
216 my $text = $_;
217 my $escaped_text = $self->escape_text($_);
218
219 $self->{'current_element'} = $element;
220
221 ##get all atributes of this element and store it in a map name=>value
222 my %attr_map = ();
223 my $attrstring = $_;
224 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
225 $attr_map{$1}=$2;
226 $attrstring = $'; #'
227 }
228
229
230 my $processor = $self->{'processor'};
231 my $metadata = $self->{'metadata'};
232
233 ##create a new document for each record
234 if ($element eq "record") {
235 my $filename = $self->{'filename'};
236 my $language = $self->{'language'};
237 my $encoding = $self->{'encoding'};
238 my $file = $self->{'file'};
239 my $doc_obj = new doc($filename, undef, $self->{'file_rename_method'});
240 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
241 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
242
243 my ($filemeta) = $file =~ /([^\\\/]+)$/;
244 my $plugin_filename_encoding = $self->{'filename_encoding'};
245 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
246 $self->set_Source_metadata($doc_obj, $filename, $filename_encoding);
247
248 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
249 if ($self->{'cover_image'}) {
250 $self->associate_cover_image($doc_obj, $filename);
251 }
252 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
253 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "MARCXML");
254
255 my $outhandle = $self->{'outhandle'};
256 print $outhandle "Record $self->{'record_count'}\n" if $self->{'verbosity'} > 1;
257
258 $self->{'record_count'}++;
259 $self->{'doc_obj'} = $doc_obj;
260 $self->{'num_processed'}++;
261 if (!defined $self->{'base_oid'}) {
262 $self->SUPER::add_OID($doc_obj);
263 $self->{'base_oid'} = $doc_obj->get_OID();
264 }
265
266
267 }
268
269 ## get the marc code, for example 520
270 if ($element eq "datafield") {
271 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
272 $self->{'current_tag'} = $attr_map{tag};
273 }
274 }
275
276
277 ## append the subcode to the marc code for example 520a or 520b
278 if ($element eq "subfield"){
279 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
280 $self->{'current_code'} = $attr_map{'code'};
281 }
282 }
283
284 if ($element eq "record"){
285 $self->{'indent'} = 0;
286 $self->{'content'} = "";
287 $self->{'xmlcontent'} = "";
288 }
289 else {
290 if ($element ne "subfield"){
291 $self->{'indent'} = 1;
292 }
293 else{
294 $self->{'indent'} = 2;
295 }
296 }
297
298
299 if ($element eq "collection") {
300 # remember the full start tag for <collection ...>
301 # This is needed to wrap around each <record> when generating its associate MARCXML file
302
303 $self->{'xmlcollectiontag'} = $text;
304 }
305 else {
306 $self->{'content'} .= "<br/>" if ($element ne "record");
307 $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
308 $self->{'xmlcontent'} .= $text;
309 }
310
311}
312
313
314
315sub xml_end_tag {
316 my $self = shift(@_);
317 my ($expat, $element) = @_;
318
319 my $text = $_;
320 my $escaped_text = $self->escape_text($_);
321
322 if ($element eq "record" and defined $self->{'doc_obj'}) {
323 # process the document
324 my $processor = $self->{'processor'};
325 my $doc_obj = $self->{'doc_obj'};
326 $self->{'content'} .= "<br/>".$escaped_text;
327 $self->{'xmlcontent'} .= $text;
328
329
330 my $top_section = $doc_obj->get_top_section();
331
332 my $tmp_marcxml_filename = &util::get_tmp_filename("xml");
333 if (open (XMLOUT,">$tmp_marcxml_filename")) {
334 binmode(XMLOUT,":utf8");
335
336 print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";
337 my $xml_content = $self->{'xmlcontent'};
338
339 $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
340
341 print XMLOUT $xml_content;
342
343 close(XMLOUT);
344
345 $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
346
347 # assicate xsl style file for presentation as HTML
348 my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
349 $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
350
351 }
352 else {
353 my $outhandle = $self->{'outhandle'};
354 print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
355 }
356
357 # include any metadata passed in from previous plugins
358 # note that this metadata is associated with the top level section
359
360 $self->extra_metadata ($doc_obj,
361 $doc_obj->get_top_section(),
362 $self->{'metadata'});
363
364
365 $self->add_OID($doc_obj, $self->{'base_oid'}, $self->{'record_count'});
366
367 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
368 $processor->process($doc_obj);
369
370 ##clean up
371 $self->{'content'} = "";
372 $self->{'xmlcontent'} = "";
373 $self->{'doc_obj'} = undef;
374 return;
375 }
376
377 ## map the xmlmarc to gsdl metadata
378 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'} and defined $self->{'metadata_mapping'}){
379 my $metadata_mapping = $self->{'metadata_mapping'};
380 my $marc_mapping = $self->{'marc_mapping'};
381 my $doc_obj = $self->{'doc_obj'};
382
383 ##print STDERR "**** Marc Record\n";
384 ##map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
385 ##print STDERR "**** Metadata Mapping\n";
386 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
387
388
389 foreach my $marc_field (keys %$metadata_mapping){
390
391 ## test whether this field has subfield
392 my $subfield = undef;
393 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
394 $marc_field = $1;
395 $subfield = $2;
396 }
397
398 my $matched_field = $marc_mapping->{$marc_field};
399
400 if (defined $matched_field) {
401
402 my $meta_name = undef;
403 my $meta_value = undef;
404
405 if (defined $subfield){
406 $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
407
408 $meta_value = $matched_field->{$subfield};
409
410 if (!defined $meta_value) {
411 # record read in does not have the specified subfield
412 next;
413 }
414 }
415 else {
416 $meta_name = $metadata_mapping->{$marc_field};
417
418 # no subfield => get all the values
419 my $first = 1;
420 foreach my $value (sort keys %{$matched_field}) {
421 if ($first) {
422 $meta_value = $matched_field->{$value};
423 $first = 0;
424 } else {
425 $meta_value .= " " . $matched_field->{$value};
426 }
427 }
428
429 }
430
431 my $gs_mode = ($ENV{'GSDL3SRCHOME'}) ? "gs3" : "gs2";
432
433 if ($gs_mode eq "gs2") {
434 ## escape [ and ]
435 $meta_value =~ s/\[/\\\[/g;
436 $meta_value =~ s/\]/\\\]/g;
437
438 # The following is how MARCPlug does this
439 # If this is really OK for Greenstone2, then consider
440 # to switching to this, as this would mean we
441 # wouldn't need a special gs2/gs3 test here!
442
443# $meta_value =~ s/\[/&\#091;/g;
444# $meta_value =~ s/\]/&\#093;/g;
445
446 ##print STDERR "$meta_name=$meta_value\n";
447 }
448
449 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
450
451 }
452
453 }
454
455 ##clean up
456 $self->{'marc_mapping'} = undef;
457 $self->{'current_tag'} = "";
458 }
459
460 if ($element eq "datafield"){
461 $self->{'indent'} = 1;
462 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
463 $self->{'xmlcontent'} .= $text;
464 }
465 else{
466 $self->{'content'} .= $escaped_text;
467 $self->{'xmlcontent'} .= $text;
468 }
469
470}
471
472sub add_OID {
473 my $self = shift (@_);
474 my ($doc_obj, $id, $record_number) = @_;
475
476 my $full_id = $id . "r" . $record_number;
477 if ($self->{'OIDtype'} eq "assigned") {
478 my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
479 if (defined $identifier && $identifier ne "") {
480 $full_id = $identifier;
481 $full_id = &util::tidy_up_oid($full_id);
482 }
483 }
484 $doc_obj->set_OID($full_id);
485}
486
487sub xml_text {
488 my $self = shift(@_);
489 my ($expat) = @_;
490
491 my $text = $_;
492 my $escaped_text = $self->escape_text($_);
493
494 # protect against & in raw text file
495 $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
496
497 ## store the text of a marc code, for exapmle 520a=>A poem about....
498 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
499 ##stored it in the marc_mapping
500
501 my $current_tag = $self->{'current_tag'};
502 my $current_code = $self->{'current_code'};
503
504 $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
505
506 $self->{'current_code'} = "";
507 }
508
509 $self->{'content'} .= $escaped_text;
510 $self->{'xmlcontent'} .= $text;
511
512}
513
514sub calculate_indent{
515 my ($self,$num) = @_;
516
517 my $indent ="";
518
519 for (my $i=0; $i<$num;$i++){
520 $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
521 }
522
523 return $indent;
524
525}
526
527sub escape_text {
528 my ($self,$text) = @_;
529 # special characters in the xml encoding
530 $text =~ s/&/&amp;/g; # this has to be first...
531 $text =~ s/</&lt;/g;
532 $text =~ s/>/&gt;/g;
533 $text =~ s/\"/&quot;/g;
534
535 return $text;
536}
537
538
539sub unescape_text {
540 my ($self,$text) = @_;
541 # special characters in the xml encoding
542 $text =~ s/&lt;/</g;
543 $text =~ s/&gt;/>/g;
544 $text =~ s/&quot;/\"/g;
545
546 $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
547
548 return $text;
549}
550
551
5521;
553
554
Note: See TracBrowser for help on using the repository browser.