source: main/trunk/greenstone2/perllib/plugins/MARCXMLPlugin.pm@ 22597

Last change on this file since 22597 was 20609, checked in by kjdon, 15 years ago

added process_exp arg explicitly to this plugin. Otherwise, due to the way the args are ordered, inheriting from both ReadTextFile and ReadXMLFile, GLI thinks its process exp is empty not matching .xml. And then it won't be suggested as a plugin when you add in a marcxml file

  • Property svn:keywords set to Author Date Id Revision
File size: 14.3 KB
Line 
1###########################################################################
2#
3# MARCXMLPlugin.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
30package MARCXMLPlugin;
31
32use ReadXMLFile;
33use ReadTextFile;
34use marcmapping;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39sub BEGIN {
40 @MARCXMLPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
41}
42
43my $arguments = [{'name' => "metadata_mapping_file",
44 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
45 'type' => "string",
46 'deft' => "marc2dc.txt",
47 'reqd' => "no" },
48 { 'name' => "process_exp",
49 'desc' => "{BasePlugin.process_exp}",
50 'type' => "regexp",
51 'deft' => &get_default_process_exp(),
52 'reqd' => "no" }];
53
54my $options = { 'name' => "MARCXMLPlugin",
55 'desc' => "{MARCXMLPlugin.desc}",
56 'abstract' => "no",
57 'inherits' => "yes",
58 'args' => $arguments
59 };
60
61
62sub new {
63 my ($class) = shift (@_);
64 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
65 push(@$pluginlist, $class);
66
67 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
68 push(@{$hashArgOptLists->{"OptList"}},$options);
69
70 # we want to be able to use the textcat methods from ReadTextFile
71 # to get the language and encoding
72 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
73
74 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
75
76 # we want to strip namespaces, so have to create a new XML parser
77 my $parser = new XML::Parser('Style' => 'Stream',
78 'Pkg' => 'ReadXMLFile',
79 'PluginObj' => $self,
80 'Namespaces' => 1, # strip out namespaces
81 'Handlers' => {'Char' => \&ReadXMLFile::Char,
82 'XMLDecl' => \&ReadXMLFile::XMLDecl,
83 'Entity' => \&ReadXMLFile::Entity,
84 'Doctype' => \&ReadXMLFile::Doctype,
85 'Default' => \&ReadXMLFile::Default
86 });
87
88 $self->{'parser'} = $parser;
89
90 $self->{'content'} = "";
91 $self->{'xmlcontent'} = "";
92 $self->{'record_count'} = 1;
93 $self->{'language'} = "";
94 $self->{'encoding'} = "";
95 $self->{'marc_mapping'} = {};
96 $self->{'current_code'} = "";
97 $self->{'current_tag'} = "";
98 $self->{'current_element'} = "";
99 $self->{'metadata_mapping'} = undef;
100 $self->{'num_processed'} = 0;
101 $self->{'indent'} = 0;
102
103 return bless $self, $class;
104}
105
106
107sub get_default_process_exp {
108 my $self = shift (@_);
109
110 return q^(?i)\.xml$^;
111}
112
113sub get_doctype {
114 my $self = shift(@_);
115
116 return "collection";
117}
118
119
120sub init {
121 my $self = shift (@_);
122 my ($verbosity, $outhandle, $failhandle) = @_;
123
124 ## the mapping file has already been loaded
125 if (defined $self->{'metadata_mapping'} ){
126 $self->SUPER::init(@_);
127 return;
128 }
129
130 # read in the metadata mapping file
131 my $mm_file = &util::locate_config_file($self->{'metadata_mapping_file'});
132
133 if (! defined $mm_file)
134 {
135 my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
136 $self->{'metadata_mapping_file'} . "\".\n " .
137 " No metadata will be extracted from MARCXML files.\n";
138
139 print $outhandle $msg;
140 print $failhandle $msg;
141 $self->{'metadata_mapping'} = undef;
142 # We pick up the error in process() if there is no $mm_file
143 # If we exit here, then pluginfo.pl will exit too!
144 }
145 else {
146 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_file, $outhandle);
147 }
148
149
150 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
151
152 $self->SUPER::init(@_);
153}
154
155# Called for DOCTYPE declarations - use die to bail out if this doctype
156# is not meant for this plugin
157sub xml_doctype {
158 my $self = shift(@_);
159
160 my ($expat, $name, $sysid, $pubid, $internal) = @_;
161 return;
162
163}
164
165
166sub xml_start_document {
167 my $self = shift(@_);
168
169 my ($expat, $name, $sysid, $pubid, $internal) = @_;
170
171
172 my $file = $self->{'file'};
173 my $filename = $self->{'filename'};
174
175 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
176
177 $self->{'language'} = $language;
178 $self->{'encoding'} = $encoding;
179 $self->{'element_count'} = 1;
180 $self->{'indent'} = 0;
181 my $outhandle = $self->{'outhandle'};
182 print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
183 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
184
185 # reset the base id
186 $self->{'base_oid'} = undef;
187
188}
189
190sub xml_end_document {
191
192}
193
194sub xml_start_tag {
195 my $self = shift;
196 my $expat = shift;
197 my $element = shift;
198
199 my $text = $_;
200 my $escaped_text = $self->escape_text($_);
201
202 $self->{'current_element'} = $element;
203
204 ##get all atributes of this element and store it in a map name=>value
205 my %attr_map = ();
206 my $attrstring = $_;
207 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
208 $attr_map{$1}=$2;
209 $attrstring = $'; #'
210 }
211
212
213 my $processor = $self->{'processor'};
214
215 ##create a new document for each record
216 if ($element eq "record") {
217 my $filename = $self->{'filename'};
218 my $language = $self->{'language'};
219 my $encoding = $self->{'encoding'};
220 my $file = $self->{'file'};
221 my $doc_obj = new doc($filename, undef, $self->{'file_rename_method'});
222 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
223 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
224 my ($filemeta) = $file =~ /([^\\\/]+)$/;
225 $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
226 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
227 if ($self->{'cover_image'}) {
228 $self->associate_cover_image($doc_obj, $filename);
229 }
230 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
231 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "MARCXML");
232
233 my $outhandle = $self->{'outhandle'};
234 print $outhandle "Record $self->{'record_count'}\n" if $self->{'verbosity'} > 1;
235
236 $self->{'record_count'}++;
237 $self->{'doc_obj'} = $doc_obj;
238 $self->{'num_processed'}++;
239 if (!defined $self->{'base_oid'}) {
240 $self->SUPER::add_OID($doc_obj);
241 $self->{'base_oid'} = $doc_obj->get_OID();
242 }
243
244
245 }
246
247 ## get the marc code, for example 520
248 if ($element eq "datafield") {
249 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
250 $self->{'current_tag'} = $attr_map{tag};
251 }
252 }
253
254
255 ## append the subcode to the marc code for example 520a or 520b
256 if ($element eq "subfield"){
257 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
258 $self->{'current_code'} = $attr_map{'code'};
259 }
260 }
261
262 if ($element eq "record"){
263 $self->{'indent'} = 0;
264 $self->{'content'} = "";
265 $self->{'xmlcontent'} = "";
266 }
267 else {
268 if ($element ne "subfield"){
269 $self->{'indent'} = 1;
270 }
271 else{
272 $self->{'indent'} = 2;
273 }
274 }
275
276
277 if ($element eq "collection") {
278 # remember the full start tag for <collection ...>
279 # This is needed to wrap around each <record> when generating its associate MARCXML file
280
281 $self->{'xmlcollectiontag'} = $text;
282 }
283 else {
284 $self->{'content'} .= "<br/>" if ($element ne "record");
285 $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
286 $self->{'xmlcontent'} .= $text;
287 }
288
289}
290
291
292
293sub xml_end_tag {
294 my $self = shift(@_);
295 my ($expat, $element) = @_;
296
297 my $text = $_;
298 my $escaped_text = $self->escape_text($_);
299
300 if ($element eq "record" and defined $self->{'doc_obj'}) {
301 # process the document
302 my $processor = $self->{'processor'};
303 my $doc_obj = $self->{'doc_obj'};
304 $self->{'content'} .= "<br/>".$escaped_text;
305 $self->{'xmlcontent'} .= $text;
306
307
308 my $top_section = $doc_obj->get_top_section();
309
310 my $tmp_marcxml_filename = &util::get_tmp_filename("xml");
311 if (open (XMLOUT,">$tmp_marcxml_filename")) {
312
313 print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";
314 my $xml_content = $self->{'xmlcontent'};
315
316 $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
317
318 print XMLOUT $xml_content;
319
320 close(XMLOUT);
321
322 $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
323
324 # assicate xsl style file for presentation as HTML
325 my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
326 $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
327
328 }
329 else {
330 my $outhandle = $self->{'outhandle'};
331 print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
332 }
333
334 # include any metadata passed in from previous plugins
335 # note that this metadata is associated with the top level section
336
337 $self->extra_metadata ($doc_obj,
338 $doc_obj->get_top_section(),
339 $self->{'metadata'});
340
341
342 $self->add_OID($doc_obj, $self->{'base_oid'}, $self->{'record_count'});
343
344 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
345 $processor->process($doc_obj);
346
347 ##clean up
348 $self->{'content'} = "";
349 $self->{'xmlcontent'} = "";
350 $self->{'doc_obj'} = undef;
351 return;
352 }
353
354 ## map the xmlmarc to gsdl metadata
355 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'} and defined $self->{'metadata_mapping'}){
356 my $metadata_mapping = $self->{'metadata_mapping'};
357 my $marc_mapping = $self->{'marc_mapping'};
358 my $doc_obj = $self->{'doc_obj'};
359
360 ##print STDERR "**** Marc Record\n";
361 ##map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
362 ##print STDERR "**** Metadata Mapping\n";
363 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
364
365
366 foreach my $marc_field (keys %$metadata_mapping){
367
368 ## test whether this field has subfield
369 my $subfield = undef;
370 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
371 $marc_field = $1;
372 $subfield = $2;
373 }
374
375 my $matched_field = $marc_mapping->{$marc_field};
376
377 if (defined $matched_field) {
378
379 my $meta_name = undef;
380 my $meta_value = undef;
381
382 if (defined $subfield){
383 $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
384
385 $meta_value = $matched_field->{$subfield};
386
387 if (!defined $meta_value) {
388 # record read in does not have the specified subfield
389 next;
390 }
391 }
392 else {
393 $meta_name = $metadata_mapping->{$marc_field};
394
395 # no subfield => get all the values
396 my $first = 1;
397 foreach my $value (sort keys %{$matched_field}) {
398 if ($first) {
399 $meta_value = $matched_field->{$value};
400 $first = 0;
401 } else {
402 $meta_value .= " " . $matched_field->{$value};
403 }
404 }
405
406 }
407
408 ## escape [ and ]
409 $meta_value =~ s/\[/\\\[/g;
410 $meta_value =~ s/\]/\\\]/g;
411 ##print STDERR "$meta_name=$meta_value\n";
412 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
413
414 }
415
416 }
417
418 ##clean up
419 $self->{'marc_mapping'} = undef;
420 $self->{'current_tag'} = "";
421 }
422
423 if ($element eq "datafield"){
424 $self->{'indent'} = 1;
425 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
426 $self->{'xmlcontent'} .= $text;
427 }
428 else{
429 $self->{'content'} .= $escaped_text;
430 $self->{'xmlcontent'} .= $text;
431 }
432
433}
434
435sub add_OID {
436 my $self = shift (@_);
437 my ($doc_obj, $id, $record_number) = @_;
438
439 my $full_id = $id . "r" . $record_number;
440 if ($self->{'OIDtype'} eq "assigned") {
441 my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
442 if (defined $identifier && $identifier ne "") {
443 $full_id = $identifier;
444 $full_id = &util::tidy_up_oid($full_id);
445 }
446 }
447 $doc_obj->set_OID($full_id);
448}
449
450sub xml_text {
451 my $self = shift(@_);
452 my ($expat) = @_;
453
454 my $text = $_;
455 my $escaped_text = $self->escape_text($_);
456
457 # protect against & in raw text file
458 $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
459
460 ## store the text of a marc code, for exapmle 520a=>A poem about....
461 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
462 ##stored it in the marc_mapping
463
464 my $current_tag = $self->{'current_tag'};
465 my $current_code = $self->{'current_code'};
466
467 $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
468
469 $self->{'current_code'} = "";
470 }
471
472 $self->{'content'} .= $escaped_text;
473 $self->{'xmlcontent'} .= $text;
474
475}
476
477sub calculate_indent{
478 my ($self,$num) = @_;
479
480 my $indent ="";
481
482 for (my $i=0; $i<$num;$i++){
483 $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
484 }
485
486 return $indent;
487
488}
489
490sub escape_text {
491 my ($self,$text) = @_;
492 # special characters in the xml encoding
493 $text =~ s/&/&amp;/g; # this has to be first...
494 $text =~ s/</&lt;/g;
495 $text =~ s/>/&gt;/g;
496 $text =~ s/\"/&quot;/g;
497
498 return $text;
499}
500
501
502sub unescape_text {
503 my ($self,$text) = @_;
504 # special characters in the xml encoding
505 $text =~ s/&lt;/</g;
506 $text =~ s/&gt;/>/g;
507 $text =~ s/&quot;/\"/g;
508
509 $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
510
511 return $text;
512}
513
514
5151;
516
517
Note: See TracBrowser for help on using the repository browser.