source: gsdl/trunk/perllib/plugins/MARCXMLPlugin.pm@ 16694

Last change on this file since 16694 was 16694, checked in by kjdon, 16 years ago

MARCXMLPlugin uses textcat_language_and_encoding method from ReadTextFile so made it inherit from this as well.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.1 KB
RevLine 
[13486]1###########################################################################
2#
[15872]3# MARCXMLPlugin.pm
[13486]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
[15872]30package MARCXMLPlugin;
[13486]31
[15872]32use ReadXMLFile;
[16693]33use ReadTextFile;
[16692]34use marcmapping;
[13486]35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39sub BEGIN {
[16693]40 @MARCXMLPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
[13486]41}
42
43my $arguments = [{'name' => "metadata_mapping_file",
[15872]44 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
[13486]45 'type' => "string",
[15018]46 'deft' => "marctodc.txt",
[13486]47 'reqd' => "no" }];
48
[15872]49my $options = { 'name' => "MARCXMLPlugin",
50 'desc' => "{MARCXMLPlugin.desc}",
[13486]51 'abstract' => "no",
52 'inherits' => "yes",
53 'args' => $arguments
54 };
55
56sub new {
57 my ($class) = shift (@_);
58 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
59 push(@$pluginlist, $class);
60
[15872]61 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
62 push(@{$hashArgOptLists->{"OptList"}},$options);
[13486]63
[16693]64 # we want to be able to use the textcat methods from ReadTextFile
65 # to get the language and encoding
66 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
67
[15872]68 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
[13486]69
70 $self->{'content'} = "";
[15018]71 $self->{'xmlcontent'} = "";
[13486]72 $self->{'record_count'} = 1;
73 $self->{'language'} = "";
74 $self->{'encoding'} = "";
75 $self->{'marc_mapping'} = {};
76 $self->{'current_code'} = "";
77 $self->{'current_tag'} = "";
78 $self->{'current_element'} = "";
79 $self->{'metadata_mapping'} = undef;
80 $self->{'num_processed'} = 0;
[13496]81 $self->{'indent'} = 0;
82
[16694]83 print STDERR "metadata mapping file = $self->{'metadata_mapping_file'}\n";
[13486]84 return bless $self, $class;
85}
86
[16694]87
88
[13486]89sub get_doctype {
90 my $self = shift(@_);
91
92 return "collection";
93}
94
95
96sub init {
97 my $self = shift (@_);
98 my ($verbosity, $outhandle, $failhandle) = @_;
99
100 ## the mapping file has already been loaded
101 if (defined $self->{'metadata_mapping'} ){
102 $self->SUPER::init(@_);
103 return;
104 }
105
[16692]106 # read in the metadata mapping files
[15018]107 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
[13486]108
109
[15018]110 if (scalar(@$mm_files)==0)
[13486]111 {
[15872]112 my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
[15018]113 $self->{'metadata_mapping_file'} . "\".\n " .
[13486]114 " No marc files can be processed.\n";
115
116 print $outhandle $msg;
117 print $failhandle $msg;
118 $self->{'metadata_mapping'} = undef;
119 # We pick up the error in process() if there is no $mm_file
120 # If we exit here, then pluginfo.pl will exit too!
121 }
[15018]122 else {
[16692]123 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
[13486]124 }
125
126
[16694]127 map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
[13486]128
129 $self->SUPER::init(@_);
130}
131
132# Called for DOCTYPE declarations - use die to bail out if this doctype
133# is not meant for this plugin
134sub xml_doctype {
135 my $self = shift(@_);
136
137 my ($expat, $name, $sysid, $pubid, $internal) = @_;
138 return;
139
140}
141
142
143sub xml_start_document {
144 my $self = shift(@_);
145
146 my ($expat, $name, $sysid, $pubid, $internal) = @_;
147
148
149 my $file = $self->{'file'};
150 my $filename = $self->{'filename'};
[13496]151
[13486]152 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
153
154 $self->{'language'} = $language;
155 $self->{'encoding'} = $encoding;
156 $self->{'element_count'} = 1;
[13496]157 $self->{'indent'} = 0;
[13486]158 my $outhandle = $self->{'outhandle'};
[15872]159 print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
160 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
[13486]161
162}
163
164sub xml_end_document {
165
166}
167
168sub xml_start_tag {
169 my $self = shift;
170 my $expat = shift;
171 my $element = shift;
[13496]172
[15018]173 my $text = $_;
174 my $escaped_text = $self->escape_text($_);
[13486]175
176 $self->{'current_element'} = $element;
177
178 ##get all atributes of this element and store it in a map name=>value
179 my %attr_map = ();
180 my $attrstring = $_;
181 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
182 $attr_map{$1}=$2;
[13496]183 $attrstring = $'; #'
[13486]184 }
185
[15018]186
[13486]187 my $processor = $self->{'processor'};
188
189 ##create a new document for each record
190 if ($element eq "record") {
[13496]191 my $filename = $self->{'filename'};
[13486]192 my $language = $self->{'language'};
193 my $encoding = $self->{'encoding'};
194 my $file = $self->{'file'};
195 my $doc_obj = new doc($filename);
196 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
197 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
198 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
199 my ($filemeta) = $file =~ /([^\\\/]+)$/;
[15872]200 $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
[13486]201 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
202 if ($self->{'cover_image'}) {
203 $self->associate_cover_image($doc_obj, $filename);
204 }
205 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[16692]206 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "MARCXML");
[13486]207
208 my $outhandle = $self->{'outhandle'};
[15872]209 print $outhandle "Record $self->{'record_count'} - MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
[13486]210
211 $self->{'record_count'}++;
212 $self->{'doc_obj'} = $doc_obj;
213 $self->{'num_processed'}++;
214
215 }
216
217 ## get the marc code, for example 520
218 if ($element eq "datafield") {
[13496]219 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
[13486]220 $self->{'current_tag'} = $attr_map{tag};
221 }
222 }
223
224
225 ## append the subcode to the marc code for example 520a or 520b
226 if ($element eq "subfield"){
[13496]227 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
[13486]228 $self->{'current_code'} = $attr_map{'code'};
229 }
230 }
[13496]231
232 if ($element eq "record"){
233 $self->{'indent'} = 0;
[15018]234 $self->{'content'} = "";
235 $self->{'xmlcontent'} = "";
[13496]236 }
237 else {
238 if ($element ne "subfield"){
239 $self->{'indent'} = 1;
240 }
241 else{
242 $self->{'indent'} = 2;
243 }
244 }
245
[15018]246
247 if ($element eq "collection") {
248 # remember the full start tag for <collection ...>
249 # This is needed to wrap around each <record> when generating its associate MARCXML file
250
251 $self->{'xmlcollectiontag'} = $text;
252 }
253 else {
254 $self->{'content'} .= "<br/>" if ($element ne "record");
255 $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
256 $self->{'xmlcontent'} .= $text;
257 }
[13496]258
[13486]259}
260
261
262
263sub xml_end_tag {
264 my $self = shift(@_);
265 my ($expat, $element) = @_;
[15018]266
267 my $text = $_;
268 my $escaped_text = $self->escape_text($_);
[13496]269
[13486]270 if ($element eq "record" and defined $self->{'doc_obj'}) {
271 # process the document
272 my $processor = $self->{'processor'};
273 my $doc_obj = $self->{'doc_obj'};
[15018]274 $self->{'content'} .= "<br/>".$escaped_text;
275 $self->{'xmlcontent'} .= $text;
[13496]276
[15018]277
278 my $top_section = $doc_obj->get_top_section();
279
[16521]280 my $tmp_marcxml_filename = &util::get_tmp_filename("xml");
[15018]281 if (open (XMLOUT,">$tmp_marcxml_filename")) {
282
283 print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";
284 my $xml_content = $self->{'xmlcontent'};
285
286 $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
287
288 print XMLOUT $xml_content;
289
290 close(XMLOUT);
291
292 $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
293
294 # assicate xsl style file for presentation as HTML
295 my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
296 $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
297
298 }
299 else {
300 my $outhandle = $self->{'outhandle'};
301 print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
302 }
[15178]303
304 # include any metadata passed in from previous plugins
305 # note that this metadata is associated with the top level section
306
307 $self->extra_metadata ($doc_obj,
308 $doc_obj->get_top_section(),
309 $self->{'metadata'});
310
[15018]311
312 $self->add_OID($doc_obj, $self->{'record_count'});
313
[13496]314 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
[13486]315 $processor->process($doc_obj);
316
317 ##clean up
318 $self->{'content'} = "";
[15018]319 $self->{'xmlcontent'} = "";
[13486]320 $self->{'doc_obj'} = undef;
[13496]321 return;
[13486]322 }
323
324 ## map the xmlmarc to gsdl metadata
325 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'}){
326 my $metadata_mapping = $self->{'metadata_mapping'};
327 my $marc_mapping = $self->{'marc_mapping'};
328 my $doc_obj = $self->{'doc_obj'};
329
[15018]330## print STDERR "**** Marc Record\n";
331## map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
332## print STDERR "**** Metadata Mapping\n";
333## map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
[13486]334
[15018]335
[13486]336 foreach my $marc_field (keys %$metadata_mapping){
[15018]337
338 ## test whether this field has subfield
[13486]339 my $subfield = undef;
[15018]340 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
341 $marc_field = $1;
342 $subfield = $2;
343 }
[13486]344
[15018]345 my $matched_field = $marc_mapping->{$marc_field};
346
347 if (defined $matched_field) {
348
349 my $meta_name = undef;
350 my $meta_value = undef;
351
[13486]352 if (defined $subfield){
[15018]353 $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
354
355 $meta_value = $matched_field->{$subfield};
356
357 if (!defined $meta_value) {
358 # record read in does not have the specified subfield
359 next;
360 }
[13486]361 }
[15018]362 else {
363 $meta_name = $metadata_mapping->{$marc_field};
364
365 # no subfield => get all the values
366 foreach my $value (sort keys %{$matched_field}) {
367 $meta_value .= $matched_field->{$value} ." ";
[13486]368 }
[15018]369
[13486]370 }
371
372 ## escape [ and ]
373 $meta_value =~ s/\[/\\\[/g;
374 $meta_value =~ s/\]/\\\]/g;
375 ##print STDERR "$meta_name=$meta_value\n";
376 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
377
378 }
379
380 }
381
382 ##clean up
383 $self->{'marc_mapping'} = undef;
384 $self->{'current_tag'} = "";
385 }
[13496]386
387 if ($element eq "datafield"){
388 $self->{'indent'} = 1;
[15018]389 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
390 $self->{'xmlcontent'} .= $text;
[13496]391 }
392 else{
[15018]393 $self->{'content'} .= $escaped_text;
394 $self->{'xmlcontent'} .= $text;
[13496]395 }
396
[13486]397}
398
399
400sub set_OID {
401 my $self = shift (@_);
[15018]402 my ($doc_obj, $record_number) = @_;
[13486]403
[15018]404 # first set it to generate hash value
405 $doc_obj->set_OID();
406
407 # then top it up with an "r" + record-number suffix
408 my $id = $doc_obj->get_OID();
[13486]409 $doc_obj->set_OID($id . "r" . $record_number);
410}
411
412sub xml_text {
413 my $self = shift(@_);
414 my ($expat) = @_;
415
[15018]416 my $text = $_;
417 my $escaped_text = $self->escape_text($_);
[13486]418
[15018]419 # protect against & in raw text file
420 $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
421
[13486]422 ## store the text of a marc code, for exapmle 520a=>A poem about....
423 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
424 ##stored it in the marc_mapping
[15018]425
426 my $current_tag = $self->{'current_tag'};
427 my $current_code = $self->{'current_code'};
428
429 $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
430
[13486]431 $self->{'current_code'} = "";
432 }
433
[15018]434 $self->{'content'} .= $escaped_text;
435 $self->{'xmlcontent'} .= $text;
[13486]436
437}
438
[13496]439sub calculate_indent{
440 my ($self,$num) = @_;
[13486]441
[13496]442 my $indent ="";
443
444 for (my $i=0; $i<$num;$i++){
445 $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
446 }
447
448 return $indent;
449
450}
451
452sub escape_text {
453 my ($self,$text) = @_;
454 # special characters in the xml encoding
455 $text =~ s/&/&amp;/g; # this has to be first...
456 $text =~ s/</&lt;/g;
457 $text =~ s/>/&gt;/g;
458 $text =~ s/\"/&quot;/g;
459
460 return $text;
461}
462
463
[15018]464sub unescape_text {
465 my ($self,$text) = @_;
466 # special characters in the xml encoding
467 $text =~ s/&lt;/</g;
468 $text =~ s/&gt;/>/g;
469 $text =~ s/&quot;/\"/g;
470
471 $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
472
473 return $text;
474}
475
476
[13486]4771;
478
479
Note: See TracBrowser for help on using the repository browser.