source: gsdl/trunk/perllib/plugins/MARCXMLPlugin.pm@ 18327

Last change on this file since 18327 was 18327, checked in by ak19, 15 years ago

Extra parameter to new doc(): the renaming method to be used on the file (base64 or URL encoding).

  • Property svn:keywords set to Author Date Id Revision
File size: 13.5 KB
RevLine 
[13486]1###########################################################################
2#
[15872]3# MARCXMLPlugin.pm
[13486]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
[15872]30package MARCXMLPlugin;
[13486]31
[15872]32use ReadXMLFile;
[16693]33use ReadTextFile;
[16692]34use marcmapping;
[13486]35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39sub BEGIN {
[16693]40 @MARCXMLPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
[13486]41}
42
43my $arguments = [{'name' => "metadata_mapping_file",
[15872]44 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
[13486]45 'type' => "string",
[15018]46 'deft' => "marctodc.txt",
[13486]47 'reqd' => "no" }];
48
[15872]49my $options = { 'name' => "MARCXMLPlugin",
50 'desc' => "{MARCXMLPlugin.desc}",
[13486]51 'abstract' => "no",
52 'inherits' => "yes",
53 'args' => $arguments
54 };
55
56sub new {
57 my ($class) = shift (@_);
58 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
59 push(@$pluginlist, $class);
60
[15872]61 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
62 push(@{$hashArgOptLists->{"OptList"}},$options);
[13486]63
[16693]64 # we want to be able to use the textcat methods from ReadTextFile
65 # to get the language and encoding
66 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
67
[15872]68 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
[13486]69
70 $self->{'content'} = "";
[15018]71 $self->{'xmlcontent'} = "";
[13486]72 $self->{'record_count'} = 1;
73 $self->{'language'} = "";
74 $self->{'encoding'} = "";
75 $self->{'marc_mapping'} = {};
76 $self->{'current_code'} = "";
77 $self->{'current_tag'} = "";
78 $self->{'current_element'} = "";
79 $self->{'metadata_mapping'} = undef;
80 $self->{'num_processed'} = 0;
[13496]81 $self->{'indent'} = 0;
82
[13486]83 return bless $self, $class;
84}
85
[16694]86
87
[13486]88sub get_doctype {
89 my $self = shift(@_);
90
91 return "collection";
92}
93
94
95sub init {
96 my $self = shift (@_);
97 my ($verbosity, $outhandle, $failhandle) = @_;
98
99 ## the mapping file has already been loaded
100 if (defined $self->{'metadata_mapping'} ){
101 $self->SUPER::init(@_);
102 return;
103 }
104
[16692]105 # read in the metadata mapping files
[15018]106 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
[13486]107
108
[15018]109 if (scalar(@$mm_files)==0)
[13486]110 {
[15872]111 my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
[15018]112 $self->{'metadata_mapping_file'} . "\".\n " .
[16697]113 " No metadata will be extracted from MARCXML files.\n";
[13486]114
115 print $outhandle $msg;
116 print $failhandle $msg;
117 $self->{'metadata_mapping'} = undef;
118 # We pick up the error in process() if there is no $mm_file
119 # If we exit here, then pluginfo.pl will exit too!
120 }
[15018]121 else {
[16692]122 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
[13486]123 }
124
125
[16695]126 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
[13486]127
128 $self->SUPER::init(@_);
129}
130
131# Called for DOCTYPE declarations - use die to bail out if this doctype
132# is not meant for this plugin
133sub xml_doctype {
134 my $self = shift(@_);
135
136 my ($expat, $name, $sysid, $pubid, $internal) = @_;
137 return;
138
139}
140
141
142sub xml_start_document {
143 my $self = shift(@_);
144
145 my ($expat, $name, $sysid, $pubid, $internal) = @_;
146
147
148 my $file = $self->{'file'};
149 my $filename = $self->{'filename'};
[13496]150
[13486]151 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
152
153 $self->{'language'} = $language;
154 $self->{'encoding'} = $encoding;
155 $self->{'element_count'} = 1;
[13496]156 $self->{'indent'} = 0;
[13486]157 my $outhandle = $self->{'outhandle'};
[15872]158 print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
159 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
[17026]160
161 # reset the base id
162 $self->{'base_oid'} = undef;
[13486]163
164}
165
166sub xml_end_document {
167
168}
169
170sub xml_start_tag {
171 my $self = shift;
172 my $expat = shift;
173 my $element = shift;
[13496]174
[15018]175 my $text = $_;
176 my $escaped_text = $self->escape_text($_);
[13486]177
178 $self->{'current_element'} = $element;
179
180 ##get all atributes of this element and store it in a map name=>value
181 my %attr_map = ();
182 my $attrstring = $_;
183 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
184 $attr_map{$1}=$2;
[13496]185 $attrstring = $'; #'
[13486]186 }
187
[15018]188
[13486]189 my $processor = $self->{'processor'};
190
191 ##create a new document for each record
192 if ($element eq "record") {
[13496]193 my $filename = $self->{'filename'};
[13486]194 my $language = $self->{'language'};
195 my $encoding = $self->{'encoding'};
196 my $file = $self->{'file'};
[18327]197 my $doc_obj = new doc($filename, undef, $self->{'file_rename_method'});
[13486]198 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
199 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
200 my ($filemeta) = $file =~ /([^\\\/]+)$/;
[15872]201 $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
[13486]202 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
203 if ($self->{'cover_image'}) {
204 $self->associate_cover_image($doc_obj, $filename);
205 }
206 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[16692]207 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "MARCXML");
[13486]208
209 my $outhandle = $self->{'outhandle'};
[16697]210 print $outhandle "Record $self->{'record_count'}\n" if $self->{'verbosity'} > 1;
[13486]211
212 $self->{'record_count'}++;
213 $self->{'doc_obj'} = $doc_obj;
214 $self->{'num_processed'}++;
[17026]215 if (!defined $self->{'base_oid'}) {
216 $self->SUPER::add_OID($doc_obj);
217 $self->{'base_oid'} = $doc_obj->get_OID();
218 }
219
[13486]220
221 }
222
223 ## get the marc code, for example 520
224 if ($element eq "datafield") {
[13496]225 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
[13486]226 $self->{'current_tag'} = $attr_map{tag};
227 }
228 }
229
230
231 ## append the subcode to the marc code for example 520a or 520b
232 if ($element eq "subfield"){
[13496]233 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
[13486]234 $self->{'current_code'} = $attr_map{'code'};
235 }
236 }
[13496]237
238 if ($element eq "record"){
239 $self->{'indent'} = 0;
[15018]240 $self->{'content'} = "";
241 $self->{'xmlcontent'} = "";
[13496]242 }
243 else {
244 if ($element ne "subfield"){
245 $self->{'indent'} = 1;
246 }
247 else{
248 $self->{'indent'} = 2;
249 }
250 }
251
[15018]252
253 if ($element eq "collection") {
254 # remember the full start tag for <collection ...>
255 # This is needed to wrap around each <record> when generating its associate MARCXML file
256
257 $self->{'xmlcollectiontag'} = $text;
258 }
259 else {
260 $self->{'content'} .= "<br/>" if ($element ne "record");
261 $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
262 $self->{'xmlcontent'} .= $text;
263 }
[13496]264
[13486]265}
266
267
268
269sub xml_end_tag {
270 my $self = shift(@_);
271 my ($expat, $element) = @_;
[15018]272
273 my $text = $_;
274 my $escaped_text = $self->escape_text($_);
[13496]275
[13486]276 if ($element eq "record" and defined $self->{'doc_obj'}) {
277 # process the document
278 my $processor = $self->{'processor'};
279 my $doc_obj = $self->{'doc_obj'};
[15018]280 $self->{'content'} .= "<br/>".$escaped_text;
281 $self->{'xmlcontent'} .= $text;
[13496]282
[15018]283
284 my $top_section = $doc_obj->get_top_section();
285
[16521]286 my $tmp_marcxml_filename = &util::get_tmp_filename("xml");
[15018]287 if (open (XMLOUT,">$tmp_marcxml_filename")) {
288
289 print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";
290 my $xml_content = $self->{'xmlcontent'};
291
292 $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
293
294 print XMLOUT $xml_content;
295
296 close(XMLOUT);
297
298 $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
299
300 # assicate xsl style file for presentation as HTML
301 my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
302 $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
303
304 }
305 else {
306 my $outhandle = $self->{'outhandle'};
307 print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
308 }
[15178]309
310 # include any metadata passed in from previous plugins
311 # note that this metadata is associated with the top level section
312
313 $self->extra_metadata ($doc_obj,
314 $doc_obj->get_top_section(),
315 $self->{'metadata'});
316
[15018]317
[17026]318 $self->add_OID($doc_obj, $self->{'base_oid'}, $self->{'record_count'});
[15018]319
[13496]320 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
[13486]321 $processor->process($doc_obj);
322
323 ##clean up
324 $self->{'content'} = "";
[15018]325 $self->{'xmlcontent'} = "";
[13486]326 $self->{'doc_obj'} = undef;
[13496]327 return;
[13486]328 }
329
330 ## map the xmlmarc to gsdl metadata
[16697]331 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'} and defined $self->{'metadata_mapping'}){
[13486]332 my $metadata_mapping = $self->{'metadata_mapping'};
333 my $marc_mapping = $self->{'marc_mapping'};
334 my $doc_obj = $self->{'doc_obj'};
335
[15018]336## print STDERR "**** Marc Record\n";
337## map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
338## print STDERR "**** Metadata Mapping\n";
339## map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
[13486]340
[15018]341
[13486]342 foreach my $marc_field (keys %$metadata_mapping){
[15018]343
344 ## test whether this field has subfield
[13486]345 my $subfield = undef;
[15018]346 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
347 $marc_field = $1;
348 $subfield = $2;
349 }
[13486]350
[15018]351 my $matched_field = $marc_mapping->{$marc_field};
352
353 if (defined $matched_field) {
354
355 my $meta_name = undef;
356 my $meta_value = undef;
357
[13486]358 if (defined $subfield){
[15018]359 $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
360
361 $meta_value = $matched_field->{$subfield};
362
363 if (!defined $meta_value) {
364 # record read in does not have the specified subfield
365 next;
366 }
[13486]367 }
[15018]368 else {
369 $meta_name = $metadata_mapping->{$marc_field};
370
371 # no subfield => get all the values
372 foreach my $value (sort keys %{$matched_field}) {
373 $meta_value .= $matched_field->{$value} ." ";
[13486]374 }
[15018]375
[13486]376 }
377
378 ## escape [ and ]
379 $meta_value =~ s/\[/\\\[/g;
380 $meta_value =~ s/\]/\\\]/g;
381 ##print STDERR "$meta_name=$meta_value\n";
382 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
383
384 }
385
386 }
387
388 ##clean up
389 $self->{'marc_mapping'} = undef;
390 $self->{'current_tag'} = "";
391 }
[13496]392
393 if ($element eq "datafield"){
394 $self->{'indent'} = 1;
[15018]395 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
396 $self->{'xmlcontent'} .= $text;
[13496]397 }
398 else{
[15018]399 $self->{'content'} .= $escaped_text;
400 $self->{'xmlcontent'} .= $text;
[13496]401 }
402
[13486]403}
404
[17026]405sub add_OID {
[13486]406 my $self = shift (@_);
[17026]407 my ($doc_obj, $id, $record_number) = @_;
[15018]408
[17026]409 my $full_id = $id . "r" . $record_number;
410 if ($self->{'OIDtype'} eq "assigned") {
411 my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
412 if (defined $identifier && $identifier ne "") {
[17033]413 $full_id = $identifier;
414 $full_id =~ s/\.//g; #remove any periods
415 if ($full_id =~ /^[\d]*$/) {
416 $full_id = "D" . $full_id;
[17026]417 print STDERR "OID only contains numbers, adding a D\n";
418 }
419 }
420 }
421 $doc_obj->set_OID($full_id);
[13486]422}
423
424sub xml_text {
425 my $self = shift(@_);
426 my ($expat) = @_;
427
[15018]428 my $text = $_;
429 my $escaped_text = $self->escape_text($_);
[13486]430
[15018]431 # protect against & in raw text file
432 $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
433
[13486]434 ## store the text of a marc code, for exapmle 520a=>A poem about....
435 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
436 ##stored it in the marc_mapping
[15018]437
438 my $current_tag = $self->{'current_tag'};
439 my $current_code = $self->{'current_code'};
440
441 $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
442
[13486]443 $self->{'current_code'} = "";
444 }
445
[15018]446 $self->{'content'} .= $escaped_text;
447 $self->{'xmlcontent'} .= $text;
[13486]448
449}
450
[13496]451sub calculate_indent{
452 my ($self,$num) = @_;
[13486]453
[13496]454 my $indent ="";
455
456 for (my $i=0; $i<$num;$i++){
457 $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
458 }
459
460 return $indent;
461
462}
463
464sub escape_text {
465 my ($self,$text) = @_;
466 # special characters in the xml encoding
467 $text =~ s/&/&amp;/g; # this has to be first...
468 $text =~ s/</&lt;/g;
469 $text =~ s/>/&gt;/g;
470 $text =~ s/\"/&quot;/g;
471
472 return $text;
473}
474
475
[15018]476sub unescape_text {
477 my ($self,$text) = @_;
478 # special characters in the xml encoding
479 $text =~ s/&lt;/</g;
480 $text =~ s/&gt;/>/g;
481 $text =~ s/&quot;/\"/g;
482
483 $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
484
485 return $text;
486}
487
488
[13486]4891;
490
491
Note: See TracBrowser for help on using the repository browser.