source: main/trunk/greenstone2/perllib/plugins/MARCXMLPlugin.pm@ 24547

Last change on this file since 24547 was 24547, checked in by ak19, 13 years ago

Added new abstract plugin MetadataRead that defines can_process_this_file_for_metadata that MetadataPlugin subclasses can inherit (if MetadataRead is listed first in the ISA inheritance list) and which will then override the one defined in BasePlugin. For now committing MARC, ISIS and OAIPlugins which now additionally inherit from MetadataRead. Other metadataPlugins also need to be committed.

  • Property svn:keywords set to Author Date Id Revision
File size: 14.6 KB
Line 
1###########################################################################
2#
3# MARCXMLPlugin.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
30package MARCXMLPlugin;
31
32use ReadXMLFile;
33use ReadTextFile;
34use MetadataRead;
35use marcmapping;
36
37use strict;
38no strict 'refs'; # allow filehandles to be variables and viceversa
39
40# methods with identical signatures take precedence in the order given in the ISA list.
41sub BEGIN {
42 @MARCXMLPlugin::ISA = ('MetadataRead', 'ReadXMLFile', 'ReadTextFile');
43}
44
45my $arguments = [{'name' => "metadata_mapping_file",
46 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
47 'type' => "string",
48 'deft' => "marc2dc.txt",
49 'reqd' => "no" },
50 { 'name' => "process_exp",
51 'desc' => "{BasePlugin.process_exp}",
52 'type' => "regexp",
53 'deft' => &get_default_process_exp(),
54 'reqd' => "no" }];
55
56my $options = { 'name' => "MARCXMLPlugin",
57 'desc' => "{MARCXMLPlugin.desc}",
58 'abstract' => "no",
59 'inherits' => "yes",
60 'args' => $arguments
61 };
62
63
64sub new {
65 my ($class) = shift (@_);
66 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
67 push(@$pluginlist, $class);
68
69 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
70 push(@{$hashArgOptLists->{"OptList"}},$options);
71
72 # we want to be able to use the textcat methods from ReadTextFile
73 # to get the language and encoding
74 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
75
76 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
77
78 # we want to strip namespaces, so have to create a new XML parser
79 my $parser = new XML::Parser('Style' => 'Stream',
80 'Pkg' => 'ReadXMLFile',
81 'PluginObj' => $self,
82 'Namespaces' => 1, # strip out namespaces
83 'Handlers' => {'Char' => \&ReadXMLFile::Char,
84 'XMLDecl' => \&ReadXMLFile::XMLDecl,
85 'Entity' => \&ReadXMLFile::Entity,
86 'Doctype' => \&ReadXMLFile::Doctype,
87 'Default' => \&ReadXMLFile::Default
88 });
89
90 $self->{'parser'} = $parser;
91
92 $self->{'content'} = "";
93 $self->{'xmlcontent'} = "";
94 $self->{'record_count'} = 1;
95 $self->{'language'} = "";
96 $self->{'encoding'} = "";
97 $self->{'marc_mapping'} = {};
98 $self->{'current_code'} = "";
99 $self->{'current_tag'} = "";
100 $self->{'current_element'} = "";
101 $self->{'metadata_mapping'} = undef;
102 $self->{'num_processed'} = 0;
103 $self->{'indent'} = 0;
104
105 return bless $self, $class;
106}
107
108
109sub get_default_process_exp {
110 my $self = shift (@_);
111
112 return q^(?i)\.xml$^;
113}
114
115sub get_doctype {
116 my $self = shift(@_);
117
118 return "collection";
119}
120
121
122sub init {
123 my $self = shift (@_);
124 my ($verbosity, $outhandle, $failhandle) = @_;
125
126 ## the mapping file has already been loaded
127 if (defined $self->{'metadata_mapping'} ){
128 $self->SUPER::init(@_);
129 return;
130 }
131
132 # read in the metadata mapping file
133 my $mm_file = &util::locate_config_file($self->{'metadata_mapping_file'});
134
135 if (! defined $mm_file)
136 {
137 my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
138 $self->{'metadata_mapping_file'} . "\".\n " .
139 " No metadata will be extracted from MARCXML files.\n";
140
141 print $outhandle $msg;
142 print $failhandle $msg;
143 $self->{'metadata_mapping'} = undef;
144 # We pick up the error in process() if there is no $mm_file
145 # If we exit here, then pluginfo.pl will exit too!
146 }
147 else {
148 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_file, $outhandle);
149 }
150
151
152 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
153
154 $self->SUPER::init(@_);
155}
156
157# Called for DOCTYPE declarations - use die to bail out if this doctype
158# is not meant for this plugin
159sub xml_doctype {
160 my $self = shift(@_);
161
162 my ($expat, $name, $sysid, $pubid, $internal) = @_;
163 return;
164
165}
166
167
168sub xml_start_document {
169 my $self = shift(@_);
170
171 my ($expat, $name, $sysid, $pubid, $internal) = @_;
172
173
174 my $file = $self->{'file'};
175 my $filename = $self->{'filename'};
176
177 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
178
179 $self->{'language'} = $language;
180 $self->{'encoding'} = $encoding;
181 $self->{'element_count'} = 1;
182 $self->{'indent'} = 0;
183 my $outhandle = $self->{'outhandle'};
184 print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
185 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
186
187 # reset the base id
188 $self->{'base_oid'} = undef;
189
190}
191
192sub xml_end_document {
193
194}
195
196sub xml_start_tag {
197 my $self = shift;
198 my $expat = shift;
199 my $element = shift;
200
201 my $text = $_;
202 my $escaped_text = $self->escape_text($_);
203
204 $self->{'current_element'} = $element;
205
206 ##get all atributes of this element and store it in a map name=>value
207 my %attr_map = ();
208 my $attrstring = $_;
209 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
210 $attr_map{$1}=$2;
211 $attrstring = $'; #'
212 }
213
214
215 my $processor = $self->{'processor'};
216 my $metadata = $self->{'metadata'};
217
218 ##create a new document for each record
219 if ($element eq "record") {
220 my $filename = $self->{'filename'};
221 my $language = $self->{'language'};
222 my $encoding = $self->{'encoding'};
223 my $file = $self->{'file'};
224 my $doc_obj = new doc($filename, undef, $self->{'file_rename_method'});
225 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
226 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
227
228 my ($filemeta) = $file =~ /([^\\\/]+)$/;
229 my $plugin_filename_encoding = $self->{'filename_encoding'};
230 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
231 $self->set_Source_metadata($doc_obj, $filename, $filename_encoding);
232
233 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
234 if ($self->{'cover_image'}) {
235 $self->associate_cover_image($doc_obj, $filename);
236 }
237 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
238 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "MARCXML");
239
240 my $outhandle = $self->{'outhandle'};
241 print $outhandle "Record $self->{'record_count'}\n" if $self->{'verbosity'} > 1;
242
243 $self->{'record_count'}++;
244 $self->{'doc_obj'} = $doc_obj;
245 $self->{'num_processed'}++;
246 if (!defined $self->{'base_oid'}) {
247 $self->SUPER::add_OID($doc_obj);
248 $self->{'base_oid'} = $doc_obj->get_OID();
249 }
250
251
252 }
253
254 ## get the marc code, for example 520
255 if ($element eq "datafield") {
256 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
257 $self->{'current_tag'} = $attr_map{tag};
258 }
259 }
260
261
262 ## append the subcode to the marc code for example 520a or 520b
263 if ($element eq "subfield"){
264 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
265 $self->{'current_code'} = $attr_map{'code'};
266 }
267 }
268
269 if ($element eq "record"){
270 $self->{'indent'} = 0;
271 $self->{'content'} = "";
272 $self->{'xmlcontent'} = "";
273 }
274 else {
275 if ($element ne "subfield"){
276 $self->{'indent'} = 1;
277 }
278 else{
279 $self->{'indent'} = 2;
280 }
281 }
282
283
284 if ($element eq "collection") {
285 # remember the full start tag for <collection ...>
286 # This is needed to wrap around each <record> when generating its associate MARCXML file
287
288 $self->{'xmlcollectiontag'} = $text;
289 }
290 else {
291 $self->{'content'} .= "<br/>" if ($element ne "record");
292 $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
293 $self->{'xmlcontent'} .= $text;
294 }
295
296}
297
298
299
300sub xml_end_tag {
301 my $self = shift(@_);
302 my ($expat, $element) = @_;
303
304 my $text = $_;
305 my $escaped_text = $self->escape_text($_);
306
307 if ($element eq "record" and defined $self->{'doc_obj'}) {
308 # process the document
309 my $processor = $self->{'processor'};
310 my $doc_obj = $self->{'doc_obj'};
311 $self->{'content'} .= "<br/>".$escaped_text;
312 $self->{'xmlcontent'} .= $text;
313
314
315 my $top_section = $doc_obj->get_top_section();
316
317 my $tmp_marcxml_filename = &util::get_tmp_filename("xml");
318 if (open (XMLOUT,">$tmp_marcxml_filename")) {
319
320 print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";
321 my $xml_content = $self->{'xmlcontent'};
322
323 $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
324
325 print XMLOUT $xml_content;
326
327 close(XMLOUT);
328
329 $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
330
331 # assicate xsl style file for presentation as HTML
332 my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
333 $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
334
335 }
336 else {
337 my $outhandle = $self->{'outhandle'};
338 print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
339 }
340
341 # include any metadata passed in from previous plugins
342 # note that this metadata is associated with the top level section
343
344 $self->extra_metadata ($doc_obj,
345 $doc_obj->get_top_section(),
346 $self->{'metadata'});
347
348
349 $self->add_OID($doc_obj, $self->{'base_oid'}, $self->{'record_count'});
350
351 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
352 $processor->process($doc_obj);
353
354 ##clean up
355 $self->{'content'} = "";
356 $self->{'xmlcontent'} = "";
357 $self->{'doc_obj'} = undef;
358 return;
359 }
360
361 ## map the xmlmarc to gsdl metadata
362 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'} and defined $self->{'metadata_mapping'}){
363 my $metadata_mapping = $self->{'metadata_mapping'};
364 my $marc_mapping = $self->{'marc_mapping'};
365 my $doc_obj = $self->{'doc_obj'};
366
367 ##print STDERR "**** Marc Record\n";
368 ##map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
369 ##print STDERR "**** Metadata Mapping\n";
370 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
371
372
373 foreach my $marc_field (keys %$metadata_mapping){
374
375 ## test whether this field has subfield
376 my $subfield = undef;
377 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
378 $marc_field = $1;
379 $subfield = $2;
380 }
381
382 my $matched_field = $marc_mapping->{$marc_field};
383
384 if (defined $matched_field) {
385
386 my $meta_name = undef;
387 my $meta_value = undef;
388
389 if (defined $subfield){
390 $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
391
392 $meta_value = $matched_field->{$subfield};
393
394 if (!defined $meta_value) {
395 # record read in does not have the specified subfield
396 next;
397 }
398 }
399 else {
400 $meta_name = $metadata_mapping->{$marc_field};
401
402 # no subfield => get all the values
403 my $first = 1;
404 foreach my $value (sort keys %{$matched_field}) {
405 if ($first) {
406 $meta_value = $matched_field->{$value};
407 $first = 0;
408 } else {
409 $meta_value .= " " . $matched_field->{$value};
410 }
411 }
412
413 }
414
415 ## escape [ and ]
416 $meta_value =~ s/\[/\\\[/g;
417 $meta_value =~ s/\]/\\\]/g;
418 ##print STDERR "$meta_name=$meta_value\n";
419 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
420
421 }
422
423 }
424
425 ##clean up
426 $self->{'marc_mapping'} = undef;
427 $self->{'current_tag'} = "";
428 }
429
430 if ($element eq "datafield"){
431 $self->{'indent'} = 1;
432 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
433 $self->{'xmlcontent'} .= $text;
434 }
435 else{
436 $self->{'content'} .= $escaped_text;
437 $self->{'xmlcontent'} .= $text;
438 }
439
440}
441
442sub add_OID {
443 my $self = shift (@_);
444 my ($doc_obj, $id, $record_number) = @_;
445
446 my $full_id = $id . "r" . $record_number;
447 if ($self->{'OIDtype'} eq "assigned") {
448 my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
449 if (defined $identifier && $identifier ne "") {
450 $full_id = $identifier;
451 $full_id = &util::tidy_up_oid($full_id);
452 }
453 }
454 $doc_obj->set_OID($full_id);
455}
456
457sub xml_text {
458 my $self = shift(@_);
459 my ($expat) = @_;
460
461 my $text = $_;
462 my $escaped_text = $self->escape_text($_);
463
464 # protect against & in raw text file
465 $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
466
467 ## store the text of a marc code, for exapmle 520a=>A poem about....
468 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
469 ##stored it in the marc_mapping
470
471 my $current_tag = $self->{'current_tag'};
472 my $current_code = $self->{'current_code'};
473
474 $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
475
476 $self->{'current_code'} = "";
477 }
478
479 $self->{'content'} .= $escaped_text;
480 $self->{'xmlcontent'} .= $text;
481
482}
483
484sub calculate_indent{
485 my ($self,$num) = @_;
486
487 my $indent ="";
488
489 for (my $i=0; $i<$num;$i++){
490 $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
491 }
492
493 return $indent;
494
495}
496
497sub escape_text {
498 my ($self,$text) = @_;
499 # special characters in the xml encoding
500 $text =~ s/&/&amp;/g; # this has to be first...
501 $text =~ s/</&lt;/g;
502 $text =~ s/>/&gt;/g;
503 $text =~ s/\"/&quot;/g;
504
505 return $text;
506}
507
508
509sub unescape_text {
510 my ($self,$text) = @_;
511 # special characters in the xml encoding
512 $text =~ s/&lt;/</g;
513 $text =~ s/&gt;/>/g;
514 $text =~ s/&quot;/\"/g;
515
516 $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
517
518 return $text;
519}
520
521
5221;
523
524
Note: See TracBrowser for help on using the repository browser.