root/gsdl/trunk/perllib/plugins/MARCXMLPlugin.pm @ 19619

Revision 19619, 13.9 KB (checked in by kjdon, 10 years ago)

just look for a single mappign file, not for a list of them. collection version should override the main one. and use util::tidy_up_oid for assigned ids

  • Property svn:keywords set to Author Date Id Revision
RevLine 
[13486]1###########################################################################
2#
[15872]3# MARCXMLPlugin.pm
[13486]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
[15872]30package MARCXMLPlugin;
[13486]31
[15872]32use ReadXMLFile;
[16693]33use ReadTextFile;
[16692]34use marcmapping;
[13486]35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39sub BEGIN {
[16693]40    @MARCXMLPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
[13486]41}
42
43my $arguments = [{'name' => "metadata_mapping_file",
[15872]44          'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
[13486]45          'type' => "string",
[18563]46          'deft' => "marc2dc.txt",
[13486]47          'reqd' => "no" }];
48
[15872]49my $options = { 'name'     => "MARCXMLPlugin",
50        'desc'     => "{MARCXMLPlugin.desc}",
[13486]51        'abstract' => "no",
52        'inherits' => "yes",
53        'args'     => $arguments
54        };
55
56sub new {
57    my ($class) = shift (@_);
58    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
59    push(@$pluginlist, $class);
60
[15872]61    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
62    push(@{$hashArgOptLists->{"OptList"}},$options);
[13486]63   
[16693]64    # we want to be able to use the textcat methods from ReadTextFile
65    # to get the language and encoding
66    new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
67
[15872]68    my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
[18900]69       
70    # we want to strip namespaces, so have to create a new XML parser
71    my $parser = new XML::Parser('Style' => 'Stream',
72                                 'Pkg' => 'ReadXMLFile',
73                                 'PluginObj' => $self,
74                 'Namespaces' => 1, # strip out namespaces
75                 'Handlers' => {'Char' => \&ReadXMLFile::Char,
76                        'XMLDecl' => \&ReadXMLFile::XMLDecl,
77                        'Entity'  => \&ReadXMLFile::Entity,
78                        'Doctype' => \&ReadXMLFile::Doctype,
79                        'Default' => \&ReadXMLFile::Default
80                                 });
81
82    $self->{'parser'} = $parser;
83
[13486]84    $self->{'content'} = "";
[15018]85    $self->{'xmlcontent'} = "";
[13486]86    $self->{'record_count'} = 1;
87    $self->{'language'} = "";
88    $self->{'encoding'} = "";
89    $self->{'marc_mapping'} = {};
90    $self->{'current_code'} = "";
91    $self->{'current_tag'} = "";
92    $self->{'current_element'} = "";
93    $self->{'metadata_mapping'} = undef;
94    $self->{'num_processed'} = 0;
[13496]95    $self->{'indent'} = 0;
96
[13486]97    return bless $self, $class;
98}
99
[16694]100
101
[13486]102sub get_doctype {
103    my $self = shift(@_);
104   
105    return "collection";
106}
107
108
109sub init {
110    my $self = shift (@_);
111    my ($verbosity, $outhandle, $failhandle) = @_;
112   
113    ## the mapping file has already been loaded
114    if (defined $self->{'metadata_mapping'} ){
115    $self->SUPER::init(@_);
116    return;
117    }
118
[19619]119    # read in the metadata mapping file
120    my $mm_file = &util::locate_config_file($self->{'metadata_mapping_file'});
[13486]121
[19619]122    if (! defined $mm_file)
[13486]123    {
[15872]124    my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
[15018]125        $self->{'metadata_mapping_file'} . "\".\n " .
[16697]126        "    No metadata will be extracted from MARCXML files.\n";
[13486]127
128    print $outhandle $msg;
129    print $failhandle $msg;
130    $self->{'metadata_mapping'} = undef;
131    # We pick up the error in process() if there is no $mm_file
132    # If we exit here, then pluginfo.pl will exit too!
133    }
[15018]134    else {
[19619]135    $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_file, $outhandle);
[13486]136    }
137
138
[16695]139    ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
[13486]140
141    $self->SUPER::init(@_);
142}
143
144# Called for DOCTYPE declarations - use die to bail out if this doctype
145# is not meant for this plugin
146sub xml_doctype {
147    my $self = shift(@_);
148
149    my ($expat, $name, $sysid, $pubid, $internal) = @_;
150   return;
151
152}
153
154
155sub xml_start_document {
156    my $self = shift(@_);
157
158    my ($expat, $name, $sysid, $pubid, $internal) = @_;
159
160     
161    my $file = $self->{'file'};
162    my $filename = $self->{'filename'};
[13496]163       
[13486]164    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
165
166    $self->{'language'} = $language;
167    $self->{'encoding'} = $encoding;
168    $self->{'element_count'} = 1;
[13496]169    $self->{'indent'} = 0;
[13486]170    my $outhandle = $self->{'outhandle'};
[15872]171    print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
172    print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
[17026]173
174    # reset the base id
175    $self->{'base_oid'} = undef;
[13486]176 
177}
178
179sub xml_end_document {
180
181}
182
183sub xml_start_tag {
184    my $self = shift;
185    my $expat = shift;
186    my $element = shift; 
[13496]187
[15018]188    my $text = $_;
189    my $escaped_text =  $self->escape_text($_);
[13486]190 
191    $self->{'current_element'} = $element;
192
193    ##get all atributes of this element and store it in a map name=>value   
194    my %attr_map = ();
195    my $attrstring = $_;
196    while ($attrstring =~ /(\w+)=\"(\w+)\"/){
197    $attr_map{$1}=$2;
[13496]198    $attrstring = $'; #'
[13486]199    }
200
[15018]201
[13486]202    my $processor = $self->{'processor'};
203 
204    ##create a new document for each record
205    if ($element eq "record") {
[13496]206        my $filename = $self->{'filename'};
[13486]207    my $language = $self->{'language'};
208        my $encoding = $self->{'encoding'};
209    my $file = $self->{'file'};
[18327]210    my $doc_obj = new doc($filename, undef, $self->{'file_rename_method'});
[13486]211    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
212    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
213    my ($filemeta) = $file =~ /([^\\\/]+)$/;
[15872]214    $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
[13486]215    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
216        if ($self->{'cover_image'}) {
217        $self->associate_cover_image($doc_obj, $filename);
218    }
219    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[16692]220    $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "MARCXML");
[13486]221
222    my $outhandle = $self->{'outhandle'};
[16697]223    print $outhandle "Record $self->{'record_count'}\n" if $self->{'verbosity'} > 1;
[13486]224
225        $self->{'record_count'}++;
226        $self->{'doc_obj'} = $doc_obj;       
227    $self->{'num_processed'}++;
[17026]228    if (!defined $self->{'base_oid'}) {
229        $self->SUPER::add_OID($doc_obj);
230        $self->{'base_oid'} = $doc_obj->get_OID();
231    }
232   
[13486]233
234    }
235   
236    ## get the marc code, for example 520
237     if ($element eq "datafield") {
[13496]238         if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
[13486]239         $self->{'current_tag'} = $attr_map{tag}; 
240     }
241     }
242
243
244    ## append the subcode to the marc code for example 520a or 520b
245    if ($element eq "subfield"){
[13496]246    if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
[13486]247        $self->{'current_code'} = $attr_map{'code'};
248    }
249    }
[13496]250
251   if ($element eq "record"){
252        $self->{'indent'} = 0;
[15018]253        $self->{'content'} = "";
254        $self->{'xmlcontent'} = "";
[13496]255    }
256    else {
257         if ($element ne "subfield"){
258              $self->{'indent'} = 1;
259         }
260         else{
261           $self->{'indent'} = 2;
262         }
263    }
264   
[15018]265
266    if ($element eq "collection") {
267    # remember the full start tag for <collection ...>
268    # This is needed to wrap around each <record> when generating its associate MARCXML file
269
270        $self->{'xmlcollectiontag'} = $text;
271    }
272    else {
273        $self->{'content'} .= "<br/>" if ($element ne "record");
274        $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
275        $self->{'xmlcontent'} .= $text;
276   }
[13496]277   
[13486]278}
279
280
281
282sub xml_end_tag {
283    my $self = shift(@_);
284    my ($expat, $element) = @_;
[15018]285
286    my $text = $_;
287    my $escaped_text =  $self->escape_text($_);
[13496]288 
[13486]289    if ($element eq "record" and defined $self->{'doc_obj'}) {
290    # process the document
291    my $processor = $self->{'processor'};
292    my $doc_obj = $self->{'doc_obj'};
[15018]293        $self->{'content'} .= "<br/>".$escaped_text;
294        $self->{'xmlcontent'} .= $text;
[13496]295     
[15018]296
297    my $top_section = $doc_obj->get_top_section();
298
[16521]299    my $tmp_marcxml_filename = &util::get_tmp_filename("xml");
[15018]300    if (open (XMLOUT,">$tmp_marcxml_filename")) {
301
302        print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";     
303        my $xml_content = $self->{'xmlcontent'};
304
305        $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
306
307        print XMLOUT $xml_content;
308
309        close(XMLOUT);
310
311        $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
312       
313        # assicate xsl style file for presentation as HTML
314        my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
315        $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
316
317    }
318    else {
319        my $outhandle = $self->{'outhandle'};
320        print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
321    }
[15178]322   
323    # include any metadata passed in from previous plugins
324    # note that this metadata is associated with the top level section
325   
326    $self->extra_metadata ($doc_obj,
327                   $doc_obj->get_top_section(),
328                   $self->{'metadata'});
329   
[15018]330
[17026]331    $self->add_OID($doc_obj, $self->{'base_oid'}, $self->{'record_count'});
[15018]332
[13496]333    $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
[13486]334        $processor->process($doc_obj);
335
336        ##clean up
337    $self->{'content'} = ""; 
[15018]338    $self->{'xmlcontent'} = ""; 
[13486]339    $self->{'doc_obj'} = undef;
[13496]340        return;
[13486]341    }
342
343    ## map the xmlmarc to gsdl metadata
[16697]344    if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'} and defined $self->{'metadata_mapping'}){
[13486]345    my $metadata_mapping = $self->{'metadata_mapping'};
346    my $marc_mapping = $self->{'marc_mapping'};
347    my $doc_obj = $self->{'doc_obj'};
348
[15018]349##  print STDERR "**** Marc Record\n";
350##      map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
351##  print STDERR "**** Metadata Mapping\n";
352##      map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
[13486]353
[15018]354
[13486]355    foreach my $marc_field (keys %$metadata_mapping){
[15018]356
357        ## test whether this field has subfield
[13486]358        my $subfield = undef;
[15018]359        if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
360        $marc_field = $1;
361        $subfield = $2;
362        }
[13486]363
[15018]364        my $matched_field = $marc_mapping->{$marc_field};
365
366        if (defined $matched_field) {
367
368        my $meta_name  = undef;
369        my $meta_value = undef;
370
[13486]371        if (defined $subfield){
[15018]372            $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
373
374            $meta_value = $matched_field->{$subfield};
375           
376            if (!defined $meta_value) {
377            # record read in does not have the specified subfield
378            next;
379            }
[13486]380        }
[15018]381        else {
382            $meta_name = $metadata_mapping->{$marc_field};
383           
384            # no subfield => get all the values
385            foreach my $value (sort keys %{$matched_field}) {
386            $meta_value .= $matched_field->{$value} ." ";
[13486]387            }
[15018]388
[13486]389        }
390       
391        ## escape [ and ]
392        $meta_value =~ s/\[/\\\[/g;
393        $meta_value =~ s/\]/\\\]/g;
394        ##print STDERR  "$meta_name=$meta_value\n";
395        $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
396       
397        }           
398           
399    }
400
401    ##clean up
402    $self->{'marc_mapping'} = undef;
403    $self->{'current_tag'} = "";
404    }
[13496]405 
406   if ($element eq "datafield"){
407       $self->{'indent'} = 1;
[15018]408       $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
409       $self->{'xmlcontent'} .= $text;
[13496]410   }
411    else{
[15018]412    $self->{'content'} .= $escaped_text;   
413    $self->{'xmlcontent'} .= $text;   
[13496]414    }
415     
[13486]416}
417
[17026]418sub add_OID {
[13486]419    my $self = shift (@_);
[17026]420    my ($doc_obj, $id, $record_number) = @_;
[15018]421
[17026]422    my $full_id = $id . "r" . $record_number;
423    if ($self->{'OIDtype'} eq "assigned") {
424    my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
425    if (defined $identifier && $identifier ne "") {
[17033]426        $full_id = $identifier;
[19619]427        $full_id = &util::tidy_up_oid($full_id);
[17026]428    }
429    }
430    $doc_obj->set_OID($full_id);
[13486]431}
432
433sub xml_text {
434    my $self = shift(@_);
435    my ($expat) = @_;
436
[15018]437    my $text = $_;
438    my $escaped_text = $self->escape_text($_);
[13486]439
[15018]440    # protect against & in raw text file
441    $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
442
[13486]443    ## store the text of a marc code, for exapmle 520a=>A poem about....
444    if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
445    ##stored it in the marc_mapping
[15018]446
447    my $current_tag  = $self->{'current_tag'};
448    my $current_code = $self->{'current_code'};
449
450        $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
451
[13486]452    $self->{'current_code'} = "";
453    }
454   
[15018]455    $self->{'content'} .= $escaped_text;
456    $self->{'xmlcontent'} .= $text;
[13486]457   
458}
459
[13496]460sub calculate_indent{
461   my ($self,$num) = @_;
[13486]462
[13496]463   my $indent ="";
464 
465   for (my $i=0; $i<$num;$i++){
466       $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
467    }
468 
469   return $indent;
470
471}
472
473sub escape_text {
474    my ($self,$text) = @_;
475    # special characters in the xml encoding
476    $text =~ s/&/&amp;/g; # this has to be first...
477    $text =~ s/</&lt;/g;
478    $text =~ s/>/&gt;/g;
479    $text =~ s/\"/&quot;/g;
480
481    return $text;
482}
483
484
[15018]485sub unescape_text {
486    my ($self,$text) = @_;
487    # special characters in the xml encoding
488    $text =~ s/&lt;/</g;
489    $text =~ s/&gt;/>/g;
490    $text =~ s/&quot;/\"/g;
491
492    $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
493
494    return $text;
495}
496
497
[13486]4981;
499
500
Note: See TracBrowser for help on using the browser.