root/gsdl/trunk/perllib/plugins/MARCXMLPlugin.pm @ 16694

Revision 16694, 13.1 KB (checked in by kjdon, 12 years ago)

MARCXMLPlugin uses textcat_language_and_encoding method from ReadTextFile? so made it inherit from this as well.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# MARCXMLPlugin.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
30package MARCXMLPlugin;
31
32use ReadXMLFile;
33use ReadTextFile;
34use marcmapping;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39sub BEGIN {
40    @MARCXMLPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
41}
42
43my $arguments = [{'name' => "metadata_mapping_file",
44          'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
45          'type' => "string",
46          'deft' => "marctodc.txt",
47          'reqd' => "no" }];
48
49my $options = { 'name'     => "MARCXMLPlugin",
50        'desc'     => "{MARCXMLPlugin.desc}",
51        'abstract' => "no",
52        'inherits' => "yes",
53        'args'     => $arguments
54        };
55
56sub new {
57    my ($class) = shift (@_);
58    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
59    push(@$pluginlist, $class);
60
61    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
62    push(@{$hashArgOptLists->{"OptList"}},$options);
63   
64    # we want to be able to use the textcat methods from ReadTextFile
65    # to get the language and encoding
66    new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
67
68    my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
69   
70    $self->{'content'} = "";
71    $self->{'xmlcontent'} = "";
72    $self->{'record_count'} = 1;
73    $self->{'language'} = "";
74    $self->{'encoding'} = "";
75    $self->{'marc_mapping'} = {};
76    $self->{'current_code'} = "";
77    $self->{'current_tag'} = "";
78    $self->{'current_element'} = "";
79    $self->{'metadata_mapping'} = undef;
80    $self->{'num_processed'} = 0;
81    $self->{'indent'} = 0;
82
83    print STDERR "metadata mapping file = $self->{'metadata_mapping_file'}\n";
84    return bless $self, $class;
85}
86
87
88
89sub get_doctype {
90    my $self = shift(@_);
91   
92    return "collection";
93}
94
95
96sub init {
97    my $self = shift (@_);
98    my ($verbosity, $outhandle, $failhandle) = @_;
99   
100    ## the mapping file has already been loaded
101    if (defined $self->{'metadata_mapping'} ){
102    $self->SUPER::init(@_);
103    return;
104    }
105
106    # read in the metadata mapping files
107    my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
108
109
110    if (scalar(@$mm_files)==0)
111    {
112    my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
113        $self->{'metadata_mapping_file'} . "\".\n " .
114        "    No marc files can be processed.\n";
115
116    print $outhandle $msg;
117    print $failhandle $msg;
118    $self->{'metadata_mapping'} = undef;
119    # We pick up the error in process() if there is no $mm_file
120    # If we exit here, then pluginfo.pl will exit too!
121    }
122    else {
123    $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
124    }
125
126
127    map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
128
129    $self->SUPER::init(@_);
130}
131
132# Called for DOCTYPE declarations - use die to bail out if this doctype
133# is not meant for this plugin
134sub xml_doctype {
135    my $self = shift(@_);
136
137    my ($expat, $name, $sysid, $pubid, $internal) = @_;
138   return;
139
140}
141
142
143sub xml_start_document {
144    my $self = shift(@_);
145
146    my ($expat, $name, $sysid, $pubid, $internal) = @_;
147
148     
149    my $file = $self->{'file'};
150    my $filename = $self->{'filename'};
151       
152    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
153
154    $self->{'language'} = $language;
155    $self->{'encoding'} = $encoding;
156    $self->{'element_count'} = 1;
157    $self->{'indent'} = 0;
158    my $outhandle = $self->{'outhandle'};
159    print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
160    print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
161 
162}
163
164sub xml_end_document {
165
166}
167
168sub xml_start_tag {
169    my $self = shift;
170    my $expat = shift;
171    my $element = shift; 
172
173    my $text = $_;
174    my $escaped_text =  $self->escape_text($_);
175 
176    $self->{'current_element'} = $element;
177
178    ##get all atributes of this element and store it in a map name=>value   
179    my %attr_map = ();
180    my $attrstring = $_;
181    while ($attrstring =~ /(\w+)=\"(\w+)\"/){
182    $attr_map{$1}=$2;
183    $attrstring = $'; #'
184    }
185
186
187    my $processor = $self->{'processor'};
188 
189    ##create a new document for each record
190    if ($element eq "record") {
191        my $filename = $self->{'filename'};
192    my $language = $self->{'language'};
193        my $encoding = $self->{'encoding'};
194    my $file = $self->{'file'};
195    my $doc_obj = new doc($filename);
196    $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
197    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
198    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
199    my ($filemeta) = $file =~ /([^\\\/]+)$/;
200    $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
201    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
202        if ($self->{'cover_image'}) {
203        $self->associate_cover_image($doc_obj, $filename);
204    }
205    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
206    $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "MARCXML");
207
208    my $outhandle = $self->{'outhandle'};
209    print $outhandle "Record $self->{'record_count'} - MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
210
211        $self->{'record_count'}++;
212        $self->{'doc_obj'} = $doc_obj;       
213    $self->{'num_processed'}++;
214
215    }
216   
217    ## get the marc code, for example 520
218     if ($element eq "datafield") {
219         if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
220         $self->{'current_tag'} = $attr_map{tag}; 
221     }
222     }
223
224
225    ## append the subcode to the marc code for example 520a or 520b
226    if ($element eq "subfield"){
227    if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
228        $self->{'current_code'} = $attr_map{'code'};
229    }
230    }
231
232   if ($element eq "record"){
233        $self->{'indent'} = 0;
234        $self->{'content'} = "";
235        $self->{'xmlcontent'} = "";
236    }
237    else {
238         if ($element ne "subfield"){
239              $self->{'indent'} = 1;
240         }
241         else{
242           $self->{'indent'} = 2;
243         }
244    }
245   
246
247    if ($element eq "collection") {
248    # remember the full start tag for <collection ...>
249    # This is needed to wrap around each <record> when generating its associate MARCXML file
250
251        $self->{'xmlcollectiontag'} = $text;
252    }
253    else {
254        $self->{'content'} .= "<br/>" if ($element ne "record");
255        $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
256        $self->{'xmlcontent'} .= $text;
257   }
258   
259}
260
261
262
263sub xml_end_tag {
264    my $self = shift(@_);
265    my ($expat, $element) = @_;
266
267    my $text = $_;
268    my $escaped_text =  $self->escape_text($_);
269 
270    if ($element eq "record" and defined $self->{'doc_obj'}) {
271    # process the document
272    my $processor = $self->{'processor'};
273    my $doc_obj = $self->{'doc_obj'};
274        $self->{'content'} .= "<br/>".$escaped_text;
275        $self->{'xmlcontent'} .= $text;
276     
277
278    my $top_section = $doc_obj->get_top_section();
279
280    my $tmp_marcxml_filename = &util::get_tmp_filename("xml");
281    if (open (XMLOUT,">$tmp_marcxml_filename")) {
282
283        print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";     
284        my $xml_content = $self->{'xmlcontent'};
285
286        $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
287
288        print XMLOUT $xml_content;
289
290        close(XMLOUT);
291
292        $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
293       
294        # assicate xsl style file for presentation as HTML
295        my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
296        $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
297
298    }
299    else {
300        my $outhandle = $self->{'outhandle'};
301        print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
302    }
303   
304    # include any metadata passed in from previous plugins
305    # note that this metadata is associated with the top level section
306   
307    $self->extra_metadata ($doc_obj,
308                   $doc_obj->get_top_section(),
309                   $self->{'metadata'});
310   
311
312    $self->add_OID($doc_obj,  $self->{'record_count'});
313
314    $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
315        $processor->process($doc_obj);
316
317        ##clean up
318    $self->{'content'} = ""; 
319    $self->{'xmlcontent'} = ""; 
320    $self->{'doc_obj'} = undef;
321        return;
322    }
323
324    ## map the xmlmarc to gsdl metadata
325    if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'}){
326    my $metadata_mapping = $self->{'metadata_mapping'};
327    my $marc_mapping = $self->{'marc_mapping'};
328    my $doc_obj = $self->{'doc_obj'};
329
330##  print STDERR "**** Marc Record\n";
331##      map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
332##  print STDERR "**** Metadata Mapping\n";
333##      map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
334
335
336    foreach my $marc_field (keys %$metadata_mapping){
337
338        ## test whether this field has subfield
339        my $subfield = undef;
340        if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
341        $marc_field = $1;
342        $subfield = $2;
343        }
344
345        my $matched_field = $marc_mapping->{$marc_field};
346
347        if (defined $matched_field) {
348
349        my $meta_name  = undef;
350        my $meta_value = undef;
351
352        if (defined $subfield){
353            $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
354
355            $meta_value = $matched_field->{$subfield};
356           
357            if (!defined $meta_value) {
358            # record read in does not have the specified subfield
359            next;
360            }
361        }
362        else {
363            $meta_name = $metadata_mapping->{$marc_field};
364           
365            # no subfield => get all the values
366            foreach my $value (sort keys %{$matched_field}) {
367            $meta_value .= $matched_field->{$value} ." ";
368            }
369
370        }
371       
372        ## escape [ and ]
373        $meta_value =~ s/\[/\\\[/g;
374        $meta_value =~ s/\]/\\\]/g;
375        ##print STDERR  "$meta_name=$meta_value\n";
376        $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
377       
378        }           
379           
380    }
381
382    ##clean up
383    $self->{'marc_mapping'} = undef;
384    $self->{'current_tag'} = "";
385    }
386 
387   if ($element eq "datafield"){
388       $self->{'indent'} = 1;
389       $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
390       $self->{'xmlcontent'} .= $text;
391   }
392    else{
393    $self->{'content'} .= $escaped_text;   
394    $self->{'xmlcontent'} .= $text;   
395    }
396     
397}
398
399
400sub set_OID {
401    my $self = shift (@_);
402    my ($doc_obj, $record_number) = @_;
403   
404    # first set it to generate hash value
405    $doc_obj->set_OID();
406
407    # then top it up with an "r" + record-number suffix
408    my $id = $doc_obj->get_OID();
409    $doc_obj->set_OID($id . "r" . $record_number);
410}
411
412sub xml_text {
413    my $self = shift(@_);
414    my ($expat) = @_;
415
416    my $text = $_;
417    my $escaped_text = $self->escape_text($_);
418
419    # protect against & in raw text file
420    $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
421
422    ## store the text of a marc code, for exapmle 520a=>A poem about....
423    if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
424    ##stored it in the marc_mapping
425
426    my $current_tag  = $self->{'current_tag'};
427    my $current_code = $self->{'current_code'};
428
429        $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
430
431    $self->{'current_code'} = "";
432    }
433   
434    $self->{'content'} .= $escaped_text;
435    $self->{'xmlcontent'} .= $text;
436   
437}
438
439sub calculate_indent{
440   my ($self,$num) = @_;
441
442   my $indent ="";
443 
444   for (my $i=0; $i<$num;$i++){
445       $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
446    }
447 
448   return $indent;
449
450}
451
452sub escape_text {
453    my ($self,$text) = @_;
454    # special characters in the xml encoding
455    $text =~ s/&/&amp;/g; # this has to be first...
456    $text =~ s/</&lt;/g;
457    $text =~ s/>/&gt;/g;
458    $text =~ s/\"/&quot;/g;
459
460    return $text;
461}
462
463
464sub unescape_text {
465    my ($self,$text) = @_;
466    # special characters in the xml encoding
467    $text =~ s/&lt;/</g;
468    $text =~ s/&gt;/>/g;
469    $text =~ s/&quot;/\"/g;
470
471    $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
472
473    return $text;
474}
475
476
4771;
478
479
Note: See TracBrowser for help on using the browser.