root/gsdl/trunk/perllib/plugins/MARCXMLPlugin.pm @ 16693

Revision 16693, 13.0 KB (checked in by kjdon, 12 years ago)

MARCXMLPlugin uses textcat_language_and_encoding method from ReadTextFile? so made it inherit from this as well.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# MARCXMLPlugin.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
30package MARCXMLPlugin;
31
32use ReadXMLFile;
33use ReadTextFile;
34use marcmapping;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39sub BEGIN {
40    @MARCXMLPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
41}
42
43my $arguments = [{'name' => "metadata_mapping_file",
44          'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
45          'type' => "string",
46          'deft' => "marctodc.txt",
47          'reqd' => "no" }];
48
49my $options = { 'name'     => "MARCXMLPlugin",
50        'desc'     => "{MARCXMLPlugin.desc}",
51        'abstract' => "no",
52        'inherits' => "yes",
53        'args'     => $arguments
54        };
55
56sub new {
57    my ($class) = shift (@_);
58    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
59    push(@$pluginlist, $class);
60
61    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
62    push(@{$hashArgOptLists->{"OptList"}},$options);
63   
64    # we want to be able to use the textcat methods from ReadTextFile
65    # to get the language and encoding
66    new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
67
68    my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
69   
70    $self->{'content'} = "";
71    $self->{'xmlcontent'} = "";
72    $self->{'record_count'} = 1;
73    $self->{'language'} = "";
74    $self->{'encoding'} = "";
75    $self->{'marc_mapping'} = {};
76    $self->{'current_code'} = "";
77    $self->{'current_tag'} = "";
78    $self->{'current_element'} = "";
79    $self->{'metadata_mapping'} = undef;
80    $self->{'num_processed'} = 0;
81    $self->{'indent'} = 0;
82
83    return bless $self, $class;
84}
85
86sub get_doctype {
87    my $self = shift(@_);
88   
89    return "collection";
90}
91
92
93sub init {
94    my $self = shift (@_);
95    my ($verbosity, $outhandle, $failhandle) = @_;
96   
97    ## the mapping file has already been loaded
98    if (defined $self->{'metadata_mapping'} ){
99    $self->SUPER::init(@_);
100    return;
101    }
102
103    # read in the metadata mapping files
104    my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
105
106
107    if (scalar(@$mm_files)==0)
108    {
109    my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
110        $self->{'metadata_mapping_file'} . "\".\n " .
111        "    No marc files can be processed.\n";
112
113    print $outhandle $msg;
114    print $failhandle $msg;
115    $self->{'metadata_mapping'} = undef;
116    # We pick up the error in process() if there is no $mm_file
117    # If we exit here, then pluginfo.pl will exit too!
118    }
119    else {
120    $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
121    }
122
123
124    ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
125
126    $self->SUPER::init(@_);
127}
128
129# Called for DOCTYPE declarations - use die to bail out if this doctype
130# is not meant for this plugin
131sub xml_doctype {
132    my $self = shift(@_);
133
134    my ($expat, $name, $sysid, $pubid, $internal) = @_;
135   return;
136
137}
138
139
140sub xml_start_document {
141    my $self = shift(@_);
142
143    my ($expat, $name, $sysid, $pubid, $internal) = @_;
144
145     
146    my $file = $self->{'file'};
147    my $filename = $self->{'filename'};
148       
149    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
150
151    $self->{'language'} = $language;
152    $self->{'encoding'} = $encoding;
153    $self->{'element_count'} = 1;
154    $self->{'indent'} = 0;
155    my $outhandle = $self->{'outhandle'};
156    print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
157    print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
158 
159}
160
161sub xml_end_document {
162
163}
164
165sub xml_start_tag {
166    my $self = shift;
167    my $expat = shift;
168    my $element = shift; 
169
170    my $text = $_;
171    my $escaped_text =  $self->escape_text($_);
172 
173    $self->{'current_element'} = $element;
174
175    ##get all atributes of this element and store it in a map name=>value   
176    my %attr_map = ();
177    my $attrstring = $_;
178    while ($attrstring =~ /(\w+)=\"(\w+)\"/){
179    $attr_map{$1}=$2;
180    $attrstring = $'; #'
181    }
182
183
184    my $processor = $self->{'processor'};
185 
186    ##create a new document for each record
187    if ($element eq "record") {
188        my $filename = $self->{'filename'};
189    my $language = $self->{'language'};
190        my $encoding = $self->{'encoding'};
191    my $file = $self->{'file'};
192    my $doc_obj = new doc($filename);
193    $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
194    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
195    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
196    my ($filemeta) = $file =~ /([^\\\/]+)$/;
197    $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
198    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
199        if ($self->{'cover_image'}) {
200        $self->associate_cover_image($doc_obj, $filename);
201    }
202    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
203    $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "MARCXML");
204
205    my $outhandle = $self->{'outhandle'};
206    print $outhandle "Record $self->{'record_count'} - MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
207
208        $self->{'record_count'}++;
209        $self->{'doc_obj'} = $doc_obj;       
210    $self->{'num_processed'}++;
211
212    }
213   
214    ## get the marc code, for example 520
215     if ($element eq "datafield") {
216         if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
217         $self->{'current_tag'} = $attr_map{tag}; 
218     }
219     }
220
221
222    ## append the subcode to the marc code for example 520a or 520b
223    if ($element eq "subfield"){
224    if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
225        $self->{'current_code'} = $attr_map{'code'};
226    }
227    }
228
229   if ($element eq "record"){
230        $self->{'indent'} = 0;
231        $self->{'content'} = "";
232        $self->{'xmlcontent'} = "";
233    }
234    else {
235         if ($element ne "subfield"){
236              $self->{'indent'} = 1;
237         }
238         else{
239           $self->{'indent'} = 2;
240         }
241    }
242   
243
244    if ($element eq "collection") {
245    # remember the full start tag for <collection ...>
246    # This is needed to wrap around each <record> when generating its associate MARCXML file
247
248        $self->{'xmlcollectiontag'} = $text;
249    }
250    else {
251        $self->{'content'} .= "<br/>" if ($element ne "record");
252        $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
253        $self->{'xmlcontent'} .= $text;
254   }
255   
256}
257
258
259
260sub xml_end_tag {
261    my $self = shift(@_);
262    my ($expat, $element) = @_;
263
264    my $text = $_;
265    my $escaped_text =  $self->escape_text($_);
266 
267    if ($element eq "record" and defined $self->{'doc_obj'}) {
268    # process the document
269    my $processor = $self->{'processor'};
270    my $doc_obj = $self->{'doc_obj'};
271        $self->{'content'} .= "<br/>".$escaped_text;
272        $self->{'xmlcontent'} .= $text;
273     
274
275    my $top_section = $doc_obj->get_top_section();
276
277    my $tmp_marcxml_filename = &util::get_tmp_filename("xml");
278    if (open (XMLOUT,">$tmp_marcxml_filename")) {
279
280        print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";     
281        my $xml_content = $self->{'xmlcontent'};
282
283        $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
284
285        print XMLOUT $xml_content;
286
287        close(XMLOUT);
288
289        $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
290       
291        # assicate xsl style file for presentation as HTML
292        my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
293        $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
294
295    }
296    else {
297        my $outhandle = $self->{'outhandle'};
298        print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
299    }
300   
301    # include any metadata passed in from previous plugins
302    # note that this metadata is associated with the top level section
303   
304    $self->extra_metadata ($doc_obj,
305                   $doc_obj->get_top_section(),
306                   $self->{'metadata'});
307   
308
309    $self->add_OID($doc_obj,  $self->{'record_count'});
310
311    $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
312        $processor->process($doc_obj);
313
314        ##clean up
315    $self->{'content'} = ""; 
316    $self->{'xmlcontent'} = ""; 
317    $self->{'doc_obj'} = undef;
318        return;
319    }
320
321    ## map the xmlmarc to gsdl metadata
322    if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'}){
323    my $metadata_mapping = $self->{'metadata_mapping'};
324    my $marc_mapping = $self->{'marc_mapping'};
325    my $doc_obj = $self->{'doc_obj'};
326
327##  print STDERR "**** Marc Record\n";
328##      map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
329##  print STDERR "**** Metadata Mapping\n";
330##      map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
331
332
333    foreach my $marc_field (keys %$metadata_mapping){
334
335        ## test whether this field has subfield
336        my $subfield = undef;
337        if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
338        $marc_field = $1;
339        $subfield = $2;
340        }
341
342        my $matched_field = $marc_mapping->{$marc_field};
343
344        if (defined $matched_field) {
345
346        my $meta_name  = undef;
347        my $meta_value = undef;
348
349        if (defined $subfield){
350            $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
351
352            $meta_value = $matched_field->{$subfield};
353           
354            if (!defined $meta_value) {
355            # record read in does not have the specified subfield
356            next;
357            }
358        }
359        else {
360            $meta_name = $metadata_mapping->{$marc_field};
361           
362            # no subfield => get all the values
363            foreach my $value (sort keys %{$matched_field}) {
364            $meta_value .= $matched_field->{$value} ." ";
365            }
366
367        }
368       
369        ## escape [ and ]
370        $meta_value =~ s/\[/\\\[/g;
371        $meta_value =~ s/\]/\\\]/g;
372        ##print STDERR  "$meta_name=$meta_value\n";
373        $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
374       
375        }           
376           
377    }
378
379    ##clean up
380    $self->{'marc_mapping'} = undef;
381    $self->{'current_tag'} = "";
382    }
383 
384   if ($element eq "datafield"){
385       $self->{'indent'} = 1;
386       $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
387       $self->{'xmlcontent'} .= $text;
388   }
389    else{
390    $self->{'content'} .= $escaped_text;   
391    $self->{'xmlcontent'} .= $text;   
392    }
393     
394}
395
396
397sub set_OID {
398    my $self = shift (@_);
399    my ($doc_obj, $record_number) = @_;
400   
401    # first set it to generate hash value
402    $doc_obj->set_OID();
403
404    # then top it up with an "r" + record-number suffix
405    my $id = $doc_obj->get_OID();
406    $doc_obj->set_OID($id . "r" . $record_number);
407}
408
409sub xml_text {
410    my $self = shift(@_);
411    my ($expat) = @_;
412
413    my $text = $_;
414    my $escaped_text = $self->escape_text($_);
415
416    # protect against & in raw text file
417    $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
418
419    ## store the text of a marc code, for exapmle 520a=>A poem about....
420    if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
421    ##stored it in the marc_mapping
422
423    my $current_tag  = $self->{'current_tag'};
424    my $current_code = $self->{'current_code'};
425
426        $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
427
428    $self->{'current_code'} = "";
429    }
430   
431    $self->{'content'} .= $escaped_text;
432    $self->{'xmlcontent'} .= $text;
433   
434}
435
436sub calculate_indent{
437   my ($self,$num) = @_;
438
439   my $indent ="";
440 
441   for (my $i=0; $i<$num;$i++){
442       $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
443    }
444 
445   return $indent;
446
447}
448
449sub escape_text {
450    my ($self,$text) = @_;
451    # special characters in the xml encoding
452    $text =~ s/&/&amp;/g; # this has to be first...
453    $text =~ s/</&lt;/g;
454    $text =~ s/>/&gt;/g;
455    $text =~ s/\"/&quot;/g;
456
457    return $text;
458}
459
460
461sub unescape_text {
462    my ($self,$text) = @_;
463    # special characters in the xml encoding
464    $text =~ s/&lt;/</g;
465    $text =~ s/&gt;/>/g;
466    $text =~ s/&quot;/\"/g;
467
468    $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
469
470    return $text;
471}
472
473
4741;
475
476
Note: See TracBrowser for help on using the browser.