root/gsdl/trunk/perllib/plugins/MARCXMLPlugin.pm @ 18327

Revision 18327, 13.5 KB (checked in by ak19, 11 years ago)

Extra parameter to new doc(): the renaming method to be used on the file (base64 or URL encoding).

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# MARCXMLPlugin.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
30package MARCXMLPlugin;
31
32use ReadXMLFile;
33use ReadTextFile;
34use marcmapping;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39sub BEGIN {
40    @MARCXMLPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
41}
42
43my $arguments = [{'name' => "metadata_mapping_file",
44          'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
45          'type' => "string",
46          'deft' => "marctodc.txt",
47          'reqd' => "no" }];
48
49my $options = { 'name'     => "MARCXMLPlugin",
50        'desc'     => "{MARCXMLPlugin.desc}",
51        'abstract' => "no",
52        'inherits' => "yes",
53        'args'     => $arguments
54        };
55
56sub new {
57    my ($class) = shift (@_);
58    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
59    push(@$pluginlist, $class);
60
61    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
62    push(@{$hashArgOptLists->{"OptList"}},$options);
63   
64    # we want to be able to use the textcat methods from ReadTextFile
65    # to get the language and encoding
66    new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
67
68    my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
69   
70    $self->{'content'} = "";
71    $self->{'xmlcontent'} = "";
72    $self->{'record_count'} = 1;
73    $self->{'language'} = "";
74    $self->{'encoding'} = "";
75    $self->{'marc_mapping'} = {};
76    $self->{'current_code'} = "";
77    $self->{'current_tag'} = "";
78    $self->{'current_element'} = "";
79    $self->{'metadata_mapping'} = undef;
80    $self->{'num_processed'} = 0;
81    $self->{'indent'} = 0;
82
83    return bless $self, $class;
84}
85
86
87
88sub get_doctype {
89    my $self = shift(@_);
90   
91    return "collection";
92}
93
94
95sub init {
96    my $self = shift (@_);
97    my ($verbosity, $outhandle, $failhandle) = @_;
98   
99    ## the mapping file has already been loaded
100    if (defined $self->{'metadata_mapping'} ){
101    $self->SUPER::init(@_);
102    return;
103    }
104
105    # read in the metadata mapping files
106    my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
107
108
109    if (scalar(@$mm_files)==0)
110    {
111    my $msg = "MARCXMLPlugin ERROR: Can't locate mapping file \"" .
112        $self->{'metadata_mapping_file'} . "\".\n " .
113        "    No metadata will be extracted from MARCXML files.\n";
114
115    print $outhandle $msg;
116    print $failhandle $msg;
117    $self->{'metadata_mapping'} = undef;
118    # We pick up the error in process() if there is no $mm_file
119    # If we exit here, then pluginfo.pl will exit too!
120    }
121    else {
122    $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
123    }
124
125
126    ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
127
128    $self->SUPER::init(@_);
129}
130
131# Called for DOCTYPE declarations - use die to bail out if this doctype
132# is not meant for this plugin
133sub xml_doctype {
134    my $self = shift(@_);
135
136    my ($expat, $name, $sysid, $pubid, $internal) = @_;
137   return;
138
139}
140
141
142sub xml_start_document {
143    my $self = shift(@_);
144
145    my ($expat, $name, $sysid, $pubid, $internal) = @_;
146
147     
148    my $file = $self->{'file'};
149    my $filename = $self->{'filename'};
150       
151    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
152
153    $self->{'language'} = $language;
154    $self->{'encoding'} = $encoding;
155    $self->{'element_count'} = 1;
156    $self->{'indent'} = 0;
157    my $outhandle = $self->{'outhandle'};
158    print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
159    print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
160
161    # reset the base id
162    $self->{'base_oid'} = undef;
163 
164}
165
166sub xml_end_document {
167
168}
169
170sub xml_start_tag {
171    my $self = shift;
172    my $expat = shift;
173    my $element = shift; 
174
175    my $text = $_;
176    my $escaped_text =  $self->escape_text($_);
177 
178    $self->{'current_element'} = $element;
179
180    ##get all atributes of this element and store it in a map name=>value   
181    my %attr_map = ();
182    my $attrstring = $_;
183    while ($attrstring =~ /(\w+)=\"(\w+)\"/){
184    $attr_map{$1}=$2;
185    $attrstring = $'; #'
186    }
187
188
189    my $processor = $self->{'processor'};
190 
191    ##create a new document for each record
192    if ($element eq "record") {
193        my $filename = $self->{'filename'};
194    my $language = $self->{'language'};
195        my $encoding = $self->{'encoding'};
196    my $file = $self->{'file'};
197    my $doc_obj = new doc($filename, undef, $self->{'file_rename_method'});
198    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
199    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
200    my ($filemeta) = $file =~ /([^\\\/]+)$/;
201    $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
202    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
203        if ($self->{'cover_image'}) {
204        $self->associate_cover_image($doc_obj, $filename);
205    }
206    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
207    $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "MARCXML");
208
209    my $outhandle = $self->{'outhandle'};
210    print $outhandle "Record $self->{'record_count'}\n" if $self->{'verbosity'} > 1;
211
212        $self->{'record_count'}++;
213        $self->{'doc_obj'} = $doc_obj;       
214    $self->{'num_processed'}++;
215    if (!defined $self->{'base_oid'}) {
216        $self->SUPER::add_OID($doc_obj);
217        $self->{'base_oid'} = $doc_obj->get_OID();
218    }
219   
220
221    }
222   
223    ## get the marc code, for example 520
224     if ($element eq "datafield") {
225         if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
226         $self->{'current_tag'} = $attr_map{tag}; 
227     }
228     }
229
230
231    ## append the subcode to the marc code for example 520a or 520b
232    if ($element eq "subfield"){
233    if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
234        $self->{'current_code'} = $attr_map{'code'};
235    }
236    }
237
238   if ($element eq "record"){
239        $self->{'indent'} = 0;
240        $self->{'content'} = "";
241        $self->{'xmlcontent'} = "";
242    }
243    else {
244         if ($element ne "subfield"){
245              $self->{'indent'} = 1;
246         }
247         else{
248           $self->{'indent'} = 2;
249         }
250    }
251   
252
253    if ($element eq "collection") {
254    # remember the full start tag for <collection ...>
255    # This is needed to wrap around each <record> when generating its associate MARCXML file
256
257        $self->{'xmlcollectiontag'} = $text;
258    }
259    else {
260        $self->{'content'} .= "<br/>" if ($element ne "record");
261        $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text;
262        $self->{'xmlcontent'} .= $text;
263   }
264   
265}
266
267
268
269sub xml_end_tag {
270    my $self = shift(@_);
271    my ($expat, $element) = @_;
272
273    my $text = $_;
274    my $escaped_text =  $self->escape_text($_);
275 
276    if ($element eq "record" and defined $self->{'doc_obj'}) {
277    # process the document
278    my $processor = $self->{'processor'};
279    my $doc_obj = $self->{'doc_obj'};
280        $self->{'content'} .= "<br/>".$escaped_text;
281        $self->{'xmlcontent'} .= $text;
282     
283
284    my $top_section = $doc_obj->get_top_section();
285
286    my $tmp_marcxml_filename = &util::get_tmp_filename("xml");
287    if (open (XMLOUT,">$tmp_marcxml_filename")) {
288
289        print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n";     
290        my $xml_content = $self->{'xmlcontent'};
291
292        $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>";
293
294        print XMLOUT $xml_content;
295
296        close(XMLOUT);
297
298        $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section);
299       
300        # assicate xsl style file for presentation as HTML
301        my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl");
302        $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section);
303
304    }
305    else {
306        my $outhandle = $self->{'outhandle'};
307        print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n";
308    }
309   
310    # include any metadata passed in from previous plugins
311    # note that this metadata is associated with the top level section
312   
313    $self->extra_metadata ($doc_obj,
314                   $doc_obj->get_top_section(),
315                   $self->{'metadata'});
316   
317
318    $self->add_OID($doc_obj, $self->{'base_oid'}, $self->{'record_count'});
319
320    $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
321        $processor->process($doc_obj);
322
323        ##clean up
324    $self->{'content'} = ""; 
325    $self->{'xmlcontent'} = ""; 
326    $self->{'doc_obj'} = undef;
327        return;
328    }
329
330    ## map the xmlmarc to gsdl metadata
331    if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'} and defined $self->{'metadata_mapping'}){
332    my $metadata_mapping = $self->{'metadata_mapping'};
333    my $marc_mapping = $self->{'marc_mapping'};
334    my $doc_obj = $self->{'doc_obj'};
335
336##  print STDERR "**** Marc Record\n";
337##      map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
338##  print STDERR "**** Metadata Mapping\n";
339##      map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
340
341
342    foreach my $marc_field (keys %$metadata_mapping){
343
344        ## test whether this field has subfield
345        my $subfield = undef;
346        if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
347        $marc_field = $1;
348        $subfield = $2;
349        }
350
351        my $matched_field = $marc_mapping->{$marc_field};
352
353        if (defined $matched_field) {
354
355        my $meta_name  = undef;
356        my $meta_value = undef;
357
358        if (defined $subfield){
359            $meta_name = $metadata_mapping->{$marc_field."\$".$subfield};
360
361            $meta_value = $matched_field->{$subfield};
362           
363            if (!defined $meta_value) {
364            # record read in does not have the specified subfield
365            next;
366            }
367        }
368        else {
369            $meta_name = $metadata_mapping->{$marc_field};
370           
371            # no subfield => get all the values
372            foreach my $value (sort keys %{$matched_field}) {
373            $meta_value .= $matched_field->{$value} ." ";
374            }
375
376        }
377       
378        ## escape [ and ]
379        $meta_value =~ s/\[/\\\[/g;
380        $meta_value =~ s/\]/\\\]/g;
381        ##print STDERR  "$meta_name=$meta_value\n";
382        $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
383       
384        }           
385           
386    }
387
388    ##clean up
389    $self->{'marc_mapping'} = undef;
390    $self->{'current_tag'} = "";
391    }
392 
393   if ($element eq "datafield"){
394       $self->{'indent'} = 1;
395       $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text;
396       $self->{'xmlcontent'} .= $text;
397   }
398    else{
399    $self->{'content'} .= $escaped_text;   
400    $self->{'xmlcontent'} .= $text;   
401    }
402     
403}
404
405sub add_OID {
406    my $self = shift (@_);
407    my ($doc_obj, $id, $record_number) = @_;
408
409    my $full_id = $id . "r" . $record_number;
410    if ($self->{'OIDtype'} eq "assigned") {
411    my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
412    if (defined $identifier && $identifier ne "") {
413        $full_id = $identifier;
414        $full_id =~ s/\.//g; #remove any periods
415        if ($full_id =~ /^[\d]*$/) {
416        $full_id = "D" . $full_id;
417        print STDERR "OID only contains numbers, adding a D\n";
418        }
419    }
420    }
421    $doc_obj->set_OID($full_id);
422}
423
424sub xml_text {
425    my $self = shift(@_);
426    my ($expat) = @_;
427
428    my $text = $_;
429    my $escaped_text = $self->escape_text($_);
430
431    # protect against & in raw text file
432    $text =~ s/&/&amp;/g; # can't have & in raw form, even in 'raw' xml text
433
434    ## store the text of a marc code, for exapmle 520a=>A poem about....
435    if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
436    ##stored it in the marc_mapping
437
438    my $current_tag  = $self->{'current_tag'};
439    my $current_code = $self->{'current_code'};
440
441        $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_;
442
443    $self->{'current_code'} = "";
444    }
445   
446    $self->{'content'} .= $escaped_text;
447    $self->{'xmlcontent'} .= $text;
448   
449}
450
451sub calculate_indent{
452   my ($self,$num) = @_;
453
454   my $indent ="";
455 
456   for (my $i=0; $i<$num;$i++){
457       $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
458    }
459 
460   return $indent;
461
462}
463
464sub escape_text {
465    my ($self,$text) = @_;
466    # special characters in the xml encoding
467    $text =~ s/&/&amp;/g; # this has to be first...
468    $text =~ s/</&lt;/g;
469    $text =~ s/>/&gt;/g;
470    $text =~ s/\"/&quot;/g;
471
472    return $text;
473}
474
475
476sub unescape_text {
477    my ($self,$text) = @_;
478    # special characters in the xml encoding
479    $text =~ s/&lt;/</g;
480    $text =~ s/&gt;/>/g;
481    $text =~ s/&quot;/\"/g;
482
483    $text =~ s/&/&amp;/g; # can't have & in raw form, even in unescaped xml!
484
485    return $text;
486}
487
488
4891;
490
491
Note: See TracBrowser for help on using the browser.