source: trunk/gsdl/perllib/plugins/MARCXMLPlug.pm@ 13496

Last change on this file since 13496 was 13496, checked in by shaoqun, 17 years ago

make the marc xml record disply nicely

  • Property svn:keywords set to Author Date Id Revision
File size: 10.9 KB
Line 
1###########################################################################
2#
3# MARCXMLPlug.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
30package MARCXMLPlug;
31
32use XMLPlug;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub BEGIN {
38 @MARCXMLPlug::ISA = ('XMLPlug');
39}
40
41my $arguments = [{'name' => "metadata_mapping_file",
42 'desc' => "{MARCXMLPlug.metadata_mapping_file}",
43 'type' => "string",
44 'reqd' => "no" }];
45
46my $options = { 'name' => "MARCXMLPlug",
47 'desc' => "{MARCXMLPlug.desc}",
48 'abstract' => "no",
49 'inherits' => "yes",
50 'args' => $arguments
51 };
52
53sub new {
54 my ($class) = shift (@_);
55 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
56 push(@$pluginlist, $class);
57
58 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
59 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
60
61 my $self = new XMLPlug($pluginlist, $inputargs, $hashArgOptLists);
62
63 $self->{'content'} = "";
64 $self->{'record_count'} = 1;
65 $self->{'language'} = "";
66 $self->{'encoding'} = "";
67 $self->{'marc_mapping'} = {};
68 $self->{'current_code'} = "";
69 $self->{'current_tag'} = "";
70 $self->{'current_element'} = "";
71 $self->{'metadata_mapping'} = undef;
72 $self->{'num_processed'} = 0;
73 $self->{'indent'} = 0;
74
75 return bless $self, $class;
76}
77
78sub get_doctype {
79 my $self = shift(@_);
80
81 return "collection";
82}
83
84
85sub init {
86 my $self = shift (@_);
87 my ($verbosity, $outhandle, $failhandle) = @_;
88
89 ## the mapping file has already been loaded
90 if (defined $self->{'metadata_mapping'} ){
91 $self->SUPER::init(@_);
92 return;
93 }
94
95 my $metadata_mapping = {};
96
97 # read in the metadata mapping file
98 my $mm_file = $self->{'metadata_mapping_file'};
99
100 if (! defined $mm_file or $mm_file eq ""){
101 $mm_file = &util::filename_cat( $ENV{'GSDLHOME'}, "etc","marctodc.txt" );
102 $self->{'metadata_mapping_file'} = $mm_file;
103 }
104
105 if (!-e $mm_file)
106 {
107 my $msg = "MARCXMLPlug ERROR: Can't locate mapping file \"" .
108 $self->{'metadata_mapping'} . "\".\n This file should be at $mm_file\n" .
109 " No marc files can be processed.\n";
110
111 print $outhandle $msg;
112 print $failhandle $msg;
113 $self->{'metadata_mapping'} = undef;
114 # We pick up the error in process() if there is no $mm_file
115 # If we exit here, then pluginfo.pl will exit too!
116 }
117 elsif (open(MMIN, "<$mm_file"))
118 {
119 my $l=1;
120 my $line;
121 while (defined($line=<MMIN>))
122 {
123 chomp $line;
124 if ($line =~ m/^(\d+\w?)\s*->\s*([\w\^]+)$/)
125 {
126 my $marc_info = $1;
127 my $gsdl_info = $2;
128 $metadata_mapping->{$marc_info} = $gsdl_info;
129 }
130 elsif ($line !~ m/^\#/ # allow comments (# in first column)
131 && $line !~ m/^\s*$/) # allow blank lines
132 {
133 print $outhandle "Parse error on line $l of $mm_file:\n";
134 print $outhandle " \"$line\"\n";
135 }
136 $l++
137 }
138 close(MMIN);
139 }
140 else
141 {
142 print STDERR "Unable to open $mm_file: $!\n";
143 }
144
145 $self->{'metadata_mapping'} = $metadata_mapping;
146
147 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
148
149 $self->SUPER::init(@_);
150}
151
152# Called for DOCTYPE declarations - use die to bail out if this doctype
153# is not meant for this plugin
154sub xml_doctype {
155 my $self = shift(@_);
156
157 my ($expat, $name, $sysid, $pubid, $internal) = @_;
158 return;
159
160}
161
162
163sub xml_start_document {
164 my $self = shift(@_);
165
166 my ($expat, $name, $sysid, $pubid, $internal) = @_;
167
168
169 my $file = $self->{'file'};
170 my $filename = $self->{'filename'};
171
172 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
173
174 $self->{'language'} = $language;
175 $self->{'encoding'} = $encoding;
176 $self->{'element_count'} = 1;
177 $self->{'indent'} = 0;
178 my $outhandle = $self->{'outhandle'};
179 print $outhandle "MARCXMLPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
180 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlug'>\n" if $self->{'gli'};
181
182}
183
184sub xml_end_document {
185
186}
187
188sub xml_start_tag {
189 my $self = shift;
190 my $expat = shift;
191 my $element = shift;
192
193 my $text = $self->escape_text($_);
194
195 $self->{'current_element'} = $element;
196
197 ##get all atributes of this element and store it in a map name=>value
198 my %attr_map = ();
199 my $attrstring = $_;
200 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
201 $attr_map{$1}=$2;
202 $attrstring = $'; #'
203 }
204
205
206 my $processor = $self->{'processor'};
207
208 ##create a new document for each record
209 if ($element eq "record") {
210 my $filename = $self->{'filename'};
211 my $language = $self->{'language'};
212 my $encoding = $self->{'encoding'};
213 my $file = $self->{'file'};
214 my $doc_obj = new doc($filename);
215 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
216 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
217 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
218 my ($filemeta) = $file =~ /([^\\\/]+)$/;
219 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
220 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
221 if ($self->{'cover_image'}) {
222 $self->associate_cover_image($doc_obj, $filename);
223 }
224 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
225 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
226
227 $doc_obj->set_OID();
228 $self->set_OID($doc_obj, $doc_obj->get_OID() , $self->{'record_count'});
229
230 my $outhandle = $self->{'outhandle'};
231 print $outhandle "Record $self->{'record_count'} - MARCXMLPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
232
233 $self->{'record_count'}++;
234 $self->{'doc_obj'} = $doc_obj;
235 $self->{'num_processed'}++;
236
237 }
238
239 ## get the marc code, for example 520
240 if ($element eq "datafield") {
241 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
242 $self->{'current_tag'} = $attr_map{tag};
243 }
244 }
245
246
247 ## append the subcode to the marc code for example 520a or 520b
248 if ($element eq "subfield"){
249 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
250 $self->{'current_code'} = $attr_map{'code'};
251 }
252 }
253
254 if ($element eq "record"){
255 $self->{'indent'} = 0;
256 }
257 else {
258 if ($element ne "subfield"){
259 $self->{'indent'} = 1;
260 }
261 else{
262 $self->{'indent'} = 2;
263 }
264 }
265
266
267 if ($element ne "collection"){
268 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$text;
269 }
270}
271
272
273
274sub xml_end_tag {
275 my $self = shift(@_);
276 my ($expat, $element) = @_;
277 my $text = $self->escape_text($_);
278
279 if ($element eq "record" and defined $self->{'doc_obj'}) {
280 # process the document
281 my $processor = $self->{'processor'};
282 my $doc_obj = $self->{'doc_obj'};
283 $self->{'content'} .= "<br/>".$text;
284
285 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
286 $processor->process($doc_obj);
287
288 ##clean up
289 $self->{'content'} = "";
290 $self->{'doc_obj'} = undef;
291 return;
292 }
293
294 ## map the xmlmarc to gsdl metadata
295 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'}){
296 my $metadata_mapping = $self->{'metadata_mapping'};
297 my $marc_mapping = $self->{'marc_mapping'};
298 my $doc_obj = $self->{'doc_obj'};
299
300 ##map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
301 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
302
303 foreach my $marc_field (keys %$metadata_mapping){
304 my ($meta_name,$meta_value);
305 my $matched_field = $marc_mapping->{$marc_field};
306 my $subfield = undef;
307 if (defined $matched_field){
308 ## test whether this field has subfield
309 if ($marc_field =~ /\d\d\d(\w)/){
310 $subfield = $1;
311 }
312 $meta_name = $metadata_mapping->{$marc_field};
313
314 if (defined $subfield){
315 my %mapped_subfield = {@$matched_field};
316 $meta_value = $mapped_subfield{$subfield};
317 }
318 else{ ## get all values
319 my $i =0;
320 foreach my $value (@$matched_field){
321 if ($i%2 != 0){
322 $meta_value .= $value." ";
323 }
324 $i++;
325 }
326 }
327
328 ## escape [ and ]
329 $meta_value =~ s/\[/\\\[/g;
330 $meta_value =~ s/\]/\\\]/g;
331 ##print STDERR "$meta_name=$meta_value\n";
332 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
333
334 }
335
336 }
337
338 ##clean up
339 $self->{'marc_mapping'} = undef;
340 $self->{'current_tag'} = "";
341 }
342
343 if ($element eq "datafield"){
344 $self->{'indent'} = 1;
345 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$text;
346 }
347 else{
348 $self->{'content'} .= $text;
349 }
350
351}
352
353
354sub set_OID {
355 my $self = shift (@_);
356 my ($doc_obj, $id, $record_number) = @_;
357
358 $doc_obj->set_OID($id . "r" . $record_number);
359}
360
361sub xml_text {
362 my $self = shift(@_);
363 my ($expat) = @_;
364
365
366 ## store the text of a marc code, for exapmle 520a=>A poem about....
367 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
368 ##stored it in the marc_mapping
369 push(@{$self->{'marc_mapping'}->{$self->{'current_tag'}}},$self->{'current_code'});
370 push(@{$self->{'marc_mapping'}->{$self->{'current_tag'}}},$_);
371 $self->{'current_code'} = "";
372 }
373
374 $self->{'content'} .=$self->escape_text($_);
375
376}
377
378sub calculate_indent{
379 my ($self,$num) = @_;
380
381 my $indent ="";
382
383 for (my $i=0; $i<$num;$i++){
384 $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
385 }
386
387 return $indent;
388
389}
390
391sub escape_text {
392 my ($self,$text) = @_;
393 # special characters in the xml encoding
394 $text =~ s/&/&amp;/g; # this has to be first...
395 $text =~ s/</&lt;/g;
396 $text =~ s/>/&gt;/g;
397 $text =~ s/\"/&quot;/g;
398
399 return $text;
400}
401
402
4031;
404
405
Note: See TracBrowser for help on using the repository browser.