source: trunk/gsdl/perllib/plugins/MARCXMLPlug.pm@ 13486

Last change on this file since 13486 was 13486, checked in by kjdon, 17 years ago

XMLMARCPlug renamed to MARCXMLPlug

  • Property svn:keywords set to Author Date Id Revision
File size: 9.8 KB
Line 
1###########################################################################
2#
3# MARCXMLPlug.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes MARCXML documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness).
29
30package MARCXMLPlug;
31
32use XMLPlug;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub BEGIN {
38 @MARCXMLPlug::ISA = ('XMLPlug');
39}
40
41my $arguments = [{'name' => "metadata_mapping_file",
42 'desc' => "{MARCXMLPlug.metadata_mapping_file}",
43 'type' => "string",
44 'reqd' => "no" }];
45
46my $options = { 'name' => "MARCXMLPlug",
47 'desc' => "{MARCXMLPlug.desc}",
48 'abstract' => "no",
49 'inherits' => "yes",
50 'args' => $arguments
51 };
52
53sub new {
54 my ($class) = shift (@_);
55 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
56 push(@$pluginlist, $class);
57
58 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
59 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
60
61 my $self = new XMLPlug($pluginlist, $inputargs, $hashArgOptLists);
62
63 $self->{'content'} = "";
64 $self->{'record_count'} = 1;
65 $self->{'language'} = "";
66 $self->{'encoding'} = "";
67 $self->{'marc_mapping'} = {};
68 $self->{'current_code'} = "";
69 $self->{'current_tag'} = "";
70 $self->{'current_element'} = "";
71 $self->{'metadata_mapping'} = undef;
72 $self->{'num_processed'} = 0;
73
74 return bless $self, $class;
75}
76
77sub get_doctype {
78 my $self = shift(@_);
79
80 return "collection";
81}
82
83
84sub init {
85 my $self = shift (@_);
86 my ($verbosity, $outhandle, $failhandle) = @_;
87
88 ## the mapping file has already been loaded
89 if (defined $self->{'metadata_mapping'} ){
90 $self->SUPER::init(@_);
91 return;
92 }
93
94 my $metadata_mapping = {};
95
96 # read in the metadata mapping file
97 my $mm_file = $self->{'metadata_mapping_file'};
98
99 if (! defined $mm_file or $mm_file eq ""){
100 $mm_file = &util::filename_cat( $ENV{'GSDLHOME'}, "etc","marctodc.txt" );
101 $self->{'metadata_mapping_file'} = $mm_file;
102 }
103
104 if (!-e $mm_file)
105 {
106 my $msg = "MARCXMLPlug ERROR: Can't locate mapping file \"" .
107 $self->{'metadata_mapping'} . "\".\n This file should be at $mm_file\n" .
108 " No marc files can be processed.\n";
109
110 print $outhandle $msg;
111 print $failhandle $msg;
112 $self->{'metadata_mapping'} = undef;
113 # We pick up the error in process() if there is no $mm_file
114 # If we exit here, then pluginfo.pl will exit too!
115 }
116 elsif (open(MMIN, "<$mm_file"))
117 {
118 my $l=1;
119 my $line;
120 while (defined($line=<MMIN>))
121 {
122 chomp $line;
123 if ($line =~ m/^(\d+\w?)\s*->\s*([\w\^]+)$/)
124 {
125 my $marc_info = $1;
126 my $gsdl_info = $2;
127 $metadata_mapping->{$marc_info} = $gsdl_info;
128 }
129 elsif ($line !~ m/^\#/ # allow comments (# in first column)
130 && $line !~ m/^\s*$/) # allow blank lines
131 {
132 print $outhandle "Parse error on line $l of $mm_file:\n";
133 print $outhandle " \"$line\"\n";
134 }
135 $l++
136 }
137 close(MMIN);
138 }
139 else
140 {
141 print STDERR "Unable to open $mm_file: $!\n";
142 }
143
144 $self->{'metadata_mapping'} = $metadata_mapping;
145
146 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
147
148 $self->SUPER::init(@_);
149}
150
151# Called for DOCTYPE declarations - use die to bail out if this doctype
152# is not meant for this plugin
153sub xml_doctype {
154 my $self = shift(@_);
155
156 my ($expat, $name, $sysid, $pubid, $internal) = @_;
157 return;
158
159}
160
161
162sub xml_start_document {
163 my $self = shift(@_);
164
165 my ($expat, $name, $sysid, $pubid, $internal) = @_;
166
167
168 my $file = $self->{'file'};
169 my $filename = $self->{'filename'};
170
171 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
172
173 $self->{'language'} = $language;
174 $self->{'encoding'} = $encoding;
175 $self->{'element_count'} = 1;
176
177 my $outhandle = $self->{'outhandle'};
178 print $outhandle "MARCXMLPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
179 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlug'>\n" if $self->{'gli'};
180
181}
182
183sub xml_end_document {
184
185}
186
187sub xml_start_tag {
188 my $self = shift;
189 my $expat = shift;
190 my $element = shift;
191
192 $self->{'current_element'} = $element;
193
194 if ($element ne "collection"){
195 $self->{'content'} .= $_;
196 }
197
198 ##get all atributes of this element and store it in a map name=>value
199 my %attr_map = ();
200 my $attrstring = $_;
201 while ($attrstring =~ /(\w+)=\"(\w+)\"/){
202 $attr_map{$1}=$2;
203 $attrstring = $';
204 }
205
206
207 my $processor = $self->{'processor'};
208
209 ##create a new document for each record
210 if ($element eq "record") {
211 my $filename = $self->{'filename'};
212 my $language = $self->{'language'};
213 my $encoding = $self->{'encoding'};
214 my $file = $self->{'file'};
215 my $doc_obj = new doc($filename);
216 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
217 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
218 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
219 my ($filemeta) = $file =~ /([^\\\/]+)$/;
220 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
221 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
222 if ($self->{'cover_image'}) {
223 $self->associate_cover_image($doc_obj, $filename);
224 }
225 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
226 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
227
228 $doc_obj->set_OID();
229 $self->set_OID($doc_obj, $doc_obj->get_OID() , $self->{'record_count'});
230
231 my $outhandle = $self->{'outhandle'};
232 print $outhandle "Record $self->{'record_count'} - MARCXMLPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
233
234 $self->{'record_count'}++;
235 $self->{'doc_obj'} = $doc_obj;
236 $self->{'num_processed'}++;
237
238 }
239
240 ## get the marc code, for example 520
241 if ($element eq "datafield") {
242 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
243 $self->{'current_tag'} = $attr_map{tag};
244 }
245 }
246
247
248 ## append the subcode to the marc code for example 520a or 520b
249 if ($element eq "subfield"){
250 if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
251 $self->{'current_code'} = $attr_map{'code'};
252 }
253 }
254}
255
256
257
258sub xml_end_tag {
259 my $self = shift(@_);
260 my ($expat, $element) = @_;
261
262 $self->{'content'} .= $_;
263
264 if ($element eq "record" and defined $self->{'doc_obj'}) {
265 # process the document
266 my $processor = $self->{'processor'};
267 my $doc_obj = $self->{'doc_obj'};
268 $doc_obj->add_utf8_text($doc_obj->get_top_section(), $self->{'content'});
269 $processor->process($doc_obj);
270
271 ##clean up
272 $self->{'content'} = "";
273 $self->{'doc_obj'} = undef;
274 }
275
276 ## map the xmlmarc to gsdl metadata
277 if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'}){
278 my $metadata_mapping = $self->{'metadata_mapping'};
279 my $marc_mapping = $self->{'marc_mapping'};
280 my $doc_obj = $self->{'doc_obj'};
281
282 ##map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
283 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;
284
285 foreach my $marc_field (keys %$metadata_mapping){
286 my ($meta_name,$meta_value);
287 my $matched_field = $marc_mapping->{$marc_field};
288 my $subfield = undef;
289 if (defined $matched_field){
290 ## test whether this field has subfield
291 if ($marc_field =~ /\d\d\d(\w)/){
292 $subfield = $1;
293 }
294 $meta_name = $metadata_mapping->{$marc_field};
295
296 if (defined $subfield){
297 my %mapped_subfield = {@$matched_field};
298 $meta_value = $mapped_subfield{$subfield};
299 }
300 else{ ## get all values
301 my $i =0;
302 foreach my $value (@$matched_field){
303 if ($i%2 != 0){
304 $meta_value .= $value." ";
305 }
306 $i++;
307 }
308 }
309
310 ## escape [ and ]
311 $meta_value =~ s/\[/\\\[/g;
312 $meta_value =~ s/\]/\\\]/g;
313 ##print STDERR "$meta_name=$meta_value\n";
314 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
315
316 }
317
318 }
319
320 ##clean up
321 $self->{'marc_mapping'} = undef;
322 $self->{'current_tag'} = "";
323 }
324}
325
326
327sub set_OID {
328 my $self = shift (@_);
329 my ($doc_obj, $id, $record_number) = @_;
330
331 $doc_obj->set_OID($id . "r" . $record_number);
332}
333
334sub xml_text {
335 my $self = shift(@_);
336 my ($expat) = @_;
337
338
339 ## store the text of a marc code, for exapmle 520a=>A poem about....
340 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
341 ##stored it in the marc_mapping
342 push(@{$self->{'marc_mapping'}->{$self->{'current_tag'}}},$self->{'current_code'});
343 push(@{$self->{'marc_mapping'}->{$self->{'current_tag'}}},$_);
344 $self->{'current_code'} = "";
345 }
346
347 $self->{'content'} .= $_;
348
349}
350
351
3521;
353
354
Note: See TracBrowser for help on using the repository browser.