source: gsdl/trunk/perllib/plugins/MARCPlugin.pm@ 18402

Last change on this file since 18402 was 18402, checked in by mdewsnip, 15 years ago

Fixed inconsistent and incorrect escaping of square brackets in metadata values, and also tidied up some ghastly code while I was there.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.9 KB
Line 
1###########################################################################
2#
3# MARCPlugin.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlugin;
28
29use SplitTextFile;
30
31use unicode;
32use util;
33use marcmapping;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38sub BEGIN {
39 @MARCPlugin::ISA = ('SplitTextFile');
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
41}
42
43my $arguments =
44 [ { 'name' => "metadata_mapping",
45 'desc' => "{common.deprecated} {MARCPlugin.metadata_mapping}",
46 'type' => "string",
47 'deft' => "",
48 'hiddengli' => "yes", # deprecated in favour of 'metadata_mapping_file'
49 'reqd' => "no" },
50 { 'name' => "metadata_mapping_file",
51 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
52 'type' => "string",
53 'deft' => "marctodc.txt",
54 'reqd' => "no" },
55 { 'name' => "process_exp",
56 'desc' => "{BasePlugin.process_exp}",
57 'type' => "regexp",
58 'reqd' => "no",
59 'deft' => &get_default_process_exp() },
60 { 'name' => "split_exp",
61 'desc' => "{SplitTextFile.split_exp}",
62 'type' => "regexp",
63 'reqd' => "no",
64 'deft' => &get_default_split_exp() }
65 ];
66
67my $options = { 'name' => "MARCPlugin",
68 'desc' => "{MARCPlugin.desc}",
69 'abstract' => "no",
70 'inherits' => "yes",
71 'explodes' => "yes",
72 'args' => $arguments };
73
74require MARC::Record;
75require MARC::Batch;
76#use MARC::Record;
77#use MARC::Batch;
78
79sub new {
80 my ($class) = shift (@_);
81 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
82 push(@$pluginlist, $class);
83
84 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
85 push(@{$hashArgOptLists->{"OptList"}},$options);
86
87 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
88
89 if ($self->{'info_only'}) {
90 # don't worry about the options
91 return bless $self, $class;
92 }
93 # 'metadata_mapping' was used in two ways in the plugin: as a plugin
94 # option (filename) and as a datastructure to represent the mapping.
95 # In MARXXMLPlug (written later) the two are separated: filename is
96 # represented through 'metadata_mapping_file' and the data-structure
97 # mapping left as 'metadata_mapping'
98 # 'metadata_mapping' still present (but hidden in GLI) for
99 # backwards compatibility, but 'metadata_mapping_file' is used by
100 # preference
101
102 if ($self->{'metadata_mapping'} ne "") {
103 print STDERR "MARCPlugin WARNING:: the metadata_mapping option is set but has been deprecated. Please use metadata_mapping_file option instead\n";
104 # If the old version is set, use it.
105 $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'};
106 }
107 $self->{'metadata_mapping'} = undef;
108 $self->{'type'} = "";
109 return bless $self, $class;
110}
111
112sub init {
113 my $self = shift (@_);
114 my ($verbosity, $outhandle, $failhandle) = @_;
115
116 ## the mapping file has already been loaded
117 if (defined $self->{'metadata_mapping'} ){
118 $self->SUPER::init(@_);
119 return;
120 }
121
122 # read in the metadata mapping files
123 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
124 if (scalar(@$mm_files)==0)
125 {
126 my $msg = "MARCPlugin ERROR: Can't locate mapping file \"" .
127 $self->{'metadata_mapping_file'} . "\".\n " .
128 " No metadata will be extracted from MARC files.\n";
129
130 print $outhandle $msg;
131 print $failhandle $msg;
132 $self->{'metadata_mapping'} = undef;
133 # We pick up the error in process() if there is no $mm_file
134 # If we exit here, then pluginfo.pl will exit too!
135 }
136 else {
137 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
138 }
139
140 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
141
142 $self->SUPER::init(@_);
143}
144
145
146
147sub get_default_process_exp {
148 my $self = shift (@_);
149
150 return q^(?i)(\.marc)$^;
151}
152
153
154sub get_default_split_exp {
155 # \r\n for msdos eol, \n for unix
156 return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
157}
158
159
160
161# The bulk of this function is based on read_line in multiread.pm
162# Unable to use read_line original because it expects to get its input
163# from a file. Here the line to be converted is passed in as a string
164
165sub to_utf8
166{
167 my $self = shift (@_);
168 my ($encoding, $line) = @_;
169
170 if ($encoding eq "utf8") {
171 # nothing needs to be done
172 return $line;
173 }
174
175 if ($encoding eq "iso_8859_1") {
176 # we'll use ascii2utf8() for this as it's faster than going
177 # through convert2unicode()
178 return &unicode::ascii2utf8 (\$line);
179 }
180
181 # everything else uses unicode::convert2unicode
182 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
183}
184
185
186sub read_file {
187 my $self = shift (@_);
188 my ($filename, $encoding, $language, $textref) = @_;
189
190 my $outhandle = $self->{'outhandle'};
191
192 if (! defined($self->{'metadata_mapping'}))
193 {
194 # print a warning
195 print $outhandle "MARCPlugin: no metadata mapping file! Can't extract metadata from $filename\n";
196 }
197
198 $self->{'readfile_encoding'}->{$filename} = $encoding;
199
200
201 if (!-r $filename)
202 {
203 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
204 return;
205 }
206
207 ##handle ascii marc
208 #test whether this is ascii marc file
209 if (open (FILE, $filename)) {
210 while (defined (my $line = <FILE>)) {
211 $$textref .= $line;
212 if ($line =~ /\[\w+\]Record type:/){
213 undef $/;
214 $$textref .= <FILE>;
215 $/ = "\n";
216 $self->{'type'} = "ascii";
217 close FILE;
218 return;
219 }
220 }
221 close FILE;
222 }
223
224
225 $$textref = "";
226 my @marc_entries = ();
227
228 my $batch = new MARC::Batch( 'USMARC', $filename );
229 while ( my $marc = $batch->next )
230 {
231 push(@marc_entries,$marc);
232 $$textref .= $marc->as_formatted();
233 $$textref .= "\n\n"; # for SplitTextFile - see default_split_exp above...
234 }
235
236 $self->{'marc_entries'}->{$filename} = \@marc_entries;
237}
238
239
240
241# do plugin specific processing of doc_obj
242# This gets done for each record found by SplitTextFile in marc files.
243sub process {
244 my $self = shift (@_);
245 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
246
247 my $outhandle = $self->{'outhandle'};
248 my $filename = &util::filename_cat($base_dir, $file);
249
250 my $cursection = $doc_obj->get_top_section();
251
252 # Add fileFormat as the metadata
253 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
254
255 my $marc_entries = $self->{'marc_entries'}->{$filename};
256 my $marc = shift(@$marc_entries);
257
258 my $encoding = $self->{'readfile_encoding'}->{$filename};
259
260 if (defined ($self->{'metadata_mapping'}) ) {
261 if ($self->{'type'} ne "ascii" ){
262 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
263 }
264 else{
265 $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
266 }
267 }
268
269 # add spaces after the sub-field markers, for word boundaries
270 $$textref =~ s/^(.{6} _\w)/$1 /gm;
271
272 # add text to document object
273 $$textref =~ s/</&lt;/g;
274 $$textref =~ s/>/&gt;/g;
275
276 $$textref = $self->to_utf8($encoding,$$textref);
277
278 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
279 if $self->{'verbosity'} > 2;
280
281 # line wrapping
282 $$textref = &wrap_text_in_columns($$textref, 64);
283 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
284
285 $doc_obj->add_utf8_text($cursection, $$textref);
286
287 return 1;
288}
289
290sub wrap_text_in_columns
291{
292 my ($text, $columnwidth) = @_;
293 my $newtext = "";
294 my $linelength = 0;
295
296 # Break the text into words, and display one at a time
297 my @words = split(/ /, $text);
298
299 foreach my $word (@words) {
300 # If printing this word would exceed the column end, start a new line
301 if (($linelength + length($word)) >= $columnwidth) {
302 $newtext .= "\n";
303 $linelength = 0;
304 }
305
306 # Write the word
307 $newtext .= " $word";
308 if ($word =~ /\n/) {
309 $linelength = 0;
310 } else {
311 $linelength = $linelength + length(" $word");
312 }
313 }
314
315 $newtext .= "\n";
316 return $newtext;
317}
318
319
320sub extract_metadata
321{
322 my $self = shift (@_);
323
324 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
325 my $outhandle = $self->{'outhandle'};
326
327 if (!defined $marc){
328 return;
329 }
330
331 my $metadata_mapping = $self->{'metadata_mapping'};;
332
333 foreach my $marc_field ( keys %$metadata_mapping )
334 {
335 my $gsdl_field = $metadata_mapping->{$marc_field};
336
337 # have we got a subfield?
338 my $subfield = undef;
339 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
340 $marc_field = $1;
341 $subfield = $2;
342 }
343 if (defined $subfield) {
344 my $meta_value = $marc->subfield($marc_field, $subfield);
345 if (defined $meta_value) {
346 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
347 $meta_value =~ s/\[/&\#091;/g;
348 $meta_value =~ s/\]/&\#093;/g;
349 my $metavalue_str = $self->to_utf8($encoding, $meta_value);
350 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
351 }
352 }
353 else
354 {
355 foreach my $meta_value_obj ($marc->field($marc_field))
356 {
357 my $meta_value = $meta_value_obj->as_string();
358
359 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
360 $meta_value =~ s/\[/&\#091;/g;
361 $meta_value =~ s/\]/&\#093;/g;
362 my $metavalue_str = $self->to_utf8($encoding, $meta_value);
363 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
364 }
365 }
366 }
367}
368
369
370sub extract_ascii_metadata
371{
372 my $self = shift (@_);
373
374 my ($text, $metadata,$doc_obj, $section) = @_;
375 my $outhandle = $self->{'outhandle'};
376 my $metadata_mapping = $self->{'metadata_mapping'};
377 ## get fields
378 my @fields = split(/[\n\r]+/,$text);
379 my $marc_mapping ={};
380
381 foreach my $field (@fields){
382 if ($field ne ""){
383 $field =~ /^(\d\d\d)\s/;
384 my $code = $1;
385 $field = $'; #'
386 ##get subfields
387 my @subfields = split(/\$/,$field);
388 my $i=0;
389 $marc_mapping->{$code} = [];
390 foreach my $subfield (@subfields){
391 if ($i == 0){
392 ##print STDERR $subfield."\n";
393 push(@{$marc_mapping->{$code}},"info");
394 push(@{$marc_mapping->{$code}},$subfield);
395
396 $i++;
397 }
398 else{
399 $subfield =~ /(\w)\s/;
400 ##print STDERR "$1=>$'\n";
401 push(@{$marc_mapping->{$code}},$1);
402 push(@{$marc_mapping->{$code}},$'); #'
403 }
404 }
405 }
406 }
407
408
409 foreach my $marc_field ( keys %$metadata_mapping )
410 {
411
412 my $matched_field = $marc_mapping->{$marc_field};
413 my $subfield = undef;
414
415 if (defined $matched_field){
416 ## test whether this field has subfield
417 if ($marc_field =~ /\d\d\d(\w)/){
418 $subfield = $1;
419 }
420 my $metaname = $metadata_mapping->{$marc_field};
421
422 my $metavalue;
423 if (defined $subfield){
424 my %mapped_subfield = {@$matched_field};
425 $metavalue = $mapped_subfield{$subfield};
426 }
427 else{ ## get all values except info
428 my $i =0;
429 foreach my $value (@$matched_field){
430 if ($i%2 != 0 and $i != 1){
431 $metavalue .= $value." ";
432 }
433 $i++;
434 }
435 }
436
437 ## escape [ and ]
438 $metavalue =~ s/\[/\\\[/g;
439 $metavalue =~ s/\]/\\\]/g;
440 ##print STDERR "$metaname=$metavalue\n";
441 $doc_obj->add_metadata ($section, $metaname, $metavalue) ;
442 }
443
444 }
445
446}
447
448
4491;
Note: See TracBrowser for help on using the repository browser.