source: gsdl/trunk/perllib/plugins/MARCPlugin.pm@ 16970

Last change on this file since 16970 was 16970, checked in by kjdon, 13 years ago

changed an output statement

  • Property svn:keywords set to Author Date Id Revision
File size: 11.8 KB
Line 
1###########################################################################
2#
3# MARCPlugin.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlugin;
28
29use SplitTextFile;
30
31use unicode;
32use util;
33use marcmapping;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38sub BEGIN {
39 @MARCPlugin::ISA = ('SplitTextFile');
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
41}
42
43my $arguments =
44 [ { 'name' => "metadata_mapping",
45 'desc' => "{common.deprecated} {MARCPlugin.metadata_mapping}",
46 'type' => "string",
47 'deft' => "",
48 'hiddengli' => "yes", # deprecated in favour of 'metadata_mapping_file'
49 'reqd' => "no" },
50 { 'name' => "metadata_mapping_file",
51 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
52 'type' => "string",
53 'deft' => "marctodc.txt",
54 'reqd' => "no" },
55 { 'name' => "process_exp",
56 'desc' => "{BasePlugin.process_exp}",
57 'type' => "regexp",
58 'reqd' => "no",
59 'deft' => &get_default_process_exp() },
60 { 'name' => "split_exp",
61 'desc' => "{SplitTextFile.split_exp}",
62 'type' => "regexp",
63 'reqd' => "no",
64 'deft' => &get_default_split_exp() }
65 ];
66
67my $options = { 'name' => "MARCPlugin",
68 'desc' => "{MARCPlugin.desc}",
69 'abstract' => "no",
70 'inherits' => "yes",
71 'explodes' => "yes",
72 'args' => $arguments };
73
74require MARC::Record;
75require MARC::Batch;
76#use MARC::Record;
77#use MARC::Batch;
78
79sub new {
80 my ($class) = shift (@_);
81 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
82 push(@$pluginlist, $class);
83
84 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
85 push(@{$hashArgOptLists->{"OptList"}},$options);
86
87 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
88
89 if ($self->{'info_only'}) {
90 # don't worry about the options
91 return bless $self, $class;
92 }
93 # 'metadata_mapping' was used in two ways in the plugin: as a plugin
94 # option (filename) and as a datastructure to represent the mapping.
95 # In MARXXMLPlug (written later) the two are separated: filename is
96 # represented through 'metadata_mapping_file' and the data-structure
97 # mapping left as 'metadata_mapping'
98 # 'metadata_mapping' still present (but hidden in GLI) for
99 # backwards compatibility, but 'metadata_mapping_file' is used by
100 # preference
101
102 if ($self->{'metadata_mapping'} ne "") {
103 print STDERR "MARCPlugin WARNING:: the metadata_mapping option is set but has been deprecated. Please use metadata_mapping_file option instead\n";
104 # If the old version is set, use it.
105 $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'};
106 }
107 $self->{'metadata_mapping'} = undef;
108 $self->{'type'} = "";
109 return bless $self, $class;
110}
111
112sub init {
113 my $self = shift (@_);
114 my ($verbosity, $outhandle, $failhandle) = @_;
115
116 ## the mapping file has already been loaded
117 if (defined $self->{'metadata_mapping'} ){
118 $self->SUPER::init(@_);
119 return;
120 }
121
122 # read in the metadata mapping files
123 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
124 if (scalar(@$mm_files)==0)
125 {
126 my $msg = "MARCPlugin ERROR: Can't locate mapping file \"" .
127 $self->{'metadata_mapping_file'} . "\".\n " .
128 " No metadata will be extracted from MARC files.\n";
129
130 print $outhandle $msg;
131 print $failhandle $msg;
132 $self->{'metadata_mapping'} = undef;
133 # We pick up the error in process() if there is no $mm_file
134 # If we exit here, then pluginfo.pl will exit too!
135 }
136 else {
137 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
138 }
139
140 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
141
142 $self->SUPER::init(@_);
143}
144
145
146
147sub get_default_process_exp {
148 my $self = shift (@_);
149
150 return q^(?i)(\.marc)$^;
151}
152
153
154sub get_default_split_exp {
155 # \r\n for msdos eol, \n for unix
156 return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
157}
158
159
160
161# The bulk of this function is based on read_line in multiread.pm
162# Unable to use read_line original because it expects to get its input
163# from a file. Here the line to be converted is passed in as a string
164
165sub to_utf8
166{
167 my $self = shift (@_);
168 my ($encoding, $line) = @_;
169
170 if ($encoding eq "utf8") {
171 # nothing needs to be done
172 return $line;
173 }
174
175 if ($encoding eq "iso_8859_1") {
176 # we'll use ascii2utf8() for this as it's faster than going
177 # through convert2unicode()
178 return &unicode::ascii2utf8 (\$line);
179 }
180
181 # everything else uses unicode::convert2unicode
182 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
183}
184
185
186sub read_file {
187 my $self = shift (@_);
188 my ($filename, $encoding, $language, $textref) = @_;
189
190 my $outhandle = $self->{'outhandle'};
191
192 if (! defined($self->{'metadata_mapping'}))
193 {
194 # print a warning
195 print $outhandle "MARCPlugin: no metadata mapping file! Can't extract metadata from $filename\n";
196 }
197
198 $self->{'readfile_encoding'}->{$filename} = $encoding;
199
200
201 if (!-r $filename)
202 {
203 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
204 return;
205 }
206
207 ##handle ascii marc
208 #test whether this is ascii marc file
209 if (open (FILE, $filename)) {
210 while (defined (my $line = <FILE>)) {
211 $$textref .= $line;
212 if ($line =~ /\[\w+\]Record type:/){
213 undef $/;
214 $$textref .= <FILE>;
215 $/ = "\n";
216 $self->{'type'} = "ascii";
217 close FILE;
218 return;
219 }
220 }
221 close FILE;
222 }
223
224
225 $$textref = "";
226 my @marc_entries = ();
227
228 my $batch = new MARC::Batch( 'USMARC', $filename );
229 while ( my $marc = $batch->next )
230 {
231 push(@marc_entries,$marc);
232 $$textref .= $marc->as_formatted();
233 $$textref .= "\n\n"; # for SplitTextFile - see default_split_exp above...
234 }
235
236 $self->{'marc_entries'}->{$filename} = \@marc_entries;
237}
238
239
240
241# do plugin specific processing of doc_obj
242# This gets done for each record found by SplitTextFile in marc files.
243sub process {
244 my $self = shift (@_);
245 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
246
247 my $outhandle = $self->{'outhandle'};
248 my $filename = &util::filename_cat($base_dir, $file);
249
250 my $cursection = $doc_obj->get_top_section();
251
252 # Add fileFormat as the metadata
253 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
254
255 my $marc_entries = $self->{'marc_entries'}->{$filename};
256 my $marc = shift(@$marc_entries);
257
258 my $encoding = $self->{'readfile_encoding'}->{$filename};
259
260 if (defined ($self->{'metadata_mapping'}) ) {
261 if ($self->{'type'} ne "ascii" ){
262 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
263 }
264 else{
265 $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
266 }
267 }
268
269 # add spaces after the sub-field markers, for word boundaries
270 $$textref =~ s/^(.{6} _\w)/$1 /gm;
271
272 # add text to document object
273 $$textref =~ s/</&lt;/g;
274 $$textref =~ s/>/&gt;/g;
275
276 $$textref = $self->to_utf8($encoding,$$textref);
277
278 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
279 if $self->{'verbosity'} > 2;
280
281 # line wrapping
282 $$textref = &wrap_text_in_columns($$textref, 64);
283 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
284
285 $doc_obj->add_utf8_text($cursection, $$textref);
286
287 return 1;
288}
289
290sub wrap_text_in_columns
291{
292 my ($text, $columnwidth) = @_;
293 my $newtext = "";
294 my $linelength = 0;
295
296 # Break the text into words, and display one at a time
297 my @words = split(/ /, $text);
298
299 foreach my $word (@words) {
300 # If printing this word would exceed the column end, start a new line
301 if (($linelength + length($word)) >= $columnwidth) {
302 $newtext .= "\n";
303 $linelength = 0;
304 }
305
306 # Write the word
307 $newtext .= " $word";
308 if ($word =~ /\n/) {
309 $linelength = 0;
310 } else {
311 $linelength = $linelength + length(" $word");
312 }
313 }
314
315 $newtext .= "\n";
316 return $newtext;
317}
318
319
320sub extract_metadata
321{
322 my $self = shift (@_);
323
324 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
325 my $outhandle = $self->{'outhandle'};
326
327 if (!defined $marc){
328 return;
329 }
330
331 my $metadata_mapping = $self->{'metadata_mapping'};;
332
333 foreach my $marc_field ( keys %$metadata_mapping )
334 {
335 my $gsdl_field = $metadata_mapping->{$marc_field};
336 my $meta_value = undef;
337
338 # have we got a subfield?
339 my $subfield = undef;
340 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
341 $marc_field = $1;
342 $subfield = $2;
343 }
344 if (defined $subfield) {
345 $meta_value = $marc->subfield($marc_field, $subfield);
346 if (defined $meta_value) {
347 ## escape [ and ]
348 $meta_value =~ s/\[/\\\[/g;
349 $meta_value =~ s/\]/\\\]/g;
350 my $metavalue_str = $self->to_utf8($encoding, $meta_value);
351 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
352 }
353 } else {
354
355 my @metavalues = $marc->field($marc_field);
356
357 if (scalar(@metavalues)>0)
358 {
359 my $metavalue = undef;
360 foreach $metavalue ( @metavalues )
361 {
362 my $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
363 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
364 }
365 }
366 }
367 }
368}
369
370
371sub extract_ascii_metadata
372{
373 my $self = shift (@_);
374
375 my ($text, $metadata,$doc_obj, $section) = @_;
376 my $outhandle = $self->{'outhandle'};
377 my $metadata_mapping = $self->{'metadata_mapping'};
378 ## get fields
379 my @fields = split(/[\n\r]+/,$text);
380 my $marc_mapping ={};
381
382 foreach my $field (@fields){
383 if ($field ne ""){
384 $field =~ /^(\d\d\d)\s/;
385 my $code = $1;
386 $field = $'; #'
387 ##get subfields
388 my @subfields = split(/\$/,$field);
389 my $i=0;
390 $marc_mapping->{$code} = [];
391 foreach my $subfield (@subfields){
392 if ($i == 0){
393 ##print STDERR $subfield."\n";
394 push(@{$marc_mapping->{$code}},"info");
395 push(@{$marc_mapping->{$code}},$subfield);
396
397 $i++;
398 }
399 else{
400 $subfield =~ /(\w)\s/;
401 ##print STDERR "$1=>$'\n";
402 push(@{$marc_mapping->{$code}},$1);
403 push(@{$marc_mapping->{$code}},$'); #'
404 }
405 }
406 }
407 }
408
409
410 foreach my $marc_field ( keys %$metadata_mapping )
411 {
412
413 my $matched_field = $marc_mapping->{$marc_field};
414 my $subfield = undef;
415
416 if (defined $matched_field){
417 ## test whether this field has subfield
418 if ($marc_field =~ /\d\d\d(\w)/){
419 $subfield = $1;
420 }
421 my $metaname = $metadata_mapping->{$marc_field};
422
423 my $metavalue;
424 if (defined $subfield){
425 my %mapped_subfield = {@$matched_field};
426 $metavalue = $mapped_subfield{$subfield};
427 }
428 else{ ## get all values except info
429 my $i =0;
430 foreach my $value (@$matched_field){
431 if ($i%2 != 0 and $i != 1){
432 $metavalue .= $value." ";
433 }
434 $i++;
435 }
436 }
437
438 ## escape [ and ]
439 $metavalue =~ s/\[/\\\[/g;
440 $metavalue =~ s/\]/\\\]/g;
441 ##print STDERR "$metaname=$metavalue\n";
442 $doc_obj->add_metadata ($section, $metaname, $metavalue) ;
443 }
444
445 }
446
447}
448
449
4501;
Note: See TracBrowser for help on using the repository browser.