source: gsdl/trunk/perllib/plugins/MARCPlugin.pm@ 16697

Last change on this file since 16697 was 16697, checked in by kjdon, 16 years ago

if marc mapping file cannot be located, print a warning about can't extract metadata but continue to process the file - get the record in the text so at least get something in the collection

  • Property svn:keywords set to Author Date Id Revision
File size: 11.8 KB
Line 
1###########################################################################
2#
3# MARCPlugin.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlugin;
28
29use SplitTextFile;
30
31use unicode;
32use util;
33use marcmapping;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38sub BEGIN {
39 @MARCPlugin::ISA = ('SplitTextFile');
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
41}
42
43my $arguments =
44 [ { 'name' => "metadata_mapping",
45 'desc' => "{common.deprecated} {MARCPlugin.metadata_mapping}",
46 'type' => "string",
47 'deft' => "",
48 'hiddengli' => "yes", # deprecated in favour of 'metadata_mapping_file'
49 'reqd' => "no" },
50 { 'name' => "metadata_mapping_file",
51 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
52 'type' => "string",
53 'deft' => "marctodc.txt",
54 'reqd' => "no" },
55 { 'name' => "process_exp",
56 'desc' => "{BasePlugin.process_exp}",
57 'type' => "regexp",
58 'reqd' => "no",
59 'deft' => &get_default_process_exp() },
60 { 'name' => "split_exp",
61 'desc' => "{SplitTextFile.split_exp}",
62 'type' => "regexp",
63 'reqd' => "no",
64 'deft' => &get_default_split_exp() }
65 ];
66
67my $options = { 'name' => "MARCPlugin",
68 'desc' => "{MARCPlugin.desc}",
69 'abstract' => "no",
70 'inherits' => "yes",
71 'explodes' => "yes",
72 'args' => $arguments };
73
74require MARC::Record;
75require MARC::Batch;
76#use MARC::Record;
77#use MARC::Batch;
78
79sub new {
80 my ($class) = shift (@_);
81 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
82 push(@$pluginlist, $class);
83
84 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
85 push(@{$hashArgOptLists->{"OptList"}},$options);
86
87 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
88
89 if ($self->{'info_only'}) {
90 # don't worry about the options
91 return bless $self, $class;
92 }
93 # 'metadata_mapping' was used in two ways in the plugin: as a plugin
94 # option (filename) and as a datastructure to represent the mapping.
95 # In MARXXMLPlug (written later) the two are separated: filename is
96 # represented through 'metadata_mapping_file' and the data-structure
97 # mapping left as 'metadata_mapping'
98 # 'metadata_mapping' still present (but hidden in GLI) for
99 # backwards compatibility, but 'metadata_mapping_file' is used by
100 # preference
101
102 if ($self->{'metadata_mapping'} ne "") {
103 print STDERR "MARCPlugin WARNING:: the metadata_mapping option is set but has been deprecated. Please use metadata_mapping_file option instead\n";
104 # If the old version is set, use it.
105 $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'};
106 }
107 $self->{'metadata_mapping'} = undef;
108 $self->{'type'} = "";
109
110 return bless $self, $class;
111}
112
113sub init {
114 my $self = shift (@_);
115 my ($verbosity, $outhandle, $failhandle) = @_;
116
117 ## the mapping file has already been loaded
118 if (defined $self->{'metadata_mapping'} ){
119 $self->SUPER::init(@_);
120 return;
121 }
122
123 # read in the metadata mapping files
124 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
125
126 if (scalar(@$mm_files)==0)
127 {
128 my $msg = "MARCPlugin ERROR: Can't locate mapping file \"" .
129 $self->{'metadata_mapping_file'} . "\".\n " .
130 " No metadata will be extracted from MARC files.\n";
131
132 print $outhandle $msg;
133 print $failhandle $msg;
134 $self->{'metadata_mapping'} = undef;
135 # We pick up the error in process() if there is no $mm_file
136 # If we exit here, then pluginfo.pl will exit too!
137 }
138 else {
139 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
140 }
141
142 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
143
144 $self->SUPER::init(@_);
145}
146
147
148
149sub get_default_process_exp {
150 my $self = shift (@_);
151
152 return q^(?i)(\.marc)$^;
153}
154
155
156sub get_default_split_exp {
157 # \r\n for msdos eol, \n for unix
158 return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
159}
160
161
162
163# The bulk of this function is based on read_line in multiread.pm
164# Unable to use read_line original because it expects to get its input
165# from a file. Here the line to be converted is passed in as a string
166
167sub to_utf8
168{
169 my $self = shift (@_);
170 my ($encoding, $line) = @_;
171
172 if ($encoding eq "utf8") {
173 # nothing needs to be done
174 return $line;
175 }
176
177 if ($encoding eq "iso_8859_1") {
178 # we'll use ascii2utf8() for this as it's faster than going
179 # through convert2unicode()
180 return &unicode::ascii2utf8 (\$line);
181 }
182
183 # everything else uses unicode::convert2unicode
184 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
185}
186
187
188sub read_file {
189 my $self = shift (@_);
190 my ($filename, $encoding, $language, $textref) = @_;
191
192 my $outhandle = $self->{'outhandle'};
193
194 if (! defined($self->{'metadata_mapping'}))
195 {
196 # print a warning
197 print $outhandle "MARCPlugin: no metadata file! Can't extract metadata from $filename\n";
198 }
199
200 $self->{'readfile_encoding'}->{$filename} = $encoding;
201
202
203 if (!-r $filename)
204 {
205 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
206 return;
207 }
208
209 ##handle ascii marc
210 #test whether this is ascii marc file
211 if (open (FILE, $filename)) {
212 while (defined (my $line = <FILE>)) {
213 $$textref .= $line;
214 if ($line =~ /\[\w+\]Record type:/){
215 undef $/;
216 $$textref .= <FILE>;
217 $/ = "\n";
218 $self->{'type'} = "ascii";
219 close FILE;
220 return;
221 }
222 }
223 close FILE;
224 }
225
226
227 $$textref = "";
228 my @marc_entries = ();
229
230 my $batch = new MARC::Batch( 'USMARC', $filename );
231 while ( my $marc = $batch->next )
232 {
233 push(@marc_entries,$marc);
234 $$textref .= $marc->as_formatted();
235 $$textref .= "\n\n"; # for SplitTextFile - see default_split_exp above...
236 }
237
238 $self->{'marc_entries'}->{$filename} = \@marc_entries;
239}
240
241
242
243# do plugin specific processing of doc_obj
244# This gets done for each record found by SplitTextFile in marc files.
245sub process {
246 my $self = shift (@_);
247 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
248
249 my $outhandle = $self->{'outhandle'};
250 my $filename = &util::filename_cat($base_dir, $file);
251
252 my $cursection = $doc_obj->get_top_section();
253
254 # Add fileFormat as the metadata
255 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
256
257 my $marc_entries = $self->{'marc_entries'}->{$filename};
258 my $marc = shift(@$marc_entries);
259
260 my $encoding = $self->{'readfile_encoding'}->{$filename};
261
262 if (defined ($self->{'metadata_mapping'}) ) {
263 if ($self->{'type'} ne "ascii" ){
264 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
265 }
266 else{
267 $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
268 }
269 }
270
271 # add spaces after the sub-field markers, for word boundaries
272 $$textref =~ s/^(.{6} _\w)/$1 /gm;
273
274 # add text to document object
275 $$textref =~ s/</&lt;/g;
276 $$textref =~ s/>/&gt;/g;
277
278 $$textref = $self->to_utf8($encoding,$$textref);
279
280 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
281 if $self->{'verbosity'} > 2;
282
283 # line wrapping
284 $$textref = &wrap_text_in_columns($$textref, 64);
285 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
286
287 $doc_obj->add_utf8_text($cursection, $$textref);
288
289 return 1;
290}
291
292sub wrap_text_in_columns
293{
294 my ($text, $columnwidth) = @_;
295 my $newtext = "";
296 my $linelength = 0;
297
298 # Break the text into words, and display one at a time
299 my @words = split(/ /, $text);
300
301 foreach my $word (@words) {
302 # If printing this word would exceed the column end, start a new line
303 if (($linelength + length($word)) >= $columnwidth) {
304 $newtext .= "\n";
305 $linelength = 0;
306 }
307
308 # Write the word
309 $newtext .= " $word";
310 if ($word =~ /\n/) {
311 $linelength = 0;
312 } else {
313 $linelength = $linelength + length(" $word");
314 }
315 }
316
317 $newtext .= "\n";
318 return $newtext;
319}
320
321
322sub extract_metadata
323{
324 my $self = shift (@_);
325
326 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
327 my $outhandle = $self->{'outhandle'};
328
329 if (!defined $marc){
330 return;
331 }
332
333 my $metadata_mapping = $self->{'metadata_mapping'};;
334
335 foreach my $marc_field ( keys %$metadata_mapping )
336 {
337 my $gsdl_field = $metadata_mapping->{$marc_field};
338 my $meta_value = undef;
339
340 # have we got a subfield?
341 my $subfield = undef;
342 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
343 $marc_field = $1;
344 $subfield = $2;
345 }
346 if (defined $subfield) {
347 $meta_value = $marc->subfield($marc_field, $subfield);
348 if (defined $meta_value) {
349 ## escape [ and ]
350 $meta_value =~ s/\[/\\\[/g;
351 $meta_value =~ s/\]/\\\]/g;
352 my $metavalue_str = $self->to_utf8($encoding, $meta_value);
353 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
354 }
355 } else {
356
357 my @metavalues = $marc->field($marc_field);
358
359 if (scalar(@metavalues)>0)
360 {
361 my $metavalue = undef;
362 foreach $metavalue ( @metavalues )
363 {
364 my $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
365 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
366 }
367 }
368 }
369 }
370}
371
372
373sub extract_ascii_metadata
374{
375 my $self = shift (@_);
376
377 my ($text, $metadata,$doc_obj, $section) = @_;
378 my $outhandle = $self->{'outhandle'};
379 my $metadata_mapping = $self->{'metadata_mapping'};
380 ## get fields
381 my @fields = split(/[\n\r]+/,$text);
382 my $marc_mapping ={};
383
384 foreach my $field (@fields){
385 if ($field ne ""){
386 $field =~ /^(\d\d\d)\s/;
387 my $code = $1;
388 $field = $'; #'
389 ##get subfields
390 my @subfields = split(/\$/,$field);
391 my $i=0;
392 $marc_mapping->{$code} = [];
393 foreach my $subfield (@subfields){
394 if ($i == 0){
395 ##print STDERR $subfield."\n";
396 push(@{$marc_mapping->{$code}},"info");
397 push(@{$marc_mapping->{$code}},$subfield);
398
399 $i++;
400 }
401 else{
402 $subfield =~ /(\w)\s/;
403 ##print STDERR "$1=>$'\n";
404 push(@{$marc_mapping->{$code}},$1);
405 push(@{$marc_mapping->{$code}},$'); #'
406 }
407 }
408 }
409 }
410
411
412 foreach my $marc_field ( keys %$metadata_mapping )
413 {
414
415 my $matched_field = $marc_mapping->{$marc_field};
416 my $subfield = undef;
417
418 if (defined $matched_field){
419 ## test whether this field has subfield
420 if ($marc_field =~ /\d\d\d(\w)/){
421 $subfield = $1;
422 }
423 my $metaname = $metadata_mapping->{$marc_field};
424
425 my $metavalue;
426 if (defined $subfield){
427 my %mapped_subfield = {@$matched_field};
428 $metavalue = $mapped_subfield{$subfield};
429 }
430 else{ ## get all values except info
431 my $i =0;
432 foreach my $value (@$matched_field){
433 if ($i%2 != 0 and $i != 1){
434 $metavalue .= $value." ";
435 }
436 $i++;
437 }
438 }
439
440 ## escape [ and ]
441 $metavalue =~ s/\[/\\\[/g;
442 $metavalue =~ s/\]/\\\]/g;
443 ##print STDERR "$metaname=$metavalue\n";
444 $doc_obj->add_metadata ($section, $metaname, $metavalue) ;
445 }
446
447 }
448
449}
450
451
4521;
Note: See TracBrowser for help on using the repository browser.