source: gsdl/trunk/perllib/plugins/MARCPlugin.pm@ 15962

Last change on this file since 15962 was 15962, checked in by kjdon, 16 years ago

added a check for info_only before lookign at the arguments

  • Property svn:keywords set to Author Date Id Revision
File size: 11.5 KB
Line 
1###########################################################################
2#
3# MARCPlugin.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlugin;
28
29use SplitTextFile;
30
31use unicode;
32use util;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub BEGIN {
38 @MARCPlugin::ISA = ('SplitTextFile');
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
40}
41
42my $arguments =
43 [ { 'name' => "metadata_mapping",
44 'desc' => "{MARCPlugin.metadata_mapping}",
45 'type' => "string",
46 'deft' => "marctodc.txt",
47 'hiddengli' => "yes", # deprecated in favour of 'metadata_mapping_file'
48 'reqd' => "no" },
49 { 'name' => "metadata_mapping_file",
50 'desc' => "{MARCXMLPlug.metadata_mapping_file}",
51 'type' => "string",
52 'deft' => "",
53 'reqd' => "no" },
54 { 'name' => "process_exp",
55 'desc' => "{BasePlugin.process_exp}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => &get_default_process_exp() },
59 { 'name' => "split_exp",
60 'desc' => "{SplitTextFile.split_exp}",
61 'type' => "regexp",
62 'reqd' => "no",
63 'deft' => &get_default_split_exp() }
64 ];
65
66my $options = { 'name' => "MARCPlugin",
67 'desc' => "{MARCPlugin.desc}",
68 'abstract' => "no",
69 'inherits' => "yes",
70 'explodes' => "yes",
71 'args' => $arguments };
72
73require MARC::Record;
74require MARC::Batch;
75#use MARC::Record;
76#use MARC::Batch;
77
78sub new {
79 my ($class) = shift (@_);
80 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
81 push(@$pluginlist, $class);
82
83 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
84 push(@{$hashArgOptLists->{"OptList"}},$options);
85
86 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
87
88 if ($self->{'info_only'}) {
89 # don't worry about the options
90 return bless $self, $class;
91 }
92 # 'metadata_mapping' was used in two ways in the plugin: as a plugin
93 # option (filename) and as a datastructure to represent the mapping.
94 # In MARXXMLPlug (written later) the two are separated: filename is
95 # represented through 'metadata_mapping_file' and the data-structure
96 # mapping left as 'metadata_mapping'
97 # 'metadata_mapping' still present (but hidden in GLI) for
98 # backwards compatibility, but 'metadata_mapping_file' is used by
99 # preference
100
101 if ($self->{'metadata_mapping_file'} eq "") {
102 # If nothing set in the new version, use the old version
103 # that defaults to 'marctodc.txt'
104 $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'};
105 }
106
107 $self->{'type'} = "";
108
109 return bless $self, $class;
110}
111
112sub init {
113 my $self = shift (@_);
114 my ($verbosity, $outhandle, $failhandle) = @_;
115
116 my @metadata_mapping = ();
117
118 # read in the metadata mapping file
119
120 my $mm_file = &util::locate_config_file($self->{'metadata_mapping_file'});
121
122 if (!defined $mm_file)
123 {
124
125 my $msg = "MARCPlugin ERROR: Can't locate mapping file \"" .
126 $self->{'metadata_mapping_file'} . "\".\n" .
127 " No marc files can be processed.\n";
128
129 print $outhandle $msg;
130 print $failhandle $msg;
131 $self->{'metadata_mapping'} = undef;
132 # We pick up the error in process() if there is no $mm_file
133 # If we exit here, then pluginfo.pl will exit too!
134 }
135 elsif (open(MMIN, "<$mm_file"))
136 {
137 my $l=1;
138 my $line;
139 while (defined($line=<MMIN>))
140 {
141 chomp $line;
142 if ($line =~ m/^(\d+)\s*->\s*([\w\^]+)$/)
143 {
144 my $marc_info = $1;
145 my $gsdl_info = $2;
146 my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info };
147 push(@metadata_mapping,$mapping);
148 }
149 elsif ($line !~ m/^\#/ # allow comments (# in first column)
150 && $line !~ m/^\s*$/) # allow blank lines
151 {
152 print $outhandle "Parse error on line $l of $mm_file:\n";
153 print $outhandle " \"$line\"\n";
154 }
155 $l++
156 }
157 close(MMIN);
158
159 $self->{'metadata_mapping'} = \@metadata_mapping;
160 }
161 else
162 {
163 print STDERR "Unable to open $mm_file: $!\n";
164 }
165
166
167
168 $self->SUPER::init(@_);
169}
170
171
172sub get_default_process_exp {
173 my $self = shift (@_);
174
175 return q^(?i)(\.marc)$^;
176}
177
178
179sub get_default_split_exp {
180 # \r\n for msdos eol, \n for unix
181 return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
182}
183
184
185
186# The bulk of this function is based on read_line in multiread.pm
187# Unable to use read_line original because it expects to get its input
188# from a file. Here the line to be converted is passed in as a string
189
190sub to_utf8
191{
192 my $self = shift (@_);
193 my ($encoding, $line) = @_;
194
195 if ($encoding eq "utf8") {
196 # nothing needs to be done
197 return $line;
198 }
199
200 if ($encoding eq "iso_8859_1") {
201 # we'll use ascii2utf8() for this as it's faster than going
202 # through convert2unicode()
203 return &unicode::ascii2utf8 (\$line);
204 }
205
206 # everything else uses unicode::convert2unicode
207 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
208}
209
210
211sub read_file {
212 my $self = shift (@_);
213 my ($filename, $encoding, $language, $textref) = @_;
214
215 $self->{'readfile_encoding'}->{$filename} = $encoding;
216
217
218 if (!-r $filename)
219 {
220 my $outhandle = $self->{'outhandle'};
221 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
222 return;
223 }
224
225 ##handle ascii marc
226 #test whether this is ascii marc file
227 if (open (FILE, $filename)) {
228 while (defined (my $line = <FILE>)) {
229 $$textref .= $line;
230 if ($line =~ /\[\w+\]Record type:/){
231 undef $/;
232 $$textref .= <FILE>;
233 $/ = "\n";
234 $self->{'type'} = "ascii";
235 close FILE;
236 return;
237 }
238 }
239 close FILE;
240 }
241
242
243 $$textref = "";
244 my @marc_entries = ();
245
246 my $batch = new MARC::Batch( 'USMARC', $filename );
247 while ( my $marc = $batch->next )
248 {
249 push(@marc_entries,$marc);
250 $$textref .= $marc->as_formatted();
251 $$textref .= "\n\n"; # for SplitTextFile - see default_split_exp above...
252 }
253
254 $self->{'marc_entries'}->{$filename} = \@marc_entries;
255}
256
257
258
259# do plugin specific processing of doc_obj
260# This gets done for each record found by SplitTextFile in marc files.
261sub process {
262 my $self = shift (@_);
263 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
264
265 my $outhandle = $self->{'outhandle'};
266 my $filename = &util::filename_cat($base_dir, $file);
267
268 if (! defined($self->{'metadata_mapping'}))
269 {
270 print $outhandle "MARCPlugin: no metadata file! Can't process $file\n";
271 return undef;
272 }
273
274 print STDERR "<Processing n='$file' p='MARCPlugin'>\n" if ($gli);
275 print $outhandle "MARCPlugin: processing $file\n"
276 if $self->{'verbosity'} > 1;
277
278 my $cursection = $doc_obj->get_top_section();
279
280 # Add fileFormat as the metadata
281 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
282
283 my $marc_entries = $self->{'marc_entries'}->{$filename};
284 my $marc = shift(@$marc_entries);
285
286 my $encoding = $self->{'readfile_encoding'}->{$filename};
287
288 if ($self->{'type'} ne "ascii" ){
289 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
290 }
291 else{
292 $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
293 }
294
295 # add spaces after the sub-field markers, for word boundaries
296 $$textref =~ s/^(.{6} _\w)/$1 /gm;
297
298 # add text to document object
299 $$textref =~ s/</&lt;/g;
300 $$textref =~ s/>/&gt;/g;
301
302 $$textref = $self->to_utf8($encoding,$$textref);
303
304 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
305 if $self->{'verbosity'} > 2;
306
307 # line wrapping
308 $$textref = &wrap_text_in_columns($$textref, 64);
309 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
310
311 $doc_obj->add_utf8_text($cursection, $$textref);
312
313 return 1;
314}
315
316sub wrap_text_in_columns
317{
318 my ($text, $columnwidth) = @_;
319 my $newtext = "";
320 my $linelength = 0;
321
322 # Break the text into words, and display one at a time
323 my @words = split(/ /, $text);
324
325 foreach my $word (@words) {
326 # If printing this word would exceed the column end, start a new line
327 if (($linelength + length($word)) >= $columnwidth) {
328 $newtext .= "\n";
329 $linelength = 0;
330 }
331
332 # Write the word
333 $newtext .= " $word";
334 if ($word =~ /\n/) {
335 $linelength = 0;
336 } else {
337 $linelength = $linelength + length(" $word");
338 }
339 }
340
341 $newtext .= "\n";
342 return $newtext;
343}
344
345
346sub extract_metadata
347{
348 my $self = shift (@_);
349 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
350 my $outhandle = $self->{'outhandle'};
351
352 if (!defined $marc){
353 return;
354 }
355
356 my $metadata_mapping = $self->{'metadata_mapping'};
357 my $mm;
358
359 foreach $mm ( @$metadata_mapping )
360 {
361 my $marc_field = $mm->{'marc'};
362
363 my @metavalues = $marc->field($marc_field);
364
365 if (scalar(@metavalues)>0)
366 {
367 my $metaname = $mm->{'gsdl'};
368 my $metavalue;
369 foreach $metavalue ( @metavalues )
370 {
371 my $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
372 $doc_obj->add_utf8_metadata ($section, $metaname, $metavalue_str);
373 }
374 }
375 }
376}
377
378sub extract_ascii_metadata
379{
380 my $self = shift (@_);
381 my ($text, $metadata,$doc_obj, $section) = @_;
382 my $outhandle = $self->{'outhandle'};
383 my $metadata_mapping = $self->{'metadata_mapping'};
384 ## get fields
385 my @fields = split(/[\n\r]+/,$text);
386 my $marc_mapping ={};
387
388 foreach my $field (@fields){
389 if ($field ne ""){
390 $field =~ /^(\d\d\d)\s/;
391 my $code = $1;
392 $field = $';
393 ##get subfields
394 my @subfields = split(/\$/,$field);
395 my $i=0;
396 $marc_mapping->{$code} = [];
397 foreach my $subfield (@subfields){
398 if ($i == 0){
399 ##print STDERR $subfield."\n";
400 push(@{$marc_mapping->{$code}},"info");
401 push(@{$marc_mapping->{$code}},$subfield);
402
403 $i++;
404 }
405 else{
406 $subfield =~ /(\w)\s/;
407 ##print STDERR "$1=>$'\n";
408 push(@{$marc_mapping->{$code}},$1);
409 push(@{$marc_mapping->{$code}},$');
410 }
411 }
412 }
413 }
414
415
416 foreach my $mm ( @$metadata_mapping )
417 {
418 my $marc_field = $mm->{'marc'};
419
420 my $matched_field = $marc_mapping->{$marc_field};
421 my $subfield = undef;
422
423 if (defined $matched_field){
424 ## test whether this field has subfield
425 if ($marc_field =~ /\d\d\d(\w)/){
426 $subfield = $1;
427 }
428 my $metaname = $mm->{'gsdl'};
429
430 my $metavalue;
431 if (defined $subfield){
432 my %mapped_subfield = {@$matched_field};
433 $metavalue = $mapped_subfield{$subfield};
434 }
435 else{ ## get all values except info
436 my $i =0;
437 foreach my $value (@$matched_field){
438 if ($i%2 != 0 and $i != 1){
439 $metavalue .= $value." ";
440 }
441 $i++;
442 }
443 }
444
445 ## escape [ and ]
446 $metavalue =~ s/\[/\\\[/g;
447 $metavalue =~ s/\]/\\\]/g;
448 ##print STDERR "$metaname=$metavalue\n";
449 $doc_obj->add_metadata ($section, $metaname, $metavalue) ;
450 }
451
452 }
453
454}
455
456
4571;
Note: See TracBrowser for help on using the repository browser.