source: gsdl/trunk/perllib/plugins/MARCPlug.pm@ 14964

Last change on this file since 14964 was 14964, checked in by davidb, 16 years ago

Minor tweak to MARCPlug so the plugin does not try to convert utf8 data into ... utf8svn diff MARCPlug.pm

  • Property svn:keywords set to Author Date Id Revision
File size: 10.5 KB
Line 
1###########################################################################
2#
3# MARCPlug.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlug;
28
29use SplitPlug;
30
31use unicode;
32use util;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub BEGIN {
38 @MARCPlug::ISA = ('SplitPlug');
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
40}
41
42my $arguments =
43 [ { 'name' => "metadata_mapping",
44 'desc' => "{MARCPlug.metadata_mapping}",
45 'type' => "string",
46 'deft' => "marctodc.txt",
47 'reqd' => "no" },
48 { 'name' => "process_exp",
49 'desc' => "{BasPlug.process_exp}",
50 'type' => "regexp",
51 'reqd' => "no",
52 'deft' => &get_default_process_exp() },
53 { 'name' => "split_exp",
54 'desc' => "{SplitPlug.split_exp}",
55 'type' => "regexp",
56 'reqd' => "no",
57 'deft' => &get_default_split_exp() }
58 ];
59
60my $options = { 'name' => "MARCPlug",
61 'desc' => "{MARCPlug.desc}",
62 'abstract' => "no",
63 'inherits' => "yes",
64 'explodes' => "yes",
65 'args' => $arguments };
66
67require MARC::Record;
68require MARC::Batch;
69#use MARC::Record;
70#use MARC::Batch;
71
72sub new {
73 my ($class) = shift (@_);
74 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
75 push(@$pluginlist, $class);
76
77 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
78 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
79
80 my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
81
82 $self->{'type'} = "";
83
84 return bless $self, $class;
85}
86
87sub init {
88 my $self = shift (@_);
89 my ($verbosity, $outhandle, $failhandle) = @_;
90
91 my @metadata_mapping = ();
92
93 # read in the metadata mapping file
94 my $mm_file =
95 &util::filename_cat( $ENV{'GSDLHOME'}, "etc", $self->{'metadata_mapping'} );
96
97 if (!-e $mm_file)
98 {
99
100 my $msg = "MARCPlug ERROR: Can't locate mapping file \"" .
101 $self->{'metadata_mapping'} . "\".\n This file should be at $mm_file\n" .
102 " No marc files can be processed.\n";
103
104 print $outhandle $msg;
105 print $failhandle $msg;
106 $self->{'metadata_mapping'} = undef;
107 # We pick up the error in process() if there is no $mm_file
108 # If we exit here, then pluginfo.pl will exit too!
109 }
110 elsif (open(MMIN, "<$mm_file"))
111 {
112 my $l=1;
113 my $line;
114 while (defined($line=<MMIN>))
115 {
116 chomp $line;
117 if ($line =~ m/^(\d+)\s*->\s*([\w\^]+)$/)
118 {
119 my $marc_info = $1;
120 my $gsdl_info = $2;
121 my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info };
122 push(@metadata_mapping,$mapping);
123 }
124 elsif ($line !~ m/^\#/ # allow comments (# in first column)
125 && $line !~ m/^\s*$/) # allow blank lines
126 {
127 print $outhandle "Parse error on line $l of $mm_file:\n";
128 print $outhandle " \"$line\"\n";
129 }
130 $l++
131 }
132 close(MMIN);
133 }
134 else
135 {
136 print STDERR "Unable to open $mm_file: $!\n";
137 }
138
139 $self->{'metadata_mapping'} = \@metadata_mapping;
140
141 $self->SUPER::init(@_);
142}
143
144
145sub get_default_process_exp {
146 my $self = shift (@_);
147
148 return q^(?i)(\.marc)$^;
149}
150
151
152sub get_default_split_exp {
153 # \r\n for msdos eol, \n for unix
154 return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
155}
156
157
158
159# The bulk of this function is based on read_line in multiread.pm
160# Unable to use read_line original because it expects to get its input
161# from a file. Here the line to be converted is passed in as a string
162
163sub to_utf8
164{
165 my $self = shift (@_);
166 my ($encoding, $line) = @_;
167
168 if ($encoding eq "utf8") {
169 # nothing needs to be done
170 return $line;
171 }
172
173 if ($encoding eq "iso_8859_1") {
174 # we'll use ascii2utf8() for this as it's faster than going
175 # through convert2unicode()
176 return &unicode::ascii2utf8 (\$line);
177 }
178
179 # everything else uses unicode::convert2unicode
180 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
181}
182
183
184sub read_file {
185 my $self = shift (@_);
186 my ($filename, $encoding, $language, $textref) = @_;
187
188 $self->{'readfile_encoding'}->{$filename} = $encoding;
189
190
191 if (!-r $filename)
192 {
193 my $outhandle = $self->{'outhandle'};
194 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
195 return;
196 }
197
198 ##handle ascii marc
199 #test whether this is ascii marc file
200 if (open (FILE, $filename)) {
201 while (defined (my $line = <FILE>)) {
202 $$textref .= $line;
203 if ($line =~ /\[\w+\]Record type:/){
204 undef $/;
205 $$textref .= <FILE>;
206 $/ = "\n";
207 $self->{'type'} = "ascii";
208 close FILE;
209 return;
210 }
211 }
212 close FILE;
213 }
214
215
216 $$textref = "";
217 my @marc_entries = ();
218
219 my $batch = new MARC::Batch( 'USMARC', $filename );
220 while ( my $marc = $batch->next )
221 {
222 push(@marc_entries,$marc);
223 $$textref .= $marc->as_formatted();
224 $$textref .= "\n\n"; # for SplitPlug - see default_split_exp above...
225 }
226
227 $self->{'marc_entries'}->{$filename} = \@marc_entries;
228}
229
230
231
232# do plugin specific processing of doc_obj
233# This gets done for each record found by SplitPlug in marc files.
234sub process {
235 my $self = shift (@_);
236 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
237
238 my $outhandle = $self->{'outhandle'};
239 my $filename = &util::filename_cat($base_dir, $file);
240
241 if (! defined($self->{'metadata_mapping'}))
242 {
243 print $outhandle "MARCPlug: no metadata file! Can't process $file\n";
244 return undef;
245 }
246
247 print STDERR "<Processing n='$file' p='MARCPlug'>\n" if ($gli);
248 print $outhandle "MARCPlug: processing $file\n"
249 if $self->{'verbosity'} > 1;
250
251 my $cursection = $doc_obj->get_top_section();
252
253 # Add fileFormat as the metadata
254 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
255
256 my $marc_entries = $self->{'marc_entries'}->{$filename};
257 my $marc = shift(@$marc_entries);
258
259 my $encoding = $self->{'readfile_encoding'}->{$filename};
260
261 if ($self->{'type'} ne "ascii" ){
262 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
263 }
264 else{
265 $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
266 }
267
268 # add spaces after the sub-field markers, for word boundaries
269 $$textref =~ s/^(.{6} _\w)/$1 /gm;
270
271 # add text to document object
272 $$textref =~ s/</&lt;/g;
273 $$textref =~ s/>/&gt;/g;
274
275 $$textref = $self->to_utf8($encoding,$$textref);
276
277 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
278 if $self->{'verbosity'} > 2;
279
280 # line wrapping
281 $$textref = &wrap_text_in_columns($$textref, 64);
282 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
283
284 $doc_obj->add_utf8_text($cursection, $$textref);
285
286 return 1;
287}
288
289sub wrap_text_in_columns
290{
291 my ($text, $columnwidth) = @_;
292 my $newtext = "";
293 my $linelength = 0;
294
295 # Break the text into words, and display one at a time
296 my @words = split(/ /, $text);
297
298 foreach my $word (@words) {
299 # If printing this word would exceed the column end, start a new line
300 if (($linelength + length($word)) >= $columnwidth) {
301 $newtext .= "\n";
302 $linelength = 0;
303 }
304
305 # Write the word
306 $newtext .= " $word";
307 if ($word =~ /\n/) {
308 $linelength = 0;
309 } else {
310 $linelength = $linelength + length(" $word");
311 }
312 }
313
314 $newtext .= "\n";
315 return $newtext;
316}
317
318
319sub extract_metadata
320{
321 my $self = shift (@_);
322 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
323 my $outhandle = $self->{'outhandle'};
324
325 if (!defined $marc){
326 return;
327 }
328
329 my $metadata_mapping = $self->{'metadata_mapping'};
330 my $mm;
331
332 foreach $mm ( @$metadata_mapping )
333 {
334 my $marc_field = $mm->{'marc'};
335
336 my @metavalues = $marc->field($marc_field);
337
338 if (scalar(@metavalues)>0)
339 {
340 my $metaname = $mm->{'gsdl'};
341 my $metavalue;
342 foreach $metavalue ( @metavalues )
343 {
344 my $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
345 $doc_obj->add_utf8_metadata ($section, $metaname, $metavalue_str);
346 }
347 }
348 }
349}
350
351sub extract_ascii_metadata
352{
353 my $self = shift (@_);
354 my ($text, $metadata,$doc_obj, $section) = @_;
355 my $outhandle = $self->{'outhandle'};
356 my $metadata_mapping = $self->{'metadata_mapping'};
357 ## get fields
358 my @fields = split(/[\n\r]+/,$text);
359 my $marc_mapping ={};
360
361 foreach my $field (@fields){
362 if ($field ne ""){
363 $field =~ /^(\d\d\d)\s/;
364 my $code = $1;
365 $field = $';
366 ##get subfields
367 my @subfields = split(/\$/,$field);
368 my $i=0;
369 $marc_mapping->{$code} = [];
370 foreach my $subfield (@subfields){
371 if ($i == 0){
372 ##print STDERR $subfield."\n";
373 push(@{$marc_mapping->{$code}},"info");
374 push(@{$marc_mapping->{$code}},$subfield);
375
376 $i++;
377 }
378 else{
379 $subfield =~ /(\w)\s/;
380 ##print STDERR "$1=>$'\n";
381 push(@{$marc_mapping->{$code}},$1);
382 push(@{$marc_mapping->{$code}},$');
383 }
384 }
385 }
386 }
387
388
389 foreach my $mm ( @$metadata_mapping )
390 {
391 my $marc_field = $mm->{'marc'};
392
393 my $matched_field = $marc_mapping->{$marc_field};
394 my $subfield = undef;
395
396 if (defined $matched_field){
397 ## test whether this field has subfield
398 if ($marc_field =~ /\d\d\d(\w)/){
399 $subfield = $1;
400 }
401 my $metaname = $mm->{'gsdl'};
402
403 my $metavalue;
404 if (defined $subfield){
405 my %mapped_subfield = {@$matched_field};
406 $metavalue = $mapped_subfield{$subfield};
407 }
408 else{ ## get all values except info
409 my $i =0;
410 foreach my $value (@$matched_field){
411 if ($i%2 != 0 and $i != 1){
412 $metavalue .= $value." ";
413 }
414 $i++;
415 }
416 }
417
418 ## escape [ and ]
419 $metavalue =~ s/\[/\\\[/g;
420 $metavalue =~ s/\]/\\\]/g;
421 ##print STDERR "$metaname=$metavalue\n";
422 $doc_obj->add_metadata ($section, $metaname, $metavalue) ;
423 }
424
425 }
426
427}
428
429
4301;
Note: See TracBrowser for help on using the repository browser.