source: gsdl/trunk/perllib/plugins/MARCPlugin.pm@ 16104

Last change on this file since 16104 was 16104, checked in by kjdon, 16 years ago

tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process

  • Property svn:keywords set to Author Date Id Revision
File size: 11.3 KB
Line 
1###########################################################################
2#
3# MARCPlugin.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlugin;
28
29use SplitTextFile;
30
31use unicode;
32use util;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub BEGIN {
38 @MARCPlugin::ISA = ('SplitTextFile');
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
40}
41
42my $arguments =
43 [ { 'name' => "metadata_mapping",
44 'desc' => "{MARCPlugin.metadata_mapping}",
45 'type' => "string",
46 'deft' => "marctodc.txt",
47 'hiddengli' => "yes", # deprecated in favour of 'metadata_mapping_file'
48 'reqd' => "no" },
49 { 'name' => "metadata_mapping_file",
50 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
51 'type' => "string",
52 'deft' => "",
53 'reqd' => "no" },
54 { 'name' => "process_exp",
55 'desc' => "{BasePlugin.process_exp}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => &get_default_process_exp() },
59 { 'name' => "split_exp",
60 'desc' => "{SplitTextFile.split_exp}",
61 'type' => "regexp",
62 'reqd' => "no",
63 'deft' => &get_default_split_exp() }
64 ];
65
66my $options = { 'name' => "MARCPlugin",
67 'desc' => "{MARCPlugin.desc}",
68 'abstract' => "no",
69 'inherits' => "yes",
70 'explodes' => "yes",
71 'args' => $arguments };
72
73require MARC::Record;
74require MARC::Batch;
75#use MARC::Record;
76#use MARC::Batch;
77
78sub new {
79 my ($class) = shift (@_);
80 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
81 push(@$pluginlist, $class);
82
83 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
84 push(@{$hashArgOptLists->{"OptList"}},$options);
85
86 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
87
88 if ($self->{'info_only'}) {
89 # don't worry about the options
90 return bless $self, $class;
91 }
92 # 'metadata_mapping' was used in two ways in the plugin: as a plugin
93 # option (filename) and as a datastructure to represent the mapping.
94 # In MARXXMLPlug (written later) the two are separated: filename is
95 # represented through 'metadata_mapping_file' and the data-structure
96 # mapping left as 'metadata_mapping'
97 # 'metadata_mapping' still present (but hidden in GLI) for
98 # backwards compatibility, but 'metadata_mapping_file' is used by
99 # preference
100
101 if ($self->{'metadata_mapping_file'} eq "") {
102 # If nothing set in the new version, use the old version
103 # that defaults to 'marctodc.txt'
104 $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'};
105 }
106
107 $self->{'type'} = "";
108
109 return bless $self, $class;
110}
111
112sub init {
113 my $self = shift (@_);
114 my ($verbosity, $outhandle, $failhandle) = @_;
115
116 my @metadata_mapping = ();
117
118 # read in the metadata mapping file
119
120 my $mm_file = &util::locate_config_file($self->{'metadata_mapping_file'});
121
122 if (!defined $mm_file)
123 {
124
125 my $msg = "MARCPlugin ERROR: Can't locate mapping file \"" .
126 $self->{'metadata_mapping_file'} . "\".\n" .
127 " No marc files can be processed.\n";
128
129 print $outhandle $msg;
130 print $failhandle $msg;
131 $self->{'metadata_mapping'} = undef;
132 # We pick up the error in process() if there is no $mm_file
133 # If we exit here, then pluginfo.pl will exit too!
134 }
135 elsif (open(MMIN, "<$mm_file"))
136 {
137 my $l=1;
138 my $line;
139 while (defined($line=<MMIN>))
140 {
141 chomp $line;
142 if ($line =~ m/^(\d+)\s*->\s*([\w\^]+)$/)
143 {
144 my $marc_info = $1;
145 my $gsdl_info = $2;
146 my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info };
147 push(@metadata_mapping,$mapping);
148 }
149 elsif ($line !~ m/^\#/ # allow comments (# in first column)
150 && $line !~ m/^\s*$/) # allow blank lines
151 {
152 print $outhandle "Parse error on line $l of $mm_file:\n";
153 print $outhandle " \"$line\"\n";
154 }
155 $l++
156 }
157 close(MMIN);
158
159 $self->{'metadata_mapping'} = \@metadata_mapping;
160 }
161 else
162 {
163 print STDERR "Unable to open $mm_file: $!\n";
164 }
165
166
167
168 $self->SUPER::init(@_);
169}
170
171
172sub get_default_process_exp {
173 my $self = shift (@_);
174
175 return q^(?i)(\.marc)$^;
176}
177
178
179sub get_default_split_exp {
180 # \r\n for msdos eol, \n for unix
181 return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
182}
183
184
185
186# The bulk of this function is based on read_line in multiread.pm
187# Unable to use read_line original because it expects to get its input
188# from a file. Here the line to be converted is passed in as a string
189
190sub to_utf8
191{
192 my $self = shift (@_);
193 my ($encoding, $line) = @_;
194
195 if ($encoding eq "utf8") {
196 # nothing needs to be done
197 return $line;
198 }
199
200 if ($encoding eq "iso_8859_1") {
201 # we'll use ascii2utf8() for this as it's faster than going
202 # through convert2unicode()
203 return &unicode::ascii2utf8 (\$line);
204 }
205
206 # everything else uses unicode::convert2unicode
207 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
208}
209
210
211sub read_file {
212 my $self = shift (@_);
213 my ($filename, $encoding, $language, $textref) = @_;
214
215 $self->{'readfile_encoding'}->{$filename} = $encoding;
216
217
218 if (!-r $filename)
219 {
220 my $outhandle = $self->{'outhandle'};
221 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
222 return;
223 }
224
225 ##handle ascii marc
226 #test whether this is ascii marc file
227 if (open (FILE, $filename)) {
228 while (defined (my $line = <FILE>)) {
229 $$textref .= $line;
230 if ($line =~ /\[\w+\]Record type:/){
231 undef $/;
232 $$textref .= <FILE>;
233 $/ = "\n";
234 $self->{'type'} = "ascii";
235 close FILE;
236 return;
237 }
238 }
239 close FILE;
240 }
241
242
243 $$textref = "";
244 my @marc_entries = ();
245
246 my $batch = new MARC::Batch( 'USMARC', $filename );
247 while ( my $marc = $batch->next )
248 {
249 push(@marc_entries,$marc);
250 $$textref .= $marc->as_formatted();
251 $$textref .= "\n\n"; # for SplitTextFile - see default_split_exp above...
252 }
253
254 $self->{'marc_entries'}->{$filename} = \@marc_entries;
255}
256
257
258
259# do plugin specific processing of doc_obj
260# This gets done for each record found by SplitTextFile in marc files.
261sub process {
262 my $self = shift (@_);
263 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
264
265 my $outhandle = $self->{'outhandle'};
266 my $filename = &util::filename_cat($base_dir, $file);
267
268 if (! defined($self->{'metadata_mapping'}))
269 {
270 print $outhandle "MARCPlugin: no metadata file! Can't process $file\n";
271 return undef;
272 }
273
274 my $cursection = $doc_obj->get_top_section();
275
276 # Add fileFormat as the metadata
277 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
278
279 my $marc_entries = $self->{'marc_entries'}->{$filename};
280 my $marc = shift(@$marc_entries);
281
282 my $encoding = $self->{'readfile_encoding'}->{$filename};
283
284 if ($self->{'type'} ne "ascii" ){
285 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
286 }
287 else{
288 $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
289 }
290
291 # add spaces after the sub-field markers, for word boundaries
292 $$textref =~ s/^(.{6} _\w)/$1 /gm;
293
294 # add text to document object
295 $$textref =~ s/</&lt;/g;
296 $$textref =~ s/>/&gt;/g;
297
298 $$textref = $self->to_utf8($encoding,$$textref);
299
300 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
301 if $self->{'verbosity'} > 2;
302
303 # line wrapping
304 $$textref = &wrap_text_in_columns($$textref, 64);
305 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
306
307 $doc_obj->add_utf8_text($cursection, $$textref);
308
309 return 1;
310}
311
312sub wrap_text_in_columns
313{
314 my ($text, $columnwidth) = @_;
315 my $newtext = "";
316 my $linelength = 0;
317
318 # Break the text into words, and display one at a time
319 my @words = split(/ /, $text);
320
321 foreach my $word (@words) {
322 # If printing this word would exceed the column end, start a new line
323 if (($linelength + length($word)) >= $columnwidth) {
324 $newtext .= "\n";
325 $linelength = 0;
326 }
327
328 # Write the word
329 $newtext .= " $word";
330 if ($word =~ /\n/) {
331 $linelength = 0;
332 } else {
333 $linelength = $linelength + length(" $word");
334 }
335 }
336
337 $newtext .= "\n";
338 return $newtext;
339}
340
341
342sub extract_metadata
343{
344 my $self = shift (@_);
345 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
346 my $outhandle = $self->{'outhandle'};
347
348 if (!defined $marc){
349 return;
350 }
351
352 my $metadata_mapping = $self->{'metadata_mapping'};
353 my $mm;
354
355 foreach $mm ( @$metadata_mapping )
356 {
357 my $marc_field = $mm->{'marc'};
358
359 my @metavalues = $marc->field($marc_field);
360
361 if (scalar(@metavalues)>0)
362 {
363 my $metaname = $mm->{'gsdl'};
364 my $metavalue;
365 foreach $metavalue ( @metavalues )
366 {
367 my $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
368 $doc_obj->add_utf8_metadata ($section, $metaname, $metavalue_str);
369 }
370 }
371 }
372}
373
374sub extract_ascii_metadata
375{
376 my $self = shift (@_);
377 my ($text, $metadata,$doc_obj, $section) = @_;
378 my $outhandle = $self->{'outhandle'};
379 my $metadata_mapping = $self->{'metadata_mapping'};
380 ## get fields
381 my @fields = split(/[\n\r]+/,$text);
382 my $marc_mapping ={};
383
384 foreach my $field (@fields){
385 if ($field ne ""){
386 $field =~ /^(\d\d\d)\s/;
387 my $code = $1;
388 $field = $';
389 ##get subfields
390 my @subfields = split(/\$/,$field);
391 my $i=0;
392 $marc_mapping->{$code} = [];
393 foreach my $subfield (@subfields){
394 if ($i == 0){
395 ##print STDERR $subfield."\n";
396 push(@{$marc_mapping->{$code}},"info");
397 push(@{$marc_mapping->{$code}},$subfield);
398
399 $i++;
400 }
401 else{
402 $subfield =~ /(\w)\s/;
403 ##print STDERR "$1=>$'\n";
404 push(@{$marc_mapping->{$code}},$1);
405 push(@{$marc_mapping->{$code}},$');
406 }
407 }
408 }
409 }
410
411
412 foreach my $mm ( @$metadata_mapping )
413 {
414 my $marc_field = $mm->{'marc'};
415
416 my $matched_field = $marc_mapping->{$marc_field};
417 my $subfield = undef;
418
419 if (defined $matched_field){
420 ## test whether this field has subfield
421 if ($marc_field =~ /\d\d\d(\w)/){
422 $subfield = $1;
423 }
424 my $metaname = $mm->{'gsdl'};
425
426 my $metavalue;
427 if (defined $subfield){
428 my %mapped_subfield = {@$matched_field};
429 $metavalue = $mapped_subfield{$subfield};
430 }
431 else{ ## get all values except info
432 my $i =0;
433 foreach my $value (@$matched_field){
434 if ($i%2 != 0 and $i != 1){
435 $metavalue .= $value." ";
436 }
437 $i++;
438 }
439 }
440
441 ## escape [ and ]
442 $metavalue =~ s/\[/\\\[/g;
443 $metavalue =~ s/\]/\\\]/g;
444 ##print STDERR "$metaname=$metavalue\n";
445 $doc_obj->add_metadata ($section, $metaname, $metavalue) ;
446 }
447
448 }
449
450}
451
452
4531;
Note: See TracBrowser for help on using the repository browser.