source: gsdl/trunk/perllib/plugins/MARCPlugin.pm@ 16692

Last change on this file since 16692 was 16692, checked in by kjdon, 16 years ago

code to read in marc mapping files moved from MARCXMLPlugin to marcmapping.pm, and its now also used by MARCPlugin.pm so that MARCplugin can use qualified dublin core

  • Property svn:keywords set to Author Date Id Revision
File size: 11.7 KB
Line 
1###########################################################################
2#
3# MARCPlugin.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlugin;
28
29use SplitTextFile;
30
31use unicode;
32use util;
33use marcmapping;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38sub BEGIN {
39 @MARCPlugin::ISA = ('SplitTextFile');
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
41}
42
43my $arguments =
44 [ { 'name' => "metadata_mapping",
45 'desc' => "{common.deprecated} {MARCPlugin.metadata_mapping}",
46 'type' => "string",
47 'deft' => "",
48 'hiddengli' => "yes", # deprecated in favour of 'metadata_mapping_file'
49 'reqd' => "no" },
50 { 'name' => "metadata_mapping_file",
51 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
52 'type' => "string",
53 'deft' => "marctodc.txt",
54 'reqd' => "no" },
55 { 'name' => "process_exp",
56 'desc' => "{BasePlugin.process_exp}",
57 'type' => "regexp",
58 'reqd' => "no",
59 'deft' => &get_default_process_exp() },
60 { 'name' => "split_exp",
61 'desc' => "{SplitTextFile.split_exp}",
62 'type' => "regexp",
63 'reqd' => "no",
64 'deft' => &get_default_split_exp() }
65 ];
66
67my $options = { 'name' => "MARCPlugin",
68 'desc' => "{MARCPlugin.desc}",
69 'abstract' => "no",
70 'inherits' => "yes",
71 'explodes' => "yes",
72 'args' => $arguments };
73
74require MARC::Record;
75require MARC::Batch;
76#use MARC::Record;
77#use MARC::Batch;
78
79sub new {
80 my ($class) = shift (@_);
81 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
82 push(@$pluginlist, $class);
83
84 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
85 push(@{$hashArgOptLists->{"OptList"}},$options);
86
87 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
88
89 if ($self->{'info_only'}) {
90 # don't worry about the options
91 return bless $self, $class;
92 }
93 # 'metadata_mapping' was used in two ways in the plugin: as a plugin
94 # option (filename) and as a datastructure to represent the mapping.
95 # In MARXXMLPlug (written later) the two are separated: filename is
96 # represented through 'metadata_mapping_file' and the data-structure
97 # mapping left as 'metadata_mapping'
98 # 'metadata_mapping' still present (but hidden in GLI) for
99 # backwards compatibility, but 'metadata_mapping_file' is used by
100 # preference
101
102 if ($self->{'metadata_mapping'} ne "") {
103 print STDERR "MARCPlugin WARNING:: the metadata_mapping option is set but has been deprecated. Please use metadata_mapping_file option instead\n";
104 # If the old version is set, use it.
105 $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'};
106 }
107 $self->{'metadata_mapping'} = undef;
108 $self->{'type'} = "";
109
110 return bless $self, $class;
111}
112
113sub init {
114 my $self = shift (@_);
115 my ($verbosity, $outhandle, $failhandle) = @_;
116
117 ## the mapping file has already been loaded
118 if (defined $self->{'metadata_mapping'} ){
119 $self->SUPER::init(@_);
120 return;
121 }
122
123 # read in the metadata mapping files
124 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
125
126 if (scalar(@$mm_files)==0)
127 {
128 my $msg = "MARCPlugin ERROR: Can't locate mapping file \"" .
129 $self->{'metadata_mapping_file'} . "\".\n " .
130 " No marc files can be processed.\n";
131
132 print $outhandle $msg;
133 print $failhandle $msg;
134 $self->{'metadata_mapping'} = undef;
135 # We pick up the error in process() if there is no $mm_file
136 # If we exit here, then pluginfo.pl will exit too!
137 }
138 else {
139 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
140 }
141
142 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
143
144 $self->SUPER::init(@_);
145}
146
147
148
149sub get_default_process_exp {
150 my $self = shift (@_);
151
152 return q^(?i)(\.marc)$^;
153}
154
155
156sub get_default_split_exp {
157 # \r\n for msdos eol, \n for unix
158 return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
159}
160
161
162
163# The bulk of this function is based on read_line in multiread.pm
164# Unable to use read_line original because it expects to get its input
165# from a file. Here the line to be converted is passed in as a string
166
167sub to_utf8
168{
169 my $self = shift (@_);
170 my ($encoding, $line) = @_;
171
172 if ($encoding eq "utf8") {
173 # nothing needs to be done
174 return $line;
175 }
176
177 if ($encoding eq "iso_8859_1") {
178 # we'll use ascii2utf8() for this as it's faster than going
179 # through convert2unicode()
180 return &unicode::ascii2utf8 (\$line);
181 }
182
183 # everything else uses unicode::convert2unicode
184 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
185}
186
187
188sub read_file {
189 my $self = shift (@_);
190 my ($filename, $encoding, $language, $textref) = @_;
191
192 $self->{'readfile_encoding'}->{$filename} = $encoding;
193
194
195 if (!-r $filename)
196 {
197 my $outhandle = $self->{'outhandle'};
198 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
199 return;
200 }
201
202 ##handle ascii marc
203 #test whether this is ascii marc file
204 if (open (FILE, $filename)) {
205 while (defined (my $line = <FILE>)) {
206 $$textref .= $line;
207 if ($line =~ /\[\w+\]Record type:/){
208 undef $/;
209 $$textref .= <FILE>;
210 $/ = "\n";
211 $self->{'type'} = "ascii";
212 close FILE;
213 return;
214 }
215 }
216 close FILE;
217 }
218
219
220 $$textref = "";
221 my @marc_entries = ();
222
223 my $batch = new MARC::Batch( 'USMARC', $filename );
224 while ( my $marc = $batch->next )
225 {
226 push(@marc_entries,$marc);
227 $$textref .= $marc->as_formatted();
228 $$textref .= "\n\n"; # for SplitTextFile - see default_split_exp above...
229 }
230
231 $self->{'marc_entries'}->{$filename} = \@marc_entries;
232}
233
234
235
236# do plugin specific processing of doc_obj
237# This gets done for each record found by SplitTextFile in marc files.
238sub process {
239 my $self = shift (@_);
240 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
241
242 my $outhandle = $self->{'outhandle'};
243 my $filename = &util::filename_cat($base_dir, $file);
244
245 if (! defined($self->{'metadata_mapping'}))
246 {
247 print $outhandle "MARCPlugin: no metadata file! Can't process $file\n";
248 return undef;
249 }
250
251 my $cursection = $doc_obj->get_top_section();
252
253 # Add fileFormat as the metadata
254 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
255
256 my $marc_entries = $self->{'marc_entries'}->{$filename};
257 my $marc = shift(@$marc_entries);
258
259 my $encoding = $self->{'readfile_encoding'}->{$filename};
260
261 if ($self->{'type'} ne "ascii" ){
262 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
263 }
264 else{
265 $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
266 }
267
268 # add spaces after the sub-field markers, for word boundaries
269 $$textref =~ s/^(.{6} _\w)/$1 /gm;
270
271 # add text to document object
272 $$textref =~ s/</&lt;/g;
273 $$textref =~ s/>/&gt;/g;
274
275 $$textref = $self->to_utf8($encoding,$$textref);
276
277 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
278 if $self->{'verbosity'} > 2;
279
280 # line wrapping
281 $$textref = &wrap_text_in_columns($$textref, 64);
282 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
283
284 $doc_obj->add_utf8_text($cursection, $$textref);
285
286 return 1;
287}
288
289sub wrap_text_in_columns
290{
291 my ($text, $columnwidth) = @_;
292 my $newtext = "";
293 my $linelength = 0;
294
295 # Break the text into words, and display one at a time
296 my @words = split(/ /, $text);
297
298 foreach my $word (@words) {
299 # If printing this word would exceed the column end, start a new line
300 if (($linelength + length($word)) >= $columnwidth) {
301 $newtext .= "\n";
302 $linelength = 0;
303 }
304
305 # Write the word
306 $newtext .= " $word";
307 if ($word =~ /\n/) {
308 $linelength = 0;
309 } else {
310 $linelength = $linelength + length(" $word");
311 }
312 }
313
314 $newtext .= "\n";
315 return $newtext;
316}
317
318
319sub extract_metadata
320{
321 my $self = shift (@_);
322
323 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
324 my $outhandle = $self->{'outhandle'};
325
326 if (!defined $marc){
327 return;
328 }
329
330 my $metadata_mapping = $self->{'metadata_mapping'};;
331
332 foreach my $marc_field ( keys %$metadata_mapping )
333 {
334 my $gsdl_field = $metadata_mapping->{$marc_field};
335 my $meta_value = undef;
336
337 # have we got a subfield?
338 my $subfield = undef;
339 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
340 $marc_field = $1;
341 $subfield = $2;
342 }
343 if (defined $subfield) {
344 $meta_value = $marc->subfield($marc_field, $subfield);
345 if (defined $meta_value) {
346 ## escape [ and ]
347 $meta_value =~ s/\[/\\\[/g;
348 $meta_value =~ s/\]/\\\]/g;
349 my $metavalue_str = $self->to_utf8($encoding, $meta_value);
350 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
351 }
352 } else {
353
354 my @metavalues = $marc->field($marc_field);
355
356 if (scalar(@metavalues)>0)
357 {
358 my $metavalue = undef;
359 foreach $metavalue ( @metavalues )
360 {
361 my $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
362 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
363 }
364 }
365 }
366 }
367}
368
369
370sub extract_ascii_metadata
371{
372 my $self = shift (@_);
373
374 my ($text, $metadata,$doc_obj, $section) = @_;
375 my $outhandle = $self->{'outhandle'};
376 my $metadata_mapping = $self->{'metadata_mapping'};
377 ## get fields
378 my @fields = split(/[\n\r]+/,$text);
379 my $marc_mapping ={};
380
381 foreach my $field (@fields){
382 if ($field ne ""){
383 $field =~ /^(\d\d\d)\s/;
384 my $code = $1;
385 $field = $'; #'
386 ##get subfields
387 my @subfields = split(/\$/,$field);
388 my $i=0;
389 $marc_mapping->{$code} = [];
390 foreach my $subfield (@subfields){
391 if ($i == 0){
392 ##print STDERR $subfield."\n";
393 push(@{$marc_mapping->{$code}},"info");
394 push(@{$marc_mapping->{$code}},$subfield);
395
396 $i++;
397 }
398 else{
399 $subfield =~ /(\w)\s/;
400 ##print STDERR "$1=>$'\n";
401 push(@{$marc_mapping->{$code}},$1);
402 push(@{$marc_mapping->{$code}},$'); #'
403 }
404 }
405 }
406 }
407
408
409 foreach my $marc_field ( keys %$metadata_mapping )
410 {
411
412 my $matched_field = $marc_mapping->{$marc_field};
413 my $subfield = undef;
414
415 if (defined $matched_field){
416 ## test whether this field has subfield
417 if ($marc_field =~ /\d\d\d(\w)/){
418 $subfield = $1;
419 }
420 my $metaname = $metadata_mapping->{$marc_field};
421
422 my $metavalue;
423 if (defined $subfield){
424 my %mapped_subfield = {@$matched_field};
425 $metavalue = $mapped_subfield{$subfield};
426 }
427 else{ ## get all values except info
428 my $i =0;
429 foreach my $value (@$matched_field){
430 if ($i%2 != 0 and $i != 1){
431 $metavalue .= $value." ";
432 }
433 $i++;
434 }
435 }
436
437 ## escape [ and ]
438 $metavalue =~ s/\[/\\\[/g;
439 $metavalue =~ s/\]/\\\]/g;
440 ##print STDERR "$metaname=$metavalue\n";
441 $doc_obj->add_metadata ($section, $metaname, $metavalue) ;
442 }
443
444 }
445
446}
447
448
4491;
Note: See TracBrowser for help on using the repository browser.