source: main/trunk/greenstone2/perllib/plugins/MARCPlugin.pm@ 24547

Last change on this file since 24547 was 24547, checked in by ak19, 13 years ago

Added new abstract plugin MetadataRead that defines can_process_this_file_for_metadata that MetadataPlugin subclasses can inherit (if MetadataRead is listed first in the ISA inheritance list) and which will then override the one defined in BasePlugin. For now committing MARC, ISIS and OAIPlugins which now additionally inherit from MetadataRead. Other metadataPlugins also need to be committed.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.4 KB
Line 
1###########################################################################
2#
3# MARCPlugin.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlugin;
28
29use SplitTextFile;
30use MetadataRead;
31
32use unicode;
33use util;
34use marcmapping;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39# methods defined in superclasses that have the same signature take
40# precedence in the order given in the ISA list. We want MetaPlugins to
41# call MetadataRead's can_process_this_file_for_metadata(), rather than
42# calling BasePlugin's version of the same method, so list inherited
43# superclasses in this order.
44sub BEGIN {
45 @MARCPlugin::ISA = ('MetadataRead', 'SplitTextFile');
46 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
47}
48
49my $arguments =
50 [ { 'name' => "metadata_mapping",
51 'desc' => "{common.deprecated} {MARCPlugin.metadata_mapping}",
52 'type' => "string",
53 'deft' => "",
54 'hiddengli' => "yes", # deprecated in favour of 'metadata_mapping_file'
55 'reqd' => "no" },
56 { 'name' => "metadata_mapping_file",
57 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
58 'type' => "string",
59 'deft' => "marc2dc.txt",
60 'reqd' => "no" },
61 { 'name' => "process_exp",
62 'desc' => "{BasePlugin.process_exp}",
63 'type' => "regexp",
64 'reqd' => "no",
65 'deft' => &get_default_process_exp() },
66 { 'name' => "split_exp",
67 'desc' => "{SplitTextFile.split_exp}",
68 'type' => "regexp",
69 'reqd' => "no",
70 'deft' => &get_default_split_exp() }
71 ];
72
73my $options = { 'name' => "MARCPlugin",
74 'desc' => "{MARCPlugin.desc}",
75 'abstract' => "no",
76 'inherits' => "yes",
77 'explodes' => "yes",
78 'args' => $arguments };
79
80require MARC::Record;
81require MARC::Batch;
82#use MARC::Record;
83#use MARC::Batch;
84
85sub new {
86 my ($class) = shift (@_);
87 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88 push(@$pluginlist, $class);
89
90 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
91 push(@{$hashArgOptLists->{"OptList"}},$options);
92
93 # this does nothing yet, but if member vars are ever added
94 # to MetadataRead, will need to do this anyway:
95 #new MetadataRead($pluginlist, $inputargs, $hashArgOptLists);
96 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
97
98 if ($self->{'info_only'}) {
99 # don't worry about the options
100 return bless $self, $class;
101 }
102 # 'metadata_mapping' was used in two ways in the plugin: as a plugin
103 # option (filename) and as a datastructure to represent the mapping.
104 # In MARXXMLPlug (written later) the two are separated: filename is
105 # represented through 'metadata_mapping_file' and the data-structure
106 # mapping left as 'metadata_mapping'
107 # 'metadata_mapping' still present (but hidden in GLI) for
108 # backwards compatibility, but 'metadata_mapping_file' is used by
109 # preference
110
111 if ($self->{'metadata_mapping'} ne "") {
112 print STDERR "MARCPlugin WARNING:: the metadata_mapping option is set but has been deprecated. Please use metadata_mapping_file option instead\n";
113 # If the old version is set, use it.
114 $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'};
115 }
116 $self->{'metadata_mapping'} = undef;
117 $self->{'type'} = "";
118 return bless $self, $class;
119}
120
121sub init {
122 my $self = shift (@_);
123 my ($verbosity, $outhandle, $failhandle) = @_;
124
125 ## the mapping file has already been loaded
126 if (defined $self->{'metadata_mapping'} ){
127 $self->SUPER::init(@_);
128 return;
129 }
130
131 # read in the metadata mapping files
132 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
133 if (scalar(@$mm_files)==0)
134 {
135 my $msg = "MARCPlugin ERROR: Can't locate mapping file \"" .
136 $self->{'metadata_mapping_file'} . "\".\n " .
137 " No metadata will be extracted from MARC files.\n";
138
139 print $outhandle $msg;
140 print $failhandle $msg;
141 $self->{'metadata_mapping'} = undef;
142 # We pick up the error in process() if there is no $mm_file
143 # If we exit here, then pluginfo.pl will exit too!
144 }
145 else {
146 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
147 }
148
149 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
150
151 $self->SUPER::init(@_);
152}
153
154
155
156sub get_default_process_exp {
157 my $self = shift (@_);
158
159 return q^(?i)(\.marc)$^;
160}
161
162
163sub get_default_split_exp {
164 # \r\n for msdos eol, \n for unix
165 return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
166}
167
168
169
170# The bulk of this function is based on read_line in multiread.pm
171# Unable to use read_line original because it expects to get its input
172# from a file. Here the line to be converted is passed in as a string
173
174sub to_utf8
175{
176 my $self = shift (@_);
177 my ($encoding, $line) = @_;
178
179 if ($encoding eq "utf8") {
180 # nothing needs to be done
181 return $line;
182 }
183
184 if ($encoding eq "iso_8859_1") {
185 # we'll use ascii2utf8() for this as it's faster than going
186 # through convert2unicode()
187 return &unicode::ascii2utf8 (\$line);
188 }
189
190 # everything else uses unicode::convert2unicode
191 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
192}
193
194
195sub read_file {
196 my $self = shift (@_);
197 my ($filename, $encoding, $language, $textref) = @_;
198
199 my $outhandle = $self->{'outhandle'};
200
201 if (! defined($self->{'metadata_mapping'}))
202 {
203 # print a warning
204 print $outhandle "MARCPlugin: no metadata mapping file! Can't extract metadata from $filename\n";
205 }
206
207 $self->{'readfile_encoding'}->{$filename} = $encoding;
208
209
210 if (!-r $filename)
211 {
212 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
213 return;
214 }
215
216 ##handle ascii marc
217 #test whether this is ascii marc file
218 if (open (FILE, $filename)) {
219 while (defined (my $line = <FILE>)) {
220 $$textref .= $line;
221 if ($line =~ /\[\w+\]Record type:/){
222 undef $/;
223 $$textref .= <FILE>;
224 $/ = "\n";
225 $self->{'type'} = "ascii";
226 close FILE;
227 return;
228 }
229 }
230 close FILE;
231 }
232
233
234 $$textref = "";
235 my @marc_entries = ();
236
237 my $batch = new MARC::Batch( 'USMARC', $filename );
238 while ( my $marc = $batch->next )
239 {
240 push(@marc_entries,$marc);
241 $$textref .= $marc->as_formatted();
242 $$textref .= "\n\n"; # for SplitTextFile - see default_split_exp above...
243 }
244
245 $self->{'marc_entries'}->{$filename} = \@marc_entries;
246}
247
248
249
250# do plugin specific processing of doc_obj
251# This gets done for each record found by SplitTextFile in marc files.
252sub process {
253 my $self = shift (@_);
254 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
255
256 my $outhandle = $self->{'outhandle'};
257 my $filename = &util::filename_cat($base_dir, $file);
258
259 my $cursection = $doc_obj->get_top_section();
260
261 # Add fileFormat as the metadata
262 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
263
264 my $marc_entries = $self->{'marc_entries'}->{$filename};
265 my $marc = shift(@$marc_entries);
266
267 my $encoding = $self->{'readfile_encoding'}->{$filename};
268
269 if (defined ($self->{'metadata_mapping'}) ) {
270 if ($self->{'type'} ne "ascii" ){
271 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
272 }
273 else{
274 $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
275 }
276 }
277
278 # add spaces after the sub-field markers, for word boundaries
279 $$textref =~ s/^(.{6} _\w)/$1 /gm;
280
281 # add text to document object
282 $$textref =~ s/</&lt;/g;
283 $$textref =~ s/>/&gt;/g;
284
285 $$textref = $self->to_utf8($encoding,$$textref);
286
287 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
288 if $self->{'verbosity'} > 2;
289
290 # line wrapping
291 $$textref = &wrap_text_in_columns($$textref, 64);
292 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
293
294 $doc_obj->add_utf8_text($cursection, $$textref);
295
296 return 1;
297}
298
299sub wrap_text_in_columns
300{
301 my ($text, $columnwidth) = @_;
302 my $newtext = "";
303 my $linelength = 0;
304
305 # Break the text into words, and display one at a time
306 my @words = split(/ /, $text);
307
308 foreach my $word (@words) {
309 # If printing this word would exceed the column end, start a new line
310 if (($linelength + length($word)) >= $columnwidth) {
311 $newtext .= "\n";
312 $linelength = 0;
313 }
314
315 # Write the word
316 $newtext .= " $word";
317 if ($word =~ /\n/) {
318 $linelength = 0;
319 } else {
320 $linelength = $linelength + length(" $word");
321 }
322 }
323
324 $newtext .= "\n";
325 return $newtext;
326}
327
328
329sub extract_metadata
330{
331 my $self = shift (@_);
332
333 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
334 my $outhandle = $self->{'outhandle'};
335
336 if (!defined $marc){
337 return;
338 }
339
340 my $metadata_mapping = $self->{'metadata_mapping'};;
341
342 foreach my $marc_field ( keys %$metadata_mapping )
343 {
344 my $gsdl_field = $metadata_mapping->{$marc_field};
345
346 # have we got a subfield?
347 my $subfield = undef;
348 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
349 $marc_field = $1;
350 $subfield = $2;
351 }
352 if (defined $subfield) {
353 my $meta_value = $marc->subfield($marc_field, $subfield);
354 if (defined $meta_value) {
355 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
356 $meta_value =~ s/\[/&\#091;/g;
357 $meta_value =~ s/\]/&\#093;/g;
358 my $metavalue_str = $self->to_utf8($encoding, $meta_value);
359 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
360 }
361 }
362 else
363 {
364 foreach my $meta_value_obj ($marc->field($marc_field))
365 {
366 my $meta_value = $meta_value_obj->as_string();
367
368 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
369 $meta_value =~ s/\[/&\#091;/g;
370 $meta_value =~ s/\]/&\#093;/g;
371 my $metavalue_str = $self->to_utf8($encoding, $meta_value);
372 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
373 }
374 }
375 }
376}
377
378
379sub extract_ascii_metadata
380{
381 my $self = shift (@_);
382
383 my ($text, $metadata,$doc_obj, $section) = @_;
384 my $outhandle = $self->{'outhandle'};
385 my $metadata_mapping = $self->{'metadata_mapping'};
386 ## get fields
387 my @fields = split(/[\n\r]+/,$text);
388 my $marc_mapping ={};
389
390 foreach my $field (@fields){
391 if ($field ne ""){
392 $field =~ /^(\d\d\d)\s/;
393 my $code = $1;
394 $field = $'; #'
395 ##get subfields
396 my @subfields = split(/\$/,$field);
397 my $i=0;
398 $marc_mapping->{$code} = [];
399 foreach my $subfield (@subfields){
400 if ($i == 0){
401 ##print STDERR $subfield."\n";
402 push(@{$marc_mapping->{$code}},"info");
403 push(@{$marc_mapping->{$code}},$subfield);
404
405 $i++;
406 }
407 else{
408 $subfield =~ /(\w)\s/;
409 ##print STDERR "$1=>$'\n";
410 push(@{$marc_mapping->{$code}},$1);
411 push(@{$marc_mapping->{$code}},$'); #'
412 }
413 }
414 }
415 }
416
417
418 foreach my $marc_field ( keys %$metadata_mapping )
419 {
420
421 my $matched_field = $marc_mapping->{$marc_field};
422 my $subfield = undef;
423
424 if (defined $matched_field){
425 ## test whether this field has subfield
426 if ($marc_field =~ /\d\d\d(\w)/){
427 $subfield = $1;
428 }
429 my $metaname = $metadata_mapping->{$marc_field};
430
431 my $metavalue;
432 if (defined $subfield){
433 my %mapped_subfield = {@$matched_field};
434 $metavalue = $mapped_subfield{$subfield};
435 }
436 else{ ## get all values except info
437 my $i =0;
438 foreach my $value (@$matched_field){
439 if ($i%2 != 0 and $i != 1){
440 $metavalue .= $value." ";
441 }
442 $i++;
443 }
444 }
445
446 ## escape [ and ]
447 $metavalue =~ s/\[/\\\[/g;
448 $metavalue =~ s/\]/\\\]/g;
449 ##print STDERR "$metaname=$metavalue\n";
450 $doc_obj->add_metadata ($section, $metaname, $metavalue) ;
451 }
452
453 }
454
455}
456
457
4581;
Note: See TracBrowser for help on using the repository browser.