source: main/trunk/greenstone2/perllib/plugins/MARCPlugin.pm@ 27141

Last change on this file since 27141 was 27141, checked in by kjdon, 11 years ago

fixed extract_metadata so that it will get all ocurrences of a subfield, not just the first one. NOTE, haven't done the same for extract_ascii_metadata

  • Property svn:keywords set to Author Date Id Revision
File size: 12.4 KB
RevLine 
[3430]1###########################################################################
2#
[15872]3# MARCPlugin.pm -- basic MARC plugin
[3430]4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
[3508]9# Copyright (C) 2002 New Zealand Digital Library Project
[3430]10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
[15872]27package MARCPlugin;
[3430]28
[15872]29use SplitTextFile;
[24547]30use MetadataRead;
[3430]31
[27106]32use Encode;
33
[3430]34use unicode;
35use util;
[16692]36use marcmapping;
[3430]37
[10254]38use strict;
39no strict 'refs'; # allow filehandles to be variables and viceversa
40
[24547]41# methods defined in superclasses that have the same signature take
42# precedence in the order given in the ISA list. We want MetaPlugins to
43# call MetadataRead's can_process_this_file_for_metadata(), rather than
44# calling BasePlugin's version of the same method, so list inherited
45# superclasses in this order.
[3430]46sub BEGIN {
[24547]47 @MARCPlugin::ISA = ('MetadataRead', 'SplitTextFile');
[5878]48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
[3430]49}
50
[4744]51my $arguments =
52 [ { 'name' => "metadata_mapping",
[16692]53 'desc' => "{common.deprecated} {MARCPlugin.metadata_mapping}",
[4744]54 'type' => "string",
[16692]55 'deft' => "",
[15962]56 'hiddengli' => "yes", # deprecated in favour of 'metadata_mapping_file'
[6408]57 'reqd' => "no" },
[15018]58 { 'name' => "metadata_mapping_file",
[16013]59 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
[15018]60 'type' => "string",
[18563]61 'deft' => "marc2dc.txt",
[15018]62 'reqd' => "no" },
[6408]63 { 'name' => "process_exp",
[15872]64 'desc' => "{BasePlugin.process_exp}",
[6408]65 'type' => "regexp",
66 'reqd' => "no",
67 'deft' => &get_default_process_exp() },
68 { 'name' => "split_exp",
[15872]69 'desc' => "{SplitTextFile.split_exp}",
[6408]70 'type' => "regexp",
71 'reqd' => "no",
72 'deft' => &get_default_split_exp() }
73 ];
[4744]74
[15872]75my $options = { 'name' => "MARCPlugin",
76 'desc' => "{MARCPlugin.desc}",
[6408]77 'abstract' => "no",
[11676]78 'inherits' => "yes",
[9118]79 'explodes' => "yes",
[4744]80 'args' => $arguments };
81
[5866]82require MARC::Record;
83require MARC::Batch;
[6332]84#use MARC::Record;
85#use MARC::Batch;
[3430]86
87sub new {
[10218]88 my ($class) = shift (@_);
89 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
90 push(@$pluginlist, $class);
[3430]91
[15872]92 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
93 push(@{$hashArgOptLists->{"OptList"}},$options);
[24547]94
95 # this does nothing yet, but if member vars are ever added
96 # to MetadataRead, will need to do this anyway:
97 #new MetadataRead($pluginlist, $inputargs, $hashArgOptLists);
[15872]98 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[3430]99
[15962]100 if ($self->{'info_only'}) {
101 # don't worry about the options
102 return bless $self, $class;
103 }
[15018]104 # 'metadata_mapping' was used in two ways in the plugin: as a plugin
105 # option (filename) and as a datastructure to represent the mapping.
106 # In MARXXMLPlug (written later) the two are separated: filename is
107 # represented through 'metadata_mapping_file' and the data-structure
108 # mapping left as 'metadata_mapping'
109 # 'metadata_mapping' still present (but hidden in GLI) for
110 # backwards compatibility, but 'metadata_mapping_file' is used by
111 # preference
112
[16692]113 if ($self->{'metadata_mapping'} ne "") {
114 print STDERR "MARCPlugin WARNING:: the metadata_mapping option is set but has been deprecated. Please use metadata_mapping_file option instead\n";
115 # If the old version is set, use it.
[15018]116 $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'};
117 }
[16692]118 $self->{'metadata_mapping'} = undef;
[13198]119 $self->{'type'} = "";
[3430]120 return bless $self, $class;
121}
122
123sub init {
124 my $self = shift (@_);
125 my ($verbosity, $outhandle, $failhandle) = @_;
126
[16692]127 ## the mapping file has already been loaded
128 if (defined $self->{'metadata_mapping'} ){
129 $self->SUPER::init(@_);
130 return;
131 }
[3430]132
[16692]133 # read in the metadata mapping files
134 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
135 if (scalar(@$mm_files)==0)
[3430]136 {
[15872]137 my $msg = "MARCPlugin ERROR: Can't locate mapping file \"" .
[16692]138 $self->{'metadata_mapping_file'} . "\".\n " .
[16697]139 " No metadata will be extracted from MARC files.\n";
[3430]140
141 print $outhandle $msg;
142 print $failhandle $msg;
[10218]143 $self->{'metadata_mapping'} = undef;
[3430]144 # We pick up the error in process() if there is no $mm_file
145 # If we exit here, then pluginfo.pl will exit too!
146 }
[16692]147 else {
148 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
[3430]149 }
150
[16692]151 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
[3430]152
153 $self->SUPER::init(@_);
154}
155
[16692]156
157
[3430]158sub get_default_process_exp {
159 my $self = shift (@_);
160
161 return q^(?i)(\.marc)$^;
162}
163
164
165sub get_default_split_exp {
166 # \r\n for msdos eol, \n for unix
[13198]167 return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
[3430]168}
169
170
171
[4791]172# The bulk of this function is based on read_line in multiread.pm
173# Unable to use read_line original because it expects to get its input
174# from a file. Here the line to be converted is passed in as a string
175
176sub to_utf8
177{
178 my $self = shift (@_);
179 my ($encoding, $line) = @_;
180
[14964]181 if ($encoding eq "utf8") {
182 # nothing needs to be done
[27106]183 #return $line;
184 } elsif ($encoding eq "iso_8859_1") {
[4791]185 # we'll use ascii2utf8() for this as it's faster than going
186 # through convert2unicode()
[27106]187 #return &unicode::ascii2utf8 (\$line);
188 $line = &unicode::ascii2utf8 (\$line);
189 } else {
190
191 # everything else uses unicode::convert2unicode
192 $line = &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
[4791]193 }
[27106]194 # At this point $line is a binary byte string
195 # => turn it into a Unicode aware string, so full
196 # Unicode aware pattern matching can be used.
197 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
[4791]198
[27106]199 return decode ("utf8", $line);
[4791]200}
201
202
[3430]203sub read_file {
204 my $self = shift (@_);
205 my ($filename, $encoding, $language, $textref) = @_;
206
[16697]207 my $outhandle = $self->{'outhandle'};
208
209 if (! defined($self->{'metadata_mapping'}))
210 {
211 # print a warning
[16970]212 print $outhandle "MARCPlugin: no metadata mapping file! Can't extract metadata from $filename\n";
[16697]213 }
214
[9494]215 $self->{'readfile_encoding'}->{$filename} = $encoding;
[4791]216
[13198]217
[3430]218 if (!-r $filename)
219 {
220 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
221 return;
222 }
223
[13198]224 ##handle ascii marc
225 #test whether this is ascii marc file
226 if (open (FILE, $filename)) {
227 while (defined (my $line = <FILE>)) {
228 $$textref .= $line;
229 if ($line =~ /\[\w+\]Record type:/){
230 undef $/;
231 $$textref .= <FILE>;
232 $/ = "\n";
233 $self->{'type'} = "ascii";
234 close FILE;
235 return;
236 }
237 }
238 close FILE;
239 }
240
241
242 $$textref = "";
243 my @marc_entries = ();
244
[3430]245 my $batch = new MARC::Batch( 'USMARC', $filename );
246 while ( my $marc = $batch->next )
247 {
[13198]248 push(@marc_entries,$marc);
[3430]249 $$textref .= $marc->as_formatted();
[15872]250 $$textref .= "\n\n"; # for SplitTextFile - see default_split_exp above...
[3430]251 }
252
[9493]253 $self->{'marc_entries'}->{$filename} = \@marc_entries;
[3430]254}
255
256
257
258# do plugin specific processing of doc_obj
[15872]259# This gets done for each record found by SplitTextFile in marc files.
[3430]260sub process {
261 my $self = shift (@_);
[6332]262 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[9494]263
[3430]264 my $outhandle = $self->{'outhandle'};
[9494]265 my $filename = &util::filename_cat($base_dir, $file);
[3430]266
267 my $cursection = $doc_obj->get_top_section();
268
[8121]269 # Add fileFormat as the metadata
270 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
271
[9493]272 my $marc_entries = $self->{'marc_entries'}->{$filename};
[3430]273 my $marc = shift(@$marc_entries);
274
[9494]275 my $encoding = $self->{'readfile_encoding'}->{$filename};
[16697]276 if (defined ($self->{'metadata_mapping'}) ) {
277 if ($self->{'type'} ne "ascii" ){
278 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
279 }
280 else{
281 $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
282 }
[13198]283 }
284
[3430]285 # add spaces after the sub-field markers, for word boundaries
286 $$textref =~ s/^(.{6} _\w)/$1 /gm;
287
288 # add text to document object
289 $$textref =~ s/</&lt;/g;
290 $$textref =~ s/>/&gt;/g;
291
[4791]292 $$textref = $self->to_utf8($encoding,$$textref);
293
[3430]294 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
295 if $self->{'verbosity'} > 2;
296
[7533]297 # line wrapping
[7553]298 $$textref = &wrap_text_in_columns($$textref, 64);
[3430]299 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
300
301 $doc_obj->add_utf8_text($cursection, $$textref);
302
303 return 1;
304}
305
[7547]306sub wrap_text_in_columns
307{
308 my ($text, $columnwidth) = @_;
309 my $newtext = "";
310 my $linelength = 0;
311
312 # Break the text into words, and display one at a time
313 my @words = split(/ /, $text);
[3430]314
[10254]315 foreach my $word (@words) {
[7547]316 # If printing this word would exceed the column end, start a new line
317 if (($linelength + length($word)) >= $columnwidth) {
318 $newtext .= "\n";
319 $linelength = 0;
320 }
321
322 # Write the word
323 $newtext .= " $word";
324 if ($word =~ /\n/) {
325 $linelength = 0;
326 } else {
327 $linelength = $linelength + length(" $word");
328 }
329 }
[3430]330
[7547]331 $newtext .= "\n";
332 return $newtext;
333}
334
[16692]335sub extract_metadata
[3430]336{
337 my $self = shift (@_);
[16692]338
[9494]339 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
[3430]340 my $outhandle = $self->{'outhandle'};
341
[13198]342 if (!defined $marc){
343 return;
344 }
345
[16692]346 my $metadata_mapping = $self->{'metadata_mapping'};;
[13198]347
[16692]348 foreach my $marc_field ( keys %$metadata_mapping )
[3430]349 {
[16692]350 my $gsdl_field = $metadata_mapping->{$marc_field};
351
352 # have we got a subfield?
353 my $subfield = undef;
354 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
355 $marc_field = $1;
356 $subfield = $2;
357 }
[27141]358
359 foreach my $meta_value_obj ($marc->field($marc_field)) {
360 my $meta_value;
361 if (defined($subfield)) {
362 $meta_value = $meta_value_obj->subfield($subfield);
363 } else {
364 $meta_value = $meta_value_obj->as_string();
365 }
[16692]366 if (defined $meta_value) {
[18402]367 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
368 $meta_value =~ s/\[/&\#091;/g;
369 $meta_value =~ s/\]/&\#093;/g;
[16692]370 my $metavalue_str = $self->to_utf8($encoding, $meta_value);
371 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
372 }
[18402]373 }
[3430]374 }
375}
[13198]376
[16692]377
[13198]378sub extract_ascii_metadata
379{
380 my $self = shift (@_);
[16692]381
[13198]382 my ($text, $metadata,$doc_obj, $section) = @_;
383 my $outhandle = $self->{'outhandle'};
384 my $metadata_mapping = $self->{'metadata_mapping'};
385 ## get fields
386 my @fields = split(/[\n\r]+/,$text);
387 my $marc_mapping ={};
388
389 foreach my $field (@fields){
390 if ($field ne ""){
391 $field =~ /^(\d\d\d)\s/;
392 my $code = $1;
[16692]393 $field = $'; #'
[13198]394 ##get subfields
395 my @subfields = split(/\$/,$field);
396 my $i=0;
397 $marc_mapping->{$code} = [];
398 foreach my $subfield (@subfields){
399 if ($i == 0){
400 ##print STDERR $subfield."\n";
401 push(@{$marc_mapping->{$code}},"info");
402 push(@{$marc_mapping->{$code}},$subfield);
403
404 $i++;
405 }
406 else{
407 $subfield =~ /(\w)\s/;
408 ##print STDERR "$1=>$'\n";
409 push(@{$marc_mapping->{$code}},$1);
[16692]410 push(@{$marc_mapping->{$code}},$'); #'
[13198]411 }
412 }
413 }
414 }
415
416
[16692]417 foreach my $marc_field ( keys %$metadata_mapping )
[13198]418 {
419
420 my $matched_field = $marc_mapping->{$marc_field};
421 my $subfield = undef;
422
423 if (defined $matched_field){
424 ## test whether this field has subfield
425 if ($marc_field =~ /\d\d\d(\w)/){
426 $subfield = $1;
427 }
[16692]428 my $metaname = $metadata_mapping->{$marc_field};
[13198]429
430 my $metavalue;
431 if (defined $subfield){
432 my %mapped_subfield = {@$matched_field};
433 $metavalue = $mapped_subfield{$subfield};
434 }
435 else{ ## get all values except info
436 my $i =0;
437 foreach my $value (@$matched_field){
438 if ($i%2 != 0 and $i != 1){
439 $metavalue .= $value." ";
440 }
441 $i++;
442 }
443 }
444
445 ## escape [ and ]
446 $metavalue =~ s/\[/\\\[/g;
447 $metavalue =~ s/\]/\\\]/g;
448 ##print STDERR "$metaname=$metavalue\n";
449 $doc_obj->add_metadata ($section, $metaname, $metavalue) ;
450 }
451
452 }
453
454}
455
456
[3430]4571;
Note: See TracBrowser for help on using the repository browser.