source: main/trunk/greenstone2/perllib/plugins/MARCPlugin.pm@ 27106

Last change on this file since 27106 was 27106, checked in by kjdon, 11 years ago

need to do the same utf8 decode step that is used in ReadTextFile on the text and metadata so that we get proper utf8 strings

  • Property svn:keywords set to Author Date Id Revision
File size: 12.7 KB
Line 
1###########################################################################
2#
3# MARCPlugin.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlugin;
28
29use SplitTextFile;
30use MetadataRead;
31
32use Encode;
33
34use unicode;
35use util;
36use marcmapping;
37
38use strict;
39no strict 'refs'; # allow filehandles to be variables and viceversa
40
41# methods defined in superclasses that have the same signature take
42# precedence in the order given in the ISA list. We want MetaPlugins to
43# call MetadataRead's can_process_this_file_for_metadata(), rather than
44# calling BasePlugin's version of the same method, so list inherited
45# superclasses in this order.
46sub BEGIN {
47 @MARCPlugin::ISA = ('MetadataRead', 'SplitTextFile');
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
49}
50
51my $arguments =
52 [ { 'name' => "metadata_mapping",
53 'desc' => "{common.deprecated} {MARCPlugin.metadata_mapping}",
54 'type' => "string",
55 'deft' => "",
56 'hiddengli' => "yes", # deprecated in favour of 'metadata_mapping_file'
57 'reqd' => "no" },
58 { 'name' => "metadata_mapping_file",
59 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
60 'type' => "string",
61 'deft' => "marc2dc.txt",
62 'reqd' => "no" },
63 { 'name' => "process_exp",
64 'desc' => "{BasePlugin.process_exp}",
65 'type' => "regexp",
66 'reqd' => "no",
67 'deft' => &get_default_process_exp() },
68 { 'name' => "split_exp",
69 'desc' => "{SplitTextFile.split_exp}",
70 'type' => "regexp",
71 'reqd' => "no",
72 'deft' => &get_default_split_exp() }
73 ];
74
75my $options = { 'name' => "MARCPlugin",
76 'desc' => "{MARCPlugin.desc}",
77 'abstract' => "no",
78 'inherits' => "yes",
79 'explodes' => "yes",
80 'args' => $arguments };
81
82require MARC::Record;
83require MARC::Batch;
84#use MARC::Record;
85#use MARC::Batch;
86
87sub new {
88 my ($class) = shift (@_);
89 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
90 push(@$pluginlist, $class);
91
92 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
93 push(@{$hashArgOptLists->{"OptList"}},$options);
94
95 # this does nothing yet, but if member vars are ever added
96 # to MetadataRead, will need to do this anyway:
97 #new MetadataRead($pluginlist, $inputargs, $hashArgOptLists);
98 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
99
100 if ($self->{'info_only'}) {
101 # don't worry about the options
102 return bless $self, $class;
103 }
104 # 'metadata_mapping' was used in two ways in the plugin: as a plugin
105 # option (filename) and as a datastructure to represent the mapping.
106 # In MARXXMLPlug (written later) the two are separated: filename is
107 # represented through 'metadata_mapping_file' and the data-structure
108 # mapping left as 'metadata_mapping'
109 # 'metadata_mapping' still present (but hidden in GLI) for
110 # backwards compatibility, but 'metadata_mapping_file' is used by
111 # preference
112
113 if ($self->{'metadata_mapping'} ne "") {
114 print STDERR "MARCPlugin WARNING:: the metadata_mapping option is set but has been deprecated. Please use metadata_mapping_file option instead\n";
115 # If the old version is set, use it.
116 $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'};
117 }
118 $self->{'metadata_mapping'} = undef;
119 $self->{'type'} = "";
120 return bless $self, $class;
121}
122
123sub init {
124 my $self = shift (@_);
125 my ($verbosity, $outhandle, $failhandle) = @_;
126
127 ## the mapping file has already been loaded
128 if (defined $self->{'metadata_mapping'} ){
129 $self->SUPER::init(@_);
130 return;
131 }
132
133 # read in the metadata mapping files
134 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
135 if (scalar(@$mm_files)==0)
136 {
137 my $msg = "MARCPlugin ERROR: Can't locate mapping file \"" .
138 $self->{'metadata_mapping_file'} . "\".\n " .
139 " No metadata will be extracted from MARC files.\n";
140
141 print $outhandle $msg;
142 print $failhandle $msg;
143 $self->{'metadata_mapping'} = undef;
144 # We pick up the error in process() if there is no $mm_file
145 # If we exit here, then pluginfo.pl will exit too!
146 }
147 else {
148 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
149 }
150
151 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
152
153 $self->SUPER::init(@_);
154}
155
156
157
158sub get_default_process_exp {
159 my $self = shift (@_);
160
161 return q^(?i)(\.marc)$^;
162}
163
164
165sub get_default_split_exp {
166 # \r\n for msdos eol, \n for unix
167 return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
168}
169
170
171
172# The bulk of this function is based on read_line in multiread.pm
173# Unable to use read_line original because it expects to get its input
174# from a file. Here the line to be converted is passed in as a string
175
176sub to_utf8
177{
178 my $self = shift (@_);
179 my ($encoding, $line) = @_;
180
181 if ($encoding eq "utf8") {
182 # nothing needs to be done
183 #return $line;
184 } elsif ($encoding eq "iso_8859_1") {
185 # we'll use ascii2utf8() for this as it's faster than going
186 # through convert2unicode()
187 #return &unicode::ascii2utf8 (\$line);
188 $line = &unicode::ascii2utf8 (\$line);
189 } else {
190
191 # everything else uses unicode::convert2unicode
192 $line = &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
193 }
194 # At this point $line is a binary byte string
195 # => turn it into a Unicode aware string, so full
196 # Unicode aware pattern matching can be used.
197 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
198
199 return decode ("utf8", $line);
200}
201
202
203sub read_file {
204 my $self = shift (@_);
205 my ($filename, $encoding, $language, $textref) = @_;
206
207 my $outhandle = $self->{'outhandle'};
208
209 if (! defined($self->{'metadata_mapping'}))
210 {
211 # print a warning
212 print $outhandle "MARCPlugin: no metadata mapping file! Can't extract metadata from $filename\n";
213 }
214
215 $self->{'readfile_encoding'}->{$filename} = $encoding;
216
217
218 if (!-r $filename)
219 {
220 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
221 return;
222 }
223
224 ##handle ascii marc
225 #test whether this is ascii marc file
226 if (open (FILE, $filename)) {
227 while (defined (my $line = <FILE>)) {
228 $$textref .= $line;
229 if ($line =~ /\[\w+\]Record type:/){
230 undef $/;
231 $$textref .= <FILE>;
232 $/ = "\n";
233 $self->{'type'} = "ascii";
234 close FILE;
235 return;
236 }
237 }
238 close FILE;
239 }
240
241
242 $$textref = "";
243 my @marc_entries = ();
244
245 my $batch = new MARC::Batch( 'USMARC', $filename );
246 while ( my $marc = $batch->next )
247 {
248 push(@marc_entries,$marc);
249 $$textref .= $marc->as_formatted();
250 $$textref .= "\n\n"; # for SplitTextFile - see default_split_exp above...
251 }
252
253 $self->{'marc_entries'}->{$filename} = \@marc_entries;
254}
255
256
257
258# do plugin specific processing of doc_obj
259# This gets done for each record found by SplitTextFile in marc files.
260sub process {
261 my $self = shift (@_);
262 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
263
264 my $outhandle = $self->{'outhandle'};
265 my $filename = &util::filename_cat($base_dir, $file);
266
267 my $cursection = $doc_obj->get_top_section();
268
269 # Add fileFormat as the metadata
270 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
271
272 my $marc_entries = $self->{'marc_entries'}->{$filename};
273 my $marc = shift(@$marc_entries);
274
275 my $encoding = $self->{'readfile_encoding'}->{$filename};
276 if (defined ($self->{'metadata_mapping'}) ) {
277 if ($self->{'type'} ne "ascii" ){
278 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
279 }
280 else{
281 $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
282 }
283 }
284
285 # add spaces after the sub-field markers, for word boundaries
286 $$textref =~ s/^(.{6} _\w)/$1 /gm;
287
288 # add text to document object
289 $$textref =~ s/</&lt;/g;
290 $$textref =~ s/>/&gt;/g;
291
292 $$textref = $self->to_utf8($encoding,$$textref);
293
294 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
295 if $self->{'verbosity'} > 2;
296
297 # line wrapping
298 $$textref = &wrap_text_in_columns($$textref, 64);
299 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
300
301 $doc_obj->add_utf8_text($cursection, $$textref);
302
303 return 1;
304}
305
306sub wrap_text_in_columns
307{
308 my ($text, $columnwidth) = @_;
309 my $newtext = "";
310 my $linelength = 0;
311
312 # Break the text into words, and display one at a time
313 my @words = split(/ /, $text);
314
315 foreach my $word (@words) {
316 # If printing this word would exceed the column end, start a new line
317 if (($linelength + length($word)) >= $columnwidth) {
318 $newtext .= "\n";
319 $linelength = 0;
320 }
321
322 # Write the word
323 $newtext .= " $word";
324 if ($word =~ /\n/) {
325 $linelength = 0;
326 } else {
327 $linelength = $linelength + length(" $word");
328 }
329 }
330
331 $newtext .= "\n";
332 return $newtext;
333}
334
335
336sub extract_metadata
337{
338 my $self = shift (@_);
339
340 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
341 my $outhandle = $self->{'outhandle'};
342
343 if (!defined $marc){
344 return;
345 }
346
347 my $metadata_mapping = $self->{'metadata_mapping'};;
348
349 foreach my $marc_field ( keys %$metadata_mapping )
350 {
351 my $gsdl_field = $metadata_mapping->{$marc_field};
352
353 # have we got a subfield?
354 my $subfield = undef;
355 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
356 $marc_field = $1;
357 $subfield = $2;
358 }
359 if (defined $subfield) {
360 my $meta_value = $marc->subfield($marc_field, $subfield);
361 if (defined $meta_value) {
362 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
363 $meta_value =~ s/\[/&\#091;/g;
364 $meta_value =~ s/\]/&\#093;/g;
365 my $metavalue_str = $self->to_utf8($encoding, $meta_value);
366 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
367 }
368 }
369 else
370 {
371 foreach my $meta_value_obj ($marc->field($marc_field))
372 {
373 my $meta_value = $meta_value_obj->as_string();
374
375 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
376 $meta_value =~ s/\[/&\#091;/g;
377 $meta_value =~ s/\]/&\#093;/g;
378 my $metavalue_str = $self->to_utf8($encoding, $meta_value);
379 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
380 }
381 }
382 }
383}
384
385
386sub extract_ascii_metadata
387{
388 my $self = shift (@_);
389
390 my ($text, $metadata,$doc_obj, $section) = @_;
391 my $outhandle = $self->{'outhandle'};
392 my $metadata_mapping = $self->{'metadata_mapping'};
393 ## get fields
394 my @fields = split(/[\n\r]+/,$text);
395 my $marc_mapping ={};
396
397 foreach my $field (@fields){
398 if ($field ne ""){
399 $field =~ /^(\d\d\d)\s/;
400 my $code = $1;
401 $field = $'; #'
402 ##get subfields
403 my @subfields = split(/\$/,$field);
404 my $i=0;
405 $marc_mapping->{$code} = [];
406 foreach my $subfield (@subfields){
407 if ($i == 0){
408 ##print STDERR $subfield."\n";
409 push(@{$marc_mapping->{$code}},"info");
410 push(@{$marc_mapping->{$code}},$subfield);
411
412 $i++;
413 }
414 else{
415 $subfield =~ /(\w)\s/;
416 ##print STDERR "$1=>$'\n";
417 push(@{$marc_mapping->{$code}},$1);
418 push(@{$marc_mapping->{$code}},$'); #'
419 }
420 }
421 }
422 }
423
424
425 foreach my $marc_field ( keys %$metadata_mapping )
426 {
427
428 my $matched_field = $marc_mapping->{$marc_field};
429 my $subfield = undef;
430
431 if (defined $matched_field){
432 ## test whether this field has subfield
433 if ($marc_field =~ /\d\d\d(\w)/){
434 $subfield = $1;
435 }
436 my $metaname = $metadata_mapping->{$marc_field};
437
438 my $metavalue;
439 if (defined $subfield){
440 my %mapped_subfield = {@$matched_field};
441 $metavalue = $mapped_subfield{$subfield};
442 }
443 else{ ## get all values except info
444 my $i =0;
445 foreach my $value (@$matched_field){
446 if ($i%2 != 0 and $i != 1){
447 $metavalue .= $value." ";
448 }
449 $i++;
450 }
451 }
452
453 ## escape [ and ]
454 $metavalue =~ s/\[/\\\[/g;
455 $metavalue =~ s/\]/\\\]/g;
456 ##print STDERR "$metaname=$metavalue\n";
457 $doc_obj->add_metadata ($section, $metaname, $metavalue) ;
458 }
459
460 }
461
462}
463
464
4651;
Note: See TracBrowser for help on using the repository browser.