source: main/trunk/greenstone2/perllib/plugins/MARCPlugin.pm@ 32215

Last change on this file since 32215 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 12.6 KB
Line 
1###########################################################################
2#
3# MARCPlugin.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlugin;
28
29use SplitTextFile;
30use MetadataRead;
31
32use Encode;
33
34use unicode;
35use util;
36use marcmapping;
37
38use strict;
39no strict 'refs'; # allow filehandles to be variables and viceversa
40
41# methods defined in superclasses that have the same signature take
42# precedence in the order given in the ISA list. We want MetaPlugins to
43# call MetadataRead's can_process_this_file_for_metadata(), rather than
44# calling BaseImporter's version of the same method, so list inherited
45# superclasses in this order.
46sub BEGIN {
47 @MARCPlugin::ISA = ('MetadataRead', 'SplitTextFile');
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
49}
50
51my $arguments =
52 [ { 'name' => "metadata_mapping",
53 'desc' => "{common.deprecated} {MARCPlugin.metadata_mapping}",
54 'type' => "string",
55 'deft' => "",
56 'hiddengli' => "yes", # deprecated in favour of 'metadata_mapping_file'
57 'reqd' => "no" },
58 { 'name' => "metadata_mapping_file",
59 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
60 'type' => "string",
61 'deft' => "marc2dc.txt",
62 'reqd' => "no" },
63 { 'name' => "process_exp",
64 'desc' => "{BaseImporter.process_exp}",
65 'type' => "regexp",
66 'reqd' => "no",
67 'deft' => &get_default_process_exp() },
68 { 'name' => "split_exp",
69 'desc' => "{SplitTextFile.split_exp}",
70 'type' => "regexp",
71 'reqd' => "no",
72 'deft' => &get_default_split_exp() }
73 ];
74
75my $options = { 'name' => "MARCPlugin",
76 'desc' => "{MARCPlugin.desc}",
77 'abstract' => "no",
78 'inherits' => "yes",
79 'explodes' => "yes",
80 'args' => $arguments };
81
82require MARC::Record;
83require MARC::Batch;
84#use MARC::Record;
85#use MARC::Batch;
86
87sub new {
88 my ($class) = shift (@_);
89 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
90 push(@$pluginlist, $class);
91
92 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
93 push(@{$hashArgOptLists->{"OptList"}},$options);
94
95 # this does nothing yet, but if member vars are ever added
96 # to MetadataRead, will need to do this anyway:
97 #new MetadataRead($pluginlist, $inputargs, $hashArgOptLists);
98 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
99
100 if ($self->{'info_only'}) {
101 # don't worry about the options
102 return bless $self, $class;
103 }
104 # 'metadata_mapping' was used in two ways in the plugin: as a plugin
105 # option (filename) and as a datastructure to represent the mapping.
106 # In MARXXMLPlug (written later) the two are separated: filename is
107 # represented through 'metadata_mapping_file' and the data-structure
108 # mapping left as 'metadata_mapping'
109 # 'metadata_mapping' still present (but hidden in GLI) for
110 # backwards compatibility, but 'metadata_mapping_file' is used by
111 # preference
112
113 if ($self->{'metadata_mapping'} ne "") {
114 print STDERR "MARCPlugin WARNING:: the metadata_mapping option is set but has been deprecated. Please use metadata_mapping_file option instead\n";
115 # If the old version is set, use it.
116 $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'};
117 }
118 $self->{'metadata_mapping'} = undef;
119 $self->{'type'} = "";
120 return bless $self, $class;
121}
122
123sub init {
124 my $self = shift (@_);
125 my ($verbosity, $outhandle, $failhandle) = @_;
126
127 ## the mapping file has already been loaded
128 if (defined $self->{'metadata_mapping'} ){
129 $self->SUPER::init(@_);
130 return;
131 }
132
133 # read in the metadata mapping files
134 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
135 if (scalar(@$mm_files)==0)
136 {
137 my $msg = "MARCPlugin ERROR: Can't locate mapping file \"" .
138 $self->{'metadata_mapping_file'} . "\".\n " .
139 " No metadata will be extracted from MARC files.\n";
140
141 print $outhandle $msg;
142 print $failhandle $msg;
143 $self->{'metadata_mapping'} = undef;
144 # We pick up the error in process() if there is no $mm_file
145 # If we exit here, then pluginfo.pl will exit too!
146 }
147 else {
148 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
149 }
150
151 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
152
153 $self->SUPER::init(@_);
154}
155
156
157
158sub get_default_process_exp {
159 my $self = shift (@_);
160
161 return q^(?i)(\.marc)$^;
162}
163
164
165sub get_default_split_exp {
166 # \r\n for msdos eol, \n for unix
167 return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
168}
169
170
171
172# The bulk of this function is based on read_line in multiread.pm
173# Unable to use read_line original because it expects to get its input
174# from a file. Here the line to be converted is passed in as a string
175
176sub to_utf8
177{
178 my $self = shift (@_);
179 my ($encoding, $line) = @_;
180
181 if ($encoding eq "utf8") {
182 # nothing needs to be done
183 #return $line;
184 } elsif ($encoding eq "iso_8859_1") {
185 # we'll use ascii2utf8() for this as it's faster than going
186 # through convert2unicode()
187 #return &unicode::ascii2utf8 (\$line);
188 $line = &unicode::ascii2utf8 (\$line);
189 } else {
190
191 # everything else uses unicode::convert2unicode
192 $line = &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
193 }
194 # At this point $line is a binary byte string
195 # => turn it into a Unicode aware string, so full
196 # Unicode aware pattern matching can be used.
197 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
198
199 return decode ("utf8", $line);
200}
201
202
203sub read_file {
204 my $self = shift (@_);
205 my ($filename, $encoding, $language, $textref) = @_;
206
207 my $outhandle = $self->{'outhandle'};
208
209 if (! defined($self->{'metadata_mapping'}))
210 {
211 # print a warning
212 print $outhandle "MARCPlugin: no metadata mapping file! Can't extract metadata from $filename\n";
213 }
214
215 $self->{'readfile_encoding'}->{$filename} = $encoding;
216
217
218 if (!-r $filename)
219 {
220 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
221 return;
222 }
223
224 ##handle ascii marc
225 #test whether this is ascii marc file
226 if (open (FILE, $filename)) {
227 while (defined (my $line = <FILE>)) {
228 $$textref .= $line;
229 if ($line =~ /\[\w+\]Record type:/){
230 undef $/;
231 $$textref .= <FILE>;
232 $/ = "\n";
233 $self->{'type'} = "ascii";
234 close FILE;
235 return;
236 }
237 }
238 close FILE;
239 }
240
241
242 $$textref = "";
243 my @marc_entries = ();
244
245 my $batch = new MARC::Batch( 'USMARC', $filename );
246 while ( my $marc = $batch->next )
247 {
248 push(@marc_entries,$marc);
249 $$textref .= $marc->as_formatted();
250 $$textref .= "\n\n"; # for SplitTextFile - see default_split_exp above...
251 }
252
253 $self->{'marc_entries'}->{$filename} = \@marc_entries;
254}
255
256
257
258# do plugin specific processing of doc_obj
259# This gets done for each record found by SplitTextFile in marc files.
260sub process {
261 my $self = shift (@_);
262 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
263
264 my $outhandle = $self->{'outhandle'};
265 my $filename = &util::filename_cat($base_dir, $file);
266
267 my $cursection = $doc_obj->get_top_section();
268
269 # Add fileFormat as the metadata
270 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
271
272 my $marc_entries = $self->{'marc_entries'}->{$filename};
273 my $marc = shift(@$marc_entries);
274
275 my $encoding = $self->{'readfile_encoding'}->{$filename};
276 if (defined ($self->{'metadata_mapping'}) ) {
277 if ($self->{'type'} ne "ascii" ){
278 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
279 }
280 else{
281 $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
282 }
283 }
284
285 # add spaces after the sub-field markers, for word boundaries
286 $$textref =~ s/^(.{6} _\w)/$1 /gm;
287
288 # add text to document object
289 $$textref =~ s/</&lt;/g;
290 $$textref =~ s/>/&gt;/g;
291
292 $$textref = $self->to_utf8($encoding,$$textref);
293
294 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
295 if $self->{'verbosity'} > 2;
296
297 # line wrapping
298 $$textref = &wrap_text_in_columns($$textref, 64);
299 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
300
301 $doc_obj->add_utf8_text($cursection, $$textref);
302
303 return 1;
304}
305
306sub wrap_text_in_columns
307{
308 my ($text, $columnwidth) = @_;
309 my $newtext = "";
310 my $linelength = 0;
311
312 # Break the text into words, and display one at a time
313 my @words = split(/ /, $text);
314
315 foreach my $word (@words) {
316 # If printing this word would exceed the column end, start a new line
317 if (($linelength + length($word)) >= $columnwidth) {
318 $newtext .= "\n";
319 $linelength = 0;
320 }
321
322 # Write the word
323 $newtext .= " $word";
324 if ($word =~ /\n/) {
325 $linelength = 0;
326 } else {
327 $linelength = $linelength + length(" $word");
328 }
329 }
330
331 $newtext .= "\n";
332 return $newtext;
333}
334
335sub extract_metadata
336{
337 my $self = shift (@_);
338
339 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
340 my $outhandle = $self->{'outhandle'};
341
342 if (!defined $marc){
343 return;
344 }
345
346 my $metadata_mapping = $self->{'metadata_mapping'};;
347
348 foreach my $marc_field ( sort keys %$metadata_mapping )
349 {
350 my $gsdl_field = $metadata_mapping->{$marc_field};
351
352 # have we got a subfield?
353 my $subfield = undef;
354 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
355 $marc_field = $1;
356 $subfield = $2;
357 }
358
359 foreach my $meta_value_obj ($marc->field($marc_field)) {
360 my $meta_value;
361 if (defined($subfield)) {
362 $meta_value = $meta_value_obj->subfield($subfield);
363 } else {
364 $meta_value = $meta_value_obj->as_string();
365 }
366 if (defined $meta_value) {
367 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
368
369 # Important! Check that this really works!! In MARCXMLPlugin
370 # it maps these characters to \\\\[ \\\\]
371
372 $meta_value =~ s/\[/&\#091;/g;
373 $meta_value =~ s/\]/&\#093;/g;
374 my $metavalue_str = $self->to_utf8($encoding, $meta_value);
375 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
376 }
377 }
378 }
379}
380
381
382sub extract_ascii_metadata
383{
384 my $self = shift (@_);
385
386 my ($text, $metadata,$doc_obj, $section) = @_;
387 my $outhandle = $self->{'outhandle'};
388 my $metadata_mapping = $self->{'metadata_mapping'};
389 ## get fields
390 my @fields = split(/[\n\r]+/,$text);
391 my $marc_mapping ={};
392
393 foreach my $field (@fields){
394 if ($field ne ""){
395 $field =~ /^(\d\d\d)\s/;
396 my $code = $1;
397 $field = $'; #'
398 ##get subfields
399 my @subfields = split(/\$/,$field);
400 my $i=0;
401 $marc_mapping->{$code} = [];
402 foreach my $subfield (@subfields){
403 if ($i == 0){
404 ##print STDERR $subfield."\n";
405 push(@{$marc_mapping->{$code}},"info");
406 push(@{$marc_mapping->{$code}},$subfield);
407
408 $i++;
409 }
410 else{
411 $subfield =~ /(\w)\s/;
412 ##print STDERR "$1=>$'\n";
413 push(@{$marc_mapping->{$code}},$1);
414 push(@{$marc_mapping->{$code}},$'); #'
415 }
416 }
417 }
418 }
419
420
421 foreach my $marc_field ( keys %$metadata_mapping )
422 {
423
424 my $matched_field = $marc_mapping->{$marc_field};
425 my $subfield = undef;
426
427 if (defined $matched_field){
428 ## test whether this field has subfield
429 if ($marc_field =~ /\d\d\d(\w)/){
430 $subfield = $1;
431 }
432 my $metaname = $metadata_mapping->{$marc_field};
433
434 my $metavalue;
435 if (defined $subfield){
436 my %mapped_subfield = {@$matched_field};
437 $metavalue = $mapped_subfield{$subfield};
438 }
439 else{ ## get all values except info
440 my $i =0;
441 foreach my $value (@$matched_field){
442 if ($i%2 != 0 and $i != 1){
443 $metavalue .= $value." ";
444 }
445 $i++;
446 }
447 }
448
449 ## escape [ and ]
450 $metavalue =~ s/\[/\\\[/g;
451 $metavalue =~ s/\]/\\\]/g;
452 ##print STDERR "$metaname=$metavalue\n";
453 $doc_obj->add_metadata ($section, $metaname, $metavalue) ;
454 }
455
456 }
457
458}
459
460
4611;
Note: See TracBrowser for help on using the repository browser.