source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/MARCPlugin.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

File size: 12.6 KB
Line 
1###########################################################################
2#
3# MARCPlugin.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlugin;
28
29use SplitTextFile;
30use MetadataRead;
31
32use unicode;
33use util;
34use marcmapping;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39# methods defined in superclasses that have the same signature take
40# precedence in the order given in the ISA list. We want MetaPlugins to
41# call MetadataRead's can_process_this_file_for_metadata(), rather than
42# calling BasePlugin's version of the same method, so list inherited
43# superclasses in this order.
44sub BEGIN {
45 @MARCPlugin::ISA = ('MetadataRead', 'SplitTextFile');
46
47 # ensure Greenstone's CPAN is on the path (but check first) [jmt12]
48 my $inc_paths = join(':', @INC);
49 my $gsdl_perllib_path = $ENV{'GSDLHOME'} . '/perllib/cpan';
50 if ($inc_paths !~ /$gsdl_perllib_path/)
51 {
52 unshift (@INC, $gsdl_perllib_path);
53 }
54}
55
56my $arguments =
57 [ { 'name' => "metadata_mapping",
58 'desc' => "{common.deprecated} {MARCPlugin.metadata_mapping}",
59 'type' => "string",
60 'deft' => "",
61 'hiddengli' => "yes", # deprecated in favour of 'metadata_mapping_file'
62 'reqd' => "no" },
63 { 'name' => "metadata_mapping_file",
64 'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
65 'type' => "string",
66 'deft' => "marc2dc.txt",
67 'reqd' => "no" },
68 { 'name' => "process_exp",
69 'desc' => "{BasePlugin.process_exp}",
70 'type' => "regexp",
71 'reqd' => "no",
72 'deft' => &get_default_process_exp() },
73 { 'name' => "split_exp",
74 'desc' => "{SplitTextFile.split_exp}",
75 'type' => "regexp",
76 'reqd' => "no",
77 'deft' => &get_default_split_exp() }
78 ];
79
80my $options = { 'name' => "MARCPlugin",
81 'desc' => "{MARCPlugin.desc}",
82 'abstract' => "no",
83 'inherits' => "yes",
84 'explodes' => "yes",
85 'args' => $arguments };
86
87require MARC::Record;
88require MARC::Batch;
89#use MARC::Record;
90#use MARC::Batch;
91
92sub new {
93 my ($class) = shift (@_);
94 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
95 push(@$pluginlist, $class);
96
97 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
98 push(@{$hashArgOptLists->{"OptList"}},$options);
99
100 # this does nothing yet, but if member vars are ever added
101 # to MetadataRead, will need to do this anyway:
102 #new MetadataRead($pluginlist, $inputargs, $hashArgOptLists);
103 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
104
105 if ($self->{'info_only'}) {
106 # don't worry about the options
107 return bless $self, $class;
108 }
109 # 'metadata_mapping' was used in two ways in the plugin: as a plugin
110 # option (filename) and as a datastructure to represent the mapping.
111 # In MARXXMLPlug (written later) the two are separated: filename is
112 # represented through 'metadata_mapping_file' and the data-structure
113 # mapping left as 'metadata_mapping'
114 # 'metadata_mapping' still present (but hidden in GLI) for
115 # backwards compatibility, but 'metadata_mapping_file' is used by
116 # preference
117
118 if ($self->{'metadata_mapping'} ne "") {
119 print STDERR "MARCPlugin WARNING:: the metadata_mapping option is set but has been deprecated. Please use metadata_mapping_file option instead\n";
120 # If the old version is set, use it.
121 $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'};
122 }
123 $self->{'metadata_mapping'} = undef;
124 $self->{'type'} = "";
125 return bless $self, $class;
126}
127
128sub init {
129 my $self = shift (@_);
130 my ($verbosity, $outhandle, $failhandle) = @_;
131
132 ## the mapping file has already been loaded
133 if (defined $self->{'metadata_mapping'} ){
134 $self->SUPER::init(@_);
135 return;
136 }
137
138 # read in the metadata mapping files
139 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
140 if (scalar(@$mm_files)==0)
141 {
142 my $msg = "MARCPlugin ERROR: Can't locate mapping file \"" .
143 $self->{'metadata_mapping_file'} . "\".\n " .
144 " No metadata will be extracted from MARC files.\n";
145
146 print $outhandle $msg;
147 print $failhandle $msg;
148 $self->{'metadata_mapping'} = undef;
149 # We pick up the error in process() if there is no $mm_file
150 # If we exit here, then pluginfo.pl will exit too!
151 }
152 else {
153 $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
154 }
155
156 ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
157
158 $self->SUPER::init(@_);
159}
160
161
162
163sub get_default_process_exp {
164 my $self = shift (@_);
165
166 return q^(?i)(\.marc)$^;
167}
168
169
170sub get_default_split_exp {
171 # \r\n for msdos eol, \n for unix
172 return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
173}
174
175
176
177# The bulk of this function is based on read_line in multiread.pm
178# Unable to use read_line original because it expects to get its input
179# from a file. Here the line to be converted is passed in as a string
180
181sub to_utf8
182{
183 my $self = shift (@_);
184 my ($encoding, $line) = @_;
185
186 if ($encoding eq "utf8") {
187 # nothing needs to be done
188 return $line;
189 }
190
191 if ($encoding eq "iso_8859_1") {
192 # we'll use ascii2utf8() for this as it's faster than going
193 # through convert2unicode()
194 return &unicode::ascii2utf8 (\$line);
195 }
196
197 # everything else uses unicode::convert2unicode
198 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
199}
200
201
202sub read_file {
203 my $self = shift (@_);
204 my ($filename, $encoding, $language, $textref) = @_;
205
206 my $outhandle = $self->{'outhandle'};
207
208 if (! defined($self->{'metadata_mapping'}))
209 {
210 # print a warning
211 print $outhandle "MARCPlugin: no metadata mapping file! Can't extract metadata from $filename\n";
212 }
213
214 $self->{'readfile_encoding'}->{$filename} = $encoding;
215
216
217 if (!-r $filename)
218 {
219 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
220 return;
221 }
222
223 ##handle ascii marc
224 #test whether this is ascii marc file
225 if (open (FILE, $filename)) {
226 while (defined (my $line = <FILE>)) {
227 $$textref .= $line;
228 if ($line =~ /\[\w+\]Record type:/){
229 undef $/;
230 $$textref .= <FILE>;
231 $/ = "\n";
232 $self->{'type'} = "ascii";
233 close FILE;
234 return;
235 }
236 }
237 close FILE;
238 }
239
240
241 $$textref = "";
242 my @marc_entries = ();
243
244 my $batch = new MARC::Batch( 'USMARC', $filename );
245 while ( my $marc = $batch->next )
246 {
247 push(@marc_entries,$marc);
248 $$textref .= $marc->as_formatted();
249 $$textref .= "\n\n"; # for SplitTextFile - see default_split_exp above...
250 }
251
252 $self->{'marc_entries'}->{$filename} = \@marc_entries;
253}
254
255
256
257# do plugin specific processing of doc_obj
258# This gets done for each record found by SplitTextFile in marc files.
259sub process {
260 my $self = shift (@_);
261 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
262
263 my $outhandle = $self->{'outhandle'};
264 my $filename = &util::filename_cat($base_dir, $file);
265
266 my $cursection = $doc_obj->get_top_section();
267
268 # Add fileFormat as the metadata
269 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
270
271 my $marc_entries = $self->{'marc_entries'}->{$filename};
272 my $marc = shift(@$marc_entries);
273
274 my $encoding = $self->{'readfile_encoding'}->{$filename};
275
276 if (defined ($self->{'metadata_mapping'}) ) {
277 if ($self->{'type'} ne "ascii" ){
278 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
279 }
280 else{
281 $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
282 }
283 }
284
285 # add spaces after the sub-field markers, for word boundaries
286 $$textref =~ s/^(.{6} _\w)/$1 /gm;
287
288 # add text to document object
289 $$textref =~ s/</&lt;/g;
290 $$textref =~ s/>/&gt;/g;
291
292 $$textref = $self->to_utf8($encoding,$$textref);
293
294 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
295 if $self->{'verbosity'} > 2;
296
297 # line wrapping
298 $$textref = &wrap_text_in_columns($$textref, 64);
299 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
300
301 $doc_obj->add_utf8_text($cursection, $$textref);
302
303 return 1;
304}
305
306sub wrap_text_in_columns
307{
308 my ($text, $columnwidth) = @_;
309 my $newtext = "";
310 my $linelength = 0;
311
312 # Break the text into words, and display one at a time
313 my @words = split(/ /, $text);
314
315 foreach my $word (@words) {
316 # If printing this word would exceed the column end, start a new line
317 if (($linelength + length($word)) >= $columnwidth) {
318 $newtext .= "\n";
319 $linelength = 0;
320 }
321
322 # Write the word
323 $newtext .= " $word";
324 if ($word =~ /\n/) {
325 $linelength = 0;
326 } else {
327 $linelength = $linelength + length(" $word");
328 }
329 }
330
331 $newtext .= "\n";
332 return $newtext;
333}
334
335
336sub extract_metadata
337{
338 my $self = shift (@_);
339
340 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
341 my $outhandle = $self->{'outhandle'};
342
343 if (!defined $marc){
344 return;
345 }
346
347 my $metadata_mapping = $self->{'metadata_mapping'};;
348
349 foreach my $marc_field ( keys %$metadata_mapping )
350 {
351 my $gsdl_field = $metadata_mapping->{$marc_field};
352
353 # have we got a subfield?
354 my $subfield = undef;
355 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
356 $marc_field = $1;
357 $subfield = $2;
358 }
359 if (defined $subfield) {
360 my $meta_value = $marc->subfield($marc_field, $subfield);
361 if (defined $meta_value) {
362 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
363 $meta_value =~ s/\[/&\#091;/g;
364 $meta_value =~ s/\]/&\#093;/g;
365 my $metavalue_str = $self->to_utf8($encoding, $meta_value);
366 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
367 }
368 }
369 else
370 {
371 foreach my $meta_value_obj ($marc->field($marc_field))
372 {
373 my $meta_value = $meta_value_obj->as_string();
374
375 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
376 $meta_value =~ s/\[/&\#091;/g;
377 $meta_value =~ s/\]/&\#093;/g;
378 my $metavalue_str = $self->to_utf8($encoding, $meta_value);
379 $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
380 }
381 }
382 }
383}
384
385
386sub extract_ascii_metadata
387{
388 my $self = shift (@_);
389
390 my ($text, $metadata,$doc_obj, $section) = @_;
391 my $outhandle = $self->{'outhandle'};
392 my $metadata_mapping = $self->{'metadata_mapping'};
393 ## get fields
394 my @fields = split(/[\n\r]+/,$text);
395 my $marc_mapping ={};
396
397 foreach my $field (@fields){
398 if ($field ne ""){
399 $field =~ /^(\d\d\d)\s/;
400 my $code = $1;
401 $field = $'; #'
402 ##get subfields
403 my @subfields = split(/\$/,$field);
404 my $i=0;
405 $marc_mapping->{$code} = [];
406 foreach my $subfield (@subfields){
407 if ($i == 0){
408 ##print STDERR $subfield."\n";
409 push(@{$marc_mapping->{$code}},"info");
410 push(@{$marc_mapping->{$code}},$subfield);
411
412 $i++;
413 }
414 else{
415 $subfield =~ /(\w)\s/;
416 ##print STDERR "$1=>$'\n";
417 push(@{$marc_mapping->{$code}},$1);
418 push(@{$marc_mapping->{$code}},$'); #'
419 }
420 }
421 }
422 }
423
424
425 foreach my $marc_field ( keys %$metadata_mapping )
426 {
427
428 my $matched_field = $marc_mapping->{$marc_field};
429 my $subfield = undef;
430
431 if (defined $matched_field){
432 ## test whether this field has subfield
433 if ($marc_field =~ /\d\d\d(\w)/){
434 $subfield = $1;
435 }
436 my $metaname = $metadata_mapping->{$marc_field};
437
438 my $metavalue;
439 if (defined $subfield){
440 my %mapped_subfield = {@$matched_field};
441 $metavalue = $mapped_subfield{$subfield};
442 }
443 else{ ## get all values except info
444 my $i =0;
445 foreach my $value (@$matched_field){
446 if ($i%2 != 0 and $i != 1){
447 $metavalue .= $value." ";
448 }
449 $i++;
450 }
451 }
452
453 ## escape [ and ]
454 $metavalue =~ s/\[/\\\[/g;
455 $metavalue =~ s/\]/\\\]/g;
456 ##print STDERR "$metaname=$metavalue\n";
457 $doc_obj->add_metadata ($section, $metaname, $metavalue) ;
458 }
459
460 }
461
462}
463
464
4651;
Note: See TracBrowser for help on using the repository browser.