source: trunk/gsdl/perllib/plugins/MARCPlug.pm@ 13198

Last change on this file since 13198 was 13198, checked in by shaoqun, 14 years ago

now this plugin can handle ascii marc files

  • Property svn:keywords set to Author Date Id Revision
File size: 10.5 KB
Line 
1###########################################################################
2#
3# MARCPlug.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlug;
28
29use SplitPlug;
30
31use unicode;
32use util;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub BEGIN {
38 @MARCPlug::ISA = ('SplitPlug');
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
40}
41
42my $arguments =
43 [ { 'name' => "metadata_mapping",
44 'desc' => "{MARCPlug.metadata_mapping}",
45 'type' => "string",
46 'deft' => "marctodc.txt",
47 'reqd' => "no" },
48 { 'name' => "process_exp",
49 'desc' => "{BasPlug.process_exp}",
50 'type' => "regexp",
51 'reqd' => "no",
52 'deft' => &get_default_process_exp() },
53 { 'name' => "split_exp",
54 'desc' => "{SplitPlug.split_exp}",
55 'type' => "regexp",
56 'reqd' => "no",
57 'deft' => &get_default_split_exp() }
58 ];
59
60my $options = { 'name' => "MARCPlug",
61 'desc' => "{MARCPlug.desc}",
62 'abstract' => "no",
63 'inherits' => "yes",
64 'explodes' => "yes",
65 'args' => $arguments };
66
67require MARC::Record;
68require MARC::Batch;
69#use MARC::Record;
70#use MARC::Batch;
71
72sub new {
73 my ($class) = shift (@_);
74 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
75 push(@$pluginlist, $class);
76
77 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
78 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
79
80 my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
81
82 $self->{'type'} = "";
83
84 return bless $self, $class;
85}
86
87sub init {
88 my $self = shift (@_);
89 my ($verbosity, $outhandle, $failhandle) = @_;
90
91 my @metadata_mapping = ();
92
93 # read in the metadata mapping file
94 my $mm_file =
95 &util::filename_cat( $ENV{'GSDLHOME'}, "etc", $self->{'metadata_mapping'} );
96
97 if (!-e $mm_file)
98 {
99
100 my $msg = "MARCPlug ERROR: Can't locate mapping file \"" .
101 $self->{'metadata_mapping'} . "\".\n This file should be at $mm_file\n" .
102 " No marc files can be processed.\n";
103
104 print $outhandle $msg;
105 print $failhandle $msg;
106 $self->{'metadata_mapping'} = undef;
107 # We pick up the error in process() if there is no $mm_file
108 # If we exit here, then pluginfo.pl will exit too!
109 }
110 elsif (open(MMIN, "<$mm_file"))
111 {
112 my $l=1;
113 my $line;
114 while (defined($line=<MMIN>))
115 {
116 chomp $line;
117 if ($line =~ m/^(\d+)\s*->\s*([\w\^]+)$/)
118 {
119 my $marc_info = $1;
120 my $gsdl_info = $2;
121 my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info };
122 push(@metadata_mapping,$mapping);
123 }
124 elsif ($line !~ m/^\#/ # allow comments (# in first column)
125 && $line !~ m/^\s*$/) # allow blank lines
126 {
127 print $outhandle "Parse error on line $l of $mm_file:\n";
128 print $outhandle " \"$line\"\n";
129 }
130 $l++
131 }
132 close(MMIN);
133 }
134 else
135 {
136 print STDERR "Unable to open $mm_file: $!\n";
137 }
138
139 $self->{'metadata_mapping'} = \@metadata_mapping;
140
141 $self->SUPER::init(@_);
142}
143
144
145sub get_default_process_exp {
146 my $self = shift (@_);
147
148 return q^(?i)(\.marc)$^;
149}
150
151
152sub get_default_split_exp {
153 # \r\n for msdos eol, \n for unix
154 return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
155}
156
157
158
159# The bulk of this function is based on read_line in multiread.pm
160# Unable to use read_line original because it expects to get its input
161# from a file. Here the line to be converted is passed in as a string
162
163sub to_utf8
164{
165 my $self = shift (@_);
166 my ($encoding, $line) = @_;
167
168 if ($encoding eq "iso_8859_1") {
169 # we'll use ascii2utf8() for this as it's faster than going
170 # through convert2unicode()
171 return &unicode::ascii2utf8 (\$line);
172 }
173
174 # everything else uses unicode::convert2unicode
175 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
176}
177
178
179sub read_file {
180 my $self = shift (@_);
181 my ($filename, $encoding, $language, $textref) = @_;
182
183 $self->{'readfile_encoding'}->{$filename} = $encoding;
184
185
186 if (!-r $filename)
187 {
188 my $outhandle = $self->{'outhandle'};
189 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
190 return;
191 }
192
193 ##handle ascii marc
194 #test whether this is ascii marc file
195 if (open (FILE, $filename)) {
196 while (defined (my $line = <FILE>)) {
197 $$textref .= $line;
198 if ($line =~ /\[\w+\]Record type:/){
199 undef $/;
200 $$textref .= <FILE>;
201 $/ = "\n";
202 $self->{'type'} = "ascii";
203 close FILE;
204 return;
205 }
206 }
207 close FILE;
208 }
209
210
211 $$textref = "";
212 my @marc_entries = ();
213
214 my $batch = new MARC::Batch( 'USMARC', $filename );
215 while ( my $marc = $batch->next )
216 {
217 push(@marc_entries,$marc);
218 $$textref .= $marc->as_formatted();
219 $$textref .= "\n\n"; # for SplitPlug - see default_split_exp above...
220 }
221
222 $self->{'marc_entries'}->{$filename} = \@marc_entries;
223}
224
225
226
227# do plugin specific processing of doc_obj
228# This gets done for each record found by SplitPlug in marc files.
229sub process {
230 my $self = shift (@_);
231 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
232
233 my $outhandle = $self->{'outhandle'};
234 my $filename = &util::filename_cat($base_dir, $file);
235
236 if (! defined($self->{'metadata_mapping'}))
237 {
238 print $outhandle "MARCPlug: no metadata file! Can't process $file\n";
239 return undef;
240 }
241
242 print STDERR "<Processing n='$file' p='MARCPlug'>\n" if ($gli);
243 print $outhandle "MARCPlug: processing $file\n"
244 if $self->{'verbosity'} > 1;
245
246 my $cursection = $doc_obj->get_top_section();
247
248 # Add fileFormat as the metadata
249 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
250
251 my $marc_entries = $self->{'marc_entries'}->{$filename};
252 my $marc = shift(@$marc_entries);
253
254 my $encoding = $self->{'readfile_encoding'}->{$filename};
255
256 if ($self->{'type'} ne "ascii" ){
257 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
258 }
259 else{
260 $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
261 }
262
263 # add spaces after the sub-field markers, for word boundaries
264 $$textref =~ s/^(.{6} _\w)/$1 /gm;
265
266 # add text to document object
267 $$textref =~ s/</&lt;/g;
268 $$textref =~ s/>/&gt;/g;
269
270 $$textref = $self->to_utf8($encoding,$$textref);
271
272 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
273 if $self->{'verbosity'} > 2;
274
275 # line wrapping
276 $$textref = &wrap_text_in_columns($$textref, 64);
277 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
278
279 $doc_obj->add_utf8_text($cursection, $$textref);
280
281 return 1;
282}
283
284sub wrap_text_in_columns
285{
286 my ($text, $columnwidth) = @_;
287 my $newtext = "";
288 my $linelength = 0;
289
290 # Break the text into words, and display one at a time
291 my @words = split(/ /, $text);
292
293 foreach my $word (@words) {
294 # If printing this word would exceed the column end, start a new line
295 if (($linelength + length($word)) >= $columnwidth) {
296 $newtext .= "\n";
297 $linelength = 0;
298 }
299
300 # Write the word
301 $newtext .= " $word";
302 if ($word =~ /\n/) {
303 $linelength = 0;
304 } else {
305 $linelength = $linelength + length(" $word");
306 }
307 }
308
309 $newtext .= "\n";
310 return $newtext;
311}
312
313
314sub extract_metadata
315{
316 my $self = shift (@_);
317 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
318 my $outhandle = $self->{'outhandle'};
319
320 if (!defined $marc){
321 return;
322 }
323
324 my $metadata_mapping = $self->{'metadata_mapping'};
325 my $mm;
326
327 foreach $mm ( @$metadata_mapping )
328 {
329 my $marc_field = $mm->{'marc'};
330
331 my @metavalues = $marc->field($marc_field);
332
333 if (scalar(@metavalues)>0)
334 {
335 my $metaname = $mm->{'gsdl'};
336 my $metavalue;
337 foreach $metavalue ( @metavalues )
338 {
339 my $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
340 $doc_obj->add_utf8_metadata ($section, $metaname, $metavalue_str);
341 }
342 }
343 }
344}
345
346sub extract_ascii_metadata
347{
348 my $self = shift (@_);
349 my ($text, $metadata,$doc_obj, $section) = @_;
350 my $outhandle = $self->{'outhandle'};
351 my $metadata_mapping = $self->{'metadata_mapping'};
352 ## get fields
353 my @fields = split(/[\n\r]+/,$text);
354 my $marc_mapping ={};
355
356 foreach my $field (@fields){
357 if ($field ne ""){
358 $field =~ /^(\d\d\d)\s/;
359 my $code = $1;
360 $field = $';
361 ##get subfields
362 my @subfields = split(/\$/,$field);
363 my $i=0;
364 $marc_mapping->{$code} = [];
365 foreach my $subfield (@subfields){
366 if ($i == 0){
367 ##print STDERR $subfield."\n";
368 push(@{$marc_mapping->{$code}},"info");
369 push(@{$marc_mapping->{$code}},$subfield);
370
371 $i++;
372 }
373 else{
374 $subfield =~ /(\w)\s/;
375 ##print STDERR "$1=>$'\n";
376 push(@{$marc_mapping->{$code}},$1);
377 push(@{$marc_mapping->{$code}},$');
378 }
379 }
380 }
381 }
382
383
384 foreach my $mm ( @$metadata_mapping )
385 {
386 my $marc_field = $mm->{'marc'};
387
388 my $matched_field = $marc_mapping->{$marc_field};
389 my $subfield = undef;
390
391 if (defined $matched_field){
392 ## test whether this field has subfield
393 if ($marc_field =~ /\d\d\d(\w)/){
394 $subfield = $1;
395 }
396 my $metaname = $mm->{'gsdl'};
397
398 my $metavalue;
399 if (defined $subfield){
400 my %mapped_subfield = {@$matched_field};
401 $metavalue = $mapped_subfield{$subfield};
402 }
403 else{ ## get all values except info
404 my $i =0;
405 foreach my $value (@$matched_field){
406 if ($i%2 != 0 and $i != 1){
407 $metavalue .= $value." ";
408 }
409 $i++;
410 }
411 }
412
413 ## escape [ and ]
414 $metavalue =~ s/\[/\\\[/g;
415 $metavalue =~ s/\]/\\\]/g;
416 ##print STDERR "$metaname=$metavalue\n";
417 $doc_obj->add_metadata ($section, $metaname, $metavalue) ;
418 }
419
420 }
421
422}
423
424
4251;
Note: See TracBrowser for help on using the repository browser.