source: trunk/gsdl/perllib/plugins/MARCPlug.pm@ 9494

Last change on this file since 9494 was 9494, checked in by mdewsnip, 19 years ago

A couple more fixes for the metadata_read stuff.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.2 KB
Line 
1###########################################################################
2#
3# MARCPlug.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlug;
28
29use SplitPlug;
30
31use unicode;
32use util;
33use parsargv;
34
35sub BEGIN {
36 @ISA = ('SplitPlug');
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38}
39
40my $arguments =
41 [ { 'name' => "metadata_mapping",
42 'desc' => "{MARCPlug.metadata_mapping}",
43 'type' => "string",
44 'deft' => "marctodc.txt",
45 'reqd' => "no" },
46 { 'name' => "process_exp",
47 'desc' => "{BasPlug.process_exp}",
48 'type' => "regexp",
49 'reqd' => "no",
50 'deft' => &get_default_process_exp() },
51 { 'name' => "split_exp",
52 'desc' => "{SplitPlug.split_exp}",
53 'type' => "regexp",
54 'reqd' => "no",
55 'deft' => &get_default_split_exp() }
56 ];
57
58my $options = { 'name' => "MARCPlug",
59 'desc' => "{MARCPlug.desc}",
60 'abstract' => "no",
61 'inherits' => "Yes",
62 'explodes' => "yes",
63 'args' => $arguments };
64
65require MARC::Record;
66require MARC::Batch;
67#use MARC::Record;
68#use MARC::Batch;
69
70sub new {
71 my $class = shift (@_);
72 my $self = new SplitPlug ($class, @_);
73 $self->{'plugin_type'} = "MARCPlug";
74 my $metadata_mapping;
75
76 if (!parsargv::parse(\@_,
77 q^metadata_mapping/.*/marctodc.txt^, \$metadata_mapping,
78 "allow_extra_options")) {
79
80 print STDERR "\nIncorrect options passed to MARCPlug, check your collect.cfg configuration file\n";
81 $self->print_txt_usage(""); # Use default resource bundle
82 die "\n";
83 }
84
85
86 $self->{'mm_file'} = $metadata_mapping; # relative to etc dir
87
88 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
89 my $option_list = $self->{'option_list'};
90 push( @{$option_list}, $options );
91
92 return bless $self, $class;
93}
94
95sub init {
96 my $self = shift (@_);
97 my ($verbosity, $outhandle, $failhandle) = @_;
98
99 my @metadata_mapping = ();
100
101 # read in the metadata mapping file
102 my $mm_file =
103 &util::filename_cat( $ENV{'GSDLHOME'}, "etc", $self->{'mm_file'} );
104
105 if (!-e $mm_file)
106 {
107
108 my $msg = "MARCPlug ERROR: Can't locate mapping file \"" .
109 $self->{'mm_file'} . "\".\n This file should be at $mm_file\n" .
110 " No marc files can be processed.\n";
111
112 print $outhandle $msg;
113 print $failhandle $msg;
114 $self->{'mm_file'} = undef;
115 # We pick up the error in process() if there is no $mm_file
116 # If we exit here, then pluginfo.pl will exit too!
117 }
118 elsif (open(MMIN, "<$mm_file"))
119 {
120 my $l=1;
121 my $line;
122 while (defined($line=<MMIN>))
123 {
124 chomp $line;
125 if ($line =~ m/^(\d+)\s*->\s*(\w+)$/)
126 {
127 my $marc_info = $1;
128 my $gsdl_info = $2;
129 my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info };
130 push(@metadata_mapping,$mapping);
131 }
132 elsif ($line !~ m/^\#/ # allow comments (# in first column)
133 && $line !~ m/^\s*$/) # allow blank lines
134 {
135 print $outhandle "Parse error on line $l of $mm_file:\n";
136 print $outhandle " \"$line\"\n";
137 }
138 $l++
139 }
140 close(MMIN);
141 }
142 else
143 {
144 print STDERR "Unable to open $mm_file: $!\n";
145 }
146
147 $self->{'metadata_mapping'} = \@metadata_mapping;
148
149 $self->SUPER::init(@_);
150}
151
152
153sub get_default_process_exp {
154 my $self = shift (@_);
155
156 return q^(?i)(\.marc)$^;
157}
158
159
160sub get_default_split_exp {
161 # \r\n for msdos eol, \n for unix
162 return q^\r?\n\s*\r?\n^;
163}
164
165
166
167# The bulk of this function is based on read_line in multiread.pm
168# Unable to use read_line original because it expects to get its input
169# from a file. Here the line to be converted is passed in as a string
170
171sub to_utf8
172{
173 my $self = shift (@_);
174 my ($encoding, $line) = @_;
175
176 if ($encoding eq "iso_8859_1") {
177 # we'll use ascii2utf8() for this as it's faster than going
178 # through convert2unicode()
179 return &unicode::ascii2utf8 (\$line);
180 }
181
182 # everything else uses unicode::convert2unicode
183 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
184}
185
186
187sub read_file {
188 my $self = shift (@_);
189 my ($filename, $encoding, $language, $textref) = @_;
190
191 $self->{'readfile_encoding'}->{$filename} = $encoding;
192
193 my @marc_entries = ();
194
195 if (!-r $filename)
196 {
197 my $outhandle = $self->{'outhandle'};
198 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
199 return;
200 }
201
202 my $batch = new MARC::Batch( 'USMARC', $filename );
203 while ( my $marc = $batch->next )
204 {
205 push(@marc_entries,$marc);
206 $$textref .= $marc->as_formatted();
207 $$textref .= "\n\n"; # for SplitPlug - see default_split_exp above...
208 }
209
210 $self->{'marc_entries'}->{$filename} = \@marc_entries;
211}
212
213
214
215# do plugin specific processing of doc_obj
216# This gets done for each record found by SplitPlug in marc files.
217sub process {
218 my $self = shift (@_);
219 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
220
221 my $outhandle = $self->{'outhandle'};
222 my $filename = &util::filename_cat($base_dir, $file);
223
224 if (! defined($self->{'mm_file'}))
225 {
226 print $outhandle "MARCPlug: no metadata file! Can't process $file\n";
227 return undef;
228 }
229
230 print STDERR "<Processing n='$file' p='MARCPlug'>\n" if ($gli);
231 print $outhandle "MARCPlug: processing $file\n"
232 if $self->{'verbosity'} > 1;
233
234 my $cursection = $doc_obj->get_top_section();
235
236 # Add fileFormat as the metadata
237 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
238
239 my $marc_entries = $self->{'marc_entries'}->{$filename};
240 my $marc = shift(@$marc_entries);
241
242 my $encoding = $self->{'readfile_encoding'}->{$filename};
243 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
244
245 # add spaces after the sub-field markers, for word boundaries
246 $$textref =~ s/^(.{6} _\w)/$1 /gm;
247
248 # add text to document object
249 $$textref =~ s/</&lt;/g;
250 $$textref =~ s/>/&gt;/g;
251
252 $$textref = $self->to_utf8($encoding,$$textref);
253
254 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
255 if $self->{'verbosity'} > 2;
256
257 # line wrapping
258 $$textref = &wrap_text_in_columns($$textref, 64);
259 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
260
261 $doc_obj->add_utf8_text($cursection, $$textref);
262
263 return 1;
264}
265
266sub wrap_text_in_columns
267{
268 my ($text, $columnwidth) = @_;
269 my $newtext = "";
270 my $linelength = 0;
271
272 # Break the text into words, and display one at a time
273 my @words = split(/ /, $text);
274
275 foreach $word (@words) {
276 # If printing this word would exceed the column end, start a new line
277 if (($linelength + length($word)) >= $columnwidth) {
278 $newtext .= "\n";
279 $linelength = 0;
280 }
281
282 # Write the word
283 $newtext .= " $word";
284 if ($word =~ /\n/) {
285 $linelength = 0;
286 } else {
287 $linelength = $linelength + length(" $word");
288 }
289 }
290
291 $newtext .= "\n";
292 return $newtext;
293}
294
295
296sub extract_metadata
297{
298 my $self = shift (@_);
299 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
300 my $outhandle = $self->{'outhandle'};
301
302 my $metadata_mapping = $self->{'metadata_mapping'};
303 my $mm;
304 foreach $mm ( @$metadata_mapping )
305 {
306 my $marc_field = $mm->{'marc'};
307 my @metavalues = $marc->field($marc_field);
308
309 if (scalar(@metavalues)>0)
310 {
311 my $metaname = $mm->{'gsdl'};
312 my $metavalue;
313 foreach $metavalue ( @metavalues )
314 {
315 $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
316 $doc_obj->add_utf8_metadata ($thissection, $metaname, $metavalue_str);
317 }
318 }
319 }
320}
3211;
Note: See TracBrowser for help on using the repository browser.