source: trunk/gsdl/perllib/plugins/MARCPlug.pm@ 9465

Last change on this file since 9465 was 9118, checked in by kjdon, 19 years ago

MARC plug can do exploding - set 'explodes' to yes in xml description

  • Property svn:keywords set to Author Date Id Revision
File size: 8.1 KB
Line 
1###########################################################################
2#
3# MARCPlug.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlug;
28
29use SplitPlug;
30
31use unicode;
32use util;
33use parsargv;
34
35sub BEGIN {
36 @ISA = ('SplitPlug');
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38}
39
40my $arguments =
41 [ { 'name' => "metadata_mapping",
42 'desc' => "{MARCPlug.metadata_mapping}",
43 'type' => "string",
44 'deft' => "marctodc.txt",
45 'reqd' => "no" },
46 { 'name' => "process_exp",
47 'desc' => "{BasPlug.process_exp}",
48 'type' => "regexp",
49 'reqd' => "no",
50 'deft' => &get_default_process_exp() },
51 { 'name' => "split_exp",
52 'desc' => "{SplitPlug.split_exp}",
53 'type' => "regexp",
54 'reqd' => "no",
55 'deft' => &get_default_split_exp() }
56 ];
57
58my $options = { 'name' => "MARCPlug",
59 'desc' => "{MARCPlug.desc}",
60 'abstract' => "no",
61 'inherits' => "Yes",
62 'explodes' => "yes",
63 'args' => $arguments };
64
65require MARC::Record;
66require MARC::Batch;
67#use MARC::Record;
68#use MARC::Batch;
69
70sub new {
71 my $class = shift (@_);
72 my $self = new SplitPlug ($class, @_);
73 $self->{'plugin_type'} = "MARCPlug";
74 my $metadata_mapping;
75
76 if (!parsargv::parse(\@_,
77 q^metadata_mapping/.*/marctodc.txt^, \$metadata_mapping,
78 "allow_extra_options")) {
79
80 print STDERR "\nIncorrect options passed to MARCPlug, check your collect.cfg configuration file\n";
81 $self->print_txt_usage(""); # Use default resource bundle
82 die "\n";
83 }
84
85
86 $self->{'mm_file'} = $metadata_mapping; # relative to etc dir
87
88 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
89 my $option_list = $self->{'option_list'};
90 push( @{$option_list}, $options );
91
92 return bless $self, $class;
93}
94
95sub init {
96 my $self = shift (@_);
97 my ($verbosity, $outhandle, $failhandle) = @_;
98
99 my @metadata_mapping = ();
100
101 # read in the metadata mapping file
102 my $mm_file =
103 &util::filename_cat( $ENV{'GSDLHOME'}, "etc", $self->{'mm_file'} );
104
105 if (!-e $mm_file)
106 {
107
108 my $msg = "MARCPlug ERROR: Can't locate mapping file \"" .
109 $self->{'mm_file'} . "\".\n This file should be at $mm_file\n" .
110 " No marc files can be processed.\n";
111
112 print $outhandle $msg;
113 print $failhandle $msg;
114 $self->{'mm_file'} = undef;
115 # We pick up the error in process() if there is no $mm_file
116 # If we exit here, then pluginfo.pl will exit too!
117 }
118 elsif (open(MMIN, "<$mm_file"))
119 {
120 my $l=1;
121 my $line;
122 while (defined($line=<MMIN>))
123 {
124 chomp $line;
125 if ($line =~ m/^(\d+)\s*->\s*(\w+)$/)
126 {
127 my $marc_info = $1;
128 my $gsdl_info = $2;
129 my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info };
130 push(@metadata_mapping,$mapping);
131 }
132 elsif ($line !~ m/^\#/ # allow comments (# in first column)
133 && $line !~ m/^\s*$/) # allow blank lines
134 {
135 print $outhandle "Parse error on line $l of $mm_file:\n";
136 print $outhandle " \"$line\"\n";
137 }
138 $l++
139 }
140 close(MMIN);
141 }
142 else
143 {
144 print STDERR "Unable to open $mm_file: $!\n";
145 }
146
147 $self->{'metadata_mapping'} = \@metadata_mapping;
148
149 $self->SUPER::init(@_);
150}
151
152
153sub get_default_process_exp {
154 my $self = shift (@_);
155
156 return q^(?i)(\.marc)$^;
157}
158
159
160sub get_default_split_exp {
161 # \r\n for msdos eol, \n for unix
162 return q^\r?\n\s*\r?\n^;
163}
164
165
166
167# The bulk of this function is based on read_line in multiread.pm
168# Unable to use read_line original because it expects to get its input
169# from a file. Here the line to be converted is passed in as a string
170
171sub to_utf8
172{
173 my $self = shift (@_);
174 my ($encoding, $line) = @_;
175
176 if ($encoding eq "iso_8859_1") {
177 # we'll use ascii2utf8() for this as it's faster than going
178 # through convert2unicode()
179 return &unicode::ascii2utf8 (\$line);
180 }
181
182 # everything else uses unicode::convert2unicode
183 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
184}
185
186
187sub read_file {
188 my $self = shift (@_);
189 my ($filename, $encoding, $language, $textref) = @_;
190
191 $self->{'readfile_encoding'} = $encoding;
192
193 @marc_entries = ();
194
195 if (!-r $filename)
196 {
197 my $outhandle = $self->{'outhandle'};
198 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
199 return;
200 }
201
202 my $batch = new MARC::Batch( 'USMARC', $filename );
203 while ( my $marc = $batch->next )
204 {
205 push(@marc_entries,$marc);
206 $$textref .= $marc->as_formatted();
207 $$textref .= "\n\n"; # for SplitPlug - see default_split_exp above...
208 }
209
210 $self->{'marc_entries'} = \@marc_entries;
211}
212
213
214
215# do plugin specific processing of doc_obj
216# This gets done for each record found by SplitPlug in marc files.
217sub process {
218 my $self = shift (@_);
219 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
220 my $outhandle = $self->{'outhandle'};
221
222 if (! defined($self->{'mm_file'}))
223 {
224 print $outhandle "MARCPlug: no metadata file! Can't process $file\n";
225 return undef;
226 }
227
228 print STDERR "<Processing n='$file' p='MARCPlug'>\n" if ($gli);
229 print $outhandle "MARCPlug: processing $file\n"
230 if $self->{'verbosity'} > 1;
231
232 my $cursection = $doc_obj->get_top_section();
233
234 # Add fileFormat as the metadata
235 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
236
237 my $marc_entries = $self->{'marc_entries'};
238 my $marc = shift(@$marc_entries);
239
240 $self->extract_metadata ($marc, $metadata, $doc_obj, $cursection);
241
242 # add spaces after the sub-field markers, for word boundaries
243 $$textref =~ s/^(.{6} _\w)/$1 /gm;
244
245 # add text to document object
246 $$textref =~ s/</&lt;/g;
247 $$textref =~ s/>/&gt;/g;
248
249 my $encoding = $self->{'readfile_encoding'};
250 $$textref = $self->to_utf8($encoding,$$textref);
251
252 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
253 if $self->{'verbosity'} > 2;
254
255 # line wrapping
256 $$textref = &wrap_text_in_columns($$textref, 64);
257 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
258
259 $doc_obj->add_utf8_text($cursection, $$textref);
260
261 return 1;
262}
263
264sub wrap_text_in_columns
265{
266 my ($text, $columnwidth) = @_;
267 my $newtext = "";
268 my $linelength = 0;
269
270 # Break the text into words, and display one at a time
271 my @words = split(/ /, $text);
272
273 foreach $word (@words) {
274 # If printing this word would exceed the column end, start a new line
275 if (($linelength + length($word)) >= $columnwidth) {
276 $newtext .= "\n";
277 $linelength = 0;
278 }
279
280 # Write the word
281 $newtext .= " $word";
282 if ($word =~ /\n/) {
283 $linelength = 0;
284 } else {
285 $linelength = $linelength + length(" $word");
286 }
287 }
288
289 $newtext .= "\n";
290 return $newtext;
291}
292
293
294sub extract_metadata
295{
296 my $self = shift (@_);
297 my ($marc, $metadata, $doc_obj, $section) = @_;
298 my $outhandle = $self->{'outhandle'};
299
300 my $encoding = $self->{'readfile_encoding'};
301
302 my $metadata_mapping = $self->{'metadata_mapping'};
303 my $mm;
304 foreach $mm ( @$metadata_mapping )
305 {
306 my $marc_field = $mm->{'marc'};
307 my @metavalues = $marc->field($marc_field);
308
309 if (scalar(@metavalues)>0)
310 {
311 my $metaname = $mm->{'gsdl'};
312 my $metavalue;
313 foreach $metavalue ( @metavalues )
314 {
315 $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
316 $doc_obj->add_utf8_metadata ($thissection, $metaname, $metavalue_str);
317 }
318 }
319 }
320}
3211;
Note: See TracBrowser for help on using the repository browser.