source: trunk/gsdl/perllib/plugins/MARCPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:keywords set to Author Date Id Revision
File size: 8.1 KB
Line 
1###########################################################################
2#
3# MARCPlug.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlug;
28
29use SplitPlug;
30
31use unicode;
32use util;
33use parsargv;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38sub BEGIN {
39 @MARCPlug::ISA = ('SplitPlug');
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
41}
42
43my $arguments =
44 [ { 'name' => "metadata_mapping",
45 'desc' => "{MARCPlug.metadata_mapping}",
46 'type' => "string",
47 'deft' => "marctodc.txt",
48 'reqd' => "no" },
49 { 'name' => "process_exp",
50 'desc' => "{BasPlug.process_exp}",
51 'type' => "regexp",
52 'reqd' => "no",
53 'deft' => &get_default_process_exp() },
54 { 'name' => "split_exp",
55 'desc' => "{SplitPlug.split_exp}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => &get_default_split_exp() }
59 ];
60
61my $options = { 'name' => "MARCPlug",
62 'desc' => "{MARCPlug.desc}",
63 'abstract' => "no",
64 'inherits' => "Yes",
65 'explodes' => "yes",
66 'args' => $arguments };
67
68require MARC::Record;
69require MARC::Batch;
70#use MARC::Record;
71#use MARC::Batch;
72
73sub new {
74 my ($class) = shift (@_);
75 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
76 push(@$pluginlist, $class);
77
78 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
79 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
80
81 my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
82
83 return bless $self, $class;
84}
85
86sub init {
87 my $self = shift (@_);
88 my ($verbosity, $outhandle, $failhandle) = @_;
89
90 my @metadata_mapping = ();
91
92 # read in the metadata mapping file
93 my $mm_file =
94 &util::filename_cat( $ENV{'GSDLHOME'}, "etc", $self->{'metadata_mapping'} );
95
96 if (!-e $mm_file)
97 {
98
99 my $msg = "MARCPlug ERROR: Can't locate mapping file \"" .
100 $self->{'metadata_mapping'} . "\".\n This file should be at $mm_file\n" .
101 " No marc files can be processed.\n";
102
103 print $outhandle $msg;
104 print $failhandle $msg;
105 $self->{'metadata_mapping'} = undef;
106 # We pick up the error in process() if there is no $mm_file
107 # If we exit here, then pluginfo.pl will exit too!
108 }
109 elsif (open(MMIN, "<$mm_file"))
110 {
111 my $l=1;
112 my $line;
113 while (defined($line=<MMIN>))
114 {
115 chomp $line;
116 if ($line =~ m/^(\d+)\s*->\s*(\w+)$/)
117 {
118 my $marc_info = $1;
119 my $gsdl_info = $2;
120 my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info };
121 push(@metadata_mapping,$mapping);
122 }
123 elsif ($line !~ m/^\#/ # allow comments (# in first column)
124 && $line !~ m/^\s*$/) # allow blank lines
125 {
126 print $outhandle "Parse error on line $l of $mm_file:\n";
127 print $outhandle " \"$line\"\n";
128 }
129 $l++
130 }
131 close(MMIN);
132 }
133 else
134 {
135 print STDERR "Unable to open $mm_file: $!\n";
136 }
137
138 $self->{'metadata_mapping'} = \@metadata_mapping;
139
140 $self->SUPER::init(@_);
141}
142
143
144sub get_default_process_exp {
145 my $self = shift (@_);
146
147 return q^(?i)(\.marc)$^;
148}
149
150
151sub get_default_split_exp {
152 # \r\n for msdos eol, \n for unix
153 return q^\r?\n\s*\r?\n^;
154}
155
156
157
158# The bulk of this function is based on read_line in multiread.pm
159# Unable to use read_line original because it expects to get its input
160# from a file. Here the line to be converted is passed in as a string
161
162sub to_utf8
163{
164 my $self = shift (@_);
165 my ($encoding, $line) = @_;
166
167 if ($encoding eq "iso_8859_1") {
168 # we'll use ascii2utf8() for this as it's faster than going
169 # through convert2unicode()
170 return &unicode::ascii2utf8 (\$line);
171 }
172
173 # everything else uses unicode::convert2unicode
174 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
175}
176
177
178sub read_file {
179 my $self = shift (@_);
180 my ($filename, $encoding, $language, $textref) = @_;
181
182 $self->{'readfile_encoding'}->{$filename} = $encoding;
183
184 my @marc_entries = ();
185
186 if (!-r $filename)
187 {
188 my $outhandle = $self->{'outhandle'};
189 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
190 return;
191 }
192
193 my $batch = new MARC::Batch( 'USMARC', $filename );
194 while ( my $marc = $batch->next )
195 {
196 push(@marc_entries,$marc);
197 $$textref .= $marc->as_formatted();
198 $$textref .= "\n\n"; # for SplitPlug - see default_split_exp above...
199 }
200
201 $self->{'marc_entries'}->{$filename} = \@marc_entries;
202}
203
204
205
206# do plugin specific processing of doc_obj
207# This gets done for each record found by SplitPlug in marc files.
208sub process {
209 my $self = shift (@_);
210 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
211
212 my $outhandle = $self->{'outhandle'};
213 my $filename = &util::filename_cat($base_dir, $file);
214
215 if (! defined($self->{'metadata_mapping'}))
216 {
217 print $outhandle "MARCPlug: no metadata file! Can't process $file\n";
218 return undef;
219 }
220
221 print STDERR "<Processing n='$file' p='MARCPlug'>\n" if ($gli);
222 print $outhandle "MARCPlug: processing $file\n"
223 if $self->{'verbosity'} > 1;
224
225 my $cursection = $doc_obj->get_top_section();
226
227 # Add fileFormat as the metadata
228 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
229
230 my $marc_entries = $self->{'marc_entries'}->{$filename};
231 my $marc = shift(@$marc_entries);
232
233 my $encoding = $self->{'readfile_encoding'}->{$filename};
234 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
235
236 # add spaces after the sub-field markers, for word boundaries
237 $$textref =~ s/^(.{6} _\w)/$1 /gm;
238
239 # add text to document object
240 $$textref =~ s/</&lt;/g;
241 $$textref =~ s/>/&gt;/g;
242
243 $$textref = $self->to_utf8($encoding,$$textref);
244
245 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
246 if $self->{'verbosity'} > 2;
247
248 # line wrapping
249 $$textref = &wrap_text_in_columns($$textref, 64);
250 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
251
252 $doc_obj->add_utf8_text($cursection, $$textref);
253
254 return 1;
255}
256
257sub wrap_text_in_columns
258{
259 my ($text, $columnwidth) = @_;
260 my $newtext = "";
261 my $linelength = 0;
262
263 # Break the text into words, and display one at a time
264 my @words = split(/ /, $text);
265
266 foreach my $word (@words) {
267 # If printing this word would exceed the column end, start a new line
268 if (($linelength + length($word)) >= $columnwidth) {
269 $newtext .= "\n";
270 $linelength = 0;
271 }
272
273 # Write the word
274 $newtext .= " $word";
275 if ($word =~ /\n/) {
276 $linelength = 0;
277 } else {
278 $linelength = $linelength + length(" $word");
279 }
280 }
281
282 $newtext .= "\n";
283 return $newtext;
284}
285
286
287sub extract_metadata
288{
289 my $self = shift (@_);
290 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
291 my $outhandle = $self->{'outhandle'};
292
293 my $metadata_mapping = $self->{'metadata_mapping'};
294 my $mm;
295 foreach $mm ( @$metadata_mapping )
296 {
297 my $marc_field = $mm->{'marc'};
298 my @metavalues = $marc->field($marc_field);
299
300 if (scalar(@metavalues)>0)
301 {
302 my $metaname = $mm->{'gsdl'};
303 my $metavalue;
304 foreach $metavalue ( @metavalues )
305 {
306 my $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
307 $doc_obj->add_utf8_metadata ($section, $metaname, $metavalue_str);
308 }
309 }
310 }
311}
3121;
Note: See TracBrowser for help on using the repository browser.