source: trunk/gsdl/perllib/plugins/MARCPlug.pm@ 11090

Last change on this file since 11090 was 10347, checked in by kjdon, 19 years ago

removed the unneeded 'use parsargv'

  • Property svn:keywords set to Author Date Id Revision
File size: 8.1 KB
Line 
1###########################################################################
2#
3# MARCPlug.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlug;
28
29use SplitPlug;
30
31use unicode;
32use util;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub BEGIN {
38 @MARCPlug::ISA = ('SplitPlug');
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
40}
41
42my $arguments =
43 [ { 'name' => "metadata_mapping",
44 'desc' => "{MARCPlug.metadata_mapping}",
45 'type' => "string",
46 'deft' => "marctodc.txt",
47 'reqd' => "no" },
48 { 'name' => "process_exp",
49 'desc' => "{BasPlug.process_exp}",
50 'type' => "regexp",
51 'reqd' => "no",
52 'deft' => &get_default_process_exp() },
53 { 'name' => "split_exp",
54 'desc' => "{SplitPlug.split_exp}",
55 'type' => "regexp",
56 'reqd' => "no",
57 'deft' => &get_default_split_exp() }
58 ];
59
60my $options = { 'name' => "MARCPlug",
61 'desc' => "{MARCPlug.desc}",
62 'abstract' => "no",
63 'inherits' => "Yes",
64 'explodes' => "yes",
65 'args' => $arguments };
66
67require MARC::Record;
68require MARC::Batch;
69#use MARC::Record;
70#use MARC::Batch;
71
72sub new {
73 my ($class) = shift (@_);
74 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
75 push(@$pluginlist, $class);
76
77 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
78 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
79
80 my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
81
82 return bless $self, $class;
83}
84
85sub init {
86 my $self = shift (@_);
87 my ($verbosity, $outhandle, $failhandle) = @_;
88
89 my @metadata_mapping = ();
90
91 # read in the metadata mapping file
92 my $mm_file =
93 &util::filename_cat( $ENV{'GSDLHOME'}, "etc", $self->{'metadata_mapping'} );
94
95 if (!-e $mm_file)
96 {
97
98 my $msg = "MARCPlug ERROR: Can't locate mapping file \"" .
99 $self->{'metadata_mapping'} . "\".\n This file should be at $mm_file\n" .
100 " No marc files can be processed.\n";
101
102 print $outhandle $msg;
103 print $failhandle $msg;
104 $self->{'metadata_mapping'} = undef;
105 # We pick up the error in process() if there is no $mm_file
106 # If we exit here, then pluginfo.pl will exit too!
107 }
108 elsif (open(MMIN, "<$mm_file"))
109 {
110 my $l=1;
111 my $line;
112 while (defined($line=<MMIN>))
113 {
114 chomp $line;
115 if ($line =~ m/^(\d+)\s*->\s*(\w+)$/)
116 {
117 my $marc_info = $1;
118 my $gsdl_info = $2;
119 my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info };
120 push(@metadata_mapping,$mapping);
121 }
122 elsif ($line !~ m/^\#/ # allow comments (# in first column)
123 && $line !~ m/^\s*$/) # allow blank lines
124 {
125 print $outhandle "Parse error on line $l of $mm_file:\n";
126 print $outhandle " \"$line\"\n";
127 }
128 $l++
129 }
130 close(MMIN);
131 }
132 else
133 {
134 print STDERR "Unable to open $mm_file: $!\n";
135 }
136
137 $self->{'metadata_mapping'} = \@metadata_mapping;
138
139 $self->SUPER::init(@_);
140}
141
142
143sub get_default_process_exp {
144 my $self = shift (@_);
145
146 return q^(?i)(\.marc)$^;
147}
148
149
150sub get_default_split_exp {
151 # \r\n for msdos eol, \n for unix
152 return q^\r?\n\s*\r?\n^;
153}
154
155
156
157# The bulk of this function is based on read_line in multiread.pm
158# Unable to use read_line original because it expects to get its input
159# from a file. Here the line to be converted is passed in as a string
160
161sub to_utf8
162{
163 my $self = shift (@_);
164 my ($encoding, $line) = @_;
165
166 if ($encoding eq "iso_8859_1") {
167 # we'll use ascii2utf8() for this as it's faster than going
168 # through convert2unicode()
169 return &unicode::ascii2utf8 (\$line);
170 }
171
172 # everything else uses unicode::convert2unicode
173 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
174}
175
176
177sub read_file {
178 my $self = shift (@_);
179 my ($filename, $encoding, $language, $textref) = @_;
180
181 $self->{'readfile_encoding'}->{$filename} = $encoding;
182
183 my @marc_entries = ();
184
185 if (!-r $filename)
186 {
187 my $outhandle = $self->{'outhandle'};
188 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
189 return;
190 }
191
192 my $batch = new MARC::Batch( 'USMARC', $filename );
193 while ( my $marc = $batch->next )
194 {
195 push(@marc_entries,$marc);
196 $$textref .= $marc->as_formatted();
197 $$textref .= "\n\n"; # for SplitPlug - see default_split_exp above...
198 }
199
200 $self->{'marc_entries'}->{$filename} = \@marc_entries;
201}
202
203
204
205# do plugin specific processing of doc_obj
206# This gets done for each record found by SplitPlug in marc files.
207sub process {
208 my $self = shift (@_);
209 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
210
211 my $outhandle = $self->{'outhandle'};
212 my $filename = &util::filename_cat($base_dir, $file);
213
214 if (! defined($self->{'metadata_mapping'}))
215 {
216 print $outhandle "MARCPlug: no metadata file! Can't process $file\n";
217 return undef;
218 }
219
220 print STDERR "<Processing n='$file' p='MARCPlug'>\n" if ($gli);
221 print $outhandle "MARCPlug: processing $file\n"
222 if $self->{'verbosity'} > 1;
223
224 my $cursection = $doc_obj->get_top_section();
225
226 # Add fileFormat as the metadata
227 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
228
229 my $marc_entries = $self->{'marc_entries'}->{$filename};
230 my $marc = shift(@$marc_entries);
231
232 my $encoding = $self->{'readfile_encoding'}->{$filename};
233 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
234
235 # add spaces after the sub-field markers, for word boundaries
236 $$textref =~ s/^(.{6} _\w)/$1 /gm;
237
238 # add text to document object
239 $$textref =~ s/</&lt;/g;
240 $$textref =~ s/>/&gt;/g;
241
242 $$textref = $self->to_utf8($encoding,$$textref);
243
244 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
245 if $self->{'verbosity'} > 2;
246
247 # line wrapping
248 $$textref = &wrap_text_in_columns($$textref, 64);
249 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
250
251 $doc_obj->add_utf8_text($cursection, $$textref);
252
253 return 1;
254}
255
256sub wrap_text_in_columns
257{
258 my ($text, $columnwidth) = @_;
259 my $newtext = "";
260 my $linelength = 0;
261
262 # Break the text into words, and display one at a time
263 my @words = split(/ /, $text);
264
265 foreach my $word (@words) {
266 # If printing this word would exceed the column end, start a new line
267 if (($linelength + length($word)) >= $columnwidth) {
268 $newtext .= "\n";
269 $linelength = 0;
270 }
271
272 # Write the word
273 $newtext .= " $word";
274 if ($word =~ /\n/) {
275 $linelength = 0;
276 } else {
277 $linelength = $linelength + length(" $word");
278 }
279 }
280
281 $newtext .= "\n";
282 return $newtext;
283}
284
285
286sub extract_metadata
287{
288 my $self = shift (@_);
289 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
290 my $outhandle = $self->{'outhandle'};
291
292 my $metadata_mapping = $self->{'metadata_mapping'};
293 my $mm;
294 foreach $mm ( @$metadata_mapping )
295 {
296 my $marc_field = $mm->{'marc'};
297 my @metavalues = $marc->field($marc_field);
298
299 if (scalar(@metavalues)>0)
300 {
301 my $metaname = $mm->{'gsdl'};
302 my $metavalue;
303 foreach $metavalue ( @metavalues )
304 {
305 my $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
306 $doc_obj->add_utf8_metadata ($section, $metaname, $metavalue_str);
307 }
308 }
309 }
310}
3111;
Note: See TracBrowser for help on using the repository browser.