source: trunk/gsdl/perllib/plugins/MARCPlug.pm@ 10218

Last change on this file since 10218 was 10218, checked in by kjdon, 19 years ago

Jeffrey's new parsing modifications, committed approx 6 July, 15.16

  • Property svn:keywords set to Author Date Id Revision
File size: 8.0 KB
Line 
1###########################################################################
2#
3# MARCPlug.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlug;
28
29use SplitPlug;
30
31use unicode;
32use util;
33use parsargv;
34
35sub BEGIN {
36 @ISA = ('SplitPlug');
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38}
39
40my $arguments =
41 [ { 'name' => "metadata_mapping",
42 'desc' => "{MARCPlug.metadata_mapping}",
43 'type' => "string",
44 'deft' => "marctodc.txt",
45 'reqd' => "no" },
46 { 'name' => "process_exp",
47 'desc' => "{BasPlug.process_exp}",
48 'type' => "regexp",
49 'reqd' => "no",
50 'deft' => &get_default_process_exp() },
51 { 'name' => "split_exp",
52 'desc' => "{SplitPlug.split_exp}",
53 'type' => "regexp",
54 'reqd' => "no",
55 'deft' => &get_default_split_exp() }
56 ];
57
58my $options = { 'name' => "MARCPlug",
59 'desc' => "{MARCPlug.desc}",
60 'abstract' => "no",
61 'inherits' => "Yes",
62 'explodes' => "yes",
63 'args' => $arguments };
64
65require MARC::Record;
66require MARC::Batch;
67#use MARC::Record;
68#use MARC::Batch;
69
70sub new {
71 my ($class) = shift (@_);
72 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73 push(@$pluginlist, $class);
74
75 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
76 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
77
78 my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
79
80 return bless $self, $class;
81}
82
83sub init {
84 my $self = shift (@_);
85 my ($verbosity, $outhandle, $failhandle) = @_;
86
87 my @metadata_mapping = ();
88
89 # read in the metadata mapping file
90 my $mm_file =
91 &util::filename_cat( $ENV{'GSDLHOME'}, "etc", $self->{'metadata_mapping'} );
92
93 if (!-e $mm_file)
94 {
95
96 my $msg = "MARCPlug ERROR: Can't locate mapping file \"" .
97 $self->{'metadata_mapping'} . "\".\n This file should be at $mm_file\n" .
98 " No marc files can be processed.\n";
99
100 print $outhandle $msg;
101 print $failhandle $msg;
102 $self->{'metadata_mapping'} = undef;
103 # We pick up the error in process() if there is no $mm_file
104 # If we exit here, then pluginfo.pl will exit too!
105 }
106 elsif (open(MMIN, "<$mm_file"))
107 {
108 my $l=1;
109 my $line;
110 while (defined($line=<MMIN>))
111 {
112 chomp $line;
113 if ($line =~ m/^(\d+)\s*->\s*(\w+)$/)
114 {
115 my $marc_info = $1;
116 my $gsdl_info = $2;
117 my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info };
118 push(@metadata_mapping,$mapping);
119 }
120 elsif ($line !~ m/^\#/ # allow comments (# in first column)
121 && $line !~ m/^\s*$/) # allow blank lines
122 {
123 print $outhandle "Parse error on line $l of $mm_file:\n";
124 print $outhandle " \"$line\"\n";
125 }
126 $l++
127 }
128 close(MMIN);
129 }
130 else
131 {
132 print STDERR "Unable to open $mm_file: $!\n";
133 }
134
135 $self->{'metadata_mapping'} = \@metadata_mapping;
136
137 $self->SUPER::init(@_);
138}
139
140
141sub get_default_process_exp {
142 my $self = shift (@_);
143
144 return q^(?i)(\.marc)$^;
145}
146
147
148sub get_default_split_exp {
149 # \r\n for msdos eol, \n for unix
150 return q^\r?\n\s*\r?\n^;
151}
152
153
154
155# The bulk of this function is based on read_line in multiread.pm
156# Unable to use read_line original because it expects to get its input
157# from a file. Here the line to be converted is passed in as a string
158
159sub to_utf8
160{
161 my $self = shift (@_);
162 my ($encoding, $line) = @_;
163
164 if ($encoding eq "iso_8859_1") {
165 # we'll use ascii2utf8() for this as it's faster than going
166 # through convert2unicode()
167 return &unicode::ascii2utf8 (\$line);
168 }
169
170 # everything else uses unicode::convert2unicode
171 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
172}
173
174
175sub read_file {
176 my $self = shift (@_);
177 my ($filename, $encoding, $language, $textref) = @_;
178
179 $self->{'readfile_encoding'}->{$filename} = $encoding;
180
181 my @marc_entries = ();
182
183 if (!-r $filename)
184 {
185 my $outhandle = $self->{'outhandle'};
186 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
187 return;
188 }
189
190 my $batch = new MARC::Batch( 'USMARC', $filename );
191 while ( my $marc = $batch->next )
192 {
193 push(@marc_entries,$marc);
194 $$textref .= $marc->as_formatted();
195 $$textref .= "\n\n"; # for SplitPlug - see default_split_exp above...
196 }
197
198 $self->{'marc_entries'}->{$filename} = \@marc_entries;
199}
200
201
202
203# do plugin specific processing of doc_obj
204# This gets done for each record found by SplitPlug in marc files.
205sub process {
206 my $self = shift (@_);
207 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
208
209 my $outhandle = $self->{'outhandle'};
210 my $filename = &util::filename_cat($base_dir, $file);
211
212 if (! defined($self->{'metadata_mapping'}))
213 {
214 print $outhandle "MARCPlug: no metadata file! Can't process $file\n";
215 return undef;
216 }
217
218 print STDERR "<Processing n='$file' p='MARCPlug'>\n" if ($gli);
219 print $outhandle "MARCPlug: processing $file\n"
220 if $self->{'verbosity'} > 1;
221
222 my $cursection = $doc_obj->get_top_section();
223
224 # Add fileFormat as the metadata
225 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
226
227 my $marc_entries = $self->{'marc_entries'}->{$filename};
228 my $marc = shift(@$marc_entries);
229
230 my $encoding = $self->{'readfile_encoding'}->{$filename};
231 $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
232
233 # add spaces after the sub-field markers, for word boundaries
234 $$textref =~ s/^(.{6} _\w)/$1 /gm;
235
236 # add text to document object
237 $$textref =~ s/</&lt;/g;
238 $$textref =~ s/>/&gt;/g;
239
240 $$textref = $self->to_utf8($encoding,$$textref);
241
242 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
243 if $self->{'verbosity'} > 2;
244
245 # line wrapping
246 $$textref = &wrap_text_in_columns($$textref, 64);
247 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
248
249 $doc_obj->add_utf8_text($cursection, $$textref);
250
251 return 1;
252}
253
254sub wrap_text_in_columns
255{
256 my ($text, $columnwidth) = @_;
257 my $newtext = "";
258 my $linelength = 0;
259
260 # Break the text into words, and display one at a time
261 my @words = split(/ /, $text);
262
263 foreach $word (@words) {
264 # If printing this word would exceed the column end, start a new line
265 if (($linelength + length($word)) >= $columnwidth) {
266 $newtext .= "\n";
267 $linelength = 0;
268 }
269
270 # Write the word
271 $newtext .= " $word";
272 if ($word =~ /\n/) {
273 $linelength = 0;
274 } else {
275 $linelength = $linelength + length(" $word");
276 }
277 }
278
279 $newtext .= "\n";
280 return $newtext;
281}
282
283
284sub extract_metadata
285{
286 my $self = shift (@_);
287 my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
288 my $outhandle = $self->{'outhandle'};
289
290 my $metadata_mapping = $self->{'metadata_mapping'};
291 my $mm;
292 foreach $mm ( @$metadata_mapping )
293 {
294 my $marc_field = $mm->{'marc'};
295 my @metavalues = $marc->field($marc_field);
296
297 if (scalar(@metavalues)>0)
298 {
299 my $metaname = $mm->{'gsdl'};
300 my $metavalue;
301 foreach $metavalue ( @metavalues )
302 {
303 $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
304 $doc_obj->add_utf8_metadata ($thissection, $metaname, $metavalue_str);
305 }
306 }
307 }
308}
3091;
Note: See TracBrowser for help on using the repository browser.