source: trunk/gsdl/perllib/plugins/MARCPlug.pm@ 8915

Last change on this file since 8915 was 8121, checked in by chi, 20 years ago

Add the "FileFormat" metadata to each of the Plugins.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.1 KB
Line 
1###########################################################################
2#
3# MARCPlug.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlug;
28
29use SplitPlug;
30
31use unicode;
32use util;
33use parsargv;
34
35sub BEGIN {
36 @ISA = ('SplitPlug');
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38}
39
40my $arguments =
41 [ { 'name' => "metadata_mapping",
42 'desc' => "{MARCPlug.metadata_mapping}",
43 'type' => "string",
44 'deft' => "marctodc.txt",
45 'reqd' => "no" },
46 { 'name' => "process_exp",
47 'desc' => "{BasPlug.process_exp}",
48 'type' => "regexp",
49 'reqd' => "no",
50 'deft' => &get_default_process_exp() },
51 { 'name' => "split_exp",
52 'desc' => "{SplitPlug.split_exp}",
53 'type' => "regexp",
54 'reqd' => "no",
55 'deft' => &get_default_split_exp() }
56 ];
57
58my $options = { 'name' => "MARCPlug",
59 'desc' => "{MARCPlug.desc}",
60 'abstract' => "no",
61 'inherits' => "Yes",
62 'args' => $arguments };
63
64require MARC::Record;
65require MARC::Batch;
66#use MARC::Record;
67#use MARC::Batch;
68
69sub new {
70 my $class = shift (@_);
71 my $self = new SplitPlug ($class, @_);
72 $self->{'plugin_type'} = "MARCPlug";
73 my $metadata_mapping;
74
75 if (!parsargv::parse(\@_,
76 q^metadata_mapping/.*/marctodc.txt^, \$metadata_mapping,
77 "allow_extra_options")) {
78
79 print STDERR "\nIncorrect options passed to MARCPlug, check your collect.cfg configuration file\n";
80 $self->print_txt_usage(""); # Use default resource bundle
81 die "\n";
82 }
83
84
85 $self->{'mm_file'} = $metadata_mapping; # relative to etc dir
86
87 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
88 my $option_list = $self->{'option_list'};
89 push( @{$option_list}, $options );
90
91 return bless $self, $class;
92}
93
94sub init {
95 my $self = shift (@_);
96 my ($verbosity, $outhandle, $failhandle) = @_;
97
98 my @metadata_mapping = ();
99
100 # read in the metadata mapping file
101 my $mm_file =
102 &util::filename_cat( $ENV{'GSDLHOME'}, "etc", $self->{'mm_file'} );
103
104 if (!-e $mm_file)
105 {
106
107 my $msg = "MARCPlug ERROR: Can't locate mapping file \"" .
108 $self->{'mm_file'} . "\".\n This file should be at $mm_file\n" .
109 " No marc files can be processed.\n";
110
111 print $outhandle $msg;
112 print $failhandle $msg;
113 $self->{'mm_file'} = undef;
114 # We pick up the error in process() if there is no $mm_file
115 # If we exit here, then pluginfo.pl will exit too!
116 }
117 elsif (open(MMIN, "<$mm_file"))
118 {
119 my $l=1;
120 my $line;
121 while (defined($line=<MMIN>))
122 {
123 chomp $line;
124 if ($line =~ m/^(\d+)\s*->\s*(\w+)$/)
125 {
126 my $marc_info = $1;
127 my $gsdl_info = $2;
128 my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info };
129 push(@metadata_mapping,$mapping);
130 }
131 elsif ($line !~ m/^\#/ # allow comments (# in first column)
132 && $line !~ m/^\s*$/) # allow blank lines
133 {
134 print $outhandle "Parse error on line $l of $mm_file:\n";
135 print $outhandle " \"$line\"\n";
136 }
137 $l++
138 }
139 close(MMIN);
140 }
141 else
142 {
143 print STDERR "Unable to open $mm_file: $!\n";
144 }
145
146 $self->{'metadata_mapping'} = \@metadata_mapping;
147
148 $self->SUPER::init(@_);
149}
150
151
152sub get_default_process_exp {
153 my $self = shift (@_);
154
155 return q^(?i)(\.marc)$^;
156}
157
158
159sub get_default_split_exp {
160 # \r\n for msdos eol, \n for unix
161 return q^\r?\n\s*\r?\n^;
162}
163
164
165
166# The bulk of this function is based on read_line in multiread.pm
167# Unable to use read_line original because it expects to get its input
168# from a file. Here the line to be converted is passed in as a string
169
170sub to_utf8
171{
172 my $self = shift (@_);
173 my ($encoding, $line) = @_;
174
175 if ($encoding eq "iso_8859_1") {
176 # we'll use ascii2utf8() for this as it's faster than going
177 # through convert2unicode()
178 return &unicode::ascii2utf8 (\$line);
179 }
180
181 # everything else uses unicode::convert2unicode
182 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
183}
184
185
186sub read_file {
187 my $self = shift (@_);
188 my ($filename, $encoding, $language, $textref) = @_;
189
190 $self->{'readfile_encoding'} = $encoding;
191
192 @marc_entries = ();
193
194 if (!-r $filename)
195 {
196 my $outhandle = $self->{'outhandle'};
197 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
198 return;
199 }
200
201 my $batch = new MARC::Batch( 'USMARC', $filename );
202 while ( my $marc = $batch->next )
203 {
204 push(@marc_entries,$marc);
205 $$textref .= $marc->as_formatted();
206 $$textref .= "\n\n"; # for SplitPlug - see default_split_exp above...
207 }
208
209 $self->{'marc_entries'} = \@marc_entries;
210}
211
212
213
214# do plugin specific processing of doc_obj
215# This gets done for each record found by SplitPlug in marc files.
216sub process {
217 my $self = shift (@_);
218 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
219 my $outhandle = $self->{'outhandle'};
220
221 if (! defined($self->{'mm_file'}))
222 {
223 print $outhandle "MARCPlug: no metadata file! Can't process $file\n";
224 return undef;
225 }
226
227 print STDERR "<Processing n='$file' p='MARCPlug'>\n" if ($gli);
228 print $outhandle "MARCPlug: processing $file\n"
229 if $self->{'verbosity'} > 1;
230
231 my $cursection = $doc_obj->get_top_section();
232
233 # Add fileFormat as the metadata
234 $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
235
236 my $marc_entries = $self->{'marc_entries'};
237 my $marc = shift(@$marc_entries);
238
239 $self->extract_metadata ($marc, $metadata, $doc_obj, $cursection);
240
241 # add spaces after the sub-field markers, for word boundaries
242 $$textref =~ s/^(.{6} _\w)/$1 /gm;
243
244 # add text to document object
245 $$textref =~ s/</&lt;/g;
246 $$textref =~ s/>/&gt;/g;
247
248 my $encoding = $self->{'readfile_encoding'};
249 $$textref = $self->to_utf8($encoding,$$textref);
250
251 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
252 if $self->{'verbosity'} > 2;
253
254 # line wrapping
255 $$textref = &wrap_text_in_columns($$textref, 64);
256 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
257
258 $doc_obj->add_utf8_text($cursection, $$textref);
259
260 return 1;
261}
262
263sub wrap_text_in_columns
264{
265 my ($text, $columnwidth) = @_;
266 my $newtext = "";
267 my $linelength = 0;
268
269 # Break the text into words, and display one at a time
270 my @words = split(/ /, $text);
271
272 foreach $word (@words) {
273 # If printing this word would exceed the column end, start a new line
274 if (($linelength + length($word)) >= $columnwidth) {
275 $newtext .= "\n";
276 $linelength = 0;
277 }
278
279 # Write the word
280 $newtext .= " $word";
281 if ($word =~ /\n/) {
282 $linelength = 0;
283 } else {
284 $linelength = $linelength + length(" $word");
285 }
286 }
287
288 $newtext .= "\n";
289 return $newtext;
290}
291
292
293sub extract_metadata
294{
295 my $self = shift (@_);
296 my ($marc, $metadata, $doc_obj, $section) = @_;
297 my $outhandle = $self->{'outhandle'};
298
299 my $encoding = $self->{'readfile_encoding'};
300
301 my $metadata_mapping = $self->{'metadata_mapping'};
302 my $mm;
303 foreach $mm ( @$metadata_mapping )
304 {
305 my $marc_field = $mm->{'marc'};
306 my @metavalues = $marc->field($marc_field);
307
308 if (scalar(@metavalues)>0)
309 {
310 my $metaname = $mm->{'gsdl'};
311 my $metavalue;
312 foreach $metavalue ( @metavalues )
313 {
314 $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
315 $doc_obj->add_utf8_metadata ($thissection, $metaname, $metavalue_str);
316 }
317 }
318 }
319}
3201;
Note: See TracBrowser for help on using the repository browser.