source: trunk/gsdl/perllib/plugins/MARCPlug.pm@ 7243

Last change on this file since 7243 was 6408, checked in by jmt12, 20 years ago

Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)

  • Property svn:keywords set to Author Date Id Revision
File size: 7.7 KB
Line 
1###########################################################################
2#
3# MARCPlug.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlug;
28
29use SplitPlug;
30
31use unicode;
32use util;
33use parsargv;
34
35sub BEGIN {
36 @ISA = ('SplitPlug');
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38}
39
40my $arguments =
41 [ { 'name' => "metadata_mapping",
42 'desc' => "{MARCPlug.metadata_mapping}",
43 'type' => "string",
44 'deft' => "marctodc.txt",
45 'reqd' => "no" },
46 { 'name' => "process_exp",
47 'desc' => "{BasPlug.process_exp}",
48 'type' => "regexp",
49 'reqd' => "no",
50 'deft' => &get_default_process_exp() },
51 { 'name' => "split_exp",
52 'desc' => "{SplitPlug.split_exp}",
53 'type' => "regexp",
54 'reqd' => "no",
55 'deft' => &get_default_split_exp() }
56 ];
57
58my $options = { 'name' => "MARCPlug",
59 'desc' => "{MARCPlug.desc}",
60 'abstract' => "no",
61 'inherits' => "Yes",
62 'args' => $arguments };
63
64require MARC::Record;
65require MARC::Batch;
66#use MARC::Record;
67#use MARC::Batch;
68
69# sub print_usage {
70# print STDERR "\n usage: plugin MARCPlug [options]\n\n";
71# print STDERR " options:\n";
72# print STDERR
73# " -metadata_mapping Name of file that includes mapping details from MARC
74# values to Greenstone metadata names. Defaults to
75# 'marctodc.txt' found in the site's etc directory.\n\n";
76# }
77
78sub new {
79 my $class = shift (@_);
80 my $self = new SplitPlug ($class, @_);
81 $self->{'plugin_type'} = "MARCPlug";
82 my $metadata_mapping;
83
84 if (!parsargv::parse(\@_,
85 q^metadata_mapping/.*/marctodc.txt^, \$metadata_mapping,
86 "allow_extra_options")) {
87
88 print STDERR "\nIncorrect options passed to MARCPlug, check your collect.cfg configuration file\n";
89 $self->print_txt_usage(""); # Use default resource bundle
90 die "\n";
91 }
92
93
94 $self->{'mm_file'} = $metadata_mapping; # relative to etc dir
95
96 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
97 my $option_list = $self->{'option_list'};
98 push( @{$option_list}, $options );
99
100 return bless $self, $class;
101}
102
103sub init {
104 my $self = shift (@_);
105 my ($verbosity, $outhandle, $failhandle) = @_;
106
107 my @metadata_mapping = ();
108
109 # read in the metadata mapping file
110 my $mm_file =
111 &util::filename_cat( $ENV{'GSDLHOME'}, "etc", $self->{'mm_file'} );
112
113 if (!-e $mm_file)
114 {
115
116 my $msg = "MARCPlug ERROR: Can't locate mapping file \"" .
117 $self->{'mm_file'} . "\".\n This file should be at $mm_file\n" .
118 " No marc files can be processed.\n";
119
120 print $outhandle $msg;
121 print $failhandle $msg;
122 $self->{'mm_file'} = undef;
123 # We pick up the error in process() if there is no $mm_file
124 # If we exit here, then pluginfo.pl will exit too!
125 }
126 elsif (open(MMIN, "<$mm_file"))
127 {
128 my $l=1;
129 my $line;
130 while (defined($line=<MMIN>))
131 {
132 chomp $line;
133 if ($line =~ m/^(\d+)\s*->\s*(\w+)$/)
134 {
135 my $marc_info = $1;
136 my $gsdl_info = $2;
137 my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info };
138 push(@metadata_mapping,$mapping);
139 }
140 elsif ($line !~ m/^\#/ # allow comments (# in first column)
141 && $line !~ m/^\s*$/) # allow blank lines
142 {
143 print $outhandle "Parse error on line $l of $mm_file:\n";
144 print $outhandle " \"$line\"\n";
145 }
146 $l++
147 }
148 close(MMIN);
149 }
150 else
151 {
152 print STDERR "Unable to open $mm_file: $!\n";
153 }
154
155 $self->{'metadata_mapping'} = \@metadata_mapping;
156
157 $self->SUPER::init(@_);
158}
159
160
161sub get_default_process_exp {
162 my $self = shift (@_);
163
164 return q^(?i)(\.marc)$^;
165}
166
167
168sub get_default_split_exp {
169 # \r\n for msdos eol, \n for unix
170 return q^\r?\n\s*\r?\n^;
171}
172
173
174
175# The bulk of this function is based on read_line in multiread.pm
176# Unable to use read_line original because it expects to get its input
177# from a file. Here the line to be converted is passed in as a string
178
179sub to_utf8
180{
181 my $self = shift (@_);
182 my ($encoding, $line) = @_;
183
184 if ($encoding eq "iso_8859_1") {
185 # we'll use ascii2utf8() for this as it's faster than going
186 # through convert2unicode()
187 return &unicode::ascii2utf8 (\$line);
188 }
189
190 # everything else uses unicode::convert2unicode
191 return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
192}
193
194
195sub read_file {
196 my $self = shift (@_);
197 my ($filename, $encoding, $language, $textref) = @_;
198
199 $self->{'readfile_encoding'} = $encoding;
200
201 @marc_entries = ();
202
203 if (!-r $filename)
204 {
205 my $outhandle = $self->{'outhandle'};
206 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
207 return;
208 }
209
210 my $batch = new MARC::Batch( 'USMARC', $filename );
211 while ( my $marc = $batch->next )
212 {
213 push(@marc_entries,$marc);
214 $$textref .= $marc->as_formatted();
215 $$textref .= "\n\n"; # for SplitPlug - see default_split_exp above...
216 }
217
218 $self->{'marc_entries'} = \@marc_entries;
219}
220
221
222
223# do plugin specific processing of doc_obj
224# This gets done for each record found by SplitPlug in marc files.
225sub process {
226 my $self = shift (@_);
227 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
228 my $outhandle = $self->{'outhandle'};
229
230 if (! defined($self->{'mm_file'}))
231 {
232 print $outhandle "MARCPlug: no metadata file! Can't process $file\n";
233 return undef;
234 }
235
236 print STDERR "<Processing n='$file' p='MARCPlug'>\n" if ($gli);
237 print $outhandle "MARCPlug: processing $file\n"
238 if $self->{'verbosity'} > 1;
239
240 my $cursection = $doc_obj->get_top_section();
241
242 my $marc_entries = $self->{'marc_entries'};
243 my $marc = shift(@$marc_entries);
244
245 $self->extract_metadata ($marc, $metadata, $doc_obj, $cursection);
246
247 # add spaces after the sub-field markers, for word boundaries
248 $$textref =~ s/^(.{6} _\w)/$1 /gm;
249
250 # add text to document object
251 $$textref =~ s/</&lt;/g;
252 $$textref =~ s/>/&gt;/g;
253
254 my $encoding = $self->{'readfile_encoding'};
255 $$textref = $self->to_utf8($encoding,$$textref);
256
257 print $outhandle " Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
258 if $self->{'verbosity'} > 2;
259
260 $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
261
262 $doc_obj->add_utf8_text($cursection, $$textref);
263
264 return 1;
265}
266
267
268
269sub extract_metadata
270{
271 my $self = shift (@_);
272 my ($marc, $metadata, $doc_obj, $section) = @_;
273 my $outhandle = $self->{'outhandle'};
274
275 my $encoding = $self->{'readfile_encoding'};
276
277 my $metadata_mapping = $self->{'metadata_mapping'};
278 my $mm;
279 foreach $mm ( @$metadata_mapping )
280 {
281 my $marc_field = $mm->{'marc'};
282 my @metavalues = $marc->field($marc_field);
283
284 if (scalar(@metavalues)>0)
285 {
286 my $metaname = $mm->{'gsdl'};
287 my $metavalue;
288 foreach $metavalue ( @metavalues )
289 {
290 $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
291 $doc_obj->add_utf8_metadata ($thissection, $metaname, $metavalue_str);
292 }
293 }
294 }
295}
2961;
Note: See TracBrowser for help on using the repository browser.