root/gsdl/trunk/perllib/plugins/MARCPlugin.pm @ 18563

Revision 18563, 11.9 KB (checked in by kjdon, 11 years ago)

marc mapping file name change

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# MARCPlugin.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlugin;
28
29use SplitTextFile;
30
31use unicode;
32use util;
33use marcmapping;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38sub BEGIN {
39    @MARCPlugin::ISA = ('SplitTextFile');
40    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
41}
42
43my $arguments =
44    [ { 'name' => "metadata_mapping",
45    'desc' => "{common.deprecated} {MARCPlugin.metadata_mapping}",
46    'type' => "string",
47    'deft' => "",
48    'hiddengli' => "yes", # deprecated in favour of 'metadata_mapping_file'
49    'reqd' => "no" },
50      { 'name' => "metadata_mapping_file",
51    'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
52    'type' => "string",
53    'deft' => "marc2dc.txt",
54    'reqd' => "no" },
55      { 'name' => "process_exp",
56    'desc' => "{BasePlugin.process_exp}",
57    'type' => "regexp",
58    'reqd' => "no",
59    'deft' => &get_default_process_exp() },
60      { 'name' => "split_exp",
61    'desc' => "{SplitTextFile.split_exp}",
62    'type' => "regexp",
63    'reqd' => "no",
64    'deft' => &get_default_split_exp() }
65      ];
66
67my $options = { 'name'     => "MARCPlugin",
68        'desc'     => "{MARCPlugin.desc}",
69        'abstract' => "no",
70        'inherits' => "yes",
71        'explodes' => "yes",
72        'args'     => $arguments };
73
74require MARC::Record; 
75require MARC::Batch; 
76#use MARC::Record; 
77#use MARC::Batch;
78
79sub new {
80    my ($class) = shift (@_);
81    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
82    push(@$pluginlist, $class);
83
84    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
85    push(@{$hashArgOptLists->{"OptList"}},$options);
86
87    my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
88
89    if ($self->{'info_only'}) {
90    # don't worry about the options
91    return bless $self, $class;
92    }
93    # 'metadata_mapping' was used in two ways in the plugin: as a plugin
94    # option (filename) and as a datastructure to represent the mapping.
95    # In MARXXMLPlug (written later) the two are separated: filename is
96    # represented through 'metadata_mapping_file' and the data-structure
97    # mapping left as 'metadata_mapping'
98    # 'metadata_mapping' still present (but hidden in GLI) for
99    # backwards compatibility, but 'metadata_mapping_file' is used by
100    # preference
101
102    if ($self->{'metadata_mapping'} ne "") {
103    print STDERR "MARCPlugin WARNING:: the metadata_mapping option is set but has been deprecated. Please use metadata_mapping_file option instead\n";
104    # If the old version is set, use it.
105    $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'};
106    }
107    $self->{'metadata_mapping'} = undef;
108    $self->{'type'} = "";
109    return bless $self, $class;
110}
111
112sub init {
113    my $self = shift (@_);
114    my ($verbosity, $outhandle, $failhandle) = @_;
115
116    ## the mapping file has already been loaded
117    if (defined $self->{'metadata_mapping'} ){
118    $self->SUPER::init(@_);
119    return;
120    }
121
122    # read in the metadata mapping files
123    my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
124    if (scalar(@$mm_files)==0)
125    {
126    my $msg = "MARCPlugin ERROR: Can't locate mapping file \"" .
127        $self->{'metadata_mapping_file'} . "\".\n " .
128        "    No metadata will be extracted from MARC files.\n";
129
130    print $outhandle $msg;
131    print $failhandle $msg;
132    $self->{'metadata_mapping'} = undef;
133    # We pick up the error in process() if there is no $mm_file
134    # If we exit here, then pluginfo.pl will exit too!
135    }
136    else {
137    $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
138    }
139
140    ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
141
142    $self->SUPER::init(@_);
143}
144
145
146
147sub get_default_process_exp {
148    my $self = shift (@_);
149
150    return q^(?i)(\.marc)$^;
151}
152
153
154sub get_default_split_exp {
155    # \r\n for msdos eol, \n for unix
156    return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
157}
158
159
160
161# The bulk of this function is based on read_line in multiread.pm
162# Unable to use read_line original because it expects to get its input
163# from a file.  Here the line to be converted is passed in as a string
164
165sub to_utf8
166{
167    my $self = shift (@_);
168    my ($encoding, $line) = @_;
169
170    if ($encoding eq "utf8") {
171    # nothing needs to be done
172    return $line;
173    }
174
175    if ($encoding eq "iso_8859_1") {
176    # we'll use ascii2utf8() for this as it's faster than going
177    # through convert2unicode()
178    return &unicode::ascii2utf8 (\$line);
179    }
180
181    # everything else uses unicode::convert2unicode
182    return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
183}
184
185
186sub read_file {
187    my $self = shift (@_);
188    my ($filename, $encoding, $language, $textref) = @_;
189
190    my $outhandle = $self->{'outhandle'};
191   
192    if (! defined($self->{'metadata_mapping'}))
193    {
194    # print a warning
195    print $outhandle "MARCPlugin: no metadata mapping file! Can't extract metadata from $filename\n";
196    }
197
198    $self->{'readfile_encoding'}->{$filename} = $encoding;
199
200       
201    if (!-r $filename)
202    {
203    print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
204    return;
205    }
206
207    ##handle ascii marc
208    #test whether this is ascii marc file
209    if (open (FILE, $filename)) {
210    while (defined (my $line = <FILE>)) {
211        $$textref .= $line;
212       if ($line =~ /\[\w+\]Record type:/){
213           undef $/;
214           $$textref .= <FILE>;
215           $/ = "\n";
216           $self->{'type'} = "ascii";
217           close FILE;
218           return;
219       }
220    }
221    close FILE;
222    }
223
224     
225    $$textref = "";
226    my @marc_entries = ();
227 
228    my $batch = new MARC::Batch( 'USMARC', $filename );
229    while ( my $marc = $batch->next )
230    {
231        push(@marc_entries,$marc);
232    $$textref .= $marc->as_formatted();
233    $$textref .= "\n\n"; # for SplitTextFile - see default_split_exp above...
234    }
235
236    $self->{'marc_entries'}->{$filename} = \@marc_entries;
237}
238
239
240
241# do plugin specific processing of doc_obj
242# This gets done for each record found by SplitTextFile in marc files.
243sub process {
244    my $self = shift (@_);
245    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
246
247    my $outhandle = $self->{'outhandle'};
248    my $filename = &util::filename_cat($base_dir, $file);
249
250    my $cursection = $doc_obj->get_top_section();
251
252    # Add fileFormat as the metadata
253    $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
254   
255    my $marc_entries = $self->{'marc_entries'}->{$filename};
256    my $marc = shift(@$marc_entries);
257
258    my $encoding = $self->{'readfile_encoding'}->{$filename};
259
260    if (defined ($self->{'metadata_mapping'}) ) {
261    if ($self->{'type'} ne "ascii" ){   
262        $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
263    }
264    else{
265        $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
266    }
267    }
268
269    # add spaces after the sub-field markers, for word boundaries
270    $$textref =~ s/^(.{6} _\w)/$1 /gm;
271
272    # add text to document object
273    $$textref =~ s/</&lt;/g;
274    $$textref =~ s/>/&gt;/g;
275
276    $$textref = $self->to_utf8($encoding,$$textref);
277
278    print $outhandle "  Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
279    if $self->{'verbosity'} > 2;
280
281    # line wrapping
282    $$textref = &wrap_text_in_columns($$textref, 64);
283    $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
284
285    $doc_obj->add_utf8_text($cursection, $$textref);
286
287    return 1;
288}
289
290sub wrap_text_in_columns
291{
292    my ($text, $columnwidth) = @_;
293    my $newtext = "";
294    my $linelength = 0;
295   
296    # Break the text into words, and display one at a time
297    my @words = split(/ /, $text);
298
299    foreach my $word (@words) {
300    # If printing this word would exceed the column end, start a new line
301    if (($linelength + length($word)) >= $columnwidth) {
302        $newtext .= "\n";
303        $linelength = 0;
304    }
305   
306    # Write the word
307    $newtext .= " $word";
308    if ($word =~ /\n/) {
309        $linelength = 0;
310    } else {
311        $linelength = $linelength + length(" $word");
312    }
313    }
314
315    $newtext .= "\n";
316    return $newtext;
317}
318
319
320sub extract_metadata
321{
322    my $self = shift (@_);
323 
324    my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
325    my $outhandle = $self->{'outhandle'};
326
327    if (!defined $marc){
328    return;
329    }
330
331    my $metadata_mapping = $self->{'metadata_mapping'};;
332
333    foreach my $marc_field ( keys %$metadata_mapping )
334    {
335    my $gsdl_field = $metadata_mapping->{$marc_field};
336   
337    # have we got a subfield?
338    my $subfield = undef;
339    if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
340        $marc_field = $1;
341        $subfield = $2;
342    }
343    if (defined $subfield) {
344        my $meta_value = $marc->subfield($marc_field, $subfield);
345        if (defined $meta_value) {
346        # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
347        $meta_value =~ s/\[/&\#091;/g;
348        $meta_value =~ s/\]/&\#093;/g;
349        my $metavalue_str = $self->to_utf8($encoding, $meta_value);
350        $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
351        }
352    }
353    else
354    {
355        foreach my $meta_value_obj ($marc->field($marc_field))
356        {
357        my $meta_value = $meta_value_obj->as_string();
358
359        # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
360        $meta_value =~ s/\[/&\#091;/g;
361        $meta_value =~ s/\]/&\#093;/g;
362        my $metavalue_str = $self->to_utf8($encoding, $meta_value);
363        $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
364        }
365    }
366    }
367}
368
369
370sub extract_ascii_metadata
371{
372    my $self = shift (@_);
373
374    my ($text, $metadata,$doc_obj, $section) = @_;
375    my $outhandle = $self->{'outhandle'};
376    my $metadata_mapping = $self->{'metadata_mapping'};
377    ## get fields
378    my @fields = split(/[\n\r]+/,$text);
379    my $marc_mapping ={};
380
381    foreach my $field (@fields){
382    if ($field ne ""){
383        $field =~ /^(\d\d\d)\s/;
384        my $code = $1;
385        $field = $'; #'
386        ##get subfields
387        my @subfields = split(/\$/,$field);
388        my $i=0;
389        $marc_mapping->{$code} = []; 
390        foreach my $subfield (@subfields){
391        if ($i == 0){
392            ##print STDERR $subfield."\n";
393            push(@{$marc_mapping->{$code}},"info");
394            push(@{$marc_mapping->{$code}},$subfield);
395                 
396            $i++;
397        }
398         else{
399             $subfield =~ /(\w)\s/;
400             ##print STDERR "$1=>$'\n";
401             push(@{$marc_mapping->{$code}},$1);
402                     push(@{$marc_mapping->{$code}},$'); #'
403         }
404        }
405    }
406    }
407
408
409     foreach my $marc_field ( keys %$metadata_mapping )
410    {
411       
412    my $matched_field = $marc_mapping->{$marc_field};
413    my $subfield = undef;
414
415    if (defined $matched_field){
416        ## test whether this field has subfield
417        if ($marc_field =~ /\d\d\d(\w)/){
418        $subfield = $1;
419        }
420        my $metaname = $metadata_mapping->{$marc_field};
421 
422        my $metavalue;
423        if (defined $subfield){
424        my %mapped_subfield = {@$matched_field};
425        $metavalue = $mapped_subfield{$subfield};
426        }
427        else{ ## get all values except info
428        my $i =0;
429        foreach my $value (@$matched_field){
430            if ($i%2 != 0 and $i != 1){
431            $metavalue .= $value." ";
432            }
433            $i++;
434        }
435        }
436       
437        ## escape [ and ]
438        $metavalue =~ s/\[/\\\[/g;
439        $metavalue =~ s/\]/\\\]/g;
440        ##print STDERR  "$metaname=$metavalue\n";
441        $doc_obj->add_metadata ($section, $metaname, $metavalue) ;         
442    }
443       
444    }
445
446}
447
448
4491;
Note: See TracBrowser for help on using the browser.