root/main/trunk/greenstone2/perllib/plugins/MARCPlugin.pm @ 27141

Revision 27141, 12.4 KB (checked in by kjdon, 7 years ago)

fixed extract_metadata so that it will get all ocurrences of a subfield, not just the first one. NOTE, haven't done the same for extract_ascii_metadata

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# MARCPlugin.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlugin;
28
29use SplitTextFile;
30use MetadataRead;
31
32use Encode;
33
34use unicode;
35use util;
36use marcmapping;
37
38use strict;
39no strict 'refs'; # allow filehandles to be variables and viceversa
40
41# methods defined in superclasses that have the same signature take
42# precedence in the order given in the ISA list. We want MetaPlugins to
43# call MetadataRead's can_process_this_file_for_metadata(), rather than
44# calling BasePlugin's version of the same method, so list inherited
45# superclasses in this order.
46sub BEGIN {
47    @MARCPlugin::ISA = ('MetadataRead', 'SplitTextFile');
48    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
49}
50
51my $arguments =
52    [ { 'name' => "metadata_mapping",
53    'desc' => "{common.deprecated} {MARCPlugin.metadata_mapping}",
54    'type' => "string",
55    'deft' => "",
56    'hiddengli' => "yes", # deprecated in favour of 'metadata_mapping_file'
57    'reqd' => "no" },
58      { 'name' => "metadata_mapping_file",
59    'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
60    'type' => "string",
61    'deft' => "marc2dc.txt",
62    'reqd' => "no" },
63      { 'name' => "process_exp",
64    'desc' => "{BasePlugin.process_exp}",
65    'type' => "regexp",
66    'reqd' => "no",
67    'deft' => &get_default_process_exp() },
68      { 'name' => "split_exp",
69    'desc' => "{SplitTextFile.split_exp}",
70    'type' => "regexp",
71    'reqd' => "no",
72    'deft' => &get_default_split_exp() }
73      ];
74
75my $options = { 'name'     => "MARCPlugin",
76        'desc'     => "{MARCPlugin.desc}",
77        'abstract' => "no",
78        'inherits' => "yes",
79        'explodes' => "yes",
80        'args'     => $arguments };
81
82require MARC::Record; 
83require MARC::Batch; 
84#use MARC::Record; 
85#use MARC::Batch;
86
87sub new {
88    my ($class) = shift (@_);
89    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
90    push(@$pluginlist, $class);
91
92    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
93    push(@{$hashArgOptLists->{"OptList"}},$options);
94 
95    # this does nothing yet, but if member vars are ever added
96    # to MetadataRead, will need to do this anyway:
97    #new MetadataRead($pluginlist, $inputargs, $hashArgOptLists);
98    my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
99
100    if ($self->{'info_only'}) {
101    # don't worry about the options
102    return bless $self, $class;
103    }
104    # 'metadata_mapping' was used in two ways in the plugin: as a plugin
105    # option (filename) and as a datastructure to represent the mapping.
106    # In MARXXMLPlug (written later) the two are separated: filename is
107    # represented through 'metadata_mapping_file' and the data-structure
108    # mapping left as 'metadata_mapping'
109    # 'metadata_mapping' still present (but hidden in GLI) for
110    # backwards compatibility, but 'metadata_mapping_file' is used by
111    # preference
112
113    if ($self->{'metadata_mapping'} ne "") {
114    print STDERR "MARCPlugin WARNING:: the metadata_mapping option is set but has been deprecated. Please use metadata_mapping_file option instead\n";
115    # If the old version is set, use it.
116    $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'};
117    }
118    $self->{'metadata_mapping'} = undef;
119    $self->{'type'} = "";
120    return bless $self, $class;
121}
122
123sub init {
124    my $self = shift (@_);
125    my ($verbosity, $outhandle, $failhandle) = @_;
126
127    ## the mapping file has already been loaded
128    if (defined $self->{'metadata_mapping'} ){
129    $self->SUPER::init(@_);
130    return;
131    }
132
133    # read in the metadata mapping files
134    my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
135    if (scalar(@$mm_files)==0)
136    {
137    my $msg = "MARCPlugin ERROR: Can't locate mapping file \"" .
138        $self->{'metadata_mapping_file'} . "\".\n " .
139        "    No metadata will be extracted from MARC files.\n";
140
141    print $outhandle $msg;
142    print $failhandle $msg;
143    $self->{'metadata_mapping'} = undef;
144    # We pick up the error in process() if there is no $mm_file
145    # If we exit here, then pluginfo.pl will exit too!
146    }
147    else {
148    $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
149    }
150
151    ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
152
153    $self->SUPER::init(@_);
154}
155
156
157
158sub get_default_process_exp {
159    my $self = shift (@_);
160
161    return q^(?i)(\.marc)$^;
162}
163
164
165sub get_default_split_exp {
166    # \r\n for msdos eol, \n for unix
167    return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
168}
169
170
171
172# The bulk of this function is based on read_line in multiread.pm
173# Unable to use read_line original because it expects to get its input
174# from a file.  Here the line to be converted is passed in as a string
175
176sub to_utf8
177{
178    my $self = shift (@_);
179    my ($encoding, $line) = @_;
180
181    if ($encoding eq "utf8") {
182    # nothing needs to be done
183    #return $line;
184    } elsif ($encoding eq "iso_8859_1") {
185    # we'll use ascii2utf8() for this as it's faster than going
186    # through convert2unicode()
187    #return &unicode::ascii2utf8 (\$line);
188    $line = &unicode::ascii2utf8 (\$line);
189    } else {
190
191    # everything else uses unicode::convert2unicode
192    $line = &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
193    }
194    # At this point $line is a binary byte string
195    # => turn it into a Unicode aware string, so full
196    # Unicode aware pattern matching can be used.
197    # For instance: 's/\x{0101}//g' or '[[:upper:]]'
198
199    return decode ("utf8", $line);
200}
201
202
203sub read_file {
204    my $self = shift (@_);
205    my ($filename, $encoding, $language, $textref) = @_;
206
207    my $outhandle = $self->{'outhandle'};
208   
209    if (! defined($self->{'metadata_mapping'}))
210    {
211    # print a warning
212    print $outhandle "MARCPlugin: no metadata mapping file! Can't extract metadata from $filename\n";
213    }
214
215    $self->{'readfile_encoding'}->{$filename} = $encoding;
216
217       
218    if (!-r $filename)
219    {
220    print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
221    return;
222    }
223
224    ##handle ascii marc
225    #test whether this is ascii marc file
226    if (open (FILE, $filename)) {
227    while (defined (my $line = <FILE>)) {
228        $$textref .= $line;
229       if ($line =~ /\[\w+\]Record type:/){
230           undef $/;
231           $$textref .= <FILE>;
232           $/ = "\n";
233           $self->{'type'} = "ascii";
234           close FILE;
235           return;
236       }
237    }
238    close FILE;
239    }
240
241     
242    $$textref = "";
243    my @marc_entries = ();
244 
245    my $batch = new MARC::Batch( 'USMARC', $filename );
246    while ( my $marc = $batch->next )
247    {
248        push(@marc_entries,$marc);
249    $$textref .= $marc->as_formatted();
250    $$textref .= "\n\n"; # for SplitTextFile - see default_split_exp above...
251    }
252
253    $self->{'marc_entries'}->{$filename} = \@marc_entries;
254}
255
256
257
258# do plugin specific processing of doc_obj
259# This gets done for each record found by SplitTextFile in marc files.
260sub process {
261    my $self = shift (@_);
262    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
263
264    my $outhandle = $self->{'outhandle'};
265    my $filename = &util::filename_cat($base_dir, $file);
266
267    my $cursection = $doc_obj->get_top_section();
268
269    # Add fileFormat as the metadata
270    $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
271   
272    my $marc_entries = $self->{'marc_entries'}->{$filename};
273    my $marc = shift(@$marc_entries);
274
275    my $encoding = $self->{'readfile_encoding'}->{$filename};
276    if (defined ($self->{'metadata_mapping'}) ) {
277    if ($self->{'type'} ne "ascii" ){   
278        $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
279    }
280    else{
281        $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
282    }
283    }
284
285    # add spaces after the sub-field markers, for word boundaries
286    $$textref =~ s/^(.{6} _\w)/$1 /gm;
287
288    # add text to document object
289    $$textref =~ s/</&lt;/g;
290    $$textref =~ s/>/&gt;/g;
291
292    $$textref = $self->to_utf8($encoding,$$textref);
293
294    print $outhandle "  Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
295    if $self->{'verbosity'} > 2;
296
297    # line wrapping
298    $$textref = &wrap_text_in_columns($$textref, 64);
299    $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
300
301    $doc_obj->add_utf8_text($cursection, $$textref);
302
303    return 1;
304}
305
306sub wrap_text_in_columns
307{
308    my ($text, $columnwidth) = @_;
309    my $newtext = "";
310    my $linelength = 0;
311   
312    # Break the text into words, and display one at a time
313    my @words = split(/ /, $text);
314
315    foreach my $word (@words) {
316    # If printing this word would exceed the column end, start a new line
317    if (($linelength + length($word)) >= $columnwidth) {
318        $newtext .= "\n";
319        $linelength = 0;
320    }
321   
322    # Write the word
323    $newtext .= " $word";
324    if ($word =~ /\n/) {
325        $linelength = 0;
326    } else {
327        $linelength = $linelength + length(" $word");
328    }
329    }
330
331    $newtext .= "\n";
332    return $newtext;
333}
334
335sub extract_metadata
336{
337    my $self = shift (@_);
338 
339    my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
340    my $outhandle = $self->{'outhandle'};
341
342    if (!defined $marc){
343    return;
344    }
345
346    my $metadata_mapping = $self->{'metadata_mapping'};;
347
348    foreach my $marc_field ( keys %$metadata_mapping )
349    {
350    my $gsdl_field = $metadata_mapping->{$marc_field};
351   
352    # have we got a subfield?
353    my $subfield = undef;
354    if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
355        $marc_field = $1;
356        $subfield = $2;
357    }
358
359    foreach my $meta_value_obj ($marc->field($marc_field)) {
360        my $meta_value;
361        if (defined($subfield)) {
362        $meta_value = $meta_value_obj->subfield($subfield);
363        } else {
364        $meta_value = $meta_value_obj->as_string();
365        }
366        if (defined $meta_value) {
367        # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
368        $meta_value =~ s/\[/&\#091;/g;
369        $meta_value =~ s/\]/&\#093;/g;
370        my $metavalue_str = $self->to_utf8($encoding, $meta_value);
371        $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
372        }
373    }
374    }
375}
376
377
378sub extract_ascii_metadata
379{
380    my $self = shift (@_);
381
382    my ($text, $metadata,$doc_obj, $section) = @_;
383    my $outhandle = $self->{'outhandle'};
384    my $metadata_mapping = $self->{'metadata_mapping'};
385    ## get fields
386    my @fields = split(/[\n\r]+/,$text);
387    my $marc_mapping ={};
388
389    foreach my $field (@fields){
390    if ($field ne ""){
391        $field =~ /^(\d\d\d)\s/;
392        my $code = $1;
393        $field = $'; #'
394        ##get subfields
395        my @subfields = split(/\$/,$field);
396        my $i=0;
397        $marc_mapping->{$code} = []; 
398        foreach my $subfield (@subfields){
399        if ($i == 0){
400            ##print STDERR $subfield."\n";
401            push(@{$marc_mapping->{$code}},"info");
402            push(@{$marc_mapping->{$code}},$subfield);
403                 
404            $i++;
405        }
406         else{
407             $subfield =~ /(\w)\s/;
408             ##print STDERR "$1=>$'\n";
409             push(@{$marc_mapping->{$code}},$1);
410                     push(@{$marc_mapping->{$code}},$'); #'
411         }
412        }
413    }
414    }
415
416
417     foreach my $marc_field ( keys %$metadata_mapping )
418    {
419       
420    my $matched_field = $marc_mapping->{$marc_field};
421    my $subfield = undef;
422
423    if (defined $matched_field){
424        ## test whether this field has subfield
425        if ($marc_field =~ /\d\d\d(\w)/){
426        $subfield = $1;
427        }
428        my $metaname = $metadata_mapping->{$marc_field};
429 
430        my $metavalue;
431        if (defined $subfield){
432        my %mapped_subfield = {@$matched_field};
433        $metavalue = $mapped_subfield{$subfield};
434        }
435        else{ ## get all values except info
436        my $i =0;
437        foreach my $value (@$matched_field){
438            if ($i%2 != 0 and $i != 1){
439            $metavalue .= $value." ";
440            }
441            $i++;
442        }
443        }
444       
445        ## escape [ and ]
446        $metavalue =~ s/\[/\\\[/g;
447        $metavalue =~ s/\]/\\\]/g;
448        ##print STDERR  "$metaname=$metavalue\n";
449        $doc_obj->add_metadata ($section, $metaname, $metavalue) ;         
450    }
451       
452    }
453
454}
455
456
4571;
Note: See TracBrowser for help on using the browser.