root/main/trunk/greenstone2/perllib/plugins/MARCPlugin.pm @ 27106

Revision 27106, 12.7 KB (checked in by kjdon, 7 years ago)

need to do the same utf8 decode step that is used in ReadTextFile? on the text and metadata so that we get proper utf8 strings

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# MARCPlugin.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlugin;
28
29use SplitTextFile;
30use MetadataRead;
31
32use Encode;
33
34use unicode;
35use util;
36use marcmapping;
37
38use strict;
39no strict 'refs'; # allow filehandles to be variables and viceversa
40
41# methods defined in superclasses that have the same signature take
42# precedence in the order given in the ISA list. We want MetaPlugins to
43# call MetadataRead's can_process_this_file_for_metadata(), rather than
44# calling BasePlugin's version of the same method, so list inherited
45# superclasses in this order.
46sub BEGIN {
47    @MARCPlugin::ISA = ('MetadataRead', 'SplitTextFile');
48    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
49}
50
51my $arguments =
52    [ { 'name' => "metadata_mapping",
53    'desc' => "{common.deprecated} {MARCPlugin.metadata_mapping}",
54    'type' => "string",
55    'deft' => "",
56    'hiddengli' => "yes", # deprecated in favour of 'metadata_mapping_file'
57    'reqd' => "no" },
58      { 'name' => "metadata_mapping_file",
59    'desc' => "{MARCXMLPlugin.metadata_mapping_file}",
60    'type' => "string",
61    'deft' => "marc2dc.txt",
62    'reqd' => "no" },
63      { 'name' => "process_exp",
64    'desc' => "{BasePlugin.process_exp}",
65    'type' => "regexp",
66    'reqd' => "no",
67    'deft' => &get_default_process_exp() },
68      { 'name' => "split_exp",
69    'desc' => "{SplitTextFile.split_exp}",
70    'type' => "regexp",
71    'reqd' => "no",
72    'deft' => &get_default_split_exp() }
73      ];
74
75my $options = { 'name'     => "MARCPlugin",
76        'desc'     => "{MARCPlugin.desc}",
77        'abstract' => "no",
78        'inherits' => "yes",
79        'explodes' => "yes",
80        'args'     => $arguments };
81
82require MARC::Record; 
83require MARC::Batch; 
84#use MARC::Record; 
85#use MARC::Batch;
86
87sub new {
88    my ($class) = shift (@_);
89    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
90    push(@$pluginlist, $class);
91
92    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
93    push(@{$hashArgOptLists->{"OptList"}},$options);
94 
95    # this does nothing yet, but if member vars are ever added
96    # to MetadataRead, will need to do this anyway:
97    #new MetadataRead($pluginlist, $inputargs, $hashArgOptLists);
98    my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
99
100    if ($self->{'info_only'}) {
101    # don't worry about the options
102    return bless $self, $class;
103    }
104    # 'metadata_mapping' was used in two ways in the plugin: as a plugin
105    # option (filename) and as a datastructure to represent the mapping.
106    # In MARXXMLPlug (written later) the two are separated: filename is
107    # represented through 'metadata_mapping_file' and the data-structure
108    # mapping left as 'metadata_mapping'
109    # 'metadata_mapping' still present (but hidden in GLI) for
110    # backwards compatibility, but 'metadata_mapping_file' is used by
111    # preference
112
113    if ($self->{'metadata_mapping'} ne "") {
114    print STDERR "MARCPlugin WARNING:: the metadata_mapping option is set but has been deprecated. Please use metadata_mapping_file option instead\n";
115    # If the old version is set, use it.
116    $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'};
117    }
118    $self->{'metadata_mapping'} = undef;
119    $self->{'type'} = "";
120    return bless $self, $class;
121}
122
123sub init {
124    my $self = shift (@_);
125    my ($verbosity, $outhandle, $failhandle) = @_;
126
127    ## the mapping file has already been loaded
128    if (defined $self->{'metadata_mapping'} ){
129    $self->SUPER::init(@_);
130    return;
131    }
132
133    # read in the metadata mapping files
134    my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'});
135    if (scalar(@$mm_files)==0)
136    {
137    my $msg = "MARCPlugin ERROR: Can't locate mapping file \"" .
138        $self->{'metadata_mapping_file'} . "\".\n " .
139        "    No metadata will be extracted from MARC files.\n";
140
141    print $outhandle $msg;
142    print $failhandle $msg;
143    $self->{'metadata_mapping'} = undef;
144    # We pick up the error in process() if there is no $mm_file
145    # If we exit here, then pluginfo.pl will exit too!
146    }
147    else {
148    $self->{'metadata_mapping'} = &marcmapping::parse_marc_metadata_mapping($mm_files, $outhandle);
149    }
150
151    ##map { print STDERR $_."=>".$self->{'metadata_mapping'}->{$_}."\n"; } keys %{$self->{'metadata_mapping'}};
152
153    $self->SUPER::init(@_);
154}
155
156
157
158sub get_default_process_exp {
159    my $self = shift (@_);
160
161    return q^(?i)(\.marc)$^;
162}
163
164
165sub get_default_split_exp {
166    # \r\n for msdos eol, \n for unix
167    return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
168}
169
170
171
172# The bulk of this function is based on read_line in multiread.pm
173# Unable to use read_line original because it expects to get its input
174# from a file.  Here the line to be converted is passed in as a string
175
176sub to_utf8
177{
178    my $self = shift (@_);
179    my ($encoding, $line) = @_;
180
181    if ($encoding eq "utf8") {
182    # nothing needs to be done
183    #return $line;
184    } elsif ($encoding eq "iso_8859_1") {
185    # we'll use ascii2utf8() for this as it's faster than going
186    # through convert2unicode()
187    #return &unicode::ascii2utf8 (\$line);
188    $line = &unicode::ascii2utf8 (\$line);
189    } else {
190
191    # everything else uses unicode::convert2unicode
192    $line = &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
193    }
194    # At this point $line is a binary byte string
195    # => turn it into a Unicode aware string, so full
196    # Unicode aware pattern matching can be used.
197    # For instance: 's/\x{0101}//g' or '[[:upper:]]'
198
199    return decode ("utf8", $line);
200}
201
202
203sub read_file {
204    my $self = shift (@_);
205    my ($filename, $encoding, $language, $textref) = @_;
206
207    my $outhandle = $self->{'outhandle'};
208   
209    if (! defined($self->{'metadata_mapping'}))
210    {
211    # print a warning
212    print $outhandle "MARCPlugin: no metadata mapping file! Can't extract metadata from $filename\n";
213    }
214
215    $self->{'readfile_encoding'}->{$filename} = $encoding;
216
217       
218    if (!-r $filename)
219    {
220    print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
221    return;
222    }
223
224    ##handle ascii marc
225    #test whether this is ascii marc file
226    if (open (FILE, $filename)) {
227    while (defined (my $line = <FILE>)) {
228        $$textref .= $line;
229       if ($line =~ /\[\w+\]Record type:/){
230           undef $/;
231           $$textref .= <FILE>;
232           $/ = "\n";
233           $self->{'type'} = "ascii";
234           close FILE;
235           return;
236       }
237    }
238    close FILE;
239    }
240
241     
242    $$textref = "";
243    my @marc_entries = ();
244 
245    my $batch = new MARC::Batch( 'USMARC', $filename );
246    while ( my $marc = $batch->next )
247    {
248        push(@marc_entries,$marc);
249    $$textref .= $marc->as_formatted();
250    $$textref .= "\n\n"; # for SplitTextFile - see default_split_exp above...
251    }
252
253    $self->{'marc_entries'}->{$filename} = \@marc_entries;
254}
255
256
257
258# do plugin specific processing of doc_obj
259# This gets done for each record found by SplitTextFile in marc files.
260sub process {
261    my $self = shift (@_);
262    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
263
264    my $outhandle = $self->{'outhandle'};
265    my $filename = &util::filename_cat($base_dir, $file);
266
267    my $cursection = $doc_obj->get_top_section();
268
269    # Add fileFormat as the metadata
270    $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
271   
272    my $marc_entries = $self->{'marc_entries'}->{$filename};
273    my $marc = shift(@$marc_entries);
274
275    my $encoding = $self->{'readfile_encoding'}->{$filename};
276    if (defined ($self->{'metadata_mapping'}) ) {
277    if ($self->{'type'} ne "ascii" ){   
278        $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
279    }
280    else{
281        $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
282    }
283    }
284
285    # add spaces after the sub-field markers, for word boundaries
286    $$textref =~ s/^(.{6} _\w)/$1 /gm;
287
288    # add text to document object
289    $$textref =~ s/</&lt;/g;
290    $$textref =~ s/>/&gt;/g;
291
292    $$textref = $self->to_utf8($encoding,$$textref);
293
294    print $outhandle "  Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
295    if $self->{'verbosity'} > 2;
296
297    # line wrapping
298    $$textref = &wrap_text_in_columns($$textref, 64);
299    $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
300
301    $doc_obj->add_utf8_text($cursection, $$textref);
302
303    return 1;
304}
305
306sub wrap_text_in_columns
307{
308    my ($text, $columnwidth) = @_;
309    my $newtext = "";
310    my $linelength = 0;
311   
312    # Break the text into words, and display one at a time
313    my @words = split(/ /, $text);
314
315    foreach my $word (@words) {
316    # If printing this word would exceed the column end, start a new line
317    if (($linelength + length($word)) >= $columnwidth) {
318        $newtext .= "\n";
319        $linelength = 0;
320    }
321   
322    # Write the word
323    $newtext .= " $word";
324    if ($word =~ /\n/) {
325        $linelength = 0;
326    } else {
327        $linelength = $linelength + length(" $word");
328    }
329    }
330
331    $newtext .= "\n";
332    return $newtext;
333}
334
335
336sub extract_metadata
337{
338    my $self = shift (@_);
339 
340    my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
341    my $outhandle = $self->{'outhandle'};
342
343    if (!defined $marc){
344    return;
345    }
346
347    my $metadata_mapping = $self->{'metadata_mapping'};;
348
349    foreach my $marc_field ( keys %$metadata_mapping )
350    {
351    my $gsdl_field = $metadata_mapping->{$marc_field};
352   
353    # have we got a subfield?
354    my $subfield = undef;
355    if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){
356        $marc_field = $1;
357        $subfield = $2;
358    }
359    if (defined $subfield) {
360        my $meta_value = $marc->subfield($marc_field, $subfield);
361        if (defined $meta_value) {
362        # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
363        $meta_value =~ s/\[/&\#091;/g;
364        $meta_value =~ s/\]/&\#093;/g;
365        my $metavalue_str = $self->to_utf8($encoding, $meta_value);
366        $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
367        }
368    }
369    else
370    {
371        foreach my $meta_value_obj ($marc->field($marc_field))
372        {
373        my $meta_value = $meta_value_obj->as_string();
374
375        # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
376        $meta_value =~ s/\[/&\#091;/g;
377        $meta_value =~ s/\]/&\#093;/g;
378        my $metavalue_str = $self->to_utf8($encoding, $meta_value);
379        $doc_obj->add_utf8_metadata ($section, $gsdl_field, $metavalue_str);
380        }
381    }
382    }
383}
384
385
386sub extract_ascii_metadata
387{
388    my $self = shift (@_);
389
390    my ($text, $metadata,$doc_obj, $section) = @_;
391    my $outhandle = $self->{'outhandle'};
392    my $metadata_mapping = $self->{'metadata_mapping'};
393    ## get fields
394    my @fields = split(/[\n\r]+/,$text);
395    my $marc_mapping ={};
396
397    foreach my $field (@fields){
398    if ($field ne ""){
399        $field =~ /^(\d\d\d)\s/;
400        my $code = $1;
401        $field = $'; #'
402        ##get subfields
403        my @subfields = split(/\$/,$field);
404        my $i=0;
405        $marc_mapping->{$code} = []; 
406        foreach my $subfield (@subfields){
407        if ($i == 0){
408            ##print STDERR $subfield."\n";
409            push(@{$marc_mapping->{$code}},"info");
410            push(@{$marc_mapping->{$code}},$subfield);
411                 
412            $i++;
413        }
414         else{
415             $subfield =~ /(\w)\s/;
416             ##print STDERR "$1=>$'\n";
417             push(@{$marc_mapping->{$code}},$1);
418                     push(@{$marc_mapping->{$code}},$'); #'
419         }
420        }
421    }
422    }
423
424
425     foreach my $marc_field ( keys %$metadata_mapping )
426    {
427       
428    my $matched_field = $marc_mapping->{$marc_field};
429    my $subfield = undef;
430
431    if (defined $matched_field){
432        ## test whether this field has subfield
433        if ($marc_field =~ /\d\d\d(\w)/){
434        $subfield = $1;
435        }
436        my $metaname = $metadata_mapping->{$marc_field};
437 
438        my $metavalue;
439        if (defined $subfield){
440        my %mapped_subfield = {@$matched_field};
441        $metavalue = $mapped_subfield{$subfield};
442        }
443        else{ ## get all values except info
444        my $i =0;
445        foreach my $value (@$matched_field){
446            if ($i%2 != 0 and $i != 1){
447            $metavalue .= $value." ";
448            }
449            $i++;
450        }
451        }
452       
453        ## escape [ and ]
454        $metavalue =~ s/\[/\\\[/g;
455        $metavalue =~ s/\]/\\\]/g;
456        ##print STDERR  "$metaname=$metavalue\n";
457        $doc_obj->add_metadata ($section, $metaname, $metavalue) ;         
458    }
459       
460    }
461
462}
463
464
4651;
Note: See TracBrowser for help on using the browser.