root/gsdl/trunk/perllib/plugins/MARCPlug.pm @ 14964

Revision 14964, 10.5 KB (checked in by davidb, 12 years ago)

Minor tweak to MARCPlug so the plugin does not try to convert utf8 data into ... utf8svn diff MARCPlug.pm

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# MARCPlug.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlug;
28
29use SplitPlug;
30
31use unicode;
32use util;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub BEGIN {
38    @MARCPlug::ISA = ('SplitPlug');
39    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
40}
41
42my $arguments =
43    [ { 'name' => "metadata_mapping",
44    'desc' => "{MARCPlug.metadata_mapping}",
45    'type' => "string",
46    'deft' => "marctodc.txt",
47    'reqd' => "no" },
48      { 'name' => "process_exp",
49    'desc' => "{BasPlug.process_exp}",
50    'type' => "regexp",
51    'reqd' => "no",
52    'deft' => &get_default_process_exp() },
53      { 'name' => "split_exp",
54    'desc' => "{SplitPlug.split_exp}",
55    'type' => "regexp",
56    'reqd' => "no",
57    'deft' => &get_default_split_exp() }
58      ];
59
60my $options = { 'name'     => "MARCPlug",
61        'desc'     => "{MARCPlug.desc}",
62        'abstract' => "no",
63        'inherits' => "yes",
64        'explodes' => "yes",
65        'args'     => $arguments };
66
67require MARC::Record; 
68require MARC::Batch; 
69#use MARC::Record; 
70#use MARC::Batch;
71
72sub new {
73    my ($class) = shift (@_);
74    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
75    push(@$pluginlist, $class);
76
77    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
78    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
79
80    my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
81
82    $self->{'type'} = "";
83   
84    return bless $self, $class;
85}
86
87sub init {
88    my $self = shift (@_);
89    my ($verbosity, $outhandle, $failhandle) = @_;
90
91    my @metadata_mapping = ();
92
93    # read in the metadata mapping file
94    my $mm_file =
95    &util::filename_cat( $ENV{'GSDLHOME'}, "etc", $self->{'metadata_mapping'} );
96
97    if (!-e $mm_file)
98    {
99
100    my $msg = "MARCPlug ERROR: Can't locate mapping file \"" .
101        $self->{'metadata_mapping'} . "\".\n    This file should be at $mm_file\n" .
102        "    No marc files can be processed.\n";
103
104    print $outhandle $msg;
105    print $failhandle $msg;
106    $self->{'metadata_mapping'} = undef;
107    # We pick up the error in process() if there is no $mm_file
108    # If we exit here, then pluginfo.pl will exit too!
109    }
110    elsif (open(MMIN, "<$mm_file"))
111    {
112    my $l=1;
113    my $line;
114    while (defined($line=<MMIN>))
115    {
116        chomp $line;
117        if ($line =~ m/^(\d+)\s*->\s*([\w\^]+)$/)
118        {
119        my $marc_info = $1;
120        my $gsdl_info = $2;
121        my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info };
122        push(@metadata_mapping,$mapping);
123        }
124        elsif ($line !~ m/^\#/       # allow comments (# in first column)
125           && $line !~ m/^\s*$/) # allow blank lines
126        {
127        print $outhandle "Parse error on line $l of $mm_file:\n";
128        print $outhandle "  \"$line\"\n";
129        }
130        $l++
131    }
132    close(MMIN);
133    }
134    else
135    {
136    print STDERR "Unable to open $mm_file: $!\n";
137    }
138
139    $self->{'metadata_mapping'} = \@metadata_mapping;
140
141    $self->SUPER::init(@_);
142}
143   
144
145sub get_default_process_exp {
146    my $self = shift (@_);
147
148    return q^(?i)(\.marc)$^;
149}
150
151
152sub get_default_split_exp {
153    # \r\n for msdos eol, \n for unix
154    return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
155}
156
157
158
159# The bulk of this function is based on read_line in multiread.pm
160# Unable to use read_line original because it expects to get its input
161# from a file.  Here the line to be converted is passed in as a string
162
163sub to_utf8
164{
165    my $self = shift (@_);
166    my ($encoding, $line) = @_;
167
168    if ($encoding eq "utf8") {
169    # nothing needs to be done
170    return $line;
171    }
172
173    if ($encoding eq "iso_8859_1") {
174    # we'll use ascii2utf8() for this as it's faster than going
175    # through convert2unicode()
176    return &unicode::ascii2utf8 (\$line);
177    }
178
179    # everything else uses unicode::convert2unicode
180    return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
181}
182
183
184sub read_file {
185    my $self = shift (@_);
186    my ($filename, $encoding, $language, $textref) = @_;
187
188    $self->{'readfile_encoding'}->{$filename} = $encoding;
189
190       
191    if (!-r $filename)
192    {
193    my $outhandle = $self->{'outhandle'};
194    print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
195    return;
196    }
197
198    ##handle ascii marc
199    #test whether this is ascii marc file
200    if (open (FILE, $filename)) {
201    while (defined (my $line = <FILE>)) {
202        $$textref .= $line;
203       if ($line =~ /\[\w+\]Record type:/){
204           undef $/;
205           $$textref .= <FILE>;
206           $/ = "\n";
207           $self->{'type'} = "ascii";
208           close FILE;
209           return;
210       }
211    }
212    close FILE;
213    }
214
215     
216    $$textref = "";
217    my @marc_entries = ();
218 
219    my $batch = new MARC::Batch( 'USMARC', $filename );
220    while ( my $marc = $batch->next )
221    {
222        push(@marc_entries,$marc);
223    $$textref .= $marc->as_formatted();
224    $$textref .= "\n\n"; # for SplitPlug - see default_split_exp above...
225    }
226
227    $self->{'marc_entries'}->{$filename} = \@marc_entries;
228}
229
230
231
232# do plugin specific processing of doc_obj
233# This gets done for each record found by SplitPlug in marc files.
234sub process {
235    my $self = shift (@_);
236    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
237
238    my $outhandle = $self->{'outhandle'};
239    my $filename = &util::filename_cat($base_dir, $file);
240
241    if (! defined($self->{'metadata_mapping'}))
242    {
243    print $outhandle "MARCPlug: no metadata file! Can't process $file\n";
244    return undef;
245    }
246
247    print STDERR "<Processing n='$file' p='MARCPlug'>\n" if ($gli);
248    print $outhandle "MARCPlug: processing $file\n"
249    if $self->{'verbosity'} > 1;
250
251    my $cursection = $doc_obj->get_top_section();
252
253    # Add fileFormat as the metadata
254    $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
255   
256    my $marc_entries = $self->{'marc_entries'}->{$filename};
257    my $marc = shift(@$marc_entries);
258
259    my $encoding = $self->{'readfile_encoding'}->{$filename};
260
261    if ($self->{'type'} ne "ascii" ){   
262        $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
263    }
264    else{
265    $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
266    }
267
268    # add spaces after the sub-field markers, for word boundaries
269    $$textref =~ s/^(.{6} _\w)/$1 /gm;
270
271    # add text to document object
272    $$textref =~ s/</&lt;/g;
273    $$textref =~ s/>/&gt;/g;
274
275    $$textref = $self->to_utf8($encoding,$$textref);
276
277    print $outhandle "  Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
278    if $self->{'verbosity'} > 2;
279
280    # line wrapping
281    $$textref = &wrap_text_in_columns($$textref, 64);
282    $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
283
284    $doc_obj->add_utf8_text($cursection, $$textref);
285
286    return 1;
287}
288
289sub wrap_text_in_columns
290{
291    my ($text, $columnwidth) = @_;
292    my $newtext = "";
293    my $linelength = 0;
294   
295    # Break the text into words, and display one at a time
296    my @words = split(/ /, $text);
297
298    foreach my $word (@words) {
299    # If printing this word would exceed the column end, start a new line
300    if (($linelength + length($word)) >= $columnwidth) {
301        $newtext .= "\n";
302        $linelength = 0;
303    }
304   
305    # Write the word
306    $newtext .= " $word";
307    if ($word =~ /\n/) {
308        $linelength = 0;
309    } else {
310        $linelength = $linelength + length(" $word");
311    }
312    }
313
314    $newtext .= "\n";
315    return $newtext;
316}
317
318
319sub extract_metadata
320{
321    my $self = shift (@_);
322    my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
323    my $outhandle = $self->{'outhandle'};
324
325    if (!defined $marc){
326    return;
327    }
328
329    my $metadata_mapping = $self->{'metadata_mapping'};
330    my $mm;
331
332    foreach $mm ( @$metadata_mapping )
333    {
334    my $marc_field = $mm->{'marc'};
335
336    my @metavalues = $marc->field($marc_field);
337
338    if (scalar(@metavalues)>0)
339    {
340        my $metaname = $mm->{'gsdl'};
341        my $metavalue;
342        foreach $metavalue ( @metavalues )
343        {
344        my $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
345        $doc_obj->add_utf8_metadata ($section, $metaname, $metavalue_str);
346        }
347    }
348    }
349}
350
351sub extract_ascii_metadata
352{
353    my $self = shift (@_);
354    my ($text, $metadata,$doc_obj, $section) = @_;
355    my $outhandle = $self->{'outhandle'};
356    my $metadata_mapping = $self->{'metadata_mapping'};
357    ## get fields
358    my @fields = split(/[\n\r]+/,$text);
359    my $marc_mapping ={};
360
361    foreach my $field (@fields){
362    if ($field ne ""){
363        $field =~ /^(\d\d\d)\s/;
364        my $code = $1;
365        $field = $';
366        ##get subfields
367        my @subfields = split(/\$/,$field);
368        my $i=0;
369        $marc_mapping->{$code} = []; 
370        foreach my $subfield (@subfields){
371        if ($i == 0){
372            ##print STDERR $subfield."\n";
373            push(@{$marc_mapping->{$code}},"info");
374            push(@{$marc_mapping->{$code}},$subfield);
375                 
376            $i++;
377        }
378         else{
379             $subfield =~ /(\w)\s/;
380             ##print STDERR "$1=>$'\n";
381             push(@{$marc_mapping->{$code}},$1);
382                     push(@{$marc_mapping->{$code}},$');
383         }
384        }
385    }
386    }
387
388
389     foreach my $mm ( @$metadata_mapping )
390    {
391    my $marc_field = $mm->{'marc'};
392       
393    my $matched_field = $marc_mapping->{$marc_field};
394    my $subfield = undef;
395
396    if (defined $matched_field){
397        ## test whether this field has subfield
398        if ($marc_field =~ /\d\d\d(\w)/){
399        $subfield = $1;
400        }
401        my $metaname = $mm->{'gsdl'};
402 
403        my $metavalue;
404        if (defined $subfield){
405        my %mapped_subfield = {@$matched_field};
406        $metavalue = $mapped_subfield{$subfield};
407        }
408        else{ ## get all values except info
409        my $i =0;
410        foreach my $value (@$matched_field){
411            if ($i%2 != 0 and $i != 1){
412            $metavalue .= $value." ";
413            }
414            $i++;
415        }
416        }
417       
418        ## escape [ and ]
419        $metavalue =~ s/\[/\\\[/g;
420        $metavalue =~ s/\]/\\\]/g;
421        ##print STDERR  "$metaname=$metavalue\n";
422        $doc_obj->add_metadata ($section, $metaname, $metavalue) ;         
423    }
424       
425    }
426
427}
428
429
4301;
Note: See TracBrowser for help on using the browser.