root/trunk/gsdl/perllib/plugins/MARCPlug.pm @ 13198

Revision 13198, 10.5 KB (checked in by shaoqun, 14 years ago)

now this plugin can handle ascii marc files

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# MARCPlug.pm -- basic MARC plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MARCPlug;
28
29use SplitPlug;
30
31use unicode;
32use util;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub BEGIN {
38    @MARCPlug::ISA = ('SplitPlug');
39    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
40}
41
42my $arguments =
43    [ { 'name' => "metadata_mapping",
44    'desc' => "{MARCPlug.metadata_mapping}",
45    'type' => "string",
46    'deft' => "marctodc.txt",
47    'reqd' => "no" },
48      { 'name' => "process_exp",
49    'desc' => "{BasPlug.process_exp}",
50    'type' => "regexp",
51    'reqd' => "no",
52    'deft' => &get_default_process_exp() },
53      { 'name' => "split_exp",
54    'desc' => "{SplitPlug.split_exp}",
55    'type' => "regexp",
56    'reqd' => "no",
57    'deft' => &get_default_split_exp() }
58      ];
59
60my $options = { 'name'     => "MARCPlug",
61        'desc'     => "{MARCPlug.desc}",
62        'abstract' => "no",
63        'inherits' => "yes",
64        'explodes' => "yes",
65        'args'     => $arguments };
66
67require MARC::Record; 
68require MARC::Batch; 
69#use MARC::Record; 
70#use MARC::Batch;
71
72sub new {
73    my ($class) = shift (@_);
74    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
75    push(@$pluginlist, $class);
76
77    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
78    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
79
80    my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
81
82    $self->{'type'} = "";
83   
84    return bless $self, $class;
85}
86
87sub init {
88    my $self = shift (@_);
89    my ($verbosity, $outhandle, $failhandle) = @_;
90
91    my @metadata_mapping = ();
92
93    # read in the metadata mapping file
94    my $mm_file =
95    &util::filename_cat( $ENV{'GSDLHOME'}, "etc", $self->{'metadata_mapping'} );
96
97    if (!-e $mm_file)
98    {
99
100    my $msg = "MARCPlug ERROR: Can't locate mapping file \"" .
101        $self->{'metadata_mapping'} . "\".\n    This file should be at $mm_file\n" .
102        "    No marc files can be processed.\n";
103
104    print $outhandle $msg;
105    print $failhandle $msg;
106    $self->{'metadata_mapping'} = undef;
107    # We pick up the error in process() if there is no $mm_file
108    # If we exit here, then pluginfo.pl will exit too!
109    }
110    elsif (open(MMIN, "<$mm_file"))
111    {
112    my $l=1;
113    my $line;
114    while (defined($line=<MMIN>))
115    {
116        chomp $line;
117        if ($line =~ m/^(\d+)\s*->\s*([\w\^]+)$/)
118        {
119        my $marc_info = $1;
120        my $gsdl_info = $2;
121        my $mapping = { 'marc' => $marc_info, 'gsdl' => $gsdl_info };
122        push(@metadata_mapping,$mapping);
123        }
124        elsif ($line !~ m/^\#/       # allow comments (# in first column)
125           && $line !~ m/^\s*$/) # allow blank lines
126        {
127        print $outhandle "Parse error on line $l of $mm_file:\n";
128        print $outhandle "  \"$line\"\n";
129        }
130        $l++
131    }
132    close(MMIN);
133    }
134    else
135    {
136    print STDERR "Unable to open $mm_file: $!\n";
137    }
138
139    $self->{'metadata_mapping'} = \@metadata_mapping;
140
141    $self->SUPER::init(@_);
142}
143   
144
145sub get_default_process_exp {
146    my $self = shift (@_);
147
148    return q^(?i)(\.marc)$^;
149}
150
151
152sub get_default_split_exp {
153    # \r\n for msdos eol, \n for unix
154    return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
155}
156
157
158
159# The bulk of this function is based on read_line in multiread.pm
160# Unable to use read_line original because it expects to get its input
161# from a file.  Here the line to be converted is passed in as a string
162
163sub to_utf8
164{
165    my $self = shift (@_);
166    my ($encoding, $line) = @_;
167
168    if ($encoding eq "iso_8859_1") {
169    # we'll use ascii2utf8() for this as it's faster than going
170    # through convert2unicode()
171    return &unicode::ascii2utf8 (\$line);
172    }
173
174    # everything else uses unicode::convert2unicode
175    return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
176}
177
178
179sub read_file {
180    my $self = shift (@_);
181    my ($filename, $encoding, $language, $textref) = @_;
182
183    $self->{'readfile_encoding'}->{$filename} = $encoding;
184
185       
186    if (!-r $filename)
187    {
188    my $outhandle = $self->{'outhandle'};
189    print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
190    return;
191    }
192
193    ##handle ascii marc
194    #test whether this is ascii marc file
195    if (open (FILE, $filename)) {
196    while (defined (my $line = <FILE>)) {
197        $$textref .= $line;
198       if ($line =~ /\[\w+\]Record type:/){
199           undef $/;
200           $$textref .= <FILE>;
201           $/ = "\n";
202           $self->{'type'} = "ascii";
203           close FILE;
204           return;
205       }
206    }
207    close FILE;
208    }
209
210     
211    $$textref = "";
212    my @marc_entries = ();
213 
214    my $batch = new MARC::Batch( 'USMARC', $filename );
215    while ( my $marc = $batch->next )
216    {
217        push(@marc_entries,$marc);
218    $$textref .= $marc->as_formatted();
219    $$textref .= "\n\n"; # for SplitPlug - see default_split_exp above...
220    }
221
222    $self->{'marc_entries'}->{$filename} = \@marc_entries;
223}
224
225
226
227# do plugin specific processing of doc_obj
228# This gets done for each record found by SplitPlug in marc files.
229sub process {
230    my $self = shift (@_);
231    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
232
233    my $outhandle = $self->{'outhandle'};
234    my $filename = &util::filename_cat($base_dir, $file);
235
236    if (! defined($self->{'metadata_mapping'}))
237    {
238    print $outhandle "MARCPlug: no metadata file! Can't process $file\n";
239    return undef;
240    }
241
242    print STDERR "<Processing n='$file' p='MARCPlug'>\n" if ($gli);
243    print $outhandle "MARCPlug: processing $file\n"
244    if $self->{'verbosity'} > 1;
245
246    my $cursection = $doc_obj->get_top_section();
247
248    # Add fileFormat as the metadata
249    $doc_obj->add_metadata($cursection, "FileFormat", "MARC");
250   
251    my $marc_entries = $self->{'marc_entries'}->{$filename};
252    my $marc = shift(@$marc_entries);
253
254    my $encoding = $self->{'readfile_encoding'}->{$filename};
255
256    if ($self->{'type'} ne "ascii" ){   
257        $self->extract_metadata ($marc, $metadata, $encoding, $doc_obj, $cursection);
258    }
259    else{
260    $self->extract_ascii_metadata ($$textref,$metadata,$doc_obj, $cursection);
261    }
262
263    # add spaces after the sub-field markers, for word boundaries
264    $$textref =~ s/^(.{6} _\w)/$1 /gm;
265
266    # add text to document object
267    $$textref =~ s/</&lt;/g;
268    $$textref =~ s/>/&gt;/g;
269
270    $$textref = $self->to_utf8($encoding,$$textref);
271
272    print $outhandle "  Adding Marc Record:\n",substr($$textref,0,40), " ...\n"
273    if $self->{'verbosity'} > 2;
274
275    # line wrapping
276    $$textref = &wrap_text_in_columns($$textref, 64);
277    $$textref = "<pre>\n" . $$textref . "</pre>\n"; # HTML formatting...
278
279    $doc_obj->add_utf8_text($cursection, $$textref);
280
281    return 1;
282}
283
284sub wrap_text_in_columns
285{
286    my ($text, $columnwidth) = @_;
287    my $newtext = "";
288    my $linelength = 0;
289   
290    # Break the text into words, and display one at a time
291    my @words = split(/ /, $text);
292
293    foreach my $word (@words) {
294    # If printing this word would exceed the column end, start a new line
295    if (($linelength + length($word)) >= $columnwidth) {
296        $newtext .= "\n";
297        $linelength = 0;
298    }
299   
300    # Write the word
301    $newtext .= " $word";
302    if ($word =~ /\n/) {
303        $linelength = 0;
304    } else {
305        $linelength = $linelength + length(" $word");
306    }
307    }
308
309    $newtext .= "\n";
310    return $newtext;
311}
312
313
314sub extract_metadata
315{
316    my $self = shift (@_);
317    my ($marc, $metadata, $encoding, $doc_obj, $section) = @_;
318    my $outhandle = $self->{'outhandle'};
319
320    if (!defined $marc){
321    return;
322    }
323
324    my $metadata_mapping = $self->{'metadata_mapping'};
325    my $mm;
326
327    foreach $mm ( @$metadata_mapping )
328    {
329    my $marc_field = $mm->{'marc'};
330
331    my @metavalues = $marc->field($marc_field);
332
333    if (scalar(@metavalues)>0)
334    {
335        my $metaname = $mm->{'gsdl'};
336        my $metavalue;
337        foreach $metavalue ( @metavalues )
338        {
339        my $metavalue_str = $self->to_utf8($encoding,$metavalue->as_string());
340        $doc_obj->add_utf8_metadata ($section, $metaname, $metavalue_str);
341        }
342    }
343    }
344}
345
346sub extract_ascii_metadata
347{
348    my $self = shift (@_);
349    my ($text, $metadata,$doc_obj, $section) = @_;
350    my $outhandle = $self->{'outhandle'};
351    my $metadata_mapping = $self->{'metadata_mapping'};
352    ## get fields
353    my @fields = split(/[\n\r]+/,$text);
354    my $marc_mapping ={};
355
356    foreach my $field (@fields){
357    if ($field ne ""){
358        $field =~ /^(\d\d\d)\s/;
359        my $code = $1;
360        $field = $';
361        ##get subfields
362        my @subfields = split(/\$/,$field);
363        my $i=0;
364        $marc_mapping->{$code} = []; 
365        foreach my $subfield (@subfields){
366        if ($i == 0){
367            ##print STDERR $subfield."\n";
368            push(@{$marc_mapping->{$code}},"info");
369            push(@{$marc_mapping->{$code}},$subfield);
370                 
371            $i++;
372        }
373         else{
374             $subfield =~ /(\w)\s/;
375             ##print STDERR "$1=>$'\n";
376             push(@{$marc_mapping->{$code}},$1);
377                     push(@{$marc_mapping->{$code}},$');
378         }
379        }
380    }
381    }
382
383
384     foreach my $mm ( @$metadata_mapping )
385    {
386    my $marc_field = $mm->{'marc'};
387       
388    my $matched_field = $marc_mapping->{$marc_field};
389    my $subfield = undef;
390
391    if (defined $matched_field){
392        ## test whether this field has subfield
393        if ($marc_field =~ /\d\d\d(\w)/){
394        $subfield = $1;
395        }
396        my $metaname = $mm->{'gsdl'};
397 
398        my $metavalue;
399        if (defined $subfield){
400        my %mapped_subfield = {@$matched_field};
401        $metavalue = $mapped_subfield{$subfield};
402        }
403        else{ ## get all values except info
404        my $i =0;
405        foreach my $value (@$matched_field){
406            if ($i%2 != 0 and $i != 1){
407            $metavalue .= $value." ";
408            }
409            $i++;
410        }
411        }
412       
413        ## escape [ and ]
414        $metavalue =~ s/\[/\\\[/g;
415        $metavalue =~ s/\]/\\\]/g;
416        ##print STDERR  "$metaname=$metavalue\n";
417        $doc_obj->add_metadata ($section, $metaname, $metavalue) ;         
418    }
419       
420    }
421
422}
423
424
4251;
Note: See TracBrowser for help on using the browser.