source: main/trunk/greenstone2/perllib/cpan/MARC/Lint.pm@ 32205

Last change on this file since 32205 was 3430, checked in by jrm21, 22 years ago

Added MARCPlug, mostly done by David Bainbridge. It needs a configuration file,
which maps MARC fields into metadata, which is in gsdlhome/etc/marctodc.txt

  • Property svn:keywords set to Author Date Id Revision
File size: 9.9 KB
Line 
1package MARC::Lint;
2
3use strict;
4use integer;
5eval 'use warnings' if $] >= 5.006;
6
7=head1 NAME
8
9MARC::Lint - Perl extension for checking validity of MARC records
10
11=head1 VERSION
12
13Version 1.10
14
15 $Id: Lint.pm 3430 2002-09-24 05:17:39Z jrm21 $
16
17=cut
18
19use vars '$VERSION'; $VERSION = '1.10';
20
21use MARC::Record;
22use MARC::Field;
23
24=head1 SYNOPSIS
25
26 use MARC::Record;
27 use MARC::Lint;
28
29 my $linter = new MARC::Lint;
30 my $filename = shift;
31
32 open( IN, "<", $filename ) or die "Couldn't open $filename: $!\n";
33 binmode( IN ); # for the Windows folks
34 while ( !eof(IN) ) {
35 my $marc = MARC::Record::next_from_file( *IN );
36 die $MARC::Record::ERROR unless $marc;
37
38 $linter->check_record( $marc );
39
40
41 # Print the title tag
42 print $marc->subfield(245,"a"), "\n";
43
44 # Print the errors that were found
45 print join( "\n", $linter->warnings ), "\n";
46 } # while
47
48 close IN or die "Error closing $filename: $!\n";
49
50Given the following MARC record:
51
52 LDR 00000nam 22002538a 4500
53 100 14 _aWall, Larry.
54 110 1 _aO'Reilly & Associates.
55 245 90 _aProgramming Perl /
56 _aBig Book of Perl /
57 _cLarry Wall, Tom Christiansen & Jon Orwant.
58 250 _a3rd ed.
59 250 _a3rd ed.
60 260 _aCambridge, Mass. :
61 _bO'Reilly,
62 _r2000.
63 590 4 _aPersonally signed by Larry.
64 856 43 _uhttp://www.perl.com/
65
66the following errors are generated:
67
68 1XX: Only one 1XX tag is allowed, but I found 2 of them.
69 100: Indicator 2 must be blank
70 245: Indicator 1 must be 0 or 1
71 245: Subfield _a is not repeatable.
72 250: Field is not repeatable.
73 260: Subfield _r is not valid.
74 260: Must have a subfield _c.
75 590: Indicator 1 must be blank
76 856: Indicator 2 must be blank, 0, 1, 2 or 8
77
78=head1 DESCRIPTION
79
80Module for checking validity of MARC records. 99% of the users will want to do
81something like is shown in the synopsis. The other intrepid 1% will overload the
82C<MARC::Lint> module's methods and provide their own special field-level checking.
83
84What this means is that if you have certain requirements, such as making sure that
85all 952 tags have a certain call number in them, you can write a function that
86checks for that, and still get all the benefits of the MARC::Lint framework.
87
88=head1 EXPORT
89
90None. Everything is done through objects.
91
92=head1 METHODS
93
94=head2 C<new()>
95
96No parms needed. The C<MARC::Lint> object is little more than a list of warnings
97and a bunch of rules.
98
99=cut
100
101sub new() {
102 my $class = shift;
103 $class = ref($class) || $class;
104
105 my $self = {
106 _warnings => [],
107 };
108 bless $self, $class;
109
110 $self->_read_rules();
111
112 return $self;
113}
114
115=head2 C<warnings()>
116
117Returns a list of warnings found by C<check_record()> and its brethren.
118
119=cut
120
121sub warnings {
122 my $self = shift;
123
124 return wantarray ? @{$self->{_warnings}} : scalar @{$self->{_warnings}};
125}
126
127=head2 C<clear_warnings()>
128
129Clear the list of warnings for this linter object. It's automatically called
130when you call C<check_record()>.
131
132=cut
133
134sub clear_warnings {
135 my $self = shift;
136
137 $self->{_warnings} = [];
138}
139
140=head2 C<warn(str[,str...])>
141
142Create a warning message, built from strings passed, like a C<print> statement.
143
144Typically, you'll leave this to C<check_record()>, but industrious programmers
145may want to do their own checking as well.
146
147=cut
148
149sub warn {
150 my $self = shift;
151
152 push( @{$self->{_warnings}}, join( "", @_ ) );
153
154 return undef;
155}
156
157=head2 C<check_record(marc)>
158
159Does all sorts of lint-like checks on the MARC record I<marc>, both on the record as a whole,
160and on the individual fields & subfields.
161
162=cut
163
164sub check_record {
165 my $self = shift;
166 my $marc = shift;
167
168 $self->clear_warnings();
169
170 (ref($marc) eq "MARC::Record")
171 or return $self->warn( "Must pass a MARC::Record object to check_record" );
172
173 if ( (my @_1xx = $marc->field( "1XX" )) > 1 ) {
174 $self->warn( "1XX: Only one 1XX tag is allowed, but I found ", scalar @_1xx, " of them." );
175 }
176
177 if ( not $marc->field( 245 ) ) {
178 $self->warn( "245: No 245 tag." );
179 }
180
181
182 my %field_seen;
183 my $rules = $self->{_rules};
184 for my $field ( $marc->fields ) {
185 my $tagno = $field->tag;
186 my $tagrules = $rules->{$tagno} or next;
187
188 if ( $tagrules->{NR} && $field_seen{$tagno} ) {
189 $self->warn( "$tagno: Field is not repeatable." );
190 }
191
192 if ( $tagno >= 10 ) {
193 for my $ind ( 1..2 ) {
194 my $indvalue = $field->indicator($ind);
195 if ( not ($indvalue =~ $tagrules->{"ind$ind" . "_regex"}) ) {
196 $self->warn(
197 "$tagno: Indicator $ind must be ",
198 $tagrules->{"ind$ind" . "_desc"},
199 " but it's \"$indvalue\""
200 );
201 }
202 }
203
204 my %sub_seen;
205 for my $subfield ( $field->subfields ) {
206 my ($code,$data) = @$subfield;
207
208 my $rule = $tagrules->{$code};
209 if ( not defined $rule ) {
210 $self->warn( "$tagno: Subfield _$code is not allowed." );
211 } elsif ( ($rule eq "NR") && $sub_seen{$code} ) {
212 $self->warn( "$tagno: Subfield _$code is not repeatable." );
213 }
214
215 if ( $data =~ /[\t\r\n]/ ) {
216 $self->warn( "$tagno: Subfield _$code has an invalid control character" );
217 }
218
219 ++$sub_seen{$code};
220 }
221 }
222
223 # Check to see if a check_xxx() function exists, and call it on the field if it does
224 my $checker = "check_$tagno";
225 if ( $self->can( $checker ) ) {
226 $self->$checker( $field );
227 }
228
229 ++$field_seen{$tagno};
230 } # for
231
232 return;
233}
234
235=head2 C<check_I<xxx>(field)>
236
237Various functions to check the different fields. If the function doesn't exist,
238then it doesn't get checked.
239
240=cut
241
242sub check_245 {
243 my $self = shift;
244 my $field = shift;
245
246 if ( not $field->subfield( "a" ) ) {
247 $self->warn( "245: Must have a subfield _a." );
248 }
249}
250
251sub check_260 {
252 my $self = shift;
253 my $field = shift;
254
255 if ( not $field->subfield( "c" ) ) {
256 $self->warn( "260: Must have a subfield _c." );
257 }
258}
259
260
261=head1 SEE ALSO
262
263Check the docs for L<MARC::Record>. All software links are there.
264
265=head1 TODO
266
267=over 4
268
269=item * ISBN and ISSN checking
270
271We can check the 020 and 022 fields with the C<Business::ISBN> and
272C<Business::ISSN> modules, respectively.
273
274=back
275
276=head1 LICENSE
277
278This code may be distributed under the same terms as Perl itself.
279
280Please note that these modules are not products of or supported by the
281employers of the various contributors to the code.
282
283=head1 AUTHOR
284
285Andy Lester, E<lt>[email protected]<gt> or E<lt>[email protected]<gt>
286
287=cut
288
289# Used only to read the stuff from __DATA__
290sub _read_rules() {
291 my $self = shift;
292
293 my $tell = tell(DATA); # Stash the position so we can reset it for next time
294
295 local $/ = "";
296 while ( my $lines = <DATA> ) {
297 $lines =~ s/\s+$//;
298 my @keyvals = split( /\s+/, $lines );
299
300 my $tagno = shift @keyvals;
301 my $repeatable = shift @keyvals;
302
303 my @tag_range = ($tagno);
304 if ( $tagno =~ /^(\d\d)X/ ) {
305 my $base = $1;
306 @tag_range = ( "${base}0" .. "${base}9" );
307 }
308
309 # Handle the ranges of tags.
310 for my $currtag ( @tag_range ) {
311 $self->_parse_tag_rules( $currtag, $repeatable, @keyvals );
312 } # for
313 # I guess I could just have multiple references to the same tag, but I'm not that worried about memory
314 } # while
315
316 seek( DATA, $tell, 0 );
317}
318
319sub _parse_tag_rules {
320 my $self = shift;
321 my $tagno = shift;
322 my $repeatable = shift;
323 my @keyvals = @_;
324
325 my $rules = ($self->{_rules}->{$tagno} ||= {});
326 $rules->{$repeatable} = $repeatable;
327
328 while ( @keyvals ) {
329 my $key = shift @keyvals;
330 my $val = shift @keyvals;
331
332 $rules->{$key} = $val;
333
334 # Do magic for indicators
335 if ( $key =~ /^ind/ ) {
336 my $desc;
337 my $regex;
338
339 if ( $val eq "blank" ) {
340 $desc = "blank";
341 $regex = qr/^ $/;
342 } else {
343 $desc = _nice_list($val);
344 $val =~ s/^b/ /;
345 $regex = qr/^[$val]$/;
346 }
347
348 $rules->{$key."_desc"} = $desc;
349 $rules->{$key."_regex"} = $regex;
350 } # if indicator
351 } # while
352}
353
354
355sub _nice_list($) {
356 my $str = shift;
357
358 if ( $str =~ s/(\d)-(\d)/$1 thru $2/ ) {
359 return $str;
360 }
361
362 my @digits = split( //, $str );
363 $digits[0] = "blank" if $digits[0] eq "b";
364 my $last = pop @digits;
365 return join( ", ", @digits ) . " or $last";
366}
367
368sub _ind_regex($) {
369 my $str = shift;
370
371 return qr/^ $/ if $str eq "blank";
372
373 return qr/^[$str]$/;
374}
375
376
3771;
378
379__DATA__
380010 NR
381ind1 blank
382ind2 blank
383a NR
384z NR
385
386016 R
387ind1 b7
388ind2 blank
389a NR
390z R
3912 NR
392
393020 R
394ind1 blank
395ind2 blank
396a R
397c R
398
399022 R
400ind1 blank
401ind2 blank
402a NR
403
404040 NR
405ind1 blank
406ind2 blank
407a NR
408c NR
409d R
410
411100 NR
412ind1 013
413ind2 blank
414a NR
415q NR
416b R
417c R
418d NR
419e R
420
421110 NR
422ind1 012
423ind2 blank
424a NR
425b R
426
427111 NR
428ind1 012
429ind2 blank
430a NR
431n R
432d NR
433c NR
434e R
435
436130 NR
437ind1 0-9
438ind2 blank
439a NR
440n R
441p R
442h NR
443l NR
444k R
445s NR
446f NR
447
448240 NR
449ind1 01
450ind2 0-9
451
452245 NR
453ind1 01
454ind2 0-9
455a NR
456n R
457p R
458h NR
459b NR
460s NR
461c NR
462
463246 NR
464ind1 0123
465ind2 012345678
466a NR
467h NR
468b NR
469n R
470p R
471i NR
472f NR
473
474250 NR
475ind1 blank
476ind2 blank
477a NR
478b NR
479
480260 NR
481ind1 blank
482ind2 blank
483a R
484b R
485c R
486
487300 NR
488ind1 blank
489ind2 blank
490a R
491b NR
492c R
493e NR
494
495440 R
496ind1 blank
497ind2 0-9
498a NR
499n R
500p R
501v NR
502
503490 R
504ind1 01
505ind2 blank
506a R
507v R
508
509500 R
510ind1 blank
511ind2 blank
512a NR
513
514504 R
515ind1 blank
516ind2 blank
517a NR
518
519505 R
520ind1 0128
521ind2 b0
522a NR
523g R
524r R
525t R
526
527520 R
528ind1 b018
529ind2 blank
530a R
531b R
532
533521 R
534ind1 b012348
535ind2 blank
536a R
537b NR
538
539526 R
540ind1 08
541ind2 blank
542a NR
543b NR
544c NR
545d NR
546i NR
547x R
548z R
549
550538 R
551ind1 blank
552ind2 blank
553a NR
554
555546 R
556ind1 blank
557ind2 blank
558a NR
559
560586 R
561ind1 b8
562ind2 blank
563a NR
564
56559X R
566ind1 blank
567ind2 blank
568a NR
569
570600 R
571ind1 013
572ind2 012567
573a NR
574q NR
575b R
576c R
577d NR
578t NR
579v R
580x R
581y R
582z R
5832 NR
584
585610 R
586ind1 012
587ind2 012567
588a NR
589b R
590t NR
591v R
592x R
593y R
594z R
5952 NR
596
597611 R
598ind1 012
599ind2 012567
600a NR
601n R
602d NR
603c NR
604v R
605x R
606y R
607z R
6082 NR
609
610630 R
611ind1 0-9
612ind2 012567
613a NR
614n R
615p R
616l NR
617k R
618s NR
619f NR
620v R
621x R
622y R
623z R
6242 NR
625
626650 R
627ind1 blank
628ind2 012567
629a NR
630v R
631x R
632y R
633z R
6342 NR
635
636651 R
637ind1 blank
638ind2 012567
639a NR
640v R
641x R
642y R
643z R
6442 NR
645
646655 R
647ind1 blank
648ind2 7
649a NR
650v R
651x R
652y R
653z R
6542 NR
655
656658 R
657ind1 blank
658ind2 blank
659a NR
660b R
661c NR
662d NR
6632 NR
664
66569X R
666ind1 blank
667ind2 blank
668a NR
669v R
670x R
671y R
672z R
673
674700 R
675ind1 013
676ind2 b2
677a NR
678q NR
679b R
680c R
681d NR
682k R
683t NR
684e R
685f NR
686
687710 R
688ind1 012
689ind2 b2
690a NR
691b R
692e R
693t NR
694
695711 R
696ind1 012
697ind2 b2
698a NR
699n R
700d NR
701c NR
702t NR
703e R
704
705730 R
706ind1 0-9
707ind2 b2
708a NR
709n R
710p R
711h NR
712l NR
713k R
714s NR
715f NR
716
717740 R
718ind1 0-9
719ind2 b2
720a NR
721h NR
722n R
723p R
724
725
726800 R
727ind1 013
728ind2 blank
729a NR
730q NR
731b NR
732c R
733d NR
734t NR
735e R
736v NR
737
738852 R
739ind1 b01234568
740ind2 b012
741a NR
742b R
743h NR
744i R
745k NR
746m NR
747t NR
748p NR
7499 NR
750
751856 R
752ind1 b012347
753ind2 b0128
754a R
755b NR
756d R
757f R
758h NR
759i R
760u R
761x R
762z R
7632 NR
7643 NR
Note: See TracBrowser for help on using the repository browser.