Changeset 69 for trunk/gsdl/perllib/mgbuildproc.pm
- Timestamp:
- 1998-12-11T14:45:40+13:00 (25 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/mgbuildproc.pm
r59 r69 24 24 $self->{'mode'} = "text"; 25 25 $self->{'index'} = "section:text"; 26 $self->{'indexexparr'} = []; 26 27 $self->{'output_handle'} = "STDOUT"; 27 28 $self->{'num_docs'} = 0; … … 74 75 sub set_index { 75 76 my $self = shift (@_); 76 my ($index ) = @_;77 my ($index, $indexexparr) = @_; 77 78 78 79 $self->{'index'} = $index; 80 $self->{'indexexparr'} = $indexexparr if defined $indexexparr; 79 81 } 80 82 … … 391 393 my ($doc_obj) = @_; 392 394 my $handle = $self->{'output_handle'}; 395 my $indexed_doc = 1; 393 396 394 397 # only output this document if it is one to be indexed 395 398 return if ($doc_obj->get_doc_type() ne "indexed_doc"); 399 400 # see if this document belongs to this subcollection 401 foreach $indexexp (@{$self->{'indexexparr'}}) { 402 $indexed_doc = 0; 403 my ($field, $exp, $options) = split /\//, $indexexp; 404 if (defined ($field) && defined ($exp)) { 405 my ($bool) = $field =~ /^(.)/; 406 $field =~ s/^.// if $bool eq '!'; 407 if ($field eq "filename") { 408 $field = $doc_obj->get_source_filename(); 409 } else { 410 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field); 411 } 412 next unless defined $field; 413 if ($bool eq '!') { 414 if ($options =~ /^i$/i) { 415 if ($field !~ /$exp/i) {$indexed_doc = 1; last;} 416 } else { 417 if ($field !~ /$exp/) {$indexed_doc = 1; last;} 418 } 419 } else { 420 if ($options =~ /^i$/i) { 421 if ($field =~ /$exp/i) {$indexed_doc = 1; last;} 422 } else { 423 if ($field =~ /$exp/) {$indexed_doc = 1; last;} 424 } 425 } 426 } 427 } 396 428 397 429 # this is another document … … 412 444 # update a few statistics 413 445 $doc_section++; 414 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);415 446 $self->{'num_sections'} += 1; 416 417 foreach $field (split (/,/, $fields)) { 418 # only deal with this field if it doesn't start with top or 419 # this is the first section 420 my $real_field = $field; 421 if (!($real_field =~ s/^top//) || ($doc_section == 1)) { 422 my $new_text = ""; 423 if ($real_field eq "text") { 424 $new_text = $doc_obj->get_text ($section); 425 $new_text =~ s/[\cB\cC]//g; 426 $new_text =~ s/(<p\b)/\cC$1/gi; 447 448 if ($indexed_doc) { 449 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 450 foreach $field (split (/,/, $fields)) { 451 # only deal with this field if it doesn't start with top or 452 # this is the first section 453 my $real_field = $field; 454 if (!($real_field =~ s/^top//) || ($doc_section == 1)) { 455 my $new_text = ""; 456 if ($real_field eq "text") { 457 $new_text = $doc_obj->get_text ($section); 458 $new_text =~ s/[\cB\cC]//g; 459 $new_text =~ s/(<p\b)/\cC$1/gi; 460 461 } else { 462 $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)}); 463 } 427 464 428 } else { 429 $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)}); 430 } 431 432 $text .= "$new_text\cC"; 465 $text .= "$new_text\cC"; 466 } 433 467 } 434 468 }
Note:
See TracChangeset
for help on using the changeset viewer.