Ignore:
Timestamp:
1998-12-11T14:45:40+13:00 (25 years ago)
Author:
sjboddie
Message:

Sub-collection indexes may now be defined within the collect.cfg file as
subcollection blah1 Title/blah/i
subcollection blah2 !Title/blah/i
indexsubcollections blah1 blah2 blah1,blah2
indexes section:text document:text
This example would create section:text and document:text indexes for:

  1. the blah1 subcollection (i.e those documents whose Title field contains 'blah')
  2. the blah2 subcollection (i.e. those documents whose Title field doesn't contain 'blah')
  3. both subcollections (i.e. all documents)

The field to match the regular expression against (Title in this example) may be
any valid metadata tag or 'filename'.
The regular expression (blah in this example) may be any valid perl regular expression.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/mgbuildproc.pm

    r59 r69  
    2424    $self->{'mode'} = "text";
    2525    $self->{'index'} = "section:text";
     26    $self->{'indexexparr'} = [];
    2627    $self->{'output_handle'} = "STDOUT";
    2728    $self->{'num_docs'} = 0;
     
    7475sub set_index {
    7576    my $self = shift (@_);
    76     my ($index) = @_;
     77    my ($index, $indexexparr) = @_;
    7778
    7879    $self->{'index'} = $index;
     80    $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
    7981}
    8082
     
    391393    my ($doc_obj) = @_;
    392394    my $handle = $self->{'output_handle'};
     395    my $indexed_doc = 1;
    393396
    394397    # only output this document if it is one to be indexed
    395398    return if ($doc_obj->get_doc_type() ne "indexed_doc");
     399
     400    # see if this document belongs to this subcollection
     401    foreach $indexexp (@{$self->{'indexexparr'}}) {
     402    $indexed_doc = 0;
     403    my ($field, $exp, $options) = split /\//, $indexexp;
     404    if (defined ($field) && defined ($exp)) {
     405        my ($bool) = $field =~ /^(.)/;
     406        $field =~ s/^.// if $bool eq '!';
     407        if ($field eq "filename") {
     408        $field = $doc_obj->get_source_filename();
     409        } else {
     410        $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
     411        }
     412        next unless defined $field;
     413        if ($bool eq '!') {
     414        if ($options =~ /^i$/i) {
     415            if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
     416        } else {
     417            if ($field !~ /$exp/) {$indexed_doc = 1; last;}
     418        }
     419        } else {
     420        if ($options =~ /^i$/i) {
     421            if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
     422        } else {
     423            if ($field =~ /$exp/) {$indexed_doc = 1; last;}
     424        }
     425        }
     426    }
     427    }
    396428
    397429    # this is another document
     
    412444    # update a few statistics
    413445    $doc_section++;
    414     $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
    415446    $self->{'num_sections'} += 1;
    416    
    417     foreach $field (split (/,/, $fields)) {
    418         # only deal with this field if it doesn't start with top or
    419         # this is the first section
    420         my $real_field = $field;
    421         if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
    422         my $new_text = "";
    423         if ($real_field eq "text") {
    424             $new_text = $doc_obj->get_text ($section);
    425             $new_text =~ s/[\cB\cC]//g;
    426             $new_text =~ s/(<p\b)/\cC$1/gi;
     447
     448    if ($indexed_doc) {
     449        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
     450        foreach $field (split (/,/, $fields)) {
     451        # only deal with this field if it doesn't start with top or
     452        # this is the first section
     453        my $real_field = $field;
     454        if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
     455            my $new_text = "";
     456            if ($real_field eq "text") {
     457            $new_text = $doc_obj->get_text ($section);
     458            $new_text =~ s/[\cB\cC]//g;
     459            $new_text =~ s/(<p\b)/\cC$1/gi;
     460           
     461            } else {
     462            $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
     463            }
    427464           
    428         } else {
    429             $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
    430         }
    431        
    432         $text .= "$new_text\cC";
     465            $text .= "$new_text\cC";
     466        }
    433467        }
    434468    }
Note: See TracChangeset for help on using the changeset viewer.