Changeset 69


Ignore:
Timestamp:
1998-12-11T14:45:40+13:00 (25 years ago)
Author:
sjboddie
Message:

Sub-collection indexes may now be defined within the collect.cfg file as
subcollection blah1 Title/blah/i
subcollection blah2 !Title/blah/i
indexsubcollections blah1 blah2 blah1,blah2
indexes section:text document:text
This example would create section:text and document:text indexes for:

  1. the blah1 subcollection (i.e those documents whose Title field contains 'blah')
  2. the blah2 subcollection (i.e. those documents whose Title field doesn't contain 'blah')
  3. both subcollections (i.e. all documents)

The field to match the regular expression against (Title in this example) may be
any valid metadata tag or 'filename'.
The regular expression (blah in this example) may be any valid perl regular expression.

Location:
trunk/gsdl/perllib
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/cfgread.pm

    r4 r69  
    2828#   '^(this|that)$'
    2929sub read_cfg_file {
    30     my ($filename, $stringexp, $arrayexp) = @_;
     30    my ($filename, $stringexp, $arrayexp, $hashexp) = @_;
    3131    my ($line);
    3232    my $data = {};
     
    4141        } elsif ($key =~ /$arrayexp/) {
    4242            push (@{$data->{$key}}, @$line);
     43
     44        } elsif ($key =~ /$hashexp/) {
     45            my $k = shift @$line;
     46            my $v = shift @$line;
     47            $data->{$key}->{$k} = $v;
    4348        }
    4449        }
     
    5661#   '^(this|that)$'
    5762sub write_cfg_file {
    58     my ($filename, $data, $stringexp, $arrayexp) = @_;
     63    my ($filename, $data, $stringexp, $arrayexp, $hashexp) = @_;
    5964
    6065    if (open (COLCFG, ">$filename")) {
     
    6469        } elsif ($key =~ /$arrayexp/) {
    6570        &write_cfg_line ('COLCFG', [$key, @{$data->{$key}}]);
     71        } elsif ($key =~ /$hashexp/) {
     72        foreach $k (keys (%{$data->{$key}})) {
     73            &write_cfg_line ('COLCFG', [$key, $k, $data->{$key}->{$k}]);
     74        }
    6675        }
    6776    }
  • trunk/gsdl/perllib/colcfg.pm

    r4 r69  
    1313# {'beta'}->string
    1414# {'key'}->string
     15# {'subcollection'}->hash of key-value pairs
     16# {'indexsubcollections'}->array of strings
    1517# {'indexes'}->array of strings
    1618# {'defaultindex'}->string
     
    2224    return &cfgread::read_cfg_file ($filename,
    2325            '^(creator|public|beta|key|defaultindex)$',
    24             '^(maintainer|indexes|plugins)$');
     26            '^(maintainer|indexsubcollections|indexes|plugins)$',
     27            '^(subcollection)$');
    2528}
    2629
     
    3033    &cfgread::write_cfg_file($filename, $data,
    3134            '^(creator|public|beta|key|defaultindex)$',
    32             '^(maintainer|indexes|plugins)$');
     35            '^(maintainer|subcollection|indexsubcollections|indexes|plugins)$',
     36            '^(subcollection)$');
    3337}
    3438
  • trunk/gsdl/perllib/mgbuilder.pm

    r16 r69  
    4141
    4242
     43    # sort out subcollection indexes
     44    if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
     45    my $indexes = $self->{'collect_cfg'}->{'indexes'};
     46    $self->{'collect_cfg'}->{'indexes'} = [];
     47    foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
     48        foreach $index (@$indexes) {
     49        push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
     50        }
     51    }
     52    }
     53
    4354    # get the list of plugins for this collection
    4455    my @plugins = (); # some good choice of plugins .... ????
     
    177188   
    178189    foreach $index (@$indexes) {
    179     my ($level, $fields) = split (":", $index);
    180     my @fields = split (",", $fields);
     190    my ($level, $fields, $subcollection) = split (":", $index);
     191    my @fields = split (/,/, $fields);
    181192    splice (@fields, 2); # just want first two fields
    182193   
     
    189200    map {s/^(.).*?([bcdfghjklmnpqrstvwxyz]).*$/$1$2/i;} @fields;
    190201    $dirname .= join("", @fields);
     202
     203    # next comes a processed version of the subcollection if there is one.
     204    # the processed version contains the first character and the next
     205    # consonant if there's only one field, other wise the first character
     206    # of the first two fields
     207    if (defined ($subcollection) && $subcollection =~ /\w/) {
     208        @fields = split /,/, $subcollection;
     209        if (scalar @fields >= 2) {
     210        splice (@fields, 2);
     211        map {s/^(.).*$/$1/i;} @fields;
     212        $dirname .= join("", @fields);
     213        } else {
     214        $subcollection =~ s/^(.).*?([bcdfghjklmnpqrstvwxyz]).*$/$1$2/i;
     215        $dirname .= $subcollection;
     216        }
     217    }
    191218
    192219    # convert the directory name to lowercase
     
    238265    $index_level = 3 if $index =~ /^paragraph/i;
    239266
     267    # get the index expression if this index belongs
     268    # to a subcollection
     269    my $indexexparr = [];
     270    my ($level, $fields, $subcollection) = split (":", $index);
     271    my (@subcollections) = split /,/, $subcollection;
     272
     273    foreach $subcollection (@subcollections) {
     274    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
     275        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
     276    }
     277    }
     278   
    240279    # set up the document processor
    241280    $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
    242281    $self->{'buildproc'}->set_mode ('text');
    243     $self->{'buildproc'}->set_index ($index);
     282    $self->{'buildproc'}->set_index ($index, $indexexparr);
    244283
    245284
  • trunk/gsdl/perllib/mgbuildproc.pm

    r59 r69  
    2424    $self->{'mode'} = "text";
    2525    $self->{'index'} = "section:text";
     26    $self->{'indexexparr'} = [];
    2627    $self->{'output_handle'} = "STDOUT";
    2728    $self->{'num_docs'} = 0;
     
    7475sub set_index {
    7576    my $self = shift (@_);
    76     my ($index) = @_;
     77    my ($index, $indexexparr) = @_;
    7778
    7879    $self->{'index'} = $index;
     80    $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
    7981}
    8082
     
    391393    my ($doc_obj) = @_;
    392394    my $handle = $self->{'output_handle'};
     395    my $indexed_doc = 1;
    393396
    394397    # only output this document if it is one to be indexed
    395398    return if ($doc_obj->get_doc_type() ne "indexed_doc");
     399
     400    # see if this document belongs to this subcollection
     401    foreach $indexexp (@{$self->{'indexexparr'}}) {
     402    $indexed_doc = 0;
     403    my ($field, $exp, $options) = split /\//, $indexexp;
     404    if (defined ($field) && defined ($exp)) {
     405        my ($bool) = $field =~ /^(.)/;
     406        $field =~ s/^.// if $bool eq '!';
     407        if ($field eq "filename") {
     408        $field = $doc_obj->get_source_filename();
     409        } else {
     410        $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
     411        }
     412        next unless defined $field;
     413        if ($bool eq '!') {
     414        if ($options =~ /^i$/i) {
     415            if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
     416        } else {
     417            if ($field !~ /$exp/) {$indexed_doc = 1; last;}
     418        }
     419        } else {
     420        if ($options =~ /^i$/i) {
     421            if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
     422        } else {
     423            if ($field =~ /$exp/) {$indexed_doc = 1; last;}
     424        }
     425        }
     426    }
     427    }
    396428
    397429    # this is another document
     
    412444    # update a few statistics
    413445    $doc_section++;
    414     $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
    415446    $self->{'num_sections'} += 1;
    416    
    417     foreach $field (split (/,/, $fields)) {
    418         # only deal with this field if it doesn't start with top or
    419         # this is the first section
    420         my $real_field = $field;
    421         if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
    422         my $new_text = "";
    423         if ($real_field eq "text") {
    424             $new_text = $doc_obj->get_text ($section);
    425             $new_text =~ s/[\cB\cC]//g;
    426             $new_text =~ s/(<p\b)/\cC$1/gi;
     447
     448    if ($indexed_doc) {
     449        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
     450        foreach $field (split (/,/, $fields)) {
     451        # only deal with this field if it doesn't start with top or
     452        # this is the first section
     453        my $real_field = $field;
     454        if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
     455            my $new_text = "";
     456            if ($real_field eq "text") {
     457            $new_text = $doc_obj->get_text ($section);
     458            $new_text =~ s/[\cB\cC]//g;
     459            $new_text =~ s/(<p\b)/\cC$1/gi;
     460           
     461            } else {
     462            $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
     463            }
    427464           
    428         } else {
    429             $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
    430         }
    431        
    432         $text .= "$new_text\cC";
     465            $text .= "$new_text\cC";
     466        }
    433467        }
    434468    }
Note: See TracChangeset for help on using the changeset viewer.