Changeset 12910


Ignore:
Timestamp:
2006-09-28T16:45:30+12:00 (18 years ago)
Author:
kjdon
Message:

new indexoptions field in collect.cfg specifies which stem indexes should be built (stem, casefold, accentfold). mg and lucene ignore this, mg does stem and casefold, llucene does none. stemindexes is output to build.cfg so that the library knows what options are available for the collection - don't give stem option if stemming is not implemented, for example. added in accent fold stem indexes for mgpp (thanks to Juan Grigera). accent folding may be disabled in mgpp, so we check the first time we try to create one, and don't try to build the rest if it failed.

Location:
trunk/gsdl/perllib
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/lucenebuilder.pm

    r12845 r12910  
    8787}
    8888# /** new() **/
     89
     90# lucene has none of these options
     91sub generate_index_options {
     92    my $self = shift (@_);
     93
     94    $self->{'casefold'} = 0;
     95    $self->{'stem'} = 0;
     96    $self->{'accentfold'} = 0;
     97    $self->{'stemindexes'} = 0;
     98}   
    8999
    90100sub default_buildproc {
  • trunk/gsdl/perllib/mgbuilder.pm

    r12340 r12910  
    8686}
    8787
     88sub generate_index_options {
     89    my $self = shift (@_);
     90
     91    $self->{'casefold'} = 0;
     92    $self->{'stem'} = 0;
     93    $self->{'accentfold'} = 0; #not yet implemented for mg
     94   
     95    if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
     96    # just use default options
     97    $self->{'casefold'} = 1;
     98    $self->{'stem'} = 1;
     99    } else {
     100    foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
     101        if ($option =~ /stem/) {
     102        $self->{'stem'} = 1;
     103        } elsif ($option =~ /casefold/) {
     104        $self->{'casefold'} = 1;
     105        }
     106    }
     107    }
     108   
     109    # now we record this for the build cfg
     110    $self->{'stemindexes'} = 0;
     111    if ($self->{'casefold'}) {
     112    $self->{'stemindexes'} += 1;
     113    }
     114    if ($self->{'stem'}) {
     115    $self->{'stemindexes'} += 2;
     116    }
     117
     118
     119}
    88120
    89121sub compress_text {
     
    456488        die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
    457489    }
    458     system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
    459     system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
    460     system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
    461    
     490    # currently mg wont work if we don't generate all the stem idexes
     491    # so we generate them whatever, but don't advertise the fact
     492    #if ($self->{'casefold'}) {
     493        print STDERR "casefolding\n";
     494        system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
     495    #}
     496    #if ($self->{'stem'}) {
     497        print STDERR "stemming\n";
     498        system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
     499    #}
     500    #if ($self->{'casefold'} && $self->{'stem'}) {
     501        print STDERR "casefold and stem\n";
     502        system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
     503    #}
     504
    462505    # remove unwanted files
    463506    my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
  • trunk/gsdl/perllib/mgppbuilder.pm

    r12340 r12910  
    5959               'ib2'=>1,
    6060               'ib3'=>1,
     61               'ib4'=>1,
     62               'ib5'=>1,
     63               'ib6'=>1,
     64               'ib7'=>1,
    6165               'i'=>1,
    6266               'il'=>1,
     
    151155}
    152156
     157sub generate_index_options {
     158    my $self = shift (@_);
     159
     160    $self->{'casefold'} = 0;
     161    $self->{'stem'} = 0;
     162    $self->{'accentfold'} = 0;
     163   
     164    if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
     165    # just use default options
     166    $self->{'casefold'} = 1;
     167    $self->{'stem'} = 1;
     168    $self->{'accentfold'} = 1;
     169    } else {
     170    foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
     171        if ($option =~ /stem/) {
     172        $self->{'stem'} = 1;
     173        } elsif ($option =~ /casefold/) {
     174        $self->{'casefold'} = 1;
     175        } elsif ($option =~ /accentfold/) {
     176        $self->{'accentfold'} = 1;
     177        }
     178    }
     179    }
     180   
     181    # now we record this for the build cfg
     182    $self->{'stemindexes'} = 0;
     183    if ($self->{'casefold'}) {
     184    $self->{'stemindexes'} += 1;
     185    }
     186    if ($self->{'stem'}) {
     187    $self->{'stemindexes'} += 2;
     188    }
     189    if ($self->{'accentfold'}) {
     190    $self->{'stemindexes'} += 4;
     191    }
     192    print STDERR "temindexes = $self->{'stemindexes'}\n";
     193}
     194
    153195sub default_buildproc {
    154196    my $self  = shift (@_);
     
    574616        die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
    575617    }
    576     system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
    577     system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
    578     system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
    579    
     618    my $accent_folding_enabled = 1;
     619    if ($self->{'accentfold'}) {
     620        # the first time we do this, we test for accent folding enabled
     621        if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") != 0) {
     622        # accent folding has not been enabled in mgpp
     623        $accent_folding_enabled = 0;
     624        $self->{'stemindexes'} -= 4;
     625        }
     626    }
     627    if ($self->{'casefold'}) {
     628        system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
     629        if ($accent_folding_enabled && $self->{'accentfold'}) {
     630        system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
     631        }
     632    }
     633    if ($self->{'stem'}) {
     634        system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
     635        if ($accent_folding_enabled && $self->{'accentfold'}) {
     636        system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
     637        }
     638    }
     639    if ($self->{'casefold'} && $self->{'stem'}) {
     640        system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
     641        if ($accent_folding_enabled && $self->{'accentfold'}) {
     642        system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
     643        }
     644    }
     645
    580646    # remove unwanted files
    581647    my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
     
    800866
    801867
    802 sub write_cfg_file {
    803     my $self = shift(@_);
    804     my ($build_cfg) = @_;
    805 
    806     # write out the build information
    807     &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
    808                  '^(builddate|buildtype|numdocs|numsections|numbytes|textlevel|indexstem|maxnumeric)$',
    809                              '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels|levelmap)$');
    810 
    811 }
    812 
    813868sub build_cfg_extra {
    814869    my $self = shift (@_);
Note: See TracChangeset for help on using the changeset viewer.