Changeset 17110 for gsdl/trunk


Ignore:
Timestamp:
2008-08-30T07:35:20+12:00 (16 years ago)
Author:
kjdon
Message:

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

Location:
gsdl/trunk/perllib
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/basebuilder.pm

    r16379 r17110  
    148148   
    149149    $self->generate_index_list();
    150     $self->generate_index_options();
    151 
     150 
    152151    # sort out subcollection indexes
    153152    if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
     
    216215    die "$@" if $@;
    217216
     217   
     218   $self->generate_index_options();
     219
    218220    if (!$self->{'debug'} && !$self->{'keepold'}) {
    219221    # remove any old builds
     
    234236}
    235237
     238sub generate_index_options {
     239    my $self = shift (@_);
     240
     241    my $separate_cjk = 0;
     242   
     243    if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
     244    foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
     245        if ($option =~ /separate_cjk/) {
     246        $separate_cjk = 1;
     247        }
     248    }
     249    }
     250    # set this for building
     251    $self->{'buildproc'}->set_separate_cjk($separate_cjk);
     252    # record it for build.cfg
     253    $self->{'separate_cjk'} = $separate_cjk;
     254}
     255 
    236256sub set_sections_index_document_metadata {
    237257    my $self = shift (@_);
     
    429449    $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
    430450    $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
     451    if ($self->{'separate_cjk'}) {
     452    $build_cfg->{'separate_cjk'} = "true";
     453    }
    431454   
    432455    # store the number of documents and number of bytes
  • gsdl/trunk/perllib/basebuildproc.pm

    r17106 r17110  
    6868    $self->{'indexexparr'} = [];
    6969
     70    $self->{'separate_cjk'} = 0;
     71
    7072    my $found_num_data = 0;
    7173    my $buildconfigfile = undef;
     
    7476    # For incremental building need to seed num_docs etc from values
    7577    # stored in build.cfg (if present)
    76       print STDERR "Keepold!\n";
    7778    $buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
    78       print STDERR "Build cfg: $buildconfigfile\n";
    7979    if (-e $buildconfigfile) {
    8080        $found_num_data = 1;
     
    8484        $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
    8585                           "index", "build.cfg");
    86             print STDERR "Index cfg: $buildconfigfile\n";
    8786        if (-e $buildconfigfile) {
    8887        $found_num_data = 1;
     
    9190
    9291    }
    93     #else
    94     #  {
    95     #    print STDERR "Removeold!\n";
    96     #  }
    9792
    9893    if ($found_num_data)
     
    308303    $self->{'sections_index_document_metadata'} = $index_type;
    309304}
     305
     306sub set_separate_cjk {
     307    my $self = shift (@_);
     308    my ($sep_cjk) = @_;
     309
     310    $self->{'separate_cjk'} = $sep_cjk;
     311}
     312
    310313sub process {
    311314    my $self = shift (@_);
     
    315318}
    316319
     320# post process text depending on field. Currently don't do anything here
     321# except cjk separation
     322sub filter_text {
     323    my $self = shift (@_);
     324    my ($field, $text) = @_;
     325
     326    # lets do cjk seg here
     327    my $new_text =$text;
     328    if ($self->{'separate_cjk'}) {
     329    $new_text = &cnseg::segment($text);
     330    }
     331    return $new_text;
     332}
    317333
    318334
  • gsdl/trunk/perllib/lucenebuilder.pm

    r16379 r17110  
    8585    my $self = shift (@_);
    8686
     87    $self->SUPER::generate_index_options();
     88   
    8789    $self->{'casefold'} = 0;
    8890    $self->{'stem'} = 0;
  • gsdl/trunk/perllib/lucenebuildproc.pm

    r16506 r17110  
    270270        }
    271271        # filter the text
    272         $self->filter_text ($field, $new_text);
     272        $new_text = $self->filter_text ($field, $new_text);
    273273        $self->{'num_processed_bytes'} += length ($new_text);
    274274
  • gsdl/trunk/perllib/mgbuilder.pm

    r16379 r17110  
    8282sub generate_index_options {
    8383    my $self = shift (@_);
    84 
     84    $self->SUPER::generate_index_options();
     85   
    8586    $self->{'casefold'} = 0;
    8687    $self->{'stem'} = 0;
     
    9192    $self->{'casefold'} = 1;
    9293    $self->{'stem'} = 1;
     94   
    9395    } else {
    9496    foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
  • gsdl/trunk/perllib/mgbuildproc.pm

    r15738 r17110  
    4747sub find_paragraphs {
    4848    $_[1] =~ s/(<p\b)/\cC$1/gi;
    49 }
    50 
    51 sub filter_text {
    52     # $self->filter_text ($field, $new_text);
    53     # don't want to do anything for this version, however,
    54     # in a particular collection you might want to override
    55     # this method to post-process certain fields depending on
    56     # the field, or whether we are outputting it for indexing
    5749}
    5850
     
    131123           
    132124            # filter the text
    133             $self->filter_text ($field, $new_text);
     125            $new_text = $self->filter_text ($field, $new_text);
    134126
    135127            $text .= "$new_text\cC";
  • gsdl/trunk/perllib/mgppbuilder.pm

    r16379 r17110  
    140140    my $self = shift (@_);
    141141
     142    $self->SUPER::generate_index_options();
     143
    142144    $self->{'casefold'} = 0;
    143145    $self->{'stem'} = 0;
  • gsdl/trunk/perllib/mgppbuildproc.pm

    r14912 r17110  
    3131
    3232use basebuildproc;
     33use cnseg;
     34
    3335use strict;
    3436no strict 'refs'; # allow filehandles to be variables and viceversa
     
    167169}
    168170   
    169    
    170 
    171 sub filter_text {
    172     # $self->filter_text ($field, $new_text);
    173     # don't want to do anything for this version, however,
    174     # in a particular collection you might want to override
    175     # this method to post-process certain fields depending on
    176     # the field, or whether we are outputting it for indexing
    177 }
    178 
    179171sub text {
    180172    my $self = shift (@_);
     
    326318       
    327319        # filter the text
    328         $self->filter_text ($field, $new_text);
     320        $new_text = $self->filter_text ($field, $new_text);
    329321       
    330322        $self->{'num_processed_bytes'} += length ($new_text);
Note: See TracChangeset for help on using the changeset viewer.