Changeset 17110

Show
Ignore:
Timestamp:
30.08.2008 07:35:20 (11 years ago)
Author:
kjdon
Message:

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

Location:
gsdl/trunk/perllib
Files:
8 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/basebuilder.pm

    r16379 r17110  
    148148     
    149149    $self->generate_index_list(); 
    150     $self->generate_index_options(); 
    151  
     150  
    152151    # sort out subcollection indexes 
    153152    if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) { 
     
    216215    die "$@" if $@; 
    217216 
     217     
     218   $self->generate_index_options(); 
     219 
    218220    if (!$self->{'debug'} && !$self->{'keepold'}) { 
    219221    # remove any old builds 
     
    234236} 
    235237 
     238sub generate_index_options { 
     239    my $self = shift (@_); 
     240 
     241    my $separate_cjk = 0; 
     242     
     243    if (defined($self->{'collect_cfg'}->{'indexoptions'})) { 
     244    foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) { 
     245        if ($option =~ /separate_cjk/) { 
     246        $separate_cjk = 1; 
     247        } 
     248    } 
     249    } 
     250    # set this for building 
     251    $self->{'buildproc'}->set_separate_cjk($separate_cjk); 
     252    # record it for build.cfg 
     253    $self->{'separate_cjk'} = $separate_cjk; 
     254} 
     255  
    236256sub set_sections_index_document_metadata { 
    237257    my $self = shift (@_); 
     
    429449    $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'}); 
    430450    $build_cfg->{'stemindexes'} = $self->{'stemindexes'}; 
     451    if ($self->{'separate_cjk'}) { 
     452    $build_cfg->{'separate_cjk'} = "true"; 
     453    } 
    431454     
    432455    # store the number of documents and number of bytes 
  • gsdl/trunk/perllib/basebuildproc.pm

    r17106 r17110  
    6868    $self->{'indexexparr'} = []; 
    6969 
     70    $self->{'separate_cjk'} = 0; 
     71 
    7072    my $found_num_data = 0; 
    7173    my $buildconfigfile = undef; 
     
    7476    # For incremental building need to seed num_docs etc from values 
    7577    # stored in build.cfg (if present) 
    76       print STDERR "Keepold!\n"; 
    7778    $buildconfigfile = &util::filename_cat($build_dir, "build.cfg"); 
    78       print STDERR "Build cfg: $buildconfigfile\n"; 
    7979    if (-e $buildconfigfile) { 
    8080        $found_num_data = 1; 
     
    8484        $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},  
    8585                           "index", "build.cfg"); 
    86             print STDERR "Index cfg: $buildconfigfile\n"; 
    8786        if (-e $buildconfigfile) { 
    8887        $found_num_data = 1; 
     
    9190 
    9291    } 
    93     #else 
    94     #  { 
    95     #    print STDERR "Removeold!\n"; 
    96     #  } 
    9792 
    9893    if ($found_num_data) 
     
    308303    $self->{'sections_index_document_metadata'} = $index_type; 
    309304} 
     305 
     306sub set_separate_cjk { 
     307    my $self = shift (@_); 
     308    my ($sep_cjk) = @_; 
     309 
     310    $self->{'separate_cjk'} = $sep_cjk; 
     311} 
     312 
    310313sub process { 
    311314    my $self = shift (@_); 
     
    315318} 
    316319 
     320# post process text depending on field. Currently don't do anything here 
     321# except cjk separation 
     322sub filter_text { 
     323    my $self = shift (@_); 
     324    my ($field, $text) = @_; 
     325 
     326    # lets do cjk seg here 
     327    my $new_text =$text; 
     328    if ($self->{'separate_cjk'}) { 
     329    $new_text = &cnseg::segment($text); 
     330    } 
     331    return $new_text; 
     332} 
    317333 
    318334 
  • gsdl/trunk/perllib/lucenebuilder.pm

    r16379 r17110  
    8585    my $self = shift (@_); 
    8686 
     87    $self->SUPER::generate_index_options(); 
     88     
    8789    $self->{'casefold'} = 0; 
    8890    $self->{'stem'} = 0; 
  • gsdl/trunk/perllib/lucenebuildproc.pm

    r16506 r17110  
    270270        } 
    271271        # filter the text 
    272         $self->filter_text ($field, $new_text); 
     272        $new_text = $self->filter_text ($field, $new_text); 
    273273        $self->{'num_processed_bytes'} += length ($new_text); 
    274274 
  • gsdl/trunk/perllib/mgbuilder.pm

    r16379 r17110  
    8282sub generate_index_options { 
    8383    my $self = shift (@_); 
    84  
     84    $self->SUPER::generate_index_options(); 
     85     
    8586    $self->{'casefold'} = 0; 
    8687    $self->{'stem'} = 0; 
     
    9192    $self->{'casefold'} = 1; 
    9293    $self->{'stem'} = 1; 
     94     
    9395    } else { 
    9496    foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) { 
  • gsdl/trunk/perllib/mgbuildproc.pm

    r15738 r17110  
    4747sub find_paragraphs { 
    4848    $_[1] =~ s/(<p\b)/\cC$1/gi; 
    49 } 
    50  
    51 sub filter_text { 
    52     # $self->filter_text ($field, $new_text); 
    53     # don't want to do anything for this version, however, 
    54     # in a particular collection you might want to override 
    55     # this method to post-process certain fields depending on 
    56     # the field, or whether we are outputting it for indexing 
    5749} 
    5850 
     
    131123             
    132124            # filter the text 
    133             $self->filter_text ($field, $new_text); 
     125            $new_text = $self->filter_text ($field, $new_text); 
    134126 
    135127            $text .= "$new_text\cC"; 
  • gsdl/trunk/perllib/mgppbuilder.pm

    r16379 r17110  
    140140    my $self = shift (@_); 
    141141 
     142    $self->SUPER::generate_index_options(); 
     143 
    142144    $self->{'casefold'} = 0; 
    143145    $self->{'stem'} = 0; 
  • gsdl/trunk/perllib/mgppbuildproc.pm

    r14912 r17110  
    3131 
    3232use basebuildproc; 
     33use cnseg; 
     34 
    3335use strict; 
    3436no strict 'refs'; # allow filehandles to be variables and viceversa 
     
    167169} 
    168170     
    169      
    170  
    171 sub filter_text { 
    172     # $self->filter_text ($field, $new_text); 
    173     # don't want to do anything for this version, however, 
    174     # in a particular collection you might want to override 
    175     # this method to post-process certain fields depending on 
    176     # the field, or whether we are outputting it for indexing 
    177 } 
    178  
    179171sub text { 
    180172    my $self = shift (@_); 
     
    326318         
    327319        # filter the text 
    328         $self->filter_text ($field, $new_text); 
     320        $new_text = $self->filter_text ($field, $new_text); 
    329321         
    330322        $self->{'num_processed_bytes'} += length ($new_text);