Changeset 17110
- Timestamp:
- 2008-08-30T07:35:20+12:00 (16 years ago)
- Location:
- gsdl/trunk/perllib
- Files:
-
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/basebuilder.pm
r16379 r17110 148 148 149 149 $self->generate_index_list(); 150 $self->generate_index_options(); 151 150 152 151 # sort out subcollection indexes 153 152 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) { … … 216 215 die "$@" if $@; 217 216 217 218 $self->generate_index_options(); 219 218 220 if (!$self->{'debug'} && !$self->{'keepold'}) { 219 221 # remove any old builds … … 234 236 } 235 237 238 sub generate_index_options { 239 my $self = shift (@_); 240 241 my $separate_cjk = 0; 242 243 if (defined($self->{'collect_cfg'}->{'indexoptions'})) { 244 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) { 245 if ($option =~ /separate_cjk/) { 246 $separate_cjk = 1; 247 } 248 } 249 } 250 # set this for building 251 $self->{'buildproc'}->set_separate_cjk($separate_cjk); 252 # record it for build.cfg 253 $self->{'separate_cjk'} = $separate_cjk; 254 } 255 236 256 sub set_sections_index_document_metadata { 237 257 my $self = shift (@_); … … 429 449 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'}); 430 450 $build_cfg->{'stemindexes'} = $self->{'stemindexes'}; 451 if ($self->{'separate_cjk'}) { 452 $build_cfg->{'separate_cjk'} = "true"; 453 } 431 454 432 455 # store the number of documents and number of bytes -
gsdl/trunk/perllib/basebuildproc.pm
r17106 r17110 68 68 $self->{'indexexparr'} = []; 69 69 70 $self->{'separate_cjk'} = 0; 71 70 72 my $found_num_data = 0; 71 73 my $buildconfigfile = undef; … … 74 76 # For incremental building need to seed num_docs etc from values 75 77 # stored in build.cfg (if present) 76 print STDERR "Keepold!\n";77 78 $buildconfigfile = &util::filename_cat($build_dir, "build.cfg"); 78 print STDERR "Build cfg: $buildconfigfile\n";79 79 if (-e $buildconfigfile) { 80 80 $found_num_data = 1; … … 84 84 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, 85 85 "index", "build.cfg"); 86 print STDERR "Index cfg: $buildconfigfile\n";87 86 if (-e $buildconfigfile) { 88 87 $found_num_data = 1; … … 91 90 92 91 } 93 #else94 # {95 # print STDERR "Removeold!\n";96 # }97 92 98 93 if ($found_num_data) … … 308 303 $self->{'sections_index_document_metadata'} = $index_type; 309 304 } 305 306 sub set_separate_cjk { 307 my $self = shift (@_); 308 my ($sep_cjk) = @_; 309 310 $self->{'separate_cjk'} = $sep_cjk; 311 } 312 310 313 sub process { 311 314 my $self = shift (@_); … … 315 318 } 316 319 320 # post process text depending on field. Currently don't do anything here 321 # except cjk separation 322 sub filter_text { 323 my $self = shift (@_); 324 my ($field, $text) = @_; 325 326 # lets do cjk seg here 327 my $new_text =$text; 328 if ($self->{'separate_cjk'}) { 329 $new_text = &cnseg::segment($text); 330 } 331 return $new_text; 332 } 317 333 318 334 -
gsdl/trunk/perllib/lucenebuilder.pm
r16379 r17110 85 85 my $self = shift (@_); 86 86 87 $self->SUPER::generate_index_options(); 88 87 89 $self->{'casefold'} = 0; 88 90 $self->{'stem'} = 0; -
gsdl/trunk/perllib/lucenebuildproc.pm
r16506 r17110 270 270 } 271 271 # filter the text 272 $ self->filter_text ($field, $new_text);272 $new_text = $self->filter_text ($field, $new_text); 273 273 $self->{'num_processed_bytes'} += length ($new_text); 274 274 -
gsdl/trunk/perllib/mgbuilder.pm
r16379 r17110 82 82 sub generate_index_options { 83 83 my $self = shift (@_); 84 84 $self->SUPER::generate_index_options(); 85 85 86 $self->{'casefold'} = 0; 86 87 $self->{'stem'} = 0; … … 91 92 $self->{'casefold'} = 1; 92 93 $self->{'stem'} = 1; 94 93 95 } else { 94 96 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) { -
gsdl/trunk/perllib/mgbuildproc.pm
r15738 r17110 47 47 sub find_paragraphs { 48 48 $_[1] =~ s/(<p\b)/\cC$1/gi; 49 }50 51 sub filter_text {52 # $self->filter_text ($field, $new_text);53 # don't want to do anything for this version, however,54 # in a particular collection you might want to override55 # this method to post-process certain fields depending on56 # the field, or whether we are outputting it for indexing57 49 } 58 50 … … 131 123 132 124 # filter the text 133 $ self->filter_text ($field, $new_text);125 $new_text = $self->filter_text ($field, $new_text); 134 126 135 127 $text .= "$new_text\cC"; -
gsdl/trunk/perllib/mgppbuilder.pm
r16379 r17110 140 140 my $self = shift (@_); 141 141 142 $self->SUPER::generate_index_options(); 143 142 144 $self->{'casefold'} = 0; 143 145 $self->{'stem'} = 0; -
gsdl/trunk/perllib/mgppbuildproc.pm
r14912 r17110 31 31 32 32 use basebuildproc; 33 use cnseg; 34 33 35 use strict; 34 36 no strict 'refs'; # allow filehandles to be variables and viceversa … … 167 169 } 168 170 169 170 171 sub filter_text {172 # $self->filter_text ($field, $new_text);173 # don't want to do anything for this version, however,174 # in a particular collection you might want to override175 # this method to post-process certain fields depending on176 # the field, or whether we are outputting it for indexing177 }178 179 171 sub text { 180 172 my $self = shift (@_); … … 326 318 327 319 # filter the text 328 $ self->filter_text ($field, $new_text);320 $new_text = $self->filter_text ($field, $new_text); 329 321 330 322 $self->{'num_processed_bytes'} += length ($new_text);
Note:
See TracChangeset
for help on using the changeset viewer.