Changeset 13590 for trunk/gsdl/perllib/lucenebuilder.pm
- Timestamp:
- 2007-01-12T14:18:53+13:00 (17 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/lucenebuilder.pm
r13589 r13590 156 156 $handle = lucenebuilder::PIPEOUT; 157 157 } 158 my $levels = $self->{'levels'}; 159 my $gdbm_level = "document"; 160 if ($levels->{'section'}) 161 { 162 $gdbm_level = "section"; 163 } 164 165 undef $levels->{'paragraph'}; # get rid of para if we had it. 158 159 # stored text is always Doc and Sec levels 160 my $levels = { 'document' => 1, 'section' => 1 }; 161 # always do gdbm at section level 162 my $gdbm_level = "section"; 163 166 164 # set up the document processr 167 165 $self->{'buildproc'}->set_output_handle ($handle); … … 232 230 } 233 231 232 233 sub build_index { 234 my $self = shift (@_); 235 my ($index,$llevel) = @_; 236 my $outhandle = $self->{'outhandle'}; 237 my $build_dir = $self->{'build_dir'}; 238 239 # get the full index directory path and make sure it exists 240 my $indexdir = $self->{'index_mapping'}->{$index}; 241 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir)); 242 243 # get any os specific stuff 244 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; 245 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script"; 246 247 # Find the perl script to call to run lucene 248 my $full_lucene_passes = $self->{'full_lucene_passes'}; 249 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'}; 250 251 # define the section names for lucenepasses 252 # define the section names and possibly the doc name for lucenepasses 253 my $lucene_passes_sections = $llevel; 254 255 my $opt_create_index = ($self->{'keepold'}) ? "" : "-create"; 256 257 my $osextra = ""; 258 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 259 $build_dir =~ s@/@\\@g; 260 } else { 261 if ($outhandle ne "STDERR") { 262 # so lucene_passes doesn't print to stderr if we redirect output 263 $osextra .= " 2>/dev/null"; 264 } 265 } 266 267 # get the index expression if this index belongs 268 # to a subcollection 269 my $indexexparr = []; 270 my $langarr = []; 271 272 # there may be subcollection info, and language info. 273 my ($fields, $subcollection, $language) = split (":", $index); 274 my @subcollections = (); 275 @subcollections = split /,/, $subcollection if (defined $subcollection); 276 277 foreach $subcollection (@subcollections) { 278 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) { 279 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection}); 280 } 281 } 282 283 # add expressions for languages if this index belongs to 284 # a language subcollection - only put languages expressions for the 285 # ones we want in the index 286 my @languages = (); 287 my $language_metadata = "Language"; 288 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) { 289 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'}; 290 } 291 @languages = split /,/, $language if (defined $language); 292 foreach my $language (@languages) { 293 my $not=0; 294 if ($language =~ s/^\!//) { 295 $not = 1; 296 } 297 if($not) { 298 push (@$langarr, "!$language"); 299 } else { 300 push (@$langarr, "$language"); 301 } 302 } 303 304 # Build index dictionary. Uses verbatim stem method 305 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1); 306 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'}; 307 my ($handle); 308 309 if ($self->{'debug'}) { 310 $handle = STDOUT; 311 } else { 312 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n"; 313 if (!-e "$full_lucene_passes" || 314 !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) { 315 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'}; 316 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n"; 317 } 318 $handle = lucenebuilder::PIPEOUT; 319 } 320 321 my $store_levels = $self->{'levels'}; 322 my $gdbm_level = "section"; #always 323 my $dom_level = ""; 324 foreach my $key (keys %$store_levels) { 325 if ($mgppbuilder::level_map{$key} eq $llevel) { 326 $dom_level = $key; 327 } 328 } 329 if ($dom_level eq "") { 330 print STDERR "Warning: unrecognized tag level $llevel\n"; 331 $dom_level = "document"; 332 } 333 334 my $local_levels = { $dom_level => 1 }; # work on one level at a time 335 336 # set up the document processr 337 $self->{'buildproc'}->set_output_handle ($handle); 338 $self->{'buildproc'}->set_mode ('text'); 339 $self->{'buildproc'}->set_index ($index, $indexexparr); 340 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language); 341 $self->{'buildproc'}->set_indexing_text (1); 342 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 343 $self->{'buildproc'}->set_levels ($local_levels); 344 $self->{'buildproc'}->set_gdbm_level($gdbm_level); 345 $self->{'buildproc'}->reset(); 346 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 347 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'}); 348 close ($handle) unless $self->{'debug'}; 349 350 $self->print_stats(); 351 352 $self->{'buildproc'}->set_levels ($store_levels); 353 print STDERR "</Stage>\n" if $self->{'gli'}; 354 } 355 356 # /** A modified version of the basebuilder.pm's function that generates the 357 # * information database (GDBM) from the GA documents. We need to change this 358 # * so that if we've been asked to do an incremental build we only add 359 # * metadata to autohierarchy classifiers via the IncrementalBuildUtils 360 # * module. All other classifiers and metadata will be ignored. 361 # */ 362 sub make_infodatabase 363 { 364 my $self = shift (@_); 365 my $outhandle = $self->{'outhandle'}; 366 367 my $dbext = ".bdb"; 368 $dbext = ".ldb" if &util::is_little_endian(); 369 my $infodb_file = &util::filename_cat($self->{'build_dir'}, "text", $self->{'collection'} . $dbext); 370 371 # If we aren't doing an incremental addition, then we just call the super- 372 # classes version 373 # Note: Incremental addition can only occur if a text/<collection>.ldb 374 # already exists. If it doesn't, let the super classes function be 375 # called once to generate it. 376 if (!$self->{'incremental_dlc'} || !(-e $infodb_file)) 377 { 378 # basebuilder::make_infodatabase(@_); 379 # Note: this doesn't work as the direct reference means all the $self 380 # data is lost. 381 $self->basebuilder::make_infodatabase(@_); 382 return; 383 } 384 385 # Carry on with an incremental addition 386 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1); 387 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'}; 388 389 # 1. Init all the classifiers 390 &classify::init_classifiers ($self->{'classifiers'}); 391 # 2. Init the buildproc settings. 392 # Note: we still need this to process any associated files - but we 393 # don't expect to pipe anything to txt2db so we can do away with the 394 # complex output handle. 395 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc"); 396 &util::mk_all_dir ($assocdir); 397 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important 398 $self->{'buildproc'}->set_assocdir ($assocdir); 399 # 3. Read in all the metadata from the files in the archives directory using 400 # the GAPlug and using ourselves as the document processor! 401 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'}); 402 403 print STDERR "</Stage>\n" if $self->{'gli'}; 404 } 405 234 406 # /** Lucene specific document removal function. This works by calling lucene_passes.pl with 235 407 # * -remove and the document id on the command line. … … 249 421 # /** remove_document_from_database **/ 250 422 251 sub build_index {252 my $self = shift (@_);253 my ($index,$llevel) = @_;254 my $outhandle = $self->{'outhandle'};255 my $build_dir = $self->{'build_dir'};256 257 # get the full index directory path and make sure it exists258 my $indexdir = $self->{'index_mapping'}->{$index};259 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));260 261 # get any os specific stuff262 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";263 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";264 265 # Find the perl script to call to run lucene266 my $full_lucene_passes = $self->{'full_lucene_passes'};267 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};268 269 # define the section names for lucenepasses270 # define the section names and possibly the doc name for lucenepasses271 my $lucene_passes_sections = $llevel;272 273 my $opt_create_index = ($self->{'keepold'}) ? "" : "-create";274 275 my $osextra = "";276 if ($ENV{'GSDLOS'} =~ /^windows$/i) {277 $build_dir =~ s@/@\\@g;278 } else {279 if ($outhandle ne "STDERR") {280 # so lucene_passes doesn't print to stderr if we redirect output281 $osextra .= " 2>/dev/null";282 }283 }284 285 # get the index expression if this index belongs286 # to a subcollection287 my $indexexparr = [];288 my $langarr = [];289 290 # there may be subcollection info, and language info.291 my ($fields, $subcollection, $language) = split (":", $index);292 my @subcollections = ();293 @subcollections = split /,/, $subcollection if (defined $subcollection);294 295 foreach $subcollection (@subcollections) {296 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {297 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});298 }299 }300 301 # add expressions for languages if this index belongs to302 # a language subcollection - only put languages expressions for the303 # ones we want in the index304 my @languages = ();305 my $language_metadata = "Language";306 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {307 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};308 }309 @languages = split /,/, $language if (defined $language);310 foreach my $language (@languages) {311 my $not=0;312 if ($language =~ s/^\!//) {313 $not = 1;314 }315 if($not) {316 push (@$langarr, "!$language");317 } else {318 push (@$langarr, "$language");319 }320 }321 322 # Build index dictionary. Uses verbatim stem method323 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);324 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};325 my ($handle);326 327 if ($self->{'debug'}) {328 $handle = STDOUT;329 } else {330 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";331 if (!-e "$full_lucene_passes" ||332 !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {333 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};334 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";335 }336 $handle = lucenebuilder::PIPEOUT;337 }338 339 my $store_levels = $self->{'levels'};340 my $gdbm_level = "document";341 if ($store_levels->{'section'}) {342 $gdbm_level = "section";343 }344 345 my $dom_level = "";346 foreach my $key (keys %$store_levels) {347 if ($mgppbuilder::level_map{$key} eq $llevel) {348 $dom_level = $key;349 }350 }351 if ($dom_level eq "") {352 print STDERR "Warning: unrecognized tag level $llevel\n";353 $dom_level = "document";354 }355 356 my $local_levels = { $dom_level => 1 }; # work on one level at a time357 358 # set up the document processr359 $self->{'buildproc'}->set_output_handle ($handle);360 $self->{'buildproc'}->set_mode ('text');361 $self->{'buildproc'}->set_index ($index, $indexexparr);362 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);363 $self->{'buildproc'}->set_indexing_text (1);364 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});365 $self->{'buildproc'}->set_levels ($local_levels);366 $self->{'buildproc'}->set_gdbm_level($gdbm_level);367 $self->{'buildproc'}->reset();368 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},369 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});370 close ($handle) unless $self->{'debug'};371 372 $self->print_stats();373 374 $self->{'buildproc'}->set_levels ($store_levels);375 print STDERR "</Stage>\n" if $self->{'gli'};376 }377 378 # /** A modified version of the basebuilder.pm's function that generates the379 # * information database (GDBM) from the GA documents. We need to change this380 # * so that if we've been asked to do an incremental build we only add381 # * metadata to autohierarchy classifiers via the IncrementalBuildUtils382 # * module. All other classifiers and metadata will be ignored.383 # */384 sub make_infodatabase385 {386 my $self = shift (@_);387 my $outhandle = $self->{'outhandle'};388 389 my $dbext = ".bdb";390 $dbext = ".ldb" if &util::is_little_endian();391 my $infodb_file = &util::filename_cat($self->{'build_dir'}, "text", $self->{'collection'} . $dbext);392 393 # If we aren't doing an incremental addition, then we just call the super-394 # classes version395 # Note: Incremental addition can only occur if a text/<collection>.ldb396 # already exists. If it doesn't, let the super classes function be397 # called once to generate it.398 if (!$self->{'incremental_dlc'} || !(-e $infodb_file))399 {400 # basebuilder::make_infodatabase(@_);401 # Note: this doesn't work as the direct reference means all the $self402 # data is lost.403 $self->basebuilder::make_infodatabase(@_);404 return;405 }406 407 # Carry on with an incremental addition408 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);409 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};410 411 # 1. Init all the classifiers412 &classify::init_classifiers ($self->{'classifiers'});413 # 2. Init the buildproc settings.414 # Note: we still need this to process any associated files - but we415 # don't expect to pipe anything to txt2db so we can do away with the416 # complex output handle.417 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");418 &util::mk_all_dir ($assocdir);419 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important420 $self->{'buildproc'}->set_assocdir ($assocdir);421 # 3. Read in all the metadata from the files in the archives directory using422 # the GAPlug and using ourselves as the document processor!423 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});424 425 print STDERR "</Stage>\n" if $self->{'gli'};426 }427 423 428 424 1;
Note:
See TracChangeset
for help on using the changeset viewer.