Changeset 13590
- Timestamp:
- 2007-01-12T14:18:53+13:00 (17 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/lucenebuilder.pm
r13589 r13590 156 156 $handle = lucenebuilder::PIPEOUT; 157 157 } 158 my $levels = $self->{'levels'}; 159 my $gdbm_level = "document"; 160 if ($levels->{'section'}) 161 { 162 $gdbm_level = "section"; 163 } 164 165 undef $levels->{'paragraph'}; # get rid of para if we had it. 158 159 # stored text is always Doc and Sec levels 160 my $levels = { 'document' => 1, 'section' => 1 }; 161 # always do gdbm at section level 162 my $gdbm_level = "section"; 163 166 164 # set up the document processr 167 165 $self->{'buildproc'}->set_output_handle ($handle); … … 232 230 } 233 231 232 233 sub build_index { 234 my $self = shift (@_); 235 my ($index,$llevel) = @_; 236 my $outhandle = $self->{'outhandle'}; 237 my $build_dir = $self->{'build_dir'}; 238 239 # get the full index directory path and make sure it exists 240 my $indexdir = $self->{'index_mapping'}->{$index}; 241 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir)); 242 243 # get any os specific stuff 244 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; 245 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script"; 246 247 # Find the perl script to call to run lucene 248 my $full_lucene_passes = $self->{'full_lucene_passes'}; 249 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'}; 250 251 # define the section names for lucenepasses 252 # define the section names and possibly the doc name for lucenepasses 253 my $lucene_passes_sections = $llevel; 254 255 my $opt_create_index = ($self->{'keepold'}) ? "" : "-create"; 256 257 my $osextra = ""; 258 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 259 $build_dir =~ s@/@\\@g; 260 } else { 261 if ($outhandle ne "STDERR") { 262 # so lucene_passes doesn't print to stderr if we redirect output 263 $osextra .= " 2>/dev/null"; 264 } 265 } 266 267 # get the index expression if this index belongs 268 # to a subcollection 269 my $indexexparr = []; 270 my $langarr = []; 271 272 # there may be subcollection info, and language info. 273 my ($fields, $subcollection, $language) = split (":", $index); 274 my @subcollections = (); 275 @subcollections = split /,/, $subcollection if (defined $subcollection); 276 277 foreach $subcollection (@subcollections) { 278 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) { 279 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection}); 280 } 281 } 282 283 # add expressions for languages if this index belongs to 284 # a language subcollection - only put languages expressions for the 285 # ones we want in the index 286 my @languages = (); 287 my $language_metadata = "Language"; 288 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) { 289 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'}; 290 } 291 @languages = split /,/, $language if (defined $language); 292 foreach my $language (@languages) { 293 my $not=0; 294 if ($language =~ s/^\!//) { 295 $not = 1; 296 } 297 if($not) { 298 push (@$langarr, "!$language"); 299 } else { 300 push (@$langarr, "$language"); 301 } 302 } 303 304 # Build index dictionary. Uses verbatim stem method 305 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1); 306 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'}; 307 my ($handle); 308 309 if ($self->{'debug'}) { 310 $handle = STDOUT; 311 } else { 312 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n"; 313 if (!-e "$full_lucene_passes" || 314 !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) { 315 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'}; 316 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n"; 317 } 318 $handle = lucenebuilder::PIPEOUT; 319 } 320 321 my $store_levels = $self->{'levels'}; 322 my $gdbm_level = "section"; #always 323 my $dom_level = ""; 324 foreach my $key (keys %$store_levels) { 325 if ($mgppbuilder::level_map{$key} eq $llevel) { 326 $dom_level = $key; 327 } 328 } 329 if ($dom_level eq "") { 330 print STDERR "Warning: unrecognized tag level $llevel\n"; 331 $dom_level = "document"; 332 } 333 334 my $local_levels = { $dom_level => 1 }; # work on one level at a time 335 336 # set up the document processr 337 $self->{'buildproc'}->set_output_handle ($handle); 338 $self->{'buildproc'}->set_mode ('text'); 339 $self->{'buildproc'}->set_index ($index, $indexexparr); 340 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language); 341 $self->{'buildproc'}->set_indexing_text (1); 342 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 343 $self->{'buildproc'}->set_levels ($local_levels); 344 $self->{'buildproc'}->set_gdbm_level($gdbm_level); 345 $self->{'buildproc'}->reset(); 346 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 347 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'}); 348 close ($handle) unless $self->{'debug'}; 349 350 $self->print_stats(); 351 352 $self->{'buildproc'}->set_levels ($store_levels); 353 print STDERR "</Stage>\n" if $self->{'gli'}; 354 } 355 356 # /** A modified version of the basebuilder.pm's function that generates the 357 # * information database (GDBM) from the GA documents. We need to change this 358 # * so that if we've been asked to do an incremental build we only add 359 # * metadata to autohierarchy classifiers via the IncrementalBuildUtils 360 # * module. All other classifiers and metadata will be ignored. 361 # */ 362 sub make_infodatabase 363 { 364 my $self = shift (@_); 365 my $outhandle = $self->{'outhandle'}; 366 367 my $dbext = ".bdb"; 368 $dbext = ".ldb" if &util::is_little_endian(); 369 my $infodb_file = &util::filename_cat($self->{'build_dir'}, "text", $self->{'collection'} . $dbext); 370 371 # If we aren't doing an incremental addition, then we just call the super- 372 # classes version 373 # Note: Incremental addition can only occur if a text/<collection>.ldb 374 # already exists. If it doesn't, let the super classes function be 375 # called once to generate it. 376 if (!$self->{'incremental_dlc'} || !(-e $infodb_file)) 377 { 378 # basebuilder::make_infodatabase(@_); 379 # Note: this doesn't work as the direct reference means all the $self 380 # data is lost. 381 $self->basebuilder::make_infodatabase(@_); 382 return; 383 } 384 385 # Carry on with an incremental addition 386 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1); 387 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'}; 388 389 # 1. Init all the classifiers 390 &classify::init_classifiers ($self->{'classifiers'}); 391 # 2. Init the buildproc settings. 392 # Note: we still need this to process any associated files - but we 393 # don't expect to pipe anything to txt2db so we can do away with the 394 # complex output handle. 395 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc"); 396 &util::mk_all_dir ($assocdir); 397 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important 398 $self->{'buildproc'}->set_assocdir ($assocdir); 399 # 3. Read in all the metadata from the files in the archives directory using 400 # the GAPlug and using ourselves as the document processor! 401 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'}); 402 403 print STDERR "</Stage>\n" if $self->{'gli'}; 404 } 405 234 406 # /** Lucene specific document removal function. This works by calling lucene_passes.pl with 235 407 # * -remove and the document id on the command line. … … 249 421 # /** remove_document_from_database **/ 250 422 251 sub build_index {252 my $self = shift (@_);253 my ($index,$llevel) = @_;254 my $outhandle = $self->{'outhandle'};255 my $build_dir = $self->{'build_dir'};256 257 # get the full index directory path and make sure it exists258 my $indexdir = $self->{'index_mapping'}->{$index};259 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));260 261 # get any os specific stuff262 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";263 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";264 265 # Find the perl script to call to run lucene266 my $full_lucene_passes = $self->{'full_lucene_passes'};267 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};268 269 # define the section names for lucenepasses270 # define the section names and possibly the doc name for lucenepasses271 my $lucene_passes_sections = $llevel;272 273 my $opt_create_index = ($self->{'keepold'}) ? "" : "-create";274 275 my $osextra = "";276 if ($ENV{'GSDLOS'} =~ /^windows$/i) {277 $build_dir =~ s@/@\\@g;278 } else {279 if ($outhandle ne "STDERR") {280 # so lucene_passes doesn't print to stderr if we redirect output281 $osextra .= " 2>/dev/null";282 }283 }284 285 # get the index expression if this index belongs286 # to a subcollection287 my $indexexparr = [];288 my $langarr = [];289 290 # there may be subcollection info, and language info.291 my ($fields, $subcollection, $language) = split (":", $index);292 my @subcollections = ();293 @subcollections = split /,/, $subcollection if (defined $subcollection);294 295 foreach $subcollection (@subcollections) {296 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {297 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});298 }299 }300 301 # add expressions for languages if this index belongs to302 # a language subcollection - only put languages expressions for the303 # ones we want in the index304 my @languages = ();305 my $language_metadata = "Language";306 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {307 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};308 }309 @languages = split /,/, $language if (defined $language);310 foreach my $language (@languages) {311 my $not=0;312 if ($language =~ s/^\!//) {313 $not = 1;314 }315 if($not) {316 push (@$langarr, "!$language");317 } else {318 push (@$langarr, "$language");319 }320 }321 322 # Build index dictionary. Uses verbatim stem method323 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);324 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};325 my ($handle);326 327 if ($self->{'debug'}) {328 $handle = STDOUT;329 } else {330 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";331 if (!-e "$full_lucene_passes" ||332 !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {333 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};334 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";335 }336 $handle = lucenebuilder::PIPEOUT;337 }338 339 my $store_levels = $self->{'levels'};340 my $gdbm_level = "document";341 if ($store_levels->{'section'}) {342 $gdbm_level = "section";343 }344 345 my $dom_level = "";346 foreach my $key (keys %$store_levels) {347 if ($mgppbuilder::level_map{$key} eq $llevel) {348 $dom_level = $key;349 }350 }351 if ($dom_level eq "") {352 print STDERR "Warning: unrecognized tag level $llevel\n";353 $dom_level = "document";354 }355 356 my $local_levels = { $dom_level => 1 }; # work on one level at a time357 358 # set up the document processr359 $self->{'buildproc'}->set_output_handle ($handle);360 $self->{'buildproc'}->set_mode ('text');361 $self->{'buildproc'}->set_index ($index, $indexexparr);362 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);363 $self->{'buildproc'}->set_indexing_text (1);364 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});365 $self->{'buildproc'}->set_levels ($local_levels);366 $self->{'buildproc'}->set_gdbm_level($gdbm_level);367 $self->{'buildproc'}->reset();368 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},369 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});370 close ($handle) unless $self->{'debug'};371 372 $self->print_stats();373 374 $self->{'buildproc'}->set_levels ($store_levels);375 print STDERR "</Stage>\n" if $self->{'gli'};376 }377 378 # /** A modified version of the basebuilder.pm's function that generates the379 # * information database (GDBM) from the GA documents. We need to change this380 # * so that if we've been asked to do an incremental build we only add381 # * metadata to autohierarchy classifiers via the IncrementalBuildUtils382 # * module. All other classifiers and metadata will be ignored.383 # */384 sub make_infodatabase385 {386 my $self = shift (@_);387 my $outhandle = $self->{'outhandle'};388 389 my $dbext = ".bdb";390 $dbext = ".ldb" if &util::is_little_endian();391 my $infodb_file = &util::filename_cat($self->{'build_dir'}, "text", $self->{'collection'} . $dbext);392 393 # If we aren't doing an incremental addition, then we just call the super-394 # classes version395 # Note: Incremental addition can only occur if a text/<collection>.ldb396 # already exists. If it doesn't, let the super classes function be397 # called once to generate it.398 if (!$self->{'incremental_dlc'} || !(-e $infodb_file))399 {400 # basebuilder::make_infodatabase(@_);401 # Note: this doesn't work as the direct reference means all the $self402 # data is lost.403 $self->basebuilder::make_infodatabase(@_);404 return;405 }406 407 # Carry on with an incremental addition408 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);409 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};410 411 # 1. Init all the classifiers412 &classify::init_classifiers ($self->{'classifiers'});413 # 2. Init the buildproc settings.414 # Note: we still need this to process any associated files - but we415 # don't expect to pipe anything to txt2db so we can do away with the416 # complex output handle.417 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");418 &util::mk_all_dir ($assocdir);419 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important420 $self->{'buildproc'}->set_assocdir ($assocdir);421 # 3. Read in all the metadata from the files in the archives directory using422 # the GAPlug and using ourselves as the document processor!423 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});424 425 print STDERR "</Stage>\n" if $self->{'gli'};426 }427 423 428 424 1; -
trunk/gsdl/perllib/mgppbuilder.pm
r13341 r13590 46 46 'Sec'=>'_textsection_', 47 47 'Para'=>'_textparagraph_'); 48 49 #$doc_level = "Doc";50 #$sec_level = "Sec";51 #$para_level = "Para";52 48 53 49 our %wanted_index_files = ('td'=>1, … … 127 123 } 128 124 129 $self->{'doc_level'} = "document";130 if (! $self->{'levels'}->{'document'}) {131 if ($self->{'levels'}->{'section'}) {132 $self->{'doc_level'} = "section";133 } else {134 die "you must have either document or section level specified!!\n";135 }136 }137 138 125 $self->{'buildtype'} = "mgpp"; 139 126 … … 229 216 # the compressor doesn't need to know about paragraphs - never want to 230 217 # retrieve them 231 my $mgpp_passes_sections = ""; 232 my ($doc_level) = $self->{'doc_level'}; 233 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} . " "; 234 foreach my $level (keys %{$self->{'levels'}}) { 235 if ($level ne $doc_level && $level ne "paragraph") { 236 $mgpp_passes_sections .= "-K " . $level_map{$level} . " "; 237 } 238 } 218 219 # always use Doc and Sec levels 220 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." "; 239 221 240 222 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1); … … 250 232 $handle = STDOUT; 251 233 } else { 252 #print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n";253 234 if (!-e "$mgpp_passes_exe" || 254 235 !open (PIPEOUT, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) { … … 258 239 $handle = mgppbuilder::PIPEOUT; 259 240 } 260 261 # gdbm_level 262 my $gdbm_level = "document"; 263 if ($self->{'levels'}->{'section'}) { 264 $gdbm_level = "section"; 265 } 266 241 242 my $gdbm_level = "section"; 243 267 244 $self->{'buildproc'}->set_output_handle ($handle); 268 245 $self->{'buildproc'}->set_mode ('text'); … … 300 277 301 278 if (!$self->{'debug'}) { 302 #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n";303 279 if (!-e "$mgpp_passes_exe" || 304 280 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) { … … 351 327 foreach my $index (@$indexes) { 352 328 my ($fields, $subcollection, $languages) = split (":", $index); 353 # the directory name starts with a processed version of index fields 354 #my ($pindex) = $self->process_field($fields); 355 #$pindex = lc ($pindex); 356 # now we only ever have one index, and its called 'idx' 329 330 # we only ever have one index, and its called 'idx' 357 331 my $pindex = 'idx'; 358 332 … … 441 415 442 416 # define the section names for mgpasses 443 # define the section names and possibly the doc name for mgpasses 444 my $mgpp_passes_sections = ""; 445 my ($doc_level) = $self->{'doc_level'}; 446 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} ." "; 447 448 foreach my $level (keys %{$self->{'levels'}}) { 449 if ($level ne $doc_level) { 450 $mgpp_passes_sections .= "-K " . $level_map{$level}. " "; 451 } 417 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." "; 418 if ($self->{'levels'}->{'paragraph'}) { 419 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " "; 452 420 } 453 421 … … 526 494 } 527 495 528 # gdbm_level 529 my $gdbm_level = "document"; 530 if ($self->{'levels'}->{'section'}) { 531 $gdbm_level = "section"; 532 } 496 # gdbm_level is always section 497 my $gdbm_level = "section"; 533 498 534 499 # set up the document processr … … 878 843 $build_cfg->{'levelmap'} = \@levelmap; 879 844 880 if ($self->{'levels'}->{'section'}) { 881 $build_cfg->{'textlevel'} = $level_map{'section'}; 882 } else { 883 $build_cfg->{'textlevel'} = $level_map{'document'}; 884 } 845 # text level (and gdbm level) is always section 846 $build_cfg->{'textlevel'} = $level_map{'section'}; 885 847 886 848 } -
trunk/gsdl/perllib/mgppbuildproc.pm
r12951 r13590 85 85 $self->{'strip_html'}=$strip; 86 86 } 87 88 89 sub get_gdbm_level {90 my $self = shift (@_);91 92 #if a Section level index is not built, the gdbm file should be at doc93 #level not Section94 if ($self->{'levels'}->{'section'}) {95 return "section";96 }97 return "document";98 }99 100 87 101 88 #sub find_paragraphs { … … 208 195 my ($fields) = split (/:/, $self->{'index'}); 209 196 210 my ($documenttag) = ""; 211 my($documentendtag) = ""; 212 if ($self->{'levels'}->{'document'}) { 213 $documenttag = "\n<". $level_map{'document'} . ">\n"; 214 $documentendtag = "\n</". $level_map{'document'} . ">\n"; 215 } 216 my ($sectiontag) = ""; 217 if ($self->{'levels'}->{'section'}) { 218 $sectiontag = "\n<". $level_map{'section'} . ">\n"; 219 } 197 # we always do text and index on Doc and Sec levels 198 my ($documenttag) = "\n<". $level_map{'document'} . ">\n"; 199 my ($documentendtag) = "\n</". $level_map{'document'} . ">\n"; 200 my ($sectiontag) = "\n<". $level_map{'section'} . ">\n"; 220 201 my ($paratag) = ""; 221 202
Note:
See TracChangeset
for help on using the changeset viewer.