Changeset 12844 for trunk/gsdl/perllib/lucenebuilder.pm
- Timestamp:
- 2006-09-25T14:17:10+12:00 (18 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/lucenebuilder.pm
r11175 r12844 3 3 # lucenebuilder.pm -- perl wrapper for building index with Lucene 4 4 # A component of the Greenstone digital library software 5 # from the New Zealand Digital Library Project at the 5 # from the New Zealand Digital Library Project at the 6 6 # University of Waikato, New Zealand. 7 7 # … … 24 24 ########################################################################### 25 25 26 ########################################################################### 27 # /* 28 # * @version 1.0 ? 29 # * @version 2.0 Incremental building assistance added, including 30 # * remove_document_from_database which implements the granddad's 31 # * empty function to call the lucene_passes.pl and full_lucene_passes_exe 32 # * so there is one place in the code that works out where the 33 # * perl script is. John Rowe 34 # * 35 # * @author John Rowe, DL Consulting Ltd. 36 # */ 37 ########################################################################### 38 26 39 package lucenebuilder; 27 40 28 41 # Use same basic XML structure setup by mgppbuilder/mgppbuildproc 29 42 30 use mgppbuilder; 43 use mgppbuilder; 44 45 use IncrementalBuildUtils; 31 46 32 47 sub BEGIN { … … 34 49 } 35 50 36 51 # /** 52 # * @author John Thompson, DL Consulting Ltd. 53 # */ 37 54 sub new { 38 55 my $class = shift(@_); 39 my ($collection, $source_dir, $build_dir, $verbosity, 40 $maxdocs, $debug, $keepold, $allclassifications, 41 $outhandle, $no_text, $gli) = @_; 56 my ($collection, $source_dir, $build_dir, $verbosity, $maxdocs, $debug, $keepold, $allclassifications, $outhandle, $no_text, $faillog, $gli, $incremental) = @_; 42 57 43 58 my $self = new mgppbuilder (@_); … … 46 61 $self->{'buildtype'} = "lucene"; 47 62 63 # Do we need to put exe on the end? 64 my $exe = &util::get_os_exe (); 65 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script"; 66 67 # So where is lucene_passes.pl anyway? 68 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl"); 69 70 # So tack perl on the beginning to ensure execution 71 $self->{'full_lucene_passes'} = "$lucene_passes_script"; 72 if ($exe eq ".exe") 73 { 74 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\""; 75 } 76 else 77 { 78 $self->{'full_lucene_passes_exe'} = "perl -S \"$lucene_passes_script\""; 79 } 80 81 # We must also record whether we have been asked to do just an incremental 82 # build (which makes no difference to the Lucene indexing bit, just the 83 # building of the classifiers in the GDBM). 84 $self->{'incremental'} = $incremental; 85 48 86 return $self; 49 87 } 88 # /** new() **/ 50 89 51 90 sub default_buildproc { … … 56 95 57 96 # this writes a nice version of the text docs 58 sub compress_text {59 97 sub compress_text 98 { 60 99 my $self = shift (@_); 61 62 100 # we don't do anything if we don't want compressed text 63 101 return if $self->{'no_text'}; … … 72 110 73 111 my $osextra = ""; 74 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 75 $text_dir =~ s@/@\\@g; 76 } else { 77 if ($outhandle ne "STDERR") { 78 # so lucene_passes doesn't print to stderr if we redirect output 79 $osextra .= " 2>/dev/null"; 80 } 81 } 82 112 if ($ENV{'GSDLOS'} =~ /^windows$/i) 113 { 114 $text_dir =~ s@/@\\@g; 115 } 116 else 117 { 118 if ($outhandle ne "STDERR") 119 { 120 # so lucene_passes doesn't print to stderr if we redirect output 121 $osextra .= " 2>/dev/null"; 122 } 123 } 83 124 84 125 # get any os specific stuff 85 126 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script"; 86 127 87 my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl"); 88 my $full_lucene_passes_exe = "\"$lucene_passes_exe\""; 89 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 90 $full_lucene_passes_exe = "perl.exe -S \"$lucene_passes_exe\""; 91 } 128 # Find the perl script to call to run lucene 129 my $full_lucene_passes = $self->{'full_lucene_passes'}; 130 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'}; 131 92 132 my $lucene_passes_sections = "Doc"; 93 133 94 134 my ($handle); 95 135 96 if ($self->{'debug'}) { 97 $handle = STDOUT; 98 } else { 99 if (!-e "$lucene_passes_exe" || 100 !open (PIPEOUT, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra")) { 101 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'}; 102 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n"; 103 } 104 $handle = lucenebuilder::PIPEOUT; 105 } 136 if ($self->{'debug'}) 137 { 138 $handle = STDOUT; 139 } 140 else 141 { 142 print STDERR "Full Path: $full_lucene_passes\n"; 143 print STDERR "Executable: $full_lucene_passes_exe\n"; 144 print STDERR "Sections: $lucene_passes_sections\n"; 145 print STDERR "Build Dir: $build_dir\n"; 146 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n"; 147 if (!-e "$full_lucene_passes" || 148 !open (PIPEOUT, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra")) 149 { 150 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'}; 151 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n"; 152 } 153 $handle = lucenebuilder::PIPEOUT; 154 } 106 155 my $levels = $self->{'levels'}; 107 156 my $gdbm_level = "document"; 108 if ($levels->{'section'}) { 109 $gdbm_level = "section"; 110 } 157 if ($levels->{'section'}) 158 { 159 $gdbm_level = "section"; 160 } 111 161 112 162 undef $levels->{'paragraph'}; # get rid of para if we had it. … … 117 167 $self->{'buildproc'}->set_indexing_text (0); 118 168 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 119 $self->{'buildproc'}->set_levels ($levels); 120 $self->{'buildproc'}->set_gdbm_level ($gdbm_level); 169 $self->{'buildproc'}->set_levels ($levels); 170 $self->{'buildproc'}->set_gdbm_level ($gdbm_level); 121 171 $self->{'buildproc'}->reset(); 122 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, 123 $self->{'buildproc'}, $self->{'maxdocs'}); 124 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 125 172 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, 173 $self->{'buildproc'}, $self->{'maxdocs'}); 174 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 175 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'}); 126 176 &plugin::end($self->{'pluginfo'}); 127 177 close ($handle) unless $self->{'debug'}; … … 130 180 131 181 print STDERR "</Stage>\n" if $self->{'gli'}; 132 133 } 182 } 134 183 135 184 sub build_indexes { … … 140 189 my $indexes = []; 141 190 if (defined $indexname && $indexname =~ /\w/) { 142 191 push @$indexes, $indexname; 143 192 } else { 144 145 } 146 147 # create the mapping between the index descriptions 193 $indexes = $self->{'collect_cfg'}->{'indexes'}; 194 } 195 196 # create the mapping between the index descriptions 148 197 # and their directory names (includes subcolls and langs) 149 198 $self->{'index_mapping'} = $self->create_index_mapping ($indexes); … … 151 200 # build each of the indexes 152 201 foreach $index (@$indexes) { 153 154 155 156 157 158 159 # should probably check that new name with level 160 161 162 163 164 165 print $outhandle "\n*** building index $index at level $llevel in subdirectory " . 166 167 168 169 170 171 172 173 174 175 202 if ($self->want_built($index)) { 203 204 my $idx = $self->{'index_mapping'}->{$index}; 205 foreach my $level (keys %{$self->{'levels'}}) { 206 next if $level =~ /paragraph/; # we don't do para indexing 207 my ($pindex) = $level =~ /^(.)/; 208 # should probably check that new name with level 209 # is unique ... but currently (with doc sec and para) 210 # each has unique first letter. 211 $self->{'index_mapping'}->{$index} = $pindex.$idx; 212 213 my $llevel = $mgppbuilder::level_map{$level}; 214 print $outhandle "\n*** building index $index at level $llevel in subdirectory " . 215 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1); 216 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'}; 217 218 $self->build_index($index,$llevel); 219 } 220 $self->{'index_mapping'}->{$index} = $idx; 221 222 } else { 223 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1); 224 } 176 225 } 177 226 … … 179 228 $self->make_final_field_list(); 180 229 } 230 231 # /** Lucene specific document removal function. This works by calling lucene_passes.pl with 232 # * -remove and the document id on the command line. 233 # * 234 # * @param oid is the document identifier to be removed. 235 # * 236 # * @author John Rowe, DL Consulting Ltd. 237 # */ 238 sub remove_document_from_database 239 { 240 my ($self, $oid) = @_; 241 # Find the perl script to call to run lucene 242 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'}; 243 # Call lucene_passes.pl with -remove and the document ID on the command line 244 `$full_lucene_passes_exe -remove "$oid"`; 245 } 246 # /** remove_document_from_database **/ 181 247 182 248 sub build_index { … … 194 260 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script"; 195 261 196 my $exe = &util::get_os_exe (); 197 my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl"); 198 my $full_lucene_passes_exe = "\"$lucene_passes_exe\""; 199 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 200 $full_lucene_passes_exe = "perl.exe -S \"$lucene_passes_exe\""; 201 } 262 # Find the perl script to call to run lucene 263 my $full_lucene_passes = $self->{'full_lucene_passes'}; 264 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'}; 202 265 203 266 # define the section names for lucenepasses … … 209 272 my $osextra = ""; 210 273 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 211 274 $build_dir =~ s@/@\\@g; 212 275 } else { 213 214 215 216 217 } 218 276 if ($outhandle ne "STDERR") { 277 # so lucene_passes doesn't print to stderr if we redirect output 278 $osextra .= " 2>/dev/null"; 279 } 280 } 281 219 282 # get the index expression if this index belongs 220 283 # to a subcollection … … 222 285 my $langarr = []; 223 286 224 # there may be subcollection info, and language info. 287 # there may be subcollection info, and language info. 225 288 my ($fields, $subcollection, $language) = split (":", $index); 226 289 my @subcollections = (); … … 228 291 229 292 foreach $subcollection (@subcollections) { 230 231 232 } 233 } 234 293 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) { 294 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection}); 295 } 296 } 297 235 298 # add expressions for languages if this index belongs to 236 # a language subcollection - only put languages expressions for the 299 # a language subcollection - only put languages expressions for the 237 300 # ones we want in the index 238 239 301 my @languages = (); 240 302 my $language_metadata = "Language"; 241 303 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) { 242 304 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'}; 243 305 } 244 306 @languages = split /,/, $language if (defined $language); 245 307 foreach my $language (@languages) { 246 247 248 249 250 251 252 253 254 308 my $not=0; 309 if ($language =~ s/^\!//) { 310 $not = 1; 311 } 312 if($not) { 313 push (@$langarr, "!$language"); 314 } else { 315 push (@$langarr, "$language"); 316 } 255 317 } 256 318 … … 261 323 262 324 if ($self->{'debug'}) { 263 325 $handle = STDOUT; 264 326 } else { 265 if (!-e "$lucene_passes_exe" || 266 !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) { 267 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'}; 268 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n"; 269 } 270 $handle = lucenebuilder::PIPEOUT; 271 } 272 327 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n"; 328 if (!-e "$full_lucene_passes" || 329 !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) { 330 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'}; 331 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n"; 332 } 333 $handle = lucenebuilder::PIPEOUT; 334 } 335 273 336 my $store_levels = $self->{'levels'}; 274 337 my $gdbm_level = "document"; 275 338 if ($store_levels->{'section'}) { 276 277 } 278 339 $gdbm_level = "section"; 340 } 341 279 342 my $dom_level = ""; 280 343 foreach my $key (keys %$store_levels) { 281 282 283 344 if ($mgppbuilder::level_map{$key} eq $llevel) { 345 $dom_level = $key; 346 } 284 347 } 285 348 if ($dom_level eq "") { 286 287 349 print STDERR "Warning: unrecognized tag level $llevel\n"; 350 $dom_level = "document"; 288 351 } 289 352 … … 297 360 $self->{'buildproc'}->set_indexing_text (1); 298 361 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 299 $self->{'buildproc'}->set_levels ($local_levels); 362 $self->{'buildproc'}->set_levels ($local_levels); 300 363 $self->{'buildproc'}->set_gdbm_level($gdbm_level); 301 364 $self->{'buildproc'}->reset(); 302 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 303 365 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 366 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'}); 304 367 close ($handle) unless $self->{'debug'}; 305 368 306 369 $self->print_stats(); 307 370 308 $self->{'buildproc'}->set_levels ($store_levels); 371 $self->{'buildproc'}->set_levels ($store_levels); 309 372 print STDERR "</Stage>\n" if $self->{'gli'}; 310 } 373 } 374 375 # /** A modified version of the basebuilder.pm's function that generates the 376 # * information database (GDBM) from the GA documents. We need to change this 377 # * so that if we've been asked to do an incremental build we only add 378 # * metadata to autohierarchy classifiers via the IncrementalBuildUtils 379 # * module. All other classifiers and metadata will be ignored. 380 # */ 381 sub make_infodatabase 382 { 383 my $self = shift (@_); 384 my $outhandle = $self->{'outhandle'}; 385 386 my $dbext = ".bdb"; 387 $dbext = ".ldb" if &util::is_little_endian(); 388 my $infodb_file = &util::filename_cat($self->{'build_dir'}, "text", $self->{'collection'} . $dbext); 389 390 # If we aren't doing an incremental addition, then we just call the super- 391 # classes version 392 # Note: Incremental addition can only occur if a text/<collection>.ldb 393 # already exists. If it doesn't, let the super classes function be 394 # called once to generate it. 395 if (!$self->{'incremental'} || !(-e $infodb_file)) 396 { 397 # basebuilder::make_infodatabase(@_); 398 # Note: this doesn't work as the direct reference means all the $self 399 # data is lost. 400 $self->basebuilder::make_infodatabase(@_); 401 return; 402 } 403 404 # Carry on with an incremental addition 405 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1); 406 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'}; 407 408 # 1. Init all the classifiers 409 &classify::init_classifiers ($self->{'classifiers'}); 410 # 2. Init the buildproc settings. 411 # Note: we still need this to process any associated files - but we 412 # don't expect to pipe anything to txt2db so we can do away with the 413 # complex output handle. 414 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc"); 415 &util::mk_all_dir ($assocdir); 416 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important 417 $self->{'buildproc'}->set_assocdir ($assocdir); 418 # 3. Read in all the metadata from the files in the archives directory using 419 # the GAPlug and using ourselves as the document processor! 420 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'}); 421 422 print STDERR "</Stage>\n" if $self->{'gli'}; 423 } 311 424 312 425 1;
Note:
See TracChangeset
for help on using the changeset viewer.