Changeset 13590


Ignore:
Timestamp:
2007-01-12T14:18:53+13:00 (17 years ago)
Author:
kjdon
Message:

mgpp and lucene. made them always use doc and sec levels for the text regardless of index level specification. mgpp will always index at doc and sec level, but these options may not be presented to the user. this is to ensure that if we have sectioned documents, we don't need to turn on section indexing in order for the document display to use sections

Location:
trunk/gsdl/perllib
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/lucenebuilder.pm

    r13589 r13590  
    156156    $handle = lucenebuilder::PIPEOUT;
    157157    }
    158     my $levels = $self->{'levels'};
    159     my $gdbm_level = "document";
    160     if ($levels->{'section'})
    161     {
    162     $gdbm_level = "section";
    163     }
    164 
    165     undef $levels->{'paragraph'}; # get rid of para if we had it.
     158
     159    # stored text is always Doc and Sec levels   
     160    my $levels = { 'document' => 1, 'section' => 1 };
     161    # always do gdbm at section level
     162    my $gdbm_level = "section";
     163
    166164    # set up the document processr
    167165    $self->{'buildproc'}->set_output_handle ($handle);
     
    232230}
    233231
     232
     233sub build_index {
     234    my $self = shift (@_);
     235    my ($index,$llevel) = @_;
     236    my $outhandle = $self->{'outhandle'};
     237    my $build_dir = $self->{'build_dir'};
     238
     239    # get the full index directory path and make sure it exists
     240    my $indexdir = $self->{'index_mapping'}->{$index};
     241    &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
     242
     243    # get any os specific stuff
     244    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
     245    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
     246
     247    # Find the perl script to call to run lucene
     248    my $full_lucene_passes = $self->{'full_lucene_passes'};
     249    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
     250
     251    # define the section names for lucenepasses
     252    # define the section names and possibly the doc name for lucenepasses
     253    my $lucene_passes_sections = $llevel;
     254
     255    my $opt_create_index = ($self->{'keepold'}) ? "" : "-create";
     256
     257    my $osextra = "";
     258    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
     259    $build_dir =~ s@/@\\@g;
     260    } else {
     261    if ($outhandle ne "STDERR") {
     262        # so lucene_passes doesn't print to stderr if we redirect output
     263        $osextra .= " 2>/dev/null";
     264    }
     265    }
     266
     267    # get the index expression if this index belongs
     268    # to a subcollection
     269    my $indexexparr = [];
     270    my $langarr = [];
     271
     272    # there may be subcollection info, and language info.
     273    my ($fields, $subcollection, $language) = split (":", $index);
     274    my @subcollections = ();
     275    @subcollections = split /,/, $subcollection if (defined $subcollection);
     276
     277    foreach $subcollection (@subcollections) {
     278    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
     279        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
     280    }
     281    }
     282
     283    # add expressions for languages if this index belongs to
     284    # a language subcollection - only put languages expressions for the
     285    # ones we want in the index
     286    my @languages = ();
     287    my $language_metadata = "Language";
     288    if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
     289    $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
     290    }
     291    @languages = split /,/, $language if (defined $language);
     292    foreach my $language (@languages) {
     293    my $not=0;
     294    if ($language =~ s/^\!//) {
     295        $not = 1;
     296    }
     297    if($not) {
     298        push (@$langarr, "!$language");
     299    } else {
     300        push (@$langarr, "$language");
     301    }
     302    }
     303
     304    # Build index dictionary. Uses verbatim stem method
     305    print $outhandle "\n    creating index dictionary (lucene_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
     306    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
     307    my ($handle);
     308
     309    if ($self->{'debug'}) {
     310    $handle = STDOUT;
     311    } else {
     312    print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra\n";
     313    if (!-e "$full_lucene_passes" ||
     314        !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra")) {
     315        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
     316        die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
     317    }
     318    $handle = lucenebuilder::PIPEOUT;
     319    }
     320
     321    my $store_levels = $self->{'levels'};
     322    my $gdbm_level = "section"; #always
     323    my $dom_level = "";
     324    foreach my $key (keys %$store_levels) {
     325    if ($mgppbuilder::level_map{$key} eq $llevel) {
     326        $dom_level = $key;
     327    }
     328    }
     329    if ($dom_level eq "") {
     330    print STDERR "Warning: unrecognized tag level $llevel\n";
     331    $dom_level = "document";
     332    }
     333
     334    my $local_levels = { $dom_level => 1 }; # work on one level at a time
     335
     336    # set up the document processr
     337    $self->{'buildproc'}->set_output_handle ($handle);
     338    $self->{'buildproc'}->set_mode ('text');
     339    $self->{'buildproc'}->set_index ($index, $indexexparr);
     340    $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
     341    $self->{'buildproc'}->set_indexing_text (1);
     342    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     343    $self->{'buildproc'}->set_levels ($local_levels);
     344    $self->{'buildproc'}->set_gdbm_level($gdbm_level);
     345    $self->{'buildproc'}->reset();
     346    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
     347           "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
     348    close ($handle) unless $self->{'debug'};
     349
     350    $self->print_stats();
     351
     352    $self->{'buildproc'}->set_levels ($store_levels);
     353    print STDERR "</Stage>\n" if $self->{'gli'};
     354}
     355
     356# /** A modified version of the basebuilder.pm's function that generates the
     357#  *  information database (GDBM) from the GA documents. We need to change this
     358#  *  so that if we've been asked to do an incremental build we only add
     359#  *  metadata to autohierarchy classifiers via the IncrementalBuildUtils
     360#  *  module. All other classifiers and metadata will be ignored.
     361#  */
     362sub make_infodatabase
     363{
     364    my $self = shift (@_);
     365    my $outhandle = $self->{'outhandle'};
     366
     367    my $dbext = ".bdb";
     368    $dbext = ".ldb" if &util::is_little_endian();
     369    my $infodb_file = &util::filename_cat($self->{'build_dir'}, "text", $self->{'collection'} . $dbext);
     370
     371    # If we aren't doing an incremental addition, then we just call the super-
     372    # classes version
     373    # Note: Incremental addition can only occur if a text/<collection>.ldb
     374    #       already exists. If it doesn't, let the super classes function be
     375    #       called once to generate it.
     376    if (!$self->{'incremental_dlc'} || !(-e $infodb_file))
     377    {
     378        # basebuilder::make_infodatabase(@_);
     379        # Note: this doesn't work as the direct reference means all the $self
     380        #       data is lost.
     381        $self->basebuilder::make_infodatabase(@_);
     382        return;
     383    }
     384
     385    # Carry on with an incremental addition
     386    print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
     387    print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
     388
     389    # 1. Init all the classifiers
     390    &classify::init_classifiers ($self->{'classifiers'});
     391    # 2. Init the buildproc settings.
     392    #    Note: we still need this to process any associated files - but we
     393    #    don't expect to pipe anything to txt2db so we can do away with the
     394    #    complex output handle.
     395    my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
     396    &util::mk_all_dir ($assocdir);
     397    $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
     398    $self->{'buildproc'}->set_assocdir ($assocdir);
     399    # 3. Read in all the metadata from the files in the archives directory using
     400    #    the GAPlug and using ourselves as the document processor!
     401    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
     402
     403    print STDERR "</Stage>\n" if $self->{'gli'};
     404}
     405
    234406# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
    235407#  *  -remove and the document id on the command line.
     
    249421# /** remove_document_from_database **/
    250422
    251 sub build_index {
    252     my $self = shift (@_);
    253     my ($index,$llevel) = @_;
    254     my $outhandle = $self->{'outhandle'};
    255     my $build_dir = $self->{'build_dir'};
    256 
    257     # get the full index directory path and make sure it exists
    258     my $indexdir = $self->{'index_mapping'}->{$index};
    259     &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
    260 
    261     # get any os specific stuff
    262     my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
    263     my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
    264 
    265     # Find the perl script to call to run lucene
    266     my $full_lucene_passes = $self->{'full_lucene_passes'};
    267     my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
    268 
    269     # define the section names for lucenepasses
    270     # define the section names and possibly the doc name for lucenepasses
    271     my $lucene_passes_sections = $llevel;
    272 
    273     my $opt_create_index = ($self->{'keepold'}) ? "" : "-create";
    274 
    275     my $osextra = "";
    276     if ($ENV{'GSDLOS'} =~ /^windows$/i) {
    277     $build_dir =~ s@/@\\@g;
    278     } else {
    279     if ($outhandle ne "STDERR") {
    280         # so lucene_passes doesn't print to stderr if we redirect output
    281         $osextra .= " 2>/dev/null";
    282     }
    283     }
    284 
    285     # get the index expression if this index belongs
    286     # to a subcollection
    287     my $indexexparr = [];
    288     my $langarr = [];
    289 
    290     # there may be subcollection info, and language info.
    291     my ($fields, $subcollection, $language) = split (":", $index);
    292     my @subcollections = ();
    293     @subcollections = split /,/, $subcollection if (defined $subcollection);
    294 
    295     foreach $subcollection (@subcollections) {
    296     if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
    297         push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
    298     }
    299     }
    300 
    301     # add expressions for languages if this index belongs to
    302     # a language subcollection - only put languages expressions for the
    303     # ones we want in the index
    304     my @languages = ();
    305     my $language_metadata = "Language";
    306     if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
    307     $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
    308     }
    309     @languages = split /,/, $language if (defined $language);
    310     foreach my $language (@languages) {
    311     my $not=0;
    312     if ($language =~ s/^\!//) {
    313         $not = 1;
    314     }
    315     if($not) {
    316         push (@$langarr, "!$language");
    317     } else {
    318         push (@$langarr, "$language");
    319     }
    320     }
    321 
    322     # Build index dictionary. Uses verbatim stem method
    323     print $outhandle "\n    creating index dictionary (lucene_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
    324     print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
    325     my ($handle);
    326 
    327     if ($self->{'debug'}) {
    328     $handle = STDOUT;
    329     } else {
    330     print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra\n";
    331     if (!-e "$full_lucene_passes" ||
    332         !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra")) {
    333         print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
    334         die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
    335     }
    336     $handle = lucenebuilder::PIPEOUT;
    337     }
    338 
    339     my $store_levels = $self->{'levels'};
    340     my $gdbm_level = "document";
    341     if ($store_levels->{'section'}) {
    342     $gdbm_level = "section";
    343     }
    344 
    345     my $dom_level = "";
    346     foreach my $key (keys %$store_levels) {
    347     if ($mgppbuilder::level_map{$key} eq $llevel) {
    348         $dom_level = $key;
    349     }
    350     }
    351     if ($dom_level eq "") {
    352     print STDERR "Warning: unrecognized tag level $llevel\n";
    353     $dom_level = "document";
    354     }
    355 
    356     my $local_levels = { $dom_level => 1 }; # work on one level at a time
    357 
    358     # set up the document processr
    359     $self->{'buildproc'}->set_output_handle ($handle);
    360     $self->{'buildproc'}->set_mode ('text');
    361     $self->{'buildproc'}->set_index ($index, $indexexparr);
    362     $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
    363     $self->{'buildproc'}->set_indexing_text (1);
    364     $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    365     $self->{'buildproc'}->set_levels ($local_levels);
    366     $self->{'buildproc'}->set_gdbm_level($gdbm_level);
    367     $self->{'buildproc'}->reset();
    368     &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    369            "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
    370     close ($handle) unless $self->{'debug'};
    371 
    372     $self->print_stats();
    373 
    374     $self->{'buildproc'}->set_levels ($store_levels);
    375     print STDERR "</Stage>\n" if $self->{'gli'};
    376 }
    377 
    378 # /** A modified version of the basebuilder.pm's function that generates the
    379 #  *  information database (GDBM) from the GA documents. We need to change this
    380 #  *  so that if we've been asked to do an incremental build we only add
    381 #  *  metadata to autohierarchy classifiers via the IncrementalBuildUtils
    382 #  *  module. All other classifiers and metadata will be ignored.
    383 #  */
    384 sub make_infodatabase
    385 {
    386     my $self = shift (@_);
    387     my $outhandle = $self->{'outhandle'};
    388 
    389     my $dbext = ".bdb";
    390     $dbext = ".ldb" if &util::is_little_endian();
    391     my $infodb_file = &util::filename_cat($self->{'build_dir'}, "text", $self->{'collection'} . $dbext);
    392 
    393     # If we aren't doing an incremental addition, then we just call the super-
    394     # classes version
    395     # Note: Incremental addition can only occur if a text/<collection>.ldb
    396     #       already exists. If it doesn't, let the super classes function be
    397     #       called once to generate it.
    398     if (!$self->{'incremental_dlc'} || !(-e $infodb_file))
    399     {
    400         # basebuilder::make_infodatabase(@_);
    401         # Note: this doesn't work as the direct reference means all the $self
    402         #       data is lost.
    403         $self->basebuilder::make_infodatabase(@_);
    404         return;
    405     }
    406 
    407     # Carry on with an incremental addition
    408     print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
    409     print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
    410 
    411     # 1. Init all the classifiers
    412     &classify::init_classifiers ($self->{'classifiers'});
    413     # 2. Init the buildproc settings.
    414     #    Note: we still need this to process any associated files - but we
    415     #    don't expect to pipe anything to txt2db so we can do away with the
    416     #    complex output handle.
    417     my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
    418     &util::mk_all_dir ($assocdir);
    419     $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
    420     $self->{'buildproc'}->set_assocdir ($assocdir);
    421     # 3. Read in all the metadata from the files in the archives directory using
    422     #    the GAPlug and using ourselves as the document processor!
    423     &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
    424 
    425     print STDERR "</Stage>\n" if $self->{'gli'};
    426 }
    427423
    4284241;
  • trunk/gsdl/perllib/mgppbuilder.pm

    r13341 r13590  
    4646          'Sec'=>'_textsection_',
    4747          'Para'=>'_textparagraph_');
    48 
    49 #$doc_level = "Doc";
    50 #$sec_level = "Sec";
    51 #$para_level = "Para";
    5248
    5349our %wanted_index_files = ('td'=>1,
     
    127123    }
    128124   
    129     $self->{'doc_level'} = "document";
    130     if (! $self->{'levels'}->{'document'}) {
    131     if ($self->{'levels'}->{'section'}) {
    132         $self->{'doc_level'} = "section";
    133     } else {
    134         die "you must have either document or section level specified!!\n";
    135     }
    136     }
    137 
    138125    $self->{'buildtype'} = "mgpp";
    139126
     
    229216    # the compressor doesn't need to know about paragraphs - never want to
    230217    # retrieve them
    231     my $mgpp_passes_sections = "";
    232     my ($doc_level) = $self->{'doc_level'};
    233     $mgpp_passes_sections .= "-J " . $level_map{$doc_level} . " ";
    234     foreach my $level (keys %{$self->{'levels'}}) {
    235     if ($level ne $doc_level && $level ne "paragraph") {
    236         $mgpp_passes_sections .= "-K " . $level_map{$level} . " ";
    237     }
    238     }
     218   
     219    # always use Doc and Sec levels
     220    my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
    239221
    240222    print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
     
    250232    $handle = STDOUT;
    251233    } else {
    252     #print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n";
    253234    if (!-e "$mgpp_passes_exe" ||
    254235        !open (PIPEOUT, "| mgpp_passes$exe  -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
     
    258239    $handle = mgppbuilder::PIPEOUT;
    259240    }
    260 
    261     # gdbm_level
    262     my $gdbm_level = "document";
    263     if ($self->{'levels'}->{'section'}) {
    264     $gdbm_level = "section";
    265     }
    266    
     241   
     242    my $gdbm_level = "section";
     243
    267244    $self->{'buildproc'}->set_output_handle ($handle);
    268245    $self->{'buildproc'}->set_mode ('text');
     
    300277
    301278    if (!$self->{'debug'}) {
    302         #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n";
    303279        if (!-e "$mgpp_passes_exe" ||
    304280        !open ($handle, "| mgpp_passes$exe  -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
     
    351327    foreach my $index (@$indexes) {
    352328    my ($fields, $subcollection, $languages) = split (":", $index);
    353     # the directory name starts with a processed version of index fields
    354     #my ($pindex) = $self->process_field($fields);
    355     #$pindex = lc ($pindex);
    356     # now we only ever have one index, and its called 'idx'
     329   
     330    # we only ever have one index, and its called 'idx'
    357331    my $pindex = 'idx';
    358332   
     
    441415
    442416    # define the section names for mgpasses
    443     # define the section names and possibly the doc name for mgpasses
    444     my $mgpp_passes_sections = "";
    445     my ($doc_level) = $self->{'doc_level'};
    446     $mgpp_passes_sections .= "-J " . $level_map{$doc_level} ." ";
    447    
    448     foreach my $level (keys %{$self->{'levels'}}) {
    449     if ($level ne $doc_level) {
    450         $mgpp_passes_sections .= "-K " . $level_map{$level}. " ";
    451     }
     417    my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
     418    if ($self->{'levels'}->{'paragraph'}) {
     419    $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
    452420    }
    453421
     
    526494    }
    527495       
    528     # gdbm_level
    529     my $gdbm_level = "document";
    530     if ($self->{'levels'}->{'section'}) {
    531     $gdbm_level = "section";
    532     }
     496    # gdbm_level is always section
     497    my $gdbm_level = "section";
    533498
    534499    # set up the document processr
     
    878843    $build_cfg->{'levelmap'} = \@levelmap;
    879844
    880     if ($self->{'levels'}->{'section'}) {
    881     $build_cfg->{'textlevel'} = $level_map{'section'};
    882     } else {   
    883     $build_cfg->{'textlevel'} = $level_map{'document'};
    884     }
     845    # text level (and gdbm level) is always section
     846    $build_cfg->{'textlevel'} = $level_map{'section'};
    885847   
    886848}
  • trunk/gsdl/perllib/mgppbuildproc.pm

    r12951 r13590  
    8585    $self->{'strip_html'}=$strip;
    8686}
    87 
    88 
    89 sub get_gdbm_level {
    90     my $self = shift (@_);
    91    
    92     #if a Section level index is not built, the gdbm file should be at doc
    93     #level not Section
    94     if ($self->{'levels'}->{'section'}) {
    95     return "section";
    96     }
    97     return "document";
    98 }
    99 
    10087
    10188#sub find_paragraphs {
     
    208195    my ($fields) = split (/:/, $self->{'index'});
    209196
    210     my ($documenttag) = "";
    211     my($documentendtag) = "";
    212     if ($self->{'levels'}->{'document'}) {
    213     $documenttag = "\n<". $level_map{'document'} . ">\n";
    214     $documentendtag = "\n</". $level_map{'document'} . ">\n";
    215     }
    216     my ($sectiontag) = "";
    217     if ($self->{'levels'}->{'section'}) {
    218     $sectiontag = "\n<". $level_map{'section'} . ">\n";
    219     }
     197    # we always do text and index on Doc and Sec levels
     198    my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
     199    my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
     200    my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
    220201    my ($paratag) = "";
    221202   
Note: See TracChangeset for help on using the changeset viewer.