Changeset 9179


Ignore:
Timestamp:
2005-02-24T16:57:53+13:00 (19 years ago)
Author:
kjdon
Message:

added a text pass to store the documents - we want all the html in it (but escaped) and no metadata

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/lucenebuilder.pm

    r8716 r9179  
    6666}
    6767
     68# this writes a nice version of the text docs
    6869sub compress_text {
    6970
    7071    my $self = shift (@_);
    7172    my ($textindex) = @_;
     73    my $outhandle = $self->{'outhandle'};
     74    print STDERR "Saving the document text\n";
     75    # the text directory
     76    my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
     77    my $build_dir = $self->{'build_dir'};
     78    &util::mk_all_dir ($text_dir);
     79
     80    my $osextra = "";
     81    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
     82    $text_dir =~ s@/@\\@g;
     83    } else {
     84    if ($outhandle ne "STDERR") {
     85        # so lucene_passes doesn't print to stderr if we redirect output
     86        $osextra .= " 2>/dev/null";
     87    }
     88    }
     89   
     90
     91    # get any os specific stuff
     92    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
     93
     94    my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl");
     95    my $lucene_passes_sections = "Doc";
     96
     97    my ($handle);
     98
     99    if ($self->{'debug'}) {
     100    $handle = STDOUT;
     101    } else {
     102    if (!-e "$lucene_passes_exe" ||
     103        !open (PIPEOUT, "| $lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"poo\"   $osextra")) {
     104        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
     105        die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
     106    }
     107    $handle = lucenebuilder::PIPEOUT;
     108    }
     109    my $levels = $self->{'levels'};
     110    undef $levels->{'paragraph'}; # get rid of para if we had it.
     111    # set up the document processr
     112    $self->{'buildproc'}->set_output_handle ($handle);
     113    $self->{'buildproc'}->set_mode ('text');
     114    $self->{'buildproc'}->set_index ($textindex);
     115    $self->{'buildproc'}->set_indexing_text (0);
     116    $self->{'buildproc'}->set_store_text(1);
     117    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     118    $self->{'buildproc'}->set_levels ($levels);                       
     119    $self->{'buildproc'}->reset();
     120    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
     121           $self->{'buildproc'}, $self->{'maxdocs'});   
     122    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
     123           "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
     124    &plugin::end($self->{'pluginfo'});
     125    close ($handle) unless $self->{'debug'};
     126    close PIPEOUT;
     127    $self->print_stats();
     128
     129    print STDERR "</Stage>\n" if $self->{'gli'};
     130
    72131}
    73132
     
    94153        my $idx = $self->{'index_mapping'}->{$index};
    95154        foreach my $level (keys %{$self->{'levels'}}) {
     155        next if $level =~ /paragraph/; # we don't do para indexing
    96156        my ($pindex) = $level =~ /^(.)/;
    97157        # should probably check that new name with level
     
    199259    } else {
    200260    if (!-e "$lucene_passes_exe" ||
    201         !open (PIPEOUT, "| $lucene_passes_exe $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra")) {
     261        !open (PIPEOUT, "| $lucene_passes_exe index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra")) {
    202262        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
    203263        die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
Note: See TracChangeset for help on using the changeset viewer.