Ignore:
Timestamp:
2009-02-03T09:48:19+13:00 (15 years ago)
Author:
davidb
Message:

Additions to support the deleting of documents from the index. Only works for indexers that support incremental building, e.g. lucene

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/lucenebuildproc.pm

    r17797 r18456  
    6363
    6464
    65 sub text {
    66     my $self = shift (@_);
    67     my ($doc_obj,$file) = @_;
    68     my $handle = $self->{'output_handle'};
     65sub textedit {
     66    my $self = shift (@_);
     67    my ($doc_obj,$file,$edit_mode) = @_;
     68
     69    my $lucenehandle = $self->{'output_handle'};
    6970    my $outhandle = $self->{'outhandle'};
    7071
     
    7273    return if ($doc_obj->get_doc_type() ne "indexed_doc");
    7374
     75    # skip this document if in "compress-text" mode and asked to delete it
     76    return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
     77
    7478    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
    7579
    7680    # this is another document
    77     $self->{'num_docs'} += 1;
     81    if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
     82    $self->{'num_docs'} += 1;
     83    }
     84    else {
     85    $self->{'num_docs'} -= 1;
     86    }
    7887
    7988    # get the parameters for the output
     
    8695    my $ldoc_level = $levels->{'document'};
    8796    my $lsec_level = $levels->{'section'};
    88     #my $lpar_level = $levels->{'paragraph'};
    89 
     97
     98    # gs2_id should be depricated #####
    9099    my $gs2_id = "";
    91100    if ($ldoc_level)
     
    102111    }
    103112    my $gs2_docOID = $doc_obj->get_OID();
    104     my $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\">\n";
    105     my $documentendtag = "\n</$doc_tag_name>\n";
     113    my $documenttag = undef;
     114    my $documentendtag = undef;
     115
     116    $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
     117    $documentendtag = "\n</$doc_tag_name>\n";
    106118
    107119    my $sec_tag_name = "";
     
    123135    $self->{'num_sections'}++;
    124136
    125     if ($sec_tag_name ne "")
    126     {
    127         my $sec_gs2_id = $self->{'num_sections'};
    128         my $sec_gs2_docOID = $gs2_docOID . "." . $section;
    129         $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\">\n";
    130     }
     137    my $sec_gs2_id = $self->{'num_sections'};
     138    my $sec_gs2_docOID = $gs2_docOID;
     139    $sec_gs2_docOID .= ".$section" if ($section ne "");
    131140
    132141    # if we are doing subcollections, then some docs shouldn't be indexed.
     
    135144    my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
    136145    if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
    137         $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
     146        if ($sec_tag_name ne "") {
     147        $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
     148        $text .= "\n</$sec_tag_name>\n"
     149        }
    138150            $section = $doc_obj->get_next_section($section);
    139151        next;
    140152          }
    141153
    142     $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
     154    if ($sec_tag_name ne "")
     155    {
     156        $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
     157    }
     158
     159    if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
     160        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
     161    }
     162    else {
     163        # delete
     164        $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
     165    }
     166
    143167
    144168    # has the user added a 'metadata' index?
     
    233257        # filter the text
    234258        $new_text = $self->filter_text ($field, $new_text);
    235         $self->{'num_processed_bytes'} += length ($new_text);
     259
     260        if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
     261            $self->{'num_processed_bytes'} += length ($new_text);
     262            $text .= "$new_text";
     263        }
     264        else {
     265            # delete
     266            $self->{'num_processed_bytes'} -= length ($new_text);
     267        }
    236268       
    237         $text .= "$new_text";
    238269
    239270        if ($self->{'indexing_text'} && $new_field) {
     
    287318        $new_text = $self->filter_text ("metadata", $new_text);
    288319       
    289         $self->{'num_processed_bytes'} += length ($new_text);
    290         $text .= "$new_text";
    291 
    292        
     320        if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
     321        $self->{'num_processed_bytes'} += length ($new_text);
     322        $text .= "$new_text";
     323        }
     324        else {
     325        # delete
     326        $self->{'num_processed_bytes'} -= length ($new_text);
     327        }       
    293328    }
    294329
     
    302337        $new_text = $self->filter_text ("allfields", $new_text);
    303338       
    304         $self->{'num_processed_bytes'} += length ($new_text);
    305         $text .= "$new_text";
    306     }
    307    
     339        if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
     340        $self->{'num_processed_bytes'} += length ($new_text);
     341        $text .= "$new_text";
     342        }
     343        else {
     344        # delete
     345        $self->{'num_processed_bytes'} -= length ($new_text);
     346        }
     347    }
     348       
    308349    $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
    309350
    310351        $section = $doc_obj->get_next_section($section);
    311     } #while defined section
    312     print $handle "$text\n$documentendtag";
    313     #print STDOUT "$text\n$documentendtag";
    314 }
     352    } # while defined section
     353
     354    print $lucenehandle "$text\n$documentendtag";
     355
     356##    if ($edit_mode eq "delete") {
     357##       print STDERR "$text\n$documentendtag";
     358##    }
     359
     360}
     361
     362sub text {
     363    my $self = shift (@_);
     364    my ($doc_obj,$file) = @_;
     365
     366    $self->textedit($doc_obj,$file,"add");
     367}
     368
     369sub textreindex
     370{
     371    my $self = shift (@_);
     372    my ($doc_obj,$file) = @_;
     373
     374    $self->textedit($doc_obj,$file,"reindex");
     375}
     376
     377sub textdelete
     378{
     379    my $self = shift (@_);
     380    my ($doc_obj,$file) = @_;
     381
     382    $self->textedit($doc_obj,$file,"delete");
     383}
     384
     385
     386
     387
    315388
    316389# /** We make this builder pretend to be a document processor so we can get
     
    4945671;
    495568
     569
Note: See TracChangeset for help on using the changeset viewer.