Ignore:
Timestamp:
10/13/10 14:25:54 (10 years ago)
Author:
kjdon
Message:

still workign on incremental infodb updating. cleaning up code now that reconstructed docs are added after processing new/changed ones. so don't need to do deletion from the infodb.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/basebuildproc.pm

    r23121 r23133  
    148148    # reconstructed docs have no text, just metadata, so we need to
    149149    # remember how many bytes we had initially
    150     $self->{'num_bytes'}     = $self->{'starting_num_bytes'};
    151    
     150    #$self->{'num_bytes'}     = $self->{'starting_num_bytes'};
     151    $self->{'num_bytes'} = 0; # we'll store num bytes in db for reconstructed docs.
    152152    $self->{'num_processed_bytes'} = 0;
    153153}
     
    389389    my $self = shift (@_);
    390390    my ($doc_obj, $filename, $edit_mode) = @_;
    391 
     391   
    392392    # only output this document if it is a "indexed_doc" or "info_doc" (database only) document
    393393    my $doctype = $doc_obj->get_doc_type();
    394394    return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
    395 
     395    print STDERR "infodbedit, mode=$edit_mode, ".$doc_obj->get_OID()."\n";
    396396    my $archivedir = "";
    397397    if (defined $filename)
     
    416416    }
    417417
    418     if (($edit_mode eq "add") || ($edit_mode eq "update")) {
    419     #add this document to the browse structure
    420     push(@{$self->{'doclist'}},$doc_obj->get_OID())
    421         unless ($doctype eq "classification");
    422     if ($edit_mode eq "update") {
    423         $self->{'dont_reconstruct'}->{$doc_obj->get_OID()} = 1;
    424     }
    425     &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
    426     $self->{'num_docs'} += 1 unless ($doctype eq "classification");
    427     }
    428     elsif ($edit_mode eq "delete") {
     418    if ($edit_mode eq "delete") {
    429419    # record this doc so we don't process the reconstructed doc later
    430     my $del_doc_oid = $doc_obj->get_OID();
    431     $self->{'dont_reconstruct'}->{$del_doc_oid} = 1;
     420    $self->{'dont_reconstruct'}->{$doc_obj->get_OID()} = 1;
     421    # we don't need to do anything else for the info database for a deleted document. The infodb starts from scratch each time, so no deletion is necessary
    432422    # do we need this??? where did num_docs come from, from reconstruction??
    433     $self->{'num_docs'} -= 1 unless ($doctype eq "classification");
    434     }
    435 
     423    #$self->{'num_docs'} -= 1 unless ($doctype eq "classification");
     424    return;
     425    }
     426   
     427    if ($edit_mode eq "update") {
     428    # we don't want to process the reconstructed doc later, but we will process this version now.
     429    $self->{'dont_reconstruct'}->{$doc_obj->get_OID()} = 1;
     430    }
     431
     432    # rest of code used for add and update. In both cases, we add to the classifiers and to the info database.
     433
     434    #add this document to the browse structure
     435    push(@{$self->{'doclist'}},$doc_obj->get_OID())
     436    unless ($doctype eq "classification");
     437    $self->{'num_docs'} += 1 unless ($doctype eq "classification");
     438   
     439    if (!defined $filename) {
     440    # a reconstructed doc
     441    $self->{'num_bytes'} += $doc_obj->get_metadata_element ($doc_obj->get_top_section (), "total_numbytes");
     442    print STDERR "new numbytes = $self->{'num_bytes'}\n";
     443    }
     444    # classify the document
     445    &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
     446   
     447    # now add all the section to the infodb.
     448   
    436449    # is this a paged or a hierarchical document
    437450    my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
     
    453466    my %section_infodb = ();
    454467
    455     # update a few statistics - how do we do update????
    456     if (($edit_mode eq "add")) {# || ($edit_mode eq "update")) {
    457 
    458         $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
    459         $self->{'num_sections'} += 1 unless ($doctype eq "classification");
    460     }
    461     elsif ($edit_mode eq "delete") {
    462         # delete
    463         $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
    464         $self->{'num_sections'} -= 1 unless ($doctype eq "classification");
    465     }
    466 
     468    # update a few statistics
     469    $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
     470    $self->{'num_sections'} += 1 unless ($doctype eq "classification");
     471   
    467472    # output the fact that this document is a document (unless doctype
    468473    # has been set to something else from within a plugin
     
    472477    }
    473478
     479    if ($first && defined $filename) {
     480        # if we are at the top level of the document, and we are not a reconstructed document, set the total_text_length - used to count bytes when we reconstruct later
     481        my $length = $doc_obj->get_total_text_length();
     482        $section_infodb{"total_numbytes"} = [ $length ];
     483    }
    474484    # Output whether this node contains text
    475485    #
     
    504514        # special case for URL metadata
    505515        if ($field =~ /^URL$/i) {
    506             if (($edit_mode eq "add") || ($edit_mode eq "update")) {
    507 
    508             &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] });
    509             }
    510             else {
    511             # delete
    512             &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value);
    513             }
    514 
    515            
     516            &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] });
    516517        }
    517 
     518       
    518519        if (!defined $self->{'dontdb'}->{$field}) {
    519520            push(@{$section_infodb{$field}}, $value);
     
    590591    }
    591592   
    592     if (($edit_mode eq "add") || ($edit_mode eq "update")) {
    593         # in case of update, this will overwrite old entry??
    594         &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
    595     }
    596     else {
    597         # delete
    598         &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID);
    599     }
    600 
    601    
    602     # output a database entry for the document number, except for Lucene (which no longer needs this information)
    603     unless (ref($self) eq "lucenebuildproc")
     593    &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
     594       
     595    # output a database entry for the document number, unless we are incremental
     596    unless ($self->is_incremental_capable())
    604597    {
    605         if (($edit_mode eq "add") || ($edit_mode eq "update")) {
    606        
    607         if ($self->{'db_level'} eq "document") {
    608             &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
    609         }
    610         else {
    611             &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
    612         }
     598        print STDERR "outputting db entry for doc number\n";
     599        if ($self->{'db_level'} eq "document") {
     600        &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
    613601        }
    614602        else {
    615 
    616         if ($self->{'db_level'} eq "document") {
    617             &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'});
    618         }
    619         else {
    620             &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'});
    621         }
    622 
    623         }
    624     }
    625 
     603        &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
     604        }
     605    }
     606       
    626607    $first = 0;
    627608    $section = $doc_obj->get_next_section($section);
    628609    last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
    629     }
     610    } # while defined section
     611
     612    print STDERR "end of infodb edit\n";
    630613}
    631614
Note: See TracChangeset for help on using the changeset viewer.