Changeset 23133

Show
Ignore:
Timestamp:
13.10.2010 14:25:54 (9 years ago)
Author:
kjdon
Message:

still workign on incremental infodb updating. cleaning up code now that reconstructed docs are added after processing new/changed ones. so don't need to do deletion from the infodb.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/basebuildproc.pm

    r23121 r23133  
    148148    # reconstructed docs have no text, just metadata, so we need to  
    149149    # remember how many bytes we had initially 
    150     $self->{'num_bytes'}     = $self->{'starting_num_bytes'}; 
    151      
     150    #$self->{'num_bytes'}     = $self->{'starting_num_bytes'}; 
     151    $self->{'num_bytes'} = 0; # we'll store num bytes in db for reconstructed docs. 
    152152    $self->{'num_processed_bytes'} = 0; 
    153153} 
     
    389389    my $self = shift (@_); 
    390390    my ($doc_obj, $filename, $edit_mode) = @_; 
    391  
     391     
    392392    # only output this document if it is a "indexed_doc" or "info_doc" (database only) document 
    393393    my $doctype = $doc_obj->get_doc_type(); 
    394394    return if ($doctype ne "indexed_doc" && $doctype ne "info_doc"); 
    395  
     395    print STDERR "infodbedit, mode=$edit_mode, ".$doc_obj->get_OID()."\n"; 
    396396    my $archivedir = ""; 
    397397    if (defined $filename) 
     
    416416    } 
    417417 
    418     if (($edit_mode eq "add") || ($edit_mode eq "update")) { 
    419     #add this document to the browse structure 
    420     push(@{$self->{'doclist'}},$doc_obj->get_OID())  
    421         unless ($doctype eq "classification"); 
    422     if ($edit_mode eq "update") { 
    423         $self->{'dont_reconstruct'}->{$doc_obj->get_OID()} = 1; 
    424     } 
    425     &classify::classify_doc ($self->{'classifiers'}, $doc_obj); 
    426     $self->{'num_docs'} += 1 unless ($doctype eq "classification"); 
    427     } 
    428     elsif ($edit_mode eq "delete") { 
     418    if ($edit_mode eq "delete") { 
    429419    # record this doc so we don't process the reconstructed doc later 
    430     my $del_doc_oid = $doc_obj->get_OID(); 
    431     $self->{'dont_reconstruct'}->{$del_doc_oid} = 1; 
     420    $self->{'dont_reconstruct'}->{$doc_obj->get_OID()} = 1; 
     421    # we don't need to do anything else for the info database for a deleted document. The infodb starts from scratch each time, so no deletion is necessary 
    432422    # do we need this??? where did num_docs come from, from reconstruction?? 
    433     $self->{'num_docs'} -= 1 unless ($doctype eq "classification"); 
    434     } 
    435  
     423    #$self->{'num_docs'} -= 1 unless ($doctype eq "classification"); 
     424    return; 
     425    } 
     426     
     427    if ($edit_mode eq "update") { 
     428    # we don't want to process the reconstructed doc later, but we will process this version now. 
     429    $self->{'dont_reconstruct'}->{$doc_obj->get_OID()} = 1; 
     430    } 
     431 
     432    # rest of code used for add and update. In both cases, we add to the classifiers and to the info database.  
     433 
     434    #add this document to the browse structure 
     435    push(@{$self->{'doclist'}},$doc_obj->get_OID())  
     436    unless ($doctype eq "classification"); 
     437    $self->{'num_docs'} += 1 unless ($doctype eq "classification"); 
     438     
     439    if (!defined $filename) { 
     440    # a reconstructed doc 
     441    $self->{'num_bytes'} += $doc_obj->get_metadata_element ($doc_obj->get_top_section (), "total_numbytes"); 
     442    print STDERR "new numbytes = $self->{'num_bytes'}\n"; 
     443    } 
     444    # classify the document 
     445    &classify::classify_doc ($self->{'classifiers'}, $doc_obj); 
     446     
     447    # now add all the section to the infodb. 
     448     
    436449    # is this a paged or a hierarchical document 
    437450    my ($thistype, $childtype) = $self->get_document_type ($doc_obj); 
     
    453466    my %section_infodb = (); 
    454467 
    455     # update a few statistics - how do we do update???? 
    456     if (($edit_mode eq "add")) {# || ($edit_mode eq "update")) { 
    457  
    458         $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 
    459         $self->{'num_sections'} += 1 unless ($doctype eq "classification"); 
    460     } 
    461     elsif ($edit_mode eq "delete") { 
    462         # delete 
    463         $self->{'num_bytes'} -= $doc_obj->get_text_length ($section); 
    464         $self->{'num_sections'} -= 1 unless ($doctype eq "classification"); 
    465     } 
    466  
     468    # update a few statistics  
     469    $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 
     470    $self->{'num_sections'} += 1 unless ($doctype eq "classification"); 
     471     
    467472    # output the fact that this document is a document (unless doctype 
    468473    # has been set to something else from within a plugin 
     
    472477    } 
    473478 
     479    if ($first && defined $filename) { 
     480        # if we are at the top level of the document, and we are not a reconstructed document, set the total_text_length - used to count bytes when we reconstruct later 
     481        my $length = $doc_obj->get_total_text_length(); 
     482        $section_infodb{"total_numbytes"} = [ $length ]; 
     483    } 
    474484    # Output whether this node contains text 
    475485    # 
     
    504514        # special case for URL metadata 
    505515        if ($field =~ /^URL$/i) { 
    506             if (($edit_mode eq "add") || ($edit_mode eq "update")) { 
    507  
    508             &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] }); 
    509             } 
    510             else { 
    511             # delete 
    512             &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value); 
    513             } 
    514  
    515              
     516            &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] }); 
    516517        } 
    517  
     518         
    518519        if (!defined $self->{'dontdb'}->{$field}) { 
    519520            push(@{$section_infodb{$field}}, $value); 
     
    590591    }  
    591592     
    592     if (($edit_mode eq "add") || ($edit_mode eq "update")) { 
    593         # in case of update, this will overwrite old entry?? 
    594         &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb); 
    595     } 
    596     else { 
    597         # delete 
    598         &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID); 
    599     } 
    600  
    601      
    602     # output a database entry for the document number, except for Lucene (which no longer needs this information) 
    603     unless (ref($self) eq "lucenebuildproc") 
     593    &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb); 
     594         
     595    # output a database entry for the document number, unless we are incremental 
     596    unless ($self->is_incremental_capable()) 
    604597    { 
    605         if (($edit_mode eq "add") || ($edit_mode eq "update")) { 
    606          
    607         if ($self->{'db_level'} eq "document") { 
    608             &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] }); 
    609         } 
    610         else { 
    611             &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] }); 
    612         } 
     598        print STDERR "outputting db entry for doc number\n"; 
     599        if ($self->{'db_level'} eq "document") { 
     600        &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] }); 
    613601        } 
    614602        else { 
    615  
    616         if ($self->{'db_level'} eq "document") { 
    617             &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}); 
    618         } 
    619         else { 
    620             &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}); 
    621         } 
    622  
    623         } 
    624     } 
    625  
     603        &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] }); 
     604        } 
     605    } 
     606         
    626607    $first = 0; 
    627608    $section = $doc_obj->get_next_section($section); 
    628609    last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs 
    629     } 
     610    } # while defined section 
     611 
     612    print STDERR "end of infodb edit\n"; 
    630613} 
    631614