Ignore:
Timestamp:
2009-09-10T10:46:36+12:00 (15 years ago)
Author:
davidb
Message:

Opening of txt2db moved to earlier in the buildcol process. This was done to avoid a huge memory spike that occurred with incremental building. Previously we recoconstructed all the documents from the GDBM database. Then the code added, edited, removed documents as required (i.e. the incremental bit), then it wrote it all out to GDBM. The problem was that the reconstructed phase could grow quite large -- an example PagedImage collection of 100000 documents took 2.4 GB when read in. When it got to the stage of opening a pipe to the datbase with open('|txt2db'), the fork() call that occurs inside this function requires the system to (briefly) have *two* 2.4 GB processes, before quickly replacing the child process with the much smalled 'txt2db' process. It is at the point of the duplication of the two processes that can cause a computer to run out of memory. In the PagedImage example, the machine had 2 GB of main memory and 2 GB of swap. Therefore there was no way it could sustain two 2.4 GB processes.\n Long explanation. The good news is shifting the open() to be before the documents are reconstructed solves the problem.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/basebuilder.pm

    r20100 r20575  
    357357
    358358    # Get info database file path
    359     my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $textdir);
     359    my $infodb_type = $self->{'infodbtype'};
     360    my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
    360361
    361362    print $outhandle "\n*** creating the info database and processing associated files\n"
     
    367368
    368369    my $reconstructed_docs = undef;
     370    my $database_recs = undef;
     371
    369372    if ($self->{'keepold'}) {
    370     # reconstruct doc_obj metadata from database for all docs
    371     $reconstructed_docs = &classify::reconstruct_doc_objs_metadata($self->{'infodbtype'}, $infodb_file_path);
    372     }
    373    
    374     # set up the document processor
     373    $database_recs = {};
     374
     375    &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
     376    }
     377
     378   
     379    # Important (for memory usage reasons) that we obtain the filehandle
     380    # here for writing out to the database, rather than after
     381    # $reconstructed_docs has been set up (assuming -keepold is on)
     382    #
     383    # This is because when we open a pipe to txt2db [using open()]
     384    # this triggers a fork() followed by exec().  $reconstructed_docs
     385    # can get very large, and so if we did the open() after this, it means
     386    # the fork creates a clone of the *large* process image which (admittedly)
     387    # is then quickly replaced in the execve() with the much smaller image for
     388    # 'txt2db'.  The trouble is, in that for a seismic second caused by
     389    # the fork(), the system really does need to have all that memory available
     390    # even though it isn't ultimately used.  The result is an out of memory
     391    # error.
     392
    375393    my ($infodb_handle);
    376394    if ($self->{'debug'}) {
     
    378396    }
    379397    else {
    380     $infodb_handle = &dbutil::open_infodb_write_handle($self->{'infodbtype'}, $infodb_file_path);
     398    $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
    381399    if (!defined($infodb_handle))
    382400    {
     
    386404    }
    387405
    388     $self->{'buildproc'}->set_infodbtype ($self->{'infodbtype'});
     406    if ($self->{'keepold'}) {
     407    # reconstruct doc_obj metadata from database for all docs
     408    $reconstructed_docs
     409        = &classify::reconstruct_doc_objs_metadata($infodb_type,
     410                               $infodb_file_path,
     411                               $database_recs);
     412    }
     413
     414    # set up the document processor
     415
     416    $self->{'buildproc'}->set_infodbtype ($infodb_type);
    389417    $self->{'buildproc'}->set_output_handle ($infodb_handle);
    390418    $self->{'buildproc'}->set_mode ('infodb');
     
    420448   
    421449    # output classification information
    422     &classify::output_classify_info ($self->{'classifiers'}, $self->{'infodbtype'}, $infodb_handle,
     450    &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
    423451                     $self->{'remove_empty_classifications'},
    424452                     $self->{'gli'});
     
    434462                  'thistype' => [ "Invisible" ],
    435463                  'contains' => [ join(";", @doc_list) ] };
    436     &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "browselist", $browselist_infodb);
    437 
    438     &dbutil::close_infodb_write_handle($self->{'infodbtype'}, $infodb_handle) if !$self->{'debug'};
     464    &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
     465
     466    &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
    439467
    440468    print STDERR "</Stage>\n" if $self->{'gli'};
Note: See TracChangeset for help on using the changeset viewer.