Ignore:
Timestamp:
2006-07-04T15:06:34+12:00 (18 years ago)
Author:
davidb
Message:

Improved support for incremental addition: instead of having to run the
classifier pass of buildcol.pl from scratch (i.e. read in all documents
from the archives folder) so correct browse structures are formed -- a
simple to implement strategy, but not very efficient -- the first layer
of a classifier structure is now reconstructed from the GDBM file. Then
the new files in the archives directory are added, and then finally the
completed browser structure is formed.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/basebuildproc.pm

    r11793 r11994  
    296296    return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
    297297
    298     my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
    299     $archivedir = "" unless defined $archivedir;
    300     $archivedir =~ s/\\/\//g;
    301     $archivedir =~ s/^\/+//;
    302     $archivedir =~ s/\/+$//;
    303 
    304     # resolve the final filenames of the files associated with this document
    305     $self->assoc_files ($doc_obj, $archivedir);
     298    my $archivedir = "";
     299
     300    if (defined $filename)
     301    {
     302    # doc_obj derived directly from file
     303
     304    my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
     305    $dir = "" unless defined $dir;
     306    $dir =~ s/\\/\//g;
     307    $dir =~ s/^\/+//;
     308    $dir =~ s/\/+$//;
     309
     310    $archivedir = $dir;
     311
     312    # resolve the final filenames of the files associated with this document
     313    $self->assoc_files ($doc_obj, $archivedir);
     314    }
     315    else
     316    {
     317    # doc_obj reconstructed from GDBM (has metadata, doc structure but no text)
     318    my $top_section = $doc_obj->get_top_section();
     319    $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
     320    }
     321
    306322
    307323    #GRB: moved 1/06/2004 from GRB01062004
     
    340356    }
    341357
    342     # output whether this node contains text
    343     if ($doc_obj->get_text_length($section) > 0) {
    344         print $handle "<hastxt>1\n";
    345     } else {
    346         print $handle "<hastxt>0\n";
     358    # Output whether this node contains text
     359    #
     360    # If doc_obj reconstructed from GDBM file then no need to
     361    # explicitly add <hastxt> as this is preserved as metadata when
     362    # the GDBM file is loaded in
     363
     364    if (defined $filename)
     365    {
     366        # doc_obj derived directly from file
     367        if ($doc_obj->get_text_length($section) > 0) {
     368        print $handle "<hastxt>1\n";
     369        } else {
     370        print $handle "<hastxt>0\n";
     371        }
    347372    }
    348373
     
    380405    }
    381406
    382     # output archivedir if at top level
    383     if ($section eq $doc_obj->get_top_section()) {
    384         print $handle "<archivedir>$archivedir\n";
     407
     408    # If doc_obj reconstructed from GDBM file then no need to
     409    # explicitly add <archivedir> as this is preserved as metadata when
     410    # the GDBM file is loaded in
     411
     412    if (defined $filename)
     413    {
     414        # output archivedir if at top level
     415        if ($section eq $doc_obj->get_top_section()) {
     416        print $handle "<archivedir>$archivedir\n";
     417        }
    385418    }
    386419
     
    389422        print $handle "<thistype>$thistype\n";
    390423    }
     424
    391425
    392426    if ($self->{'gdbm_level'} eq "document") {
Note: See TracChangeset for help on using the changeset viewer.