Changeset 11994


Ignore:
Timestamp:
2006-07-04T15:06:34+12:00 (18 years ago)
Author:
davidb
Message:

Improved support for incremental addition: instead of having to run the
classifier pass of buildcol.pl from scratch (i.e. read in all documents
from the archives folder) so correct browse structures are formed -- a
simple to implement strategy, but not very efficient -- the first layer
of a classifier structure is now reconstructed from the GDBM file. Then
the new files in the archives directory are added, and then finally the
completed browser structure is formed.

Location:
trunk/gsdl/perllib
Files:
1 added
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/basebuilder.pm

    r11965 r11994  
    274274}
    275275
     276
     277
    276278sub make_infodatabase {
    277279    my $self = shift (@_);
     
    299301    # init all the classifiers
    300302    &classify::init_classifiers ($self->{'classifiers'});
     303
     304
     305    my $reconstructed_docs = undef;
     306    if ($self->{'keepold'}) {
     307    # reconstruct doc_obj metadata from gdbm for all docs
     308    $reconstructed_docs = &classify::reconstruct_doc_objs_metadata($fulldbname);
     309    }
    301310   
    302311    # set up the document processor
     
    319328    $self->{'buildproc'}->set_indexing_text (0);
    320329    $self->{'buildproc'}->set_store_text(1);
    321     # make_infodatabase does not support incremental build
    322     # => full reset needed
    323     $self->{'buildproc'}->zero_reset();
     330
     331    # make_infodatabase needs full reset even for incremental build
     332    # as incremental works by reconstructing all docs from GDBM and
     333    # then adding in the new ones
     334    $self->{'buildproc'}->zero_reset();
     335
     336    if ($self->{'keepold'}) {
     337    # create flat classify structure, ready for new docs to be added
     338    foreach my $doc_obj ( @$reconstructed_docs ) {     
     339        print $outhandle "  Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
     340        $self->{'buildproc'}->process($doc_obj,undef);
     341    }
     342    }
     343
    324344   
    325345    # this has changed to only output collection meta if its
  • trunk/gsdl/perllib/basebuildproc.pm

    r11793 r11994  
    296296    return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
    297297
    298     my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
    299     $archivedir = "" unless defined $archivedir;
    300     $archivedir =~ s/\\/\//g;
    301     $archivedir =~ s/^\/+//;
    302     $archivedir =~ s/\/+$//;
    303 
    304     # resolve the final filenames of the files associated with this document
    305     $self->assoc_files ($doc_obj, $archivedir);
     298    my $archivedir = "";
     299
     300    if (defined $filename)
     301    {
     302    # doc_obj derived directly from file
     303
     304    my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
     305    $dir = "" unless defined $dir;
     306    $dir =~ s/\\/\//g;
     307    $dir =~ s/^\/+//;
     308    $dir =~ s/\/+$//;
     309
     310    $archivedir = $dir;
     311
     312    # resolve the final filenames of the files associated with this document
     313    $self->assoc_files ($doc_obj, $archivedir);
     314    }
     315    else
     316    {
     317    # doc_obj reconstructed from GDBM (has metadata, doc structure but no text)
     318    my $top_section = $doc_obj->get_top_section();
     319    $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
     320    }
     321
    306322
    307323    #GRB: moved 1/06/2004 from GRB01062004
     
    340356    }
    341357
    342     # output whether this node contains text
    343     if ($doc_obj->get_text_length($section) > 0) {
    344         print $handle "<hastxt>1\n";
    345     } else {
    346         print $handle "<hastxt>0\n";
     358    # Output whether this node contains text
     359    #
     360    # If doc_obj reconstructed from GDBM file then no need to
     361    # explicitly add <hastxt> as this is preserved as metadata when
     362    # the GDBM file is loaded in
     363
     364    if (defined $filename)
     365    {
     366        # doc_obj derived directly from file
     367        if ($doc_obj->get_text_length($section) > 0) {
     368        print $handle "<hastxt>1\n";
     369        } else {
     370        print $handle "<hastxt>0\n";
     371        }
    347372    }
    348373
     
    380405    }
    381406
    382     # output archivedir if at top level
    383     if ($section eq $doc_obj->get_top_section()) {
    384         print $handle "<archivedir>$archivedir\n";
     407
     408    # If doc_obj reconstructed from GDBM file then no need to
     409    # explicitly add <archivedir> as this is preserved as metadata when
     410    # the GDBM file is loaded in
     411
     412    if (defined $filename)
     413    {
     414        # output archivedir if at top level
     415        if ($section eq $doc_obj->get_top_section()) {
     416        print $handle "<archivedir>$archivedir\n";
     417        }
    385418    }
    386419
     
    389422        print $handle "<thistype>$thistype\n";
    390423    }
     424
    391425
    392426    if ($self->{'gdbm_level'} eq "document") {
  • trunk/gsdl/perllib/classify.pm

    r11644 r11994  
    3131require AllList;
    3232use gsprintf;
     33
     34use GDBM_File;
    3335
    3436
     
    139141    }
    140142}
     143
     144
     145
     146# takes a hashref containing the metadata for a gdbmfile entry, and extracts
     147# the childrens numbers (from the 'contains' entry).   
     148# assumes format is ".1;".2;".3
     149sub get_children { 
     150    my ($doc_db_hash) = @_;
     151
     152    my $children = undef;
     153
     154    $childs = $doc_db_hash->{'contains'};
     155    if (defined ($childs)) {
     156    $childs =~ s/\@$//;  #remove trailing @
     157    $childs =~ s/^\"\.//; #remove initial ".
     158    @$children = split /\;\"\./, $childs;
     159   
     160    }
     161
     162    return $children;
     163}
     164
     165   
     166sub recurse_sections {
     167    my ($doc_obj, $children, $parentoid, $parentsection, $gdbm_recs) = @_;
     168
     169    return if (!defined $children);
     170
     171    foreach my $child (sort { $a <=> $b} @$children) {
     172    $doc_obj->create_named_section("$parentsection.$child");
     173    my $doc_db_rec = $gdbm_recs->{"$parentoid.$child"};
     174    my $doc_db_hash = db_rec_to_hash($doc_db_rec);
     175
     176    # get child's children
     177    my $newchildren = &get_children($doc_db_hash);
     178
     179    # add content for current section
     180    add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
     181
     182    # process all the children if there are any
     183    if (defined ($newchildren))
     184    {
     185        recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
     186                 "$parentsection.$child", $gdbm_recs);
     187    }
     188    }
     189}                       
     190
     191
     192sub add_section_content {
     193    my ($doc_obj, $cursection, $doc_db_hash) = @_;
     194 
     195    foreach $key (keys %$doc_db_hash) {
     196    #don't need to store these metadata
     197    next if $key =~ /(thistype|childtype|contains|docnum|doctype|classifytype)/i;
     198    # but do want things like hastxt and archivedir
     199    my @items = split /@/, $doc_db_hash->{$key};
     200    map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
     201
     202    }
     203}
     204
     205
     206# gets all the metadata from a gdbm file entry, and puts it into a hashref
     207sub db_rec_to_hash {
     208   
     209    my ($gdb_str_ref) = @_;
     210
     211    my $hashref = {};
     212
     213    my @entries = split(/\n/, $gdb_str_ref);
     214    foreach $entry (@entries) {
     215    my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
     216    $hashref->{$key} .= '@' if defined $hashref->{$key};
     217    $hashref->{$key} .= $value;
     218   
     219    }
     220   
     221    return $hashref;
     222}                     
     223
     224
     225sub reconstruct_doc_objs_metadata
     226{
     227    my ($fulldbname) = @_;
     228
     229    tie %gdbm_recs, 'GDBM_File', $fulldbname, &GDBM_WRCREAT, 0640;
     230
     231    # dig out top level doc sections
     232    my %top_sections = ();
     233    foreach my $key ( keys %gdbm_recs )
     234    {
     235    my $md_rec = $gdbm_recs{$key};
     236    my $md_hash = db_rec_to_hash($md_rec);
     237
     238    if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
     239        next if ($key =~ m/\./);
     240        $top_sections{$key} = $md_hash;
     241    }
     242    }
     243
     244    # for greenstone document objects based on metadata in gdbm file
     245    my @all_docs = ();
     246    foreach my $oid ( keys %top_sections )
     247    {
     248    my $doc_db_hash = $top_sections{$oid};
     249
     250    my $doc_obj = new doc();
     251    $doc_obj->set_OID($oid);
     252
     253    my $top = $doc_obj->get_top_section();
     254        add_section_content ($doc_obj, $top, $doc_db_hash);
     255        my $children = &get_children($doc_db_hash);
     256        recurse_sections($doc_obj, $children, $oid, $top, \%gdbm_recs);
     257
     258    push(@all_docs,$doc_obj);
     259    }   
     260
     261    untie %gdbm_recs;
     262
     263    return \@all_docs;   
     264}
     265
     266
     267
     268
    141269
    142270# classify_doc lets each of the classifiers classify a document
Note: See TracChangeset for help on using the changeset viewer.