Ignore:
Timestamp:
2006-07-04T15:06:34+12:00 (18 years ago)
Author:
davidb
Message:

Improved support for incremental addition: instead of having to run the
classifier pass of buildcol.pl from scratch (i.e. read in all documents
from the archives folder) so correct browse structures are formed -- a
simple to implement strategy, but not very efficient -- the first layer
of a classifier structure is now reconstructed from the GDBM file. Then
the new files in the archives directory are added, and then finally the
completed browser structure is formed.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify.pm

    r11644 r11994  
    3131require AllList;
    3232use gsprintf;
     33
     34use GDBM_File;
    3335
    3436
     
    139141    }
    140142}
     143
     144
     145
     146# takes a hashref containing the metadata for a gdbmfile entry, and extracts
     147# the childrens numbers (from the 'contains' entry).   
     148# assumes format is ".1;".2;".3
     149sub get_children { 
     150    my ($doc_db_hash) = @_;
     151
     152    my $children = undef;
     153
     154    $childs = $doc_db_hash->{'contains'};
     155    if (defined ($childs)) {
     156    $childs =~ s/\@$//;  #remove trailing @
     157    $childs =~ s/^\"\.//; #remove initial ".
     158    @$children = split /\;\"\./, $childs;
     159   
     160    }
     161
     162    return $children;
     163}
     164
     165   
     166sub recurse_sections {
     167    my ($doc_obj, $children, $parentoid, $parentsection, $gdbm_recs) = @_;
     168
     169    return if (!defined $children);
     170
     171    foreach my $child (sort { $a <=> $b} @$children) {
     172    $doc_obj->create_named_section("$parentsection.$child");
     173    my $doc_db_rec = $gdbm_recs->{"$parentoid.$child"};
     174    my $doc_db_hash = db_rec_to_hash($doc_db_rec);
     175
     176    # get child's children
     177    my $newchildren = &get_children($doc_db_hash);
     178
     179    # add content for current section
     180    add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
     181
     182    # process all the children if there are any
     183    if (defined ($newchildren))
     184    {
     185        recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
     186                 "$parentsection.$child", $gdbm_recs);
     187    }
     188    }
     189}                       
     190
     191
     192sub add_section_content {
     193    my ($doc_obj, $cursection, $doc_db_hash) = @_;
     194 
     195    foreach $key (keys %$doc_db_hash) {
     196    #don't need to store these metadata
     197    next if $key =~ /(thistype|childtype|contains|docnum|doctype|classifytype)/i;
     198    # but do want things like hastxt and archivedir
     199    my @items = split /@/, $doc_db_hash->{$key};
     200    map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
     201
     202    }
     203}
     204
     205
     206# gets all the metadata from a gdbm file entry, and puts it into a hashref
     207sub db_rec_to_hash {
     208   
     209    my ($gdb_str_ref) = @_;
     210
     211    my $hashref = {};
     212
     213    my @entries = split(/\n/, $gdb_str_ref);
     214    foreach $entry (@entries) {
     215    my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
     216    $hashref->{$key} .= '@' if defined $hashref->{$key};
     217    $hashref->{$key} .= $value;
     218   
     219    }
     220   
     221    return $hashref;
     222}                     
     223
     224
     225sub reconstruct_doc_objs_metadata
     226{
     227    my ($fulldbname) = @_;
     228
     229    tie %gdbm_recs, 'GDBM_File', $fulldbname, &GDBM_WRCREAT, 0640;
     230
     231    # dig out top level doc sections
     232    my %top_sections = ();
     233    foreach my $key ( keys %gdbm_recs )
     234    {
     235    my $md_rec = $gdbm_recs{$key};
     236    my $md_hash = db_rec_to_hash($md_rec);
     237
     238    if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
     239        next if ($key =~ m/\./);
     240        $top_sections{$key} = $md_hash;
     241    }
     242    }
     243
     244    # for greenstone document objects based on metadata in gdbm file
     245    my @all_docs = ();
     246    foreach my $oid ( keys %top_sections )
     247    {
     248    my $doc_db_hash = $top_sections{$oid};
     249
     250    my $doc_obj = new doc();
     251    $doc_obj->set_OID($oid);
     252
     253    my $top = $doc_obj->get_top_section();
     254        add_section_content ($doc_obj, $top, $doc_db_hash);
     255        my $children = &get_children($doc_db_hash);
     256        recurse_sections($doc_obj, $children, $oid, $top, \%gdbm_recs);
     257
     258    push(@all_docs,$doc_obj);
     259    }   
     260
     261    untie %gdbm_recs;
     262
     263    return \@all_docs;   
     264}
     265
     266
     267
     268
    141269
    142270# classify_doc lets each of the classifiers classify a document
Note: See TracChangeset for help on using the changeset viewer.