Context Navigation

← Previous Changeset
Next Changeset →

Changeset 11994

Timestamp:

2006-07-04T15:06:34+12:00 (18 years ago)

Author:

davidb

Message:

Improved support for incremental addition: instead of having to run the
classifier pass of buildcol.pl from scratch (i.e. read in all documents
from the archives folder) so correct browse structures are formed -- a
simple to implement strategy, but not very efficient -- the first layer
of a classifier structure is now reconstructed from the GDBM file. Then
the new files in the archives directory are added, and then finally the
completed browser structure is formed.

Location:

trunk/gsdl/perllib

Files:

: 1 added
: 3 edited

basebuilder.pm (modified) (3 diffs)
basebuildproc.pm (modified) (4 diffs)
classify.pm (modified) (2 diffs)
manifest.pm (added)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/basebuilder.pm

-              r11965
+              r11994
+}
 sub make_infodatabase {
     my $self = shift (@_);
 …
     # init all the classifiers
     &classify::init_classifiers ($self->{'classifiers'});
+    my $reconstructed_docs = undef;
+    if ($self->{'keepold'}) {
+    # reconstruct doc_obj metadata from gdbm for all docs
+    $reconstructed_docs = &classify::reconstruct_doc_objs_metadata($fulldbname);
+    }
     # set up the document processor
 …
     $self->{'buildproc'}->set_indexing_text (0);
     $self->{'buildproc'}->set_store_text(1);
+    # make_infodatabase does not support incremental build
+    # => full reset needed
+    $self->{'buildproc'}->zero_reset();
+    # make_infodatabase needs full reset even for incremental build
+    # as incremental works by reconstructing all docs from GDBM and
+    # then adding in the new ones
+    $self->{'buildproc'}->zero_reset();
+    if ($self->{'keepold'}) {
+    # create flat classify structure, ready for new docs to be added
+    foreach my $doc_obj ( @$reconstructed_docs ) {
+        print $outhandle "  Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
+        $self->{'buildproc'}->process($doc_obj,undef);
+    }
+    }
     # this has changed to only output collection meta if its

trunk/gsdl/perllib/basebuildproc.pm

-              r11793
+              r11994
     return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
+    my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
+    $archivedir = "" unless defined $archivedir;
+    $archivedir =~ s/\\/\//g;
+    $archivedir =~ s/^\/+//;
+    $archivedir =~ s/\/+$//;
+    # resolve the final filenames of the files associated with this document
+    $self->assoc_files ($doc_obj, $archivedir);
+    my $archivedir = "";
+    if (defined $filename)
+    {
+    # doc_obj derived directly from file
+    my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
+    $dir = "" unless defined $dir;
+    $dir =~ s/\\/\//g;
+    $dir =~ s/^\/+//;
+    $dir =~ s/\/+$//;
+    $archivedir = $dir;
+    # resolve the final filenames of the files associated with this document
+    $self->assoc_files ($doc_obj, $archivedir);
+    }
+    else
+    {
+    # doc_obj reconstructed from GDBM (has metadata, doc structure but no text)
+    my $top_section = $doc_obj->get_top_section();
+    $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
+    }
     #GRB: moved 1/06/2004 from GRB01062004
 …
+    }
+    # output whether this node contains text
+    if ($doc_obj->get_text_length($section) > 0) {
+        print $handle "<hastxt>1\n";
+    } else {
+        print $handle "<hastxt>0\n";
+    # Output whether this node contains text
+    #
+    # If doc_obj reconstructed from GDBM file then no need to
+    # explicitly add <hastxt> as this is preserved as metadata when
+    # the GDBM file is loaded in
+    if (defined $filename)
+    {
+        # doc_obj derived directly from file
+        if ($doc_obj->get_text_length($section) > 0) {
+        print $handle "<hastxt>1\n";
+        } else {
+        print $handle "<hastxt>0\n";
+        }
+    }
 …
+    }
+    # output archivedir if at top level
+    if ($section eq $doc_obj->get_top_section()) {
+        print $handle "<archivedir>$archivedir\n";
+    # If doc_obj reconstructed from GDBM file then no need to
+    # explicitly add <archivedir> as this is preserved as metadata when
+    # the GDBM file is loaded in
+    if (defined $filename)
+    {
+        # output archivedir if at top level
+        if ($section eq $doc_obj->get_top_section()) {
+        print $handle "<archivedir>$archivedir\n";
+        }
+    }
 …
         print $handle "<thistype>$thistype\n";
+    }
     if ($self->{'gdbm_level'} eq "document") {

trunk/gsdl/perllib/classify.pm

-              r11644
+              r11994
 require AllList;
 use gsprintf;
+use GDBM_File;
 …
+    }
+}
+# takes a hashref containing the metadata for a gdbmfile entry, and extracts
+# the childrens numbers (from the 'contains' entry).
+# assumes format is ".1;".2;".3
+sub get_children {
+    my ($doc_db_hash) = @_;
+    my $children = undef;
+    $childs = $doc_db_hash->{'contains'};
+    if (defined ($childs)) {
+    $childs =~ s/\@$//;  #remove trailing @
+    $childs =~ s/^\"\.//; #remove initial ".
+    @$children = split /\;\"\./, $childs;
+    }
+    return $children;
+}
+sub recurse_sections {
+    my ($doc_obj, $children, $parentoid, $parentsection, $gdbm_recs) = @_;
+    return if (!defined $children);
+    foreach my $child (sort { $a <=> $b} @$children) {
+    $doc_obj->create_named_section("$parentsection.$child");
+    my $doc_db_rec = $gdbm_recs->{"$parentoid.$child"};
+    my $doc_db_hash = db_rec_to_hash($doc_db_rec);
+    # get child's children
+    my $newchildren = &get_children($doc_db_hash);
+    # add content for current section
+    add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
+    # process all the children if there are any
+    if (defined ($newchildren))
+    {
+        recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
+                 "$parentsection.$child", $gdbm_recs);
+    }
+    }
+}
+sub add_section_content {
+    my ($doc_obj, $cursection, $doc_db_hash) = @_;
+    foreach $key (keys %$doc_db_hash) {
+    #don't need to store these metadata
+    next if $key =~ /(thistype|childtype|contains|docnum|doctype|classifytype)/i;
+    # but do want things like hastxt and archivedir
+    my @items = split /@/, $doc_db_hash->{$key};
+    map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
+    }
+}
+# gets all the metadata from a gdbm file entry, and puts it into a hashref
+sub db_rec_to_hash {
+    my ($gdb_str_ref) = @_;
+    my $hashref = {};
+    my @entries = split(/\n/, $gdb_str_ref);
+    foreach $entry (@entries) {
+    my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
+    $hashref->{$key} .= '@' if defined $hashref->{$key};
+    $hashref->{$key} .= $value;
+    }
+    return $hashref;
+}
+sub reconstruct_doc_objs_metadata
+{
+    my ($fulldbname) = @_;
+    tie %gdbm_recs, 'GDBM_File', $fulldbname, &GDBM_WRCREAT, 0640;
+    # dig out top level doc sections
+    my %top_sections = ();
+    foreach my $key ( keys %gdbm_recs )
+    {
+    my $md_rec = $gdbm_recs{$key};
+    my $md_hash = db_rec_to_hash($md_rec);
+    if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
+        next if ($key =~ m/\./);
+        $top_sections{$key} = $md_hash;
+    }
+    }
+    # for greenstone document objects based on metadata in gdbm file
+    my @all_docs = ();
+    foreach my $oid ( keys %top_sections )
+    {
+    my $doc_db_hash = $top_sections{$oid};
+    my $doc_obj = new doc();
+    $doc_obj->set_OID($oid);
+    my $top = $doc_obj->get_top_section();
+        add_section_content ($doc_obj, $top, $doc_db_hash);
+        my $children = &get_children($doc_db_hash);
+        recurse_sections($doc_obj, $children, $oid, $top, \%gdbm_recs);
+    push(@all_docs,$doc_obj);
+    }
+    untie %gdbm_recs;
+    return \@all_docs;
+}
 # classify_doc lets each of the classifiers classify a document

Note: See TracChangeset for help on using the changeset viewer.