Changeset 215


Ignore:
Timestamp:
1999-03-30T17:08:18+12:00 (25 years ago)
Author:
rjmcnab
Message:

Added code to build a new gdbm format.

Location:
trunk/gsdl
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/buildcol.pl

    r130 r215  
    2929{
    3030    if (!parsargv::parse(\@ARGV,
     31             'newgdbm', \$newgdbm,
    3132             'verbosity/\d+/2', \$verbosity,
    3233             'archivedir/.*/', \$archivedir,
     
    3637    die "\n";
    3738    }
     39
     40    $newgdbm = 0 unless $newgdbm;
    3841
    3942    # get and check the collection
     
    109112
    110113    eval("\$builder = new $buildertype(\$collection, " .
    111      "\$realarchivedir, \$realbuilddir, \$verbosity)");
     114     "\$realarchivedir, \$realbuilddir, \$verbosity, \$newgdbm)");
    112115    die "$@" if $@;
    113116
  • trunk/gsdl/perllib/doc.pm

    r97 r215  
    116116    my $sectionref = $self;
    117117
    118     while ($section ne "") {
     118    while (defined $section && $section ne "") {
    119119    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
    120120    $num =~ s/^0+(\d)/$1/; # remove leading 0s
     
    260260
    261261# returns a reference to a list of children
    262 
    263262sub get_children {
    264263    my $self = shift (@_);
     
    516515
    517516    foreach $data (@{$section_ptr->{'metadata'}}) {
    518     return $data->[1] if ($data->[0] eq $field);
     517    return $data->[1] if (scalar(@$data) >= 2 && $data->[0] eq $field);
    519518    }
    520519   
  • trunk/gsdl/perllib/mgbuilder.pm

    r139 r215  
    44package mgbuilder;
    55
     6use classify;
    67use cfgread;
    78use colcfg;
     
    2425
    2526sub new {
    26     my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
     27    my ($class, $collection, $source_dir, $build_dir, $verbosity, $newgdbm) = @_;
    2728
    2829    # create an mgbuilder object
     
    3031              'source_dir'=>$source_dir,
    3132              'build_dir'=>$build_dir,
    32               'verbosity'=>$verbosity}, $class;
     33              'verbosity'=>$verbosity,
     34              'newgdbm'=>$newgdbm}, $class;
    3335
    3436
    3537    # read in the collection configuration file
    36     if (!-e "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg") {
     38    my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
     39    if (!-e $colcfgname) {
    3740    die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
    3841    }
    39     $self->{'collect_cfg'} = &colcfg::read_collect_cfg ("$ENV{'GSDLHOME'}/collect/" .
    40                             "$collection/etc/collect.cfg");
    41 
     42    $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
    4243
    4344    # sort out subcollection indexes
     
    7677    die "\n";
    7778    }
    78    
     79
     80    # load all the classifiers
     81    if ($self->{'newgdbm'}) {
     82    $self->{'classifiers'} = [];
     83    if (open (COLCFG, $colcfgname)) {
     84        while (defined ($line = &cfgread::read_cfg_line('mgbuilder::COLCFG'))) {
     85        if (scalar(@$line) >= 2) {
     86            my $key = shift (@$line);
     87            if ($key eq "classify") {
     88            my $classinfo = &classify::load_classifier($line);
     89            push (@{$self->{'classifiers'}}, $classinfo)
     90                if defined $classinfo;
     91            }
     92        }
     93        }
     94        close (COLCFG);
     95    } else {
     96        print STDERR "mgbuilder::new couldn't read the cfg file $colcfgname\n";
     97        print STDERR "               no classifiers were loaded\n";
     98    }
     99    }
    79100
    80101    # load up the document processor for building
     
    92113
    93114    eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
    94      "\$source_dir, \$build_dir, \$verbosity)");
     115     "\$source_dir, \$build_dir, \$verbosity, \$newgdbm)");
    95116    die "$@" if $@;
    96117
     
    436457    print STDERR "\n*** creating the info database\n" if ($self->{'verbosity'} >= 1);
    437458
     459    # init all the classifiers
     460    if ($self->{'newgdbm'}) {
     461    &classify::init_classifiers ($self->{'classifiers'});
     462    }
     463
    438464    # set up the document processor
    439465    $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
    440     $self->{'buildproc'}->set_mode ('infodb');
    441    
    442     # collect the statistics for the text
    443     # -b $maxdocsize sets the maximum document size to be 12 meg
     466    if ($self->{'newgdbm'}) {
     467    $self->{'buildproc'}->set_mode ('newinfodb');
     468    $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
     469    } else {
     470    $self->{'buildproc'}->set_mode ('infodb');
     471    }
     472   
     473    # create the infodatabase
    444474    if (!-e "$exedir/txt2db$exe" || !open (PIPEOUT,
    445475            "| $exedir/txt2db$exe $fulldbname")) {
     
    449479    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    450480           "", {}, $self->{'buildproc'});
     481
     482    # output classification information
     483    if ($self->{'newgdbm'}) {
     484    &classify::output_classify_info ($self->{'classifiers'}, 'mgbuilder::PIPEOUT');
     485    }
     486
    451487    close (PIPEOUT);
    452488}
  • trunk/gsdl/perllib/mgbuildproc.pm

    r139 r215  
    55package mgbuildproc;
    66
     7use classify;
     8use doc;
    79use docproc;
    810use util;
     
    2224    $self->{'build_dir'} = $build_dir;
    2325    $self->{'verbosity'} = $verbosity;
     26    $self->{'classifiers'} = [];
    2427    $self->{'mode'} = "text";
    2528    $self->{'index'} = "section:text";
     
    8184}
    8285
     86sub set_classifiers {
     87    my $self = shift (@_);
     88    my ($classifiers) = @_;
     89
     90    $self->{'classifiers'} = $classifiers;
     91}
     92
    8393sub process {
    8494    my $self = shift (@_);
     
    8797    $self->$method(@_);
    8898}
     99
     100sub newinfodb {
     101    my $self = shift (@_);
     102    my ($doc_obj, $filename) = @_;
     103    my $handle = $self->{'output_handle'};
     104#    $handle = "main::STDOUT";
     105
     106    # this was used in the old version
     107    return if ($doc_obj eq 'classifications');
     108
     109    my $doctype = $doc_obj->get_doc_type();
     110
     111    # only output this document if it is one to be indexed
     112    return if ($doctype ne "indexed_doc");
     113
     114    # this is another document
     115    $self->{'num_docs'} += 1 unless ($doctype eq "classification");
     116
     117    my $section = $doc_obj->get_top_section ();
     118    my $doc_OID = $doc_obj->get_OID();
     119    while (defined $section) {
     120    # update a few statistics
     121    $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
     122    $self->{'num_sections'} += 1 unless ($doctype eq "classification");
     123
     124    # output the section name
     125    if ($section eq "") { print $handle "[$doc_OID]\n"; }
     126    else { print $handle "[$doc_OID.$section]\n"; }
     127
     128    # output the fact that this document is a document
     129    print $handle "<doctype>doc\n";
     130
     131    # output whether this node contains text
     132    if ($doc_obj->get_text_length($section) > 0) {
     133        print $handle "<hastxt>1\n";
     134    } else {
     135        print $handle "<hastxt>0\n";
     136    }
     137
     138    # output all the section metadata
     139    my $metadata = $doc_obj->get_all_metadata ($section);
     140    foreach $pair (@$metadata) {
     141        my ($field, $value) = (@$pair);
     142
     143        if ($field ne "Identifier" && $field !~ /^gsdl/ &&
     144        defined $value && $value ne "") {
     145        # escape problematic stuff
     146        $value =~ s/\\/\\\\/g;
     147        $value =~ s/\n/\\n/g;
     148        $value =~ s/\r/\\r/g;
     149       
     150        print $handle "<$field>$value\n";
     151        }
     152    }
     153
     154    # output a list of children
     155    my $children = $doc_obj->get_children ();
     156    if (scalar(@$children) > 0) {
     157        print $handle "<contains>";
     158        my $firstchild = 1;
     159        foreach $child (@$children) {
     160        print $handle ";" unless $firstchild;
     161        $firstchild = 0;
     162        if ($child eq "") { print $handle "$doc_OID"; }
     163        else { print $handle "$doc_OID.$child"; }
     164        }
     165        print $handle "\n";
     166    }
     167
     168    # output the matching document number
     169    print $handle "<docnum>$self->{'num_sections'}\n";
     170
     171    print $handle '-' x 70, "\n";
     172
     173   
     174    # output a database entry for the document number
     175    print $handle "[$self->{'num_sections'}]\n";
     176    if ($section eq "") { print $handle "<section>$doc_OID\n"; }
     177    else { print $handle "<section>$doc_OID.$section\n"; }
     178    print $handle '-' x 70, "\n";
     179
     180
     181    $section = $doc_obj->get_next_section($section);
     182    }
     183
     184    # classify this document
     185    &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
     186
     187}
     188
     189
    89190
    90191sub infodb {
Note: See TracChangeset for help on using the changeset viewer.