Ignore:
Timestamp:
2000-11-23T11:55:50+13:00 (23 years ago)
Author:
kjm18
Message:

updated to resembled the corresponding mg updated versions

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/mgppbuildproc.pm

    r932 r1694  
    4242
    4343sub new {
    44     my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
     44    my ($class, $collection, $source_dir, $build_dir,
     45    $verbosity, $outhandle) = @_;
    4546    my $self = new docproc ();
     47
     48    # outhandle is where all the debugging info goes
     49    # output_handle is where the output of the plugins is piped
     50    # to (i.e. mg, gdbm etc.)
     51    $outhandle = STDERR unless defined $outhandle;
    4652
    4753    $self->{'collection'} = $collection;
     
    5965    $self->{'num_sections'} = 0;
    6066    $self->{'num_bytes'} = 0;
     67    $self->{'num_processed_bytes'} = 0;
     68    $self->{'outhandle'} = $outhandle;
    6169
    6270    $self->{'indexing_text'} = 0;
     
    7078    $self->{'num_docs'} = 0;
    7179    $self->{'num_sections'} = 0;
     80    $self->{'num_processed_bytes'} = 0;
    7281    $self->{'num_bytes'} = 0;
    7382}
     
    8594}
    8695
     96# num_bytes is the actual number of bytes in the collection
     97# this is normally the same as what's processed during text compression
    8798sub get_num_bytes {
    8899    my $self = shift (@_);
    89100
    90101    return $self->{'num_bytes'};
     102}
     103
     104# num_processed_bytes is the number of bytes actually passed
     105# to mgpp for the current index
     106sub get_num_processed_bytes {
     107    my $self = shift (@_);
     108
     109    return $self->{'num_processed_bytes'};
    91110}
    92111
     
    127146}
    128147
     148sub get_index {
     149    my $self = shift (@_);
     150
     151    return $self->{'index'};
     152}
     153
    129154sub set_classifiers {
    130155    my $self = shift (@_);
     
    139164
    140165    $self->{'indexing_text'} = $indexing_text;
     166}
     167
     168sub get_indexing_text {
     169    my $self = shift (@_);
     170
     171    return $self->{'indexing_text'};
    141172}
    142173
     
    240271    else { print $handle "[$doc_OID.$section]\n"; }
    241272
    242     # output the fact that this document is a document
    243     #print $handle "<doctype>doc\n";
     273    # output the fact that this document is a document (unless doctype
     274    # has been set to something else from within a plugin
     275    my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
     276    if (!defined $dtype || $dtype !~ /\w/) {
     277        print $handle "<doctype>doc\n";
     278    }
    244279
    245280    # output whether this node contains text
     
    251286
    252287    # output all the section metadata
    253     my $found_doctype = 0;
     288    #my $found_doctype = 0;
    254289    my $metadata = $doc_obj->get_all_metadata ($section);
    255290    foreach $pair (@$metadata) {
    256291        my ($field, $value) = (@$pair);
    257292
    258         $found_doctype = 1 if $field eq "doctype";
     293        #$found_doctype = 1 if $field eq "doctype";
    259294        if ($field ne "Identifier" && $field !~ /^gsdl/ &&
    260295        defined $value && $value ne "") {
     
    282317    # (unless doctype was already output as part of
    283318    # metadata)
    284     if (!$found_doctype && !defined $self->{'dontgdbm'}->{'doctype'}) {
    285         print $handle "<doctype>doc\n";
    286     }
     319    #if (!$found_doctype && !defined $self->{'dontgdbm'}->{'doctype'}) {
     320    #    print $handle "<doctype>doc\n";
     321    #}
    287322   
    288323   
Note: See TracChangeset for help on using the changeset viewer.