Ignore:
Timestamp:
2007-12-20T21:53:14+13:00 (16 years ago)
Author:
davidb
Message:

Changes to allow statistic calculations for metadata coverage, i.e. for this docment which metadata set prefixes are used, which fields within those prefixes are used, and how many times. This is then agregated over the all documents and the summary stored as collection level metadata.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/basebuildproc.pm

    r12844 r14934  
    297297}
    298298
     299
     300
     301sub infodb_metadata_stats
     302{
     303    my $self = shift (@_);
     304    my ($field) = @_;
     305
     306    # Keep some statistics relating to metadata sets used and
     307    # frequency of particular metadata fields within each set
     308
     309    # Union of metadata prefixes and frequency of fields
     310    # (both scoped for this document alone, and across whole collection)
     311   
     312    if ($field =~ m/^(.+)\.(.*)$/) {
     313    my $prefix = $1;
     314    my $core_field = $2;
     315
     316    $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
     317    $self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
     318    }
     319    elsif ($field =~ m/^[[:upper:]]/) {
     320    # implicit 'ex' metadata set
     321
     322    $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
     323    $self->{'mdprefix_fields'}->{'ex'}->{$field}++;
     324    }
     325
     326}
     327
     328
    299329sub infodb {
    300330    my $self = shift (@_);
     
    351381    my $first = 1;
    352382    my $url = "";
     383
     384    $self->{'doc_mdprefix_fields'} = {};
     385
    353386    while (defined $section) {
    354387    # update a few statistics
     
    412445        if (!defined $self->{'dontgdbm'}->{$field}) {
    413446            print $handle "<$field>$value\n";
     447
     448            if ($section eq "")
     449            {
     450            $self->infodb_metadata_stats($field);
     451            }
    414452        }
    415453        }
    416454    }
    417455
     456    if ($section eq "")
     457    {
     458        my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
     459
     460        foreach my $prefix (keys %$doc_mdprefix_fields)
     461        {
     462        print $handle "<metadataset>$prefix\n";
     463
     464        foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
     465        {
     466            my $val = $doc_mdprefix_fields->{$prefix}->{$field};
     467
     468            print $handle "<metadatalist-$prefix>$field\n";
     469            print $handle "<metadatafreq-$prefix-$field>$val\n";
     470        }
     471
     472        }
     473    }
    418474
    419475    # If doc_obj reconstructed from GDBM file then no need to
Note: See TracChangeset for help on using the changeset viewer.