Changeset 14934


Ignore:
Timestamp:
2007-12-20T21:53:14+13:00 (14 years ago)
Author:
davidb
Message:

Changes to allow statistic calculations for metadata coverage, i.e. for this docment which metadata set prefixes are used, which fields within those prefixes are used, and how many times. This is then agregated over the all documents and the summary stored as collection level metadata.

Location:
gsdl/trunk/perllib
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/basebuilder.pm

    r14930 r14934  
    368368    $self->{'buildproc'}->zero_reset();
    369369
     370    $self->{'buildproc'}->{'mdprefix_fields'} = {};
     371
    370372    if ($self->{'keepold'}) {
    371373    # create flat classify structure, ready for new docs to be added
     
    377379
    378380   
     381    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
     382           "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
     383
    379384    # this has changed to only output collection meta if its
    380385    # not in the config file
    381386    $self->output_collection_meta($handle);
    382     &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    383            "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
    384387   
    385388    # output classification information
     
    562565}
    563566
    564 # default is to output an empty [collection] entry
     567
     568
     569sub output_collection_meta_start {
     570    my $self = shift(@_);
     571    my ($handle) = @_;
     572   
     573    print $handle "[collection]\n";
     574
     575
     576
     577sub output_collection_meta_sets {
     578    my $self = shift(@_);
     579    my ($handle) = @_;
     580
     581    my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
     582
     583    foreach my $prefix (keys %$mdprefix_fields)
     584    {
     585    print $handle "<metadataset>$prefix\n";
     586
     587    foreach my $field (keys %{$mdprefix_fields->{$prefix}})
     588    {
     589        my $val = $mdprefix_fields->{$prefix}->{$field};
     590
     591        print $handle "<metadatalist-$prefix>$field\n";
     592        print $handle "<metadatafreq-$prefix-$field>$val\n";
     593    }
     594
     595    }
     596
     597
     598
     599sub output_collection_meta_end {
     600    my $self = shift(@_);
     601    my ($handle) = @_;
     602   
     603    print $handle ('-' x 70) . "\n";;
     604
     605
     606
     607
     608# default is to output the metadata sets (prefixes) used in collection
     609
    565610sub output_collection_meta {
    566611    my $self = shift(@_);
    567612    my ($handle) = @_;
    568    
    569     print $handle "[collection]\n". ('-' x 70) . "\n";;
     613
     614    $self->output_collection_meta_start($handle);
     615    $self->output_collection_meta_sets($handle);
     616    $self->output_collection_meta_end($handle);
    570617
    571618
  • gsdl/trunk/perllib/basebuildproc.pm

    r12844 r14934  
    297297}
    298298
     299
     300
     301sub infodb_metadata_stats
     302{
     303    my $self = shift (@_);
     304    my ($field) = @_;
     305
     306    # Keep some statistics relating to metadata sets used and
     307    # frequency of particular metadata fields within each set
     308
     309    # Union of metadata prefixes and frequency of fields
     310    # (both scoped for this document alone, and across whole collection)
     311   
     312    if ($field =~ m/^(.+)\.(.*)$/) {
     313    my $prefix = $1;
     314    my $core_field = $2;
     315
     316    $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
     317    $self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
     318    }
     319    elsif ($field =~ m/^[[:upper:]]/) {
     320    # implicit 'ex' metadata set
     321
     322    $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
     323    $self->{'mdprefix_fields'}->{'ex'}->{$field}++;
     324    }
     325
     326}
     327
     328
    299329sub infodb {
    300330    my $self = shift (@_);
     
    351381    my $first = 1;
    352382    my $url = "";
     383
     384    $self->{'doc_mdprefix_fields'} = {};
     385
    353386    while (defined $section) {
    354387    # update a few statistics
     
    412445        if (!defined $self->{'dontgdbm'}->{$field}) {
    413446            print $handle "<$field>$value\n";
     447
     448            if ($section eq "")
     449            {
     450            $self->infodb_metadata_stats($field);
     451            }
    414452        }
    415453        }
    416454    }
    417455
     456    if ($section eq "")
     457    {
     458        my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
     459
     460        foreach my $prefix (keys %$doc_mdprefix_fields)
     461        {
     462        print $handle "<metadataset>$prefix\n";
     463
     464        foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
     465        {
     466            my $val = $doc_mdprefix_fields->{$prefix}->{$field};
     467
     468            print $handle "<metadatalist-$prefix>$field\n";
     469            print $handle "<metadatafreq-$prefix-$field>$val\n";
     470        }
     471
     472        }
     473    }
    418474
    419475    # If doc_obj reconstructed from GDBM file then no need to
  • gsdl/trunk/perllib/lucenebuildproc.pm

    r14923 r14934  
    398398# /** process() **/
    399399
     400
     401# Following methods seem to be no different to those defined in basebuildproc.pm
     402# From inspection, it looks like these ones can be removed
     403
     404
    400405sub get_num_docs {
    401406    my $self = shift (@_);
  • gsdl/trunk/perllib/mgppbuilder.pm

    r14666 r14934  
    634634
    635635    # do the collection info
    636     print $handle "[collection]\n";
    637    
     636    $self->output_collection_meta_start($handle);
     637    $self->output_collection_meta_sets($handle);
     638
    638639    # first do the collection meta stuff - everything without a dot
    639640    my $collmetadefined = 0;
     
    701702    }
    702703    }
    703     print $handle $lang_entry;
    704     # end the collection entry
    705     print $handle "\n" . ('-' x 70) . "\n";       
    706 
    707 
     704    print $handle "$lang_entry\n";
     705
     706    $self->output_collection_meta_end($handle);
    708707}
    709708
Note: See TracChangeset for help on using the changeset viewer.