Changeset 14934

Show
Ignore:
Timestamp:
20.12.2007 21:53:14 (12 years ago)
Author:
davidb
Message:

Changes to allow statistic calculations for metadata coverage, i.e. for this docment which metadata set prefixes are used, which fields within those prefixes are used, and how many times. This is then agregated over the all documents and the summary stored as collection level metadata.

Location:
gsdl/trunk/perllib
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/basebuilder.pm

    r14930 r14934  
    368368    $self->{'buildproc'}->zero_reset();  
    369369 
     370    $self->{'buildproc'}->{'mdprefix_fields'} = {}; 
     371 
    370372    if ($self->{'keepold'}) { 
    371373    # create flat classify structure, ready for new docs to be added 
     
    377379 
    378380    
     381    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},  
     382           "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'}); 
     383 
    379384    # this has changed to only output collection meta if its  
    380385    # not in the config file 
    381386    $self->output_collection_meta($handle); 
    382     &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},  
    383            "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'}); 
    384387     
    385388    # output classification information 
     
    562565} 
    563566 
    564 # default is to output an empty [collection] entry 
     567 
     568 
     569sub output_collection_meta_start { 
     570    my $self = shift(@_); 
     571    my ($handle) = @_; 
     572     
     573    print $handle "[collection]\n"; 
     574 
     575 
     576 
     577sub output_collection_meta_sets { 
     578    my $self = shift(@_); 
     579    my ($handle) = @_; 
     580 
     581    my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'}; 
     582 
     583    foreach my $prefix (keys %$mdprefix_fields) 
     584    { 
     585    print $handle "<metadataset>$prefix\n"; 
     586 
     587    foreach my $field (keys %{$mdprefix_fields->{$prefix}}) 
     588    { 
     589        my $val = $mdprefix_fields->{$prefix}->{$field}; 
     590 
     591        print $handle "<metadatalist-$prefix>$field\n"; 
     592        print $handle "<metadatafreq-$prefix-$field>$val\n"; 
     593    } 
     594 
     595    } 
     596 
     597 
     598 
     599sub output_collection_meta_end { 
     600    my $self = shift(@_); 
     601    my ($handle) = @_; 
     602     
     603    print $handle ('-' x 70) . "\n";; 
     604 
     605 
     606 
     607 
     608# default is to output the metadata sets (prefixes) used in collection 
     609 
    565610sub output_collection_meta { 
    566611    my $self = shift(@_); 
    567612    my ($handle) = @_; 
    568      
    569     print $handle "[collection]\n". ('-' x 70) . "\n";; 
     613 
     614    $self->output_collection_meta_start($handle); 
     615    $self->output_collection_meta_sets($handle); 
     616    $self->output_collection_meta_end($handle); 
    570617 
    571618 
  • gsdl/trunk/perllib/basebuildproc.pm

    r12844 r14934  
    297297} 
    298298 
     299 
     300 
     301sub infodb_metadata_stats 
     302{ 
     303    my $self = shift (@_); 
     304    my ($field) = @_; 
     305 
     306    # Keep some statistics relating to metadata sets used and 
     307    # frequency of particular metadata fields within each set 
     308 
     309    # Union of metadata prefixes and frequency of fields 
     310    # (both scoped for this document alone, and across whole collection) 
     311     
     312    if ($field =~ m/^(.+)\.(.*)$/) { 
     313    my $prefix = $1; 
     314    my $core_field = $2; 
     315 
     316    $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++; 
     317    $self->{'mdprefix_fields'}->{$prefix}->{$core_field}++; 
     318    } 
     319    elsif ($field =~ m/^[[:upper:]]/) { 
     320    # implicit 'ex' metadata set 
     321 
     322    $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++; 
     323    $self->{'mdprefix_fields'}->{'ex'}->{$field}++; 
     324    } 
     325 
     326} 
     327 
     328 
    299329sub infodb { 
    300330    my $self = shift (@_); 
     
    351381    my $first = 1; 
    352382    my $url = ""; 
     383 
     384    $self->{'doc_mdprefix_fields'} = {}; 
     385 
    353386    while (defined $section) { 
    354387    # update a few statistics 
     
    412445        if (!defined $self->{'dontgdbm'}->{$field}) { 
    413446            print $handle "<$field>$value\n"; 
     447 
     448            if ($section eq "") 
     449            { 
     450            $self->infodb_metadata_stats($field); 
     451            } 
    414452        } 
    415453        } 
    416454    } 
    417455 
     456    if ($section eq "") 
     457    { 
     458        my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'}; 
     459 
     460        foreach my $prefix (keys %$doc_mdprefix_fields) 
     461        { 
     462        print $handle "<metadataset>$prefix\n"; 
     463 
     464        foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}}) 
     465        { 
     466            my $val = $doc_mdprefix_fields->{$prefix}->{$field}; 
     467 
     468            print $handle "<metadatalist-$prefix>$field\n"; 
     469            print $handle "<metadatafreq-$prefix-$field>$val\n"; 
     470        } 
     471 
     472        } 
     473    } 
    418474 
    419475    # If doc_obj reconstructed from GDBM file then no need to  
  • gsdl/trunk/perllib/lucenebuildproc.pm

    r14923 r14934  
    398398# /** process() **/ 
    399399 
     400 
     401# Following methods seem to be no different to those defined in basebuildproc.pm 
     402# From inspection, it looks like these ones can be removed 
     403 
     404 
    400405sub get_num_docs { 
    401406    my $self = shift (@_); 
  • gsdl/trunk/perllib/mgppbuilder.pm

    r14666 r14934  
    634634 
    635635    # do the collection info 
    636     print $handle "[collection]\n"; 
    637      
     636    $self->output_collection_meta_start($handle); 
     637    $self->output_collection_meta_sets($handle); 
     638 
    638639    # first do the collection meta stuff - everything without a dot 
    639640    my $collmetadefined = 0; 
     
    701702    } 
    702703    } 
    703     print $handle $lang_entry; 
    704     # end the collection entry 
    705     print $handle "\n" . ('-' x 70) . "\n";         
    706  
    707  
     704    print $handle "$lang_entry\n"; 
     705 
     706    $self->output_collection_meta_end($handle); 
    708707} 
    709708