Changeset 38882 for main/trunk


Ignore:
Timestamp:
2024-03-27T11:19:44+13:00 (3 months ago)
Author:
kjdon
Message:

fine tuning on generating buckets. if one side (alpha or numeric) has made buckets, then the other side must - don't want to end up with HList with some buckets and some documents in it

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/classify/List.pm

    r38738 r38882  
    637637}
    638638
    639 
     639# when using approximate_size or constant_size buckets, if there are less items than partition_size, then we generally don't make a bucket.
     640# However, if the alpha values have partitions and the numerica values don't, then this doesn't display nicely (get documents in an HList)
     641# so if one side has partitions, then we force the other side to have them too.
     642sub will_we_have_partitions
     643{
     644    my $self = shift(@_);
     645    my $metadata_group = shift(@_);
     646    my $metadata_value_to_OIDs_hash_ref = shift(@_);
     647    my $numeric_metadata_value_to_OIDs_hash_ref = shift(@_);
     648
     649    my $partition_type = $self->{$metadata_group . ".partition_type_within_level"};
     650    my $partition_size = $self->{$metadata_group . ".partition_size_within_level"};
     651    my $bookshelf_type = $self->{$metadata_group. ".bookshelf_type"};
     652    my $numeric_partition_type = $self->{$metadata_group . ".numeric_partition_type_within_level"};
     653    my $numeric_partition_size = $self->{$metadata_group . ".numeric_partition_size_within_level"};
     654
     655    if ($partition_type eq "none") { #if this is none, then numeric will also be none, so only need to check one of them.
     656        return 0;
     657    }
     658    # if either alpha or numeric has partitions, then we force partitions both sides
     659    if ($partition_type =~ /^(per_letter|all_values)$/ || $numeric_partition_type =~ /^(single_partition|per_digit|all_values)/) {
     660        return 1;
     661    }
     662    # we are using approximate/constant size buckets - lets count the items to see if either side will be making buckets
     663    my $num_alpha = 0;
     664    my $num_numeric = 0;
     665    if ($bookshelf_type =~ /^(always|duplicate_only)$/) {
     666        # each individual value will be a bookshelf or a single item
     667        $num_alpha = scalar(keys %$metadata_value_to_OIDs_hash_ref);
     668        $num_numeric = scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref);
     669    } else {
     670        # we have to count actual values to see how many entires there are
     671        foreach my $metadata_value (keys %{$metadata_value_to_OIDs_hash_ref}) {
     672            $num_alpha += scalar( @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}});
     673        }
     674        foreach my $metadata_value (keys %{$numeric_metadata_value_to_OIDs_hash_ref}) {
     675            $num_numeric += scalar( @{$numeric_metadata_value_to_OIDs_hash_ref->{$metadata_value}});
     676        }
     677       
     678    }
     679
     680    my $tolerance = 0;
     681    my $numeric_tolerance = 0;
     682    if ($partition_type eq "approximate_size") {
     683        $tolerance = ($partition_size > 40 ? 10 : int($partition_size/4));
     684    }
     685    if ($numeric_partition_type eq "approximate_size" ) {
     686        $numeric_tolerance = ($numeric_partition_size > 40 ? 10 : int($numeric_partition_size/4));
     687    }
     688   
     689    # now, test to see if we have partition on either side - in which case we need to force partitions on the other side
     690    if ($num_alpha > $partition_size+$tolerance || $num_numeric > $numeric_partition_size+$numeric_tolerance) {
     691        return 1;
     692    }
     693    return 0;
     694}
    640695sub add_level
    641696{
     
    679734    my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"};
    680735   
     736    my ($has_partitions) = $self->will_we_have_partitions($metadata_group, \%metadata_value_to_OIDs_hash, \%numeric_metadata_value_to_OIDs_hash);
    681737   
    682738    #############################################
     
    684740    #############################################
    685741    if ($self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) {
    686     $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash);
     742    $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash, $has_partitions);
    687743    }
    688744
     
    694750        $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash);
    695751    }
    696     elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
     752    elsif ($partition_type_within_level =~ /^approximate_size$/i && $has_partitions) { # scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
    697753            $self->split_approximate_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'}, $self->{'max_partition_name_length'});
    698754       
    699755    }
    700     elsif ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
     756    elsif ($partition_type_within_level =~ /^constant_size$/i && $has_partitions) { #scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
    701757        $self->split_constant_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'}, $self->{'max_partition_name_length'});
    702758    }
     
    716772    ###########################################
    717773    if (!$self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) {
    718     $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash);
     774    $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash, $has_partitions);
    719775    }
    720776}
     
    726782    my $classifier_node = shift(@_);
    727783    my $numeric_metadata_value_to_OIDs_hash_ref = shift(@_);
     784    my $has_partitions = shift(@_);
    728785
    729786    my $metadata_group = $metadata_groups[0];
     
    744801    $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, $numeric_partition_name_length_within_level);
    745802    }
    746     elsif ($numeric_partition_type_within_level eq "constant_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
     803    elsif ($numeric_partition_type_within_level eq "constant_size" && $has_partitions) { #scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
    747804    # Generate hlists of a certain size
    748805   
    749806    $self->split_constant_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level,$numeric_partition_name_length_within_level, 1);
    750     } elsif ($numeric_partition_type_within_level eq "approximate_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
     807    } elsif ($numeric_partition_type_within_level eq "approximate_size" && $has_partitions) { #scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
    751808    $self->split_approximate_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, $numeric_partition_name_length_within_level, 1);
    752809    }
     
    779836    my $is_numeric = shift(@_);
    780837
    781     my $tolerance = 10; # should this be an option??
     838    my $tolerance = ($partition_size > 40 ? 10 : int($partition_size/4)); # should this be an option??
    782839
    783840    # Generate hlist based on the first letter of the metadata value (like per_letter), or based on
Note: See TracChangeset for help on using the changeset viewer.