Changeset 38882 for main/trunk
- Timestamp:
- 2024-03-27T11:19:44+13:00 (3 months ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/classify/List.pm
r38738 r38882 637 637 } 638 638 639 639 # when using approximate_size or constant_size buckets, if there are less items than partition_size, then we generally don't make a bucket. 640 # However, if the alpha values have partitions and the numerica values don't, then this doesn't display nicely (get documents in an HList) 641 # so if one side has partitions, then we force the other side to have them too. 642 sub will_we_have_partitions 643 { 644 my $self = shift(@_); 645 my $metadata_group = shift(@_); 646 my $metadata_value_to_OIDs_hash_ref = shift(@_); 647 my $numeric_metadata_value_to_OIDs_hash_ref = shift(@_); 648 649 my $partition_type = $self->{$metadata_group . ".partition_type_within_level"}; 650 my $partition_size = $self->{$metadata_group . ".partition_size_within_level"}; 651 my $bookshelf_type = $self->{$metadata_group. ".bookshelf_type"}; 652 my $numeric_partition_type = $self->{$metadata_group . ".numeric_partition_type_within_level"}; 653 my $numeric_partition_size = $self->{$metadata_group . ".numeric_partition_size_within_level"}; 654 655 if ($partition_type eq "none") { #if this is none, then numeric will also be none, so only need to check one of them. 656 return 0; 657 } 658 # if either alpha or numeric has partitions, then we force partitions both sides 659 if ($partition_type =~ /^(per_letter|all_values)$/ || $numeric_partition_type =~ /^(single_partition|per_digit|all_values)/) { 660 return 1; 661 } 662 # we are using approximate/constant size buckets - lets count the items to see if either side will be making buckets 663 my $num_alpha = 0; 664 my $num_numeric = 0; 665 if ($bookshelf_type =~ /^(always|duplicate_only)$/) { 666 # each individual value will be a bookshelf or a single item 667 $num_alpha = scalar(keys %$metadata_value_to_OIDs_hash_ref); 668 $num_numeric = scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref); 669 } else { 670 # we have to count actual values to see how many entires there are 671 foreach my $metadata_value (keys %{$metadata_value_to_OIDs_hash_ref}) { 672 $num_alpha += scalar( @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}); 673 } 674 foreach my $metadata_value (keys %{$numeric_metadata_value_to_OIDs_hash_ref}) { 675 $num_numeric += scalar( @{$numeric_metadata_value_to_OIDs_hash_ref->{$metadata_value}}); 676 } 677 678 } 679 680 my $tolerance = 0; 681 my $numeric_tolerance = 0; 682 if ($partition_type eq "approximate_size") { 683 $tolerance = ($partition_size > 40 ? 10 : int($partition_size/4)); 684 } 685 if ($numeric_partition_type eq "approximate_size" ) { 686 $numeric_tolerance = ($numeric_partition_size > 40 ? 10 : int($numeric_partition_size/4)); 687 } 688 689 # now, test to see if we have partition on either side - in which case we need to force partitions on the other side 690 if ($num_alpha > $partition_size+$tolerance || $num_numeric > $numeric_partition_size+$numeric_tolerance) { 691 return 1; 692 } 693 return 0; 694 } 640 695 sub add_level 641 696 { … … 679 734 my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"}; 680 735 736 my ($has_partitions) = $self->will_we_have_partitions($metadata_group, \%metadata_value_to_OIDs_hash, \%numeric_metadata_value_to_OIDs_hash); 681 737 682 738 ############################################# … … 684 740 ############################################# 685 741 if ($self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) { 686 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash );742 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash, $has_partitions); 687 743 } 688 744 … … 694 750 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash); 695 751 } 696 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {752 elsif ($partition_type_within_level =~ /^approximate_size$/i && $has_partitions) { # scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) { 697 753 $self->split_approximate_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'}, $self->{'max_partition_name_length'}); 698 754 699 755 } 700 elsif ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {756 elsif ($partition_type_within_level =~ /^constant_size$/i && $has_partitions) { #scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) { 701 757 $self->split_constant_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'}, $self->{'max_partition_name_length'}); 702 758 } … … 716 772 ########################################### 717 773 if (!$self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) { 718 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash );774 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash, $has_partitions); 719 775 } 720 776 } … … 726 782 my $classifier_node = shift(@_); 727 783 my $numeric_metadata_value_to_OIDs_hash_ref = shift(@_); 784 my $has_partitions = shift(@_); 728 785 729 786 my $metadata_group = $metadata_groups[0]; … … 744 801 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, $numeric_partition_name_length_within_level); 745 802 } 746 elsif ($numeric_partition_type_within_level eq "constant_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {803 elsif ($numeric_partition_type_within_level eq "constant_size" && $has_partitions) { #scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) { 747 804 # Generate hlists of a certain size 748 805 749 806 $self->split_constant_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level,$numeric_partition_name_length_within_level, 1); 750 } elsif ($numeric_partition_type_within_level eq "approximate_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {807 } elsif ($numeric_partition_type_within_level eq "approximate_size" && $has_partitions) { #scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) { 751 808 $self->split_approximate_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, $numeric_partition_name_length_within_level, 1); 752 809 } … … 779 836 my $is_numeric = shift(@_); 780 837 781 my $tolerance = 10; # should this be an option??838 my $tolerance = ($partition_size > 40 ? 10 : int($partition_size/4)); # should this be an option?? 782 839 783 840 # Generate hlist based on the first letter of the metadata value (like per_letter), or based on
Note:
See TracChangeset
for help on using the changeset viewer.