Changeset 38738
- Timestamp:
- 2024-02-13T11:12:53+13:00 (4 months ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/classify/List.pm
r36911 r38738 568 568 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)}; 569 569 foreach my $metadata_value (@metadata_values) { 570 #print STDERR "working with value $metadata_value\n";571 570 # Strip leading and trailing whitespace 572 571 $metadata_value =~ s/^\s*//; … … 696 695 } 697 696 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) { 698 697 $self->split_approximate_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'}, $self->{'max_partition_name_length'}); 699 698 700 699 } … … 763 762 764 763 } 764 765 765 766 766 … … 811 811 $last_partition = $metadata_valuepartition; 812 812 } else { 813 $num_items_in_bucket += ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1); #scalar(@metadata_values_in_bucket);813 $num_items_in_bucket += ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1); 814 814 push (@metadata_values_in_bucket, $metadata_value); 815 815 } … … 883 883 my $num_splits = int($items_in_partition/$partition_size); 884 884 $num_splits++ if (($items_in_partition - $partition_size*$num_splits) > $tolerance); 885 my $this_partition_size = $items_in_partition/$num_splits; 885 886 my $this_partition_size = int($items_in_partition/$num_splits); 887 if ($this_partition_size < $items_in_partition/$num_splits) { $this_partition_size++ }; 888 886 889 # regenerate-the start, noticing that we are a split 887 890 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric, 1); … … 893 896 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values 894 897 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1; 895 896 if ($items_done + $items_for_this_md_value > $this_partition_size && $items_done != 0 && ($items_in_partition - ($prev_done+$items_done) >= $tolerance)) { 897 # Save the stored items into a partition 898 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes); 898 899 900 if ($items_done >= $this_partition_size) { 901 # what we have stored already is enough for a partition 902 # store this partition 903 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes); 899 904 $last_partition_end = $partition_end; 900 905 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric, 1); … … 903 908 %metadata_values_to_OIDs_subhashes = (); 904 909 } 910 $partition_end = $self->generate_partition_end($metadata_value, $partition_start, $partition_name_length, $max_partition_name_length, $is_numeric); 911 $partition_name = $partition_start; 912 if ($partition_end ne $partition_start) { 913 $partition_name = $partition_name . "-" . $partition_end; 914 } 915 916 if ($items_done + $items_for_this_md_value <= $this_partition_size) { 917 # store all the values for the current metadata to the 918 # current partition 919 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value}; 920 $items_done += $items_for_this_md_value; 921 922 } else { 923 # we only want to add some of the values 924 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}; 925 my @OIDs_for_this_partition = (); 926 for (my $j=0; $j< $items_for_this_md_value; $j++) { 927 push (@OIDs_for_this_partition, $OIDs_for_this_value[$j]); 928 $items_done++; 929 930 if ($items_done >= $this_partition_size ) { 931 # add the partition 932 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition; 933 934 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes); 935 $last_partition_end = $partition_end; 936 $partition_start = $partition_end; # is that right? $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric, 1); 937 $prev_done += $items_done; 938 $items_done = 0; 939 %metadata_values_to_OIDs_subhashes = (); 940 @OIDs_for_this_partition = (); 941 } 942 943 } # for each OID in the list 944 # at the end, have we got some ids not added to a partition? 945 # add them into the subhash 946 if (scalar(@OIDs_for_this_partition)) { 947 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition; 948 } 949 } 950 } # end foreach metadata value 951 # The last partition? 952 if($items_done >0) { 953 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes); 954 } 905 955 906 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions 907 if ($bookshelf_type eq "never" && $items_for_this_md_value > $this_partition_size) { 908 # if this is the case, we would have already stored what we have done in the previous step. 909 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric); 910 # Get the number of partitions needed for this value 911 my $num_splits = int($items_for_this_md_value / $this_partition_size); 912 $num_splits++ if ($items_for_this_md_value / $this_partition_size > $num_splits); 913 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}; 914 for (my $i = 0; $i < $num_splits; $i++) { 915 my %OIDs_subhashes_for_this_value = (); 916 my @OIDs_for_this_partition = (); 917 for (my $d = $i * $this_partition_size; $d < (($i+1) * $this_partition_size > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size); $d++) { 918 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]); 919 } 920 921 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values 922 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $this_partition_size) { 923 $partition_start = $partitionname_for_this_value; 924 $partition_name = $partition_start; 925 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition; 926 $items_done += scalar(@OIDs_for_this_partition); 927 $last_partition_end = $partitionname_for_this_value 928 } else { 929 930 # Add an HList for this bucket 931 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition; 932 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value); 933 $last_partition_end = $partitionname_for_this_value; 934 } 935 } 936 } else { 937 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value}; 938 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1; 939 $partition_end = $self->generate_partition_end($metadata_value, $partition_start, $partition_name_length, $max_partition_name_length, $is_numeric); 940 $partition_name = $partition_start; 941 if ($partition_end ne $partition_start) { 942 $partition_name = $partition_name . "-" . $partition_end; 943 } 944 945 } 946 947 # The last partition 948 if($i == scalar(@metadata_values) - 1 && $items_done >0) { 949 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes); 950 } 951 952 } 953 } # end if items in partition > partition size 954 955 } 956 } # end if items in partition > partition size 957 958 } # end for each partition bucket 956 959 957 960 # The partitions are stored in an HList … … 959 962 960 963 } 964 965 961 966 sub split_approximate_size_ORIG 962 967 { … … 1407 1412 my $metadata_group = shift(@metadata_groups); 1408 1413 $classifier_node->{'mdtype'} = $metadata_group; 1409 1410 1414 my $sort_type = $self->{$metadata_group .".metadata_sort_mode_within_level"}; 1411 1415 # Create an entry in the vlist for each value … … 1422 1426 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values 1423 1427 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") { 1424 1425 1428 @OIDs = $self->sort_leaf_items(\@OIDs); 1426 1429 foreach my $OID (@OIDs) { … … 1461 1464 # For those docs that don't belong in the sub/next_metadata_group, but which belong 1462 1465 # at this level, just add the documents as children of this list at the current level 1463 1466 @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs); 1464 1467 foreach my $current_level_OID (@current_level_OIDs) { 1465 1468 my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value); … … 1512 1515 my $sort_type = $self->{$sort_leaf_nodes_usingmetaelem . ".metadata_sort_mode_within_level"}; 1513 1516 # Force a stable sort (Perl 5.6's sort isn't stable) 1514 # !! The [0] bits aren't ideal (multiple metadata values) !! 1517 # !! The [0] bits aren't ideal (multiple metadata values) !! 1518 1515 1519 @OIDs = @OIDs[ sort { 1516 1520 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]}) && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]}))
Note:
See TracChangeset
for help on using the changeset viewer.