Changeset 38738


Ignore:
Timestamp:
2024-02-13T11:12:53+13:00 (4 months ago)
Author:
kjdon
Message:

some more work around the logic of split_approximate_size, when we have a large bucket that needs splitting, with a metadata value with multiple oids. hopefully I have it right now.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/classify/List.pm

    r36911 r38738  
    568568        my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
    569569        foreach my $metadata_value (@metadata_values) {
    570             #print STDERR "working with value $metadata_value\n";
    571570            # Strip leading and trailing whitespace
    572571            $metadata_value =~ s/^\s*//;
     
    696695    }
    697696    elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
    698         $self->split_approximate_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'}, $self->{'max_partition_name_length'});
     697            $self->split_approximate_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'}, $self->{'max_partition_name_length'});
    699698       
    700699    }
     
    763762   
    764763}
     764
    765765
    766766
     
    811811        $last_partition = $metadata_valuepartition;
    812812    } else {
    813         $num_items_in_bucket += ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1); #scalar(@metadata_values_in_bucket);
     813        $num_items_in_bucket += ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1);
    814814        push (@metadata_values_in_bucket, $metadata_value);
    815815    }
     
    883883            my $num_splits = int($items_in_partition/$partition_size);
    884884            $num_splits++ if (($items_in_partition - $partition_size*$num_splits) > $tolerance);
    885             my $this_partition_size = $items_in_partition/$num_splits;
     885           
     886            my $this_partition_size = int($items_in_partition/$num_splits);
     887            if ($this_partition_size < $items_in_partition/$num_splits) { $this_partition_size++ };
     888
    886889            # regenerate-the start, noticing that we are a split
    887890            $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric, 1);
     
    893896        # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
    894897        my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1;
    895        
    896         if ($items_done + $items_for_this_md_value > $this_partition_size && $items_done != 0 && ($items_in_partition - ($prev_done+$items_done) >= $tolerance)) {
    897             # Save the stored items into a partition
    898             $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes);
     898
     899               
     900                if ($items_done >= $this_partition_size) {
     901                    # what we have stored already is enough for a partition
     902                    # store this partition
     903            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes);
    899904            $last_partition_end = $partition_end;           
    900905            $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric, 1);
     
    903908            %metadata_values_to_OIDs_subhashes = ();
    904909        }
     910                $partition_end = $self->generate_partition_end($metadata_value, $partition_start, $partition_name_length, $max_partition_name_length, $is_numeric);
     911                $partition_name = $partition_start;
     912                if ($partition_end ne $partition_start) {
     913                    $partition_name = $partition_name . "-" . $partition_end;
     914                }
     915               
     916                if ($items_done + $items_for_this_md_value <= $this_partition_size) {
     917                    # store all the values for the current metadata to the
     918                    # current partition
     919                    $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
     920            $items_done += $items_for_this_md_value;
     921
     922                } else {
     923                    # we only want to add some of the values
     924                    my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
     925                    my @OIDs_for_this_partition = ();
     926                    for (my $j=0; $j< $items_for_this_md_value; $j++) {
     927                        push (@OIDs_for_this_partition, $OIDs_for_this_value[$j]);
     928                        $items_done++;
     929
     930                        if ($items_done >= $this_partition_size ) {
     931                            # add the partition
     932                            $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
     933                                                       
     934                            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes);
     935                            $last_partition_end = $partition_end;           
     936                            $partition_start = $partition_end; # is that right? $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric, 1);
     937                            $prev_done += $items_done;
     938                            $items_done = 0;
     939                            %metadata_values_to_OIDs_subhashes = ();
     940                            @OIDs_for_this_partition = ();
     941                        }
     942                       
     943                    } # for each OID in the list
     944                    # at the end, have we got some ids not added to a partition?
     945                    # add them into the subhash
     946                    if (scalar(@OIDs_for_this_partition)) {
     947                        $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
     948                    }
     949                }
     950            } # end foreach metadata value
     951            # The last partition?
     952            if($items_done >0) {
     953                $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes);
     954            }
    905955       
    906         # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
    907         if ($bookshelf_type eq "never" && $items_for_this_md_value > $this_partition_size) {
    908                     # if this is the case, we would have already stored what we have done in the previous step.
    909             my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric);
    910             # Get the number of partitions needed for this value
    911             my $num_splits = int($items_for_this_md_value / $this_partition_size);
    912             $num_splits++ if ($items_for_this_md_value / $this_partition_size > $num_splits);
    913             my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
    914             for (my $i = 0; $i < $num_splits; $i++) {
    915             my %OIDs_subhashes_for_this_value = ();
    916             my @OIDs_for_this_partition = ();
    917             for (my $d = $i * $this_partition_size; $d < (($i+1) * $this_partition_size > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size); $d++) {
    918                 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
    919             }
    920            
    921             # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
    922             if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $this_partition_size) {
    923                 $partition_start = $partitionname_for_this_value;
    924                 $partition_name = $partition_start;
    925                 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
    926                 $items_done += scalar(@OIDs_for_this_partition);
    927                 $last_partition_end = $partitionname_for_this_value
    928             } else {
    929            
    930                 # Add an HList for this bucket
    931                 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
    932                 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
    933                 $last_partition_end = $partitionname_for_this_value;
    934             }
    935             }
    936         } else {
    937             $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
    938             $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
    939              $partition_end = $self->generate_partition_end($metadata_value, $partition_start, $partition_name_length, $max_partition_name_length, $is_numeric);
    940              $partition_name = $partition_start;
    941             if ($partition_end ne $partition_start) {
    942             $partition_name = $partition_name . "-" . $partition_end;
    943             }
    944            
    945         }
    946        
    947         # The last partition
    948         if($i == scalar(@metadata_values) - 1 && $items_done >0) {
    949             $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes);
    950         }
    951        
    952         }
    953     } # end if items in partition > partition size
    954 
    955     }   
     956        } # end if items in partition > partition size
     957
     958    } # end for each partition bucket
    956959   
    957960    # The partitions are stored in an HList
     
    959962
    960963}
     964
     965
    961966sub split_approximate_size_ORIG
    962967{
     
    14071412    my $metadata_group = shift(@metadata_groups);
    14081413    $classifier_node->{'mdtype'} = $metadata_group;
    1409 
    14101414    my $sort_type = $self->{$metadata_group .".metadata_sort_mode_within_level"};
    14111415    # Create an entry in the vlist for each value
     
    14221426    # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
    14231427    elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
    1424        
    14251428        @OIDs = $self->sort_leaf_items(\@OIDs);
    14261429        foreach my $OID (@OIDs) {
     
    14611464        # For those docs that don't belong in the sub/next_metadata_group, but which belong
    14621465        # at this level, just add the documents as children of this list at the current level
    1463         @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
     1466                @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
    14641467        foreach my $current_level_OID (@current_level_OIDs) {
    14651468            my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
     
    15121515    my $sort_type = $self->{$sort_leaf_nodes_usingmetaelem . ".metadata_sort_mode_within_level"};
    15131516    # Force a stable sort (Perl 5.6's sort isn't stable)
    1514     # !! The [0] bits aren't ideal (multiple metadata values) !!
     1517    # !! The [0] bits aren't ideal (multiple metadata values) !!
     1518       
    15151519    @OIDs = @OIDs[ sort {
    15161520      if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]}) && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]}))
Note: See TracChangeset for help on using the changeset viewer.