Changeset 33452

Show
Ignore:
Timestamp:
03.09.2019 13:15:48 (2 weeks ago)
Author:
kjdon
Message:

revamp of list classifier. More precise handling of numeric metadata values. Can now specify what happens to numeric values separately to what happens with word values. eg can have classifier A,B,C,D etc, then a single 0-9 bucket at the end (or the start). numeric values can be sorted lexically (10 < 9) or numerically (9 < 10). this can be different for sorting the partitions, and sorting the values inside the partitions. numeric values can be partitioned based on the first digit, or by the whole number, or a set number of digits - eg 4 to get a date list if partitioning yyyymmdd dates.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/classify/List.pm

    r31767 r33452  
    99# University of Waikato, New Zealand. 
    1010# 
    11 # Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ 
    1211# 
    1312# Copyright (C) 2005 New Zealand Digital Library Project 
     
    3736 
    3837use BaseClassifier; 
    39  
     38use Sort::Naturally; 
    4039use strict; 
    4140 
     
    4544} 
    4645 
    47 my $meta_select_type_list = 
     46 
     47my $metadata_selection_mode_list = 
    4848    [  
    4949      { 'name' => "firstvalue", 
     
    5353      { 'name' => "allvalues", 
    5454    'desc' => "{List.metadata_selection.allvalues}"} ]; 
    55 my $valid_meta_select_types = { 'firstvalue' => 1, 
     55 
     56my $metadata_selection_mode_default = "firstvalidmetadata"; 
     57 
     58my $valid_metadata_selection_modes = { 'firstvalue' => 1, 
    5659                'firstvalidmetadata' => 1, 
    5760                'allvalues' => 1 }; 
     61 
     62my $metadata_sort_mode_list = 
     63    [ 
     64     { 'name' => "unicode", 
     65       'desc' => "{List.metadata_sort.unicode}"}, 
     66     { 'name' => "alphabetic", 
     67    'desc' => "{List.metadata_sort.alphabetic}"}, 
     68     { 'name' => "alphanumeric", 
     69       'desc' => "{List.metadata_sort.alphanumeric}"} ]; 
     70 
     71my $metadata_sort_mode_default = "alphanumeric"; 
     72 
     73my $valid_metadata_sort_modes = { 'unicode' => 1, 
     74                  'alphabetic' => 1, 
     75                  'alphanumeric' => 1}; 
    5876my $partition_type_list =  
    5977    [ { 'name' => "per_letter", 
     
    6684    'desc' => "{List.level_partition.none}" } ]; 
    6785 
    68 # following used to check types later on 
     86my $partition_type_default = "per_letter"; 
     87 
    6988my $valid_partition_types = { 'per_letter' => 1, 
    7089                  'constant_size' => 1, 
    71                   'per_letter_fixed_size' => 1, 
    7290                  'approximate_size' => 1, 
    7391                  'none' => 1}; 
     92 
     93my $partition_size_default = 30; 
     94 
     95my $numeric_partition_type_list = 
     96    [ { 'name' => "per_digit", 
     97    'desc' => "{List.level_partition.per_digit}" }, 
     98      { 'name' => "per_number", 
     99    'desc' => "{List.level_partition.per_number}" }, 
     100      { 'name' => "single_partition", 
     101    'desc' => "{List.level_partition_single}" }, 
     102      { 'name' => "approximate_size", 
     103    'desc' => "{List.level_partition.approximate_size_numeric}"},  
     104      { 'name' => "constant_size", 
     105    'desc' => "{List.level_partition.constant_size}" },       
     106      { 'name' => "none", 
     107    'desc' => "{List.level_partition.none}" } ]; 
     108 
     109my $numeric_partition_type_default = "single_partition"; 
     110 
     111my $valid_numeric_partition_types = { 'per_digit' => 1, 
     112                      'per_number' => 1, 
     113                      'constant_size' => 1, 
     114                      'single_partition' => 1, 
     115                      'approximate_size' => 1, 
     116                                      'none' =>1  }; 
     117 
     118my $numeric_partition_size_default = 30; 
     119 
     120my $numeric_partition_name_length_default = "-1"; # use the full number 
    74121 
    75122my $bookshelf_type_list =  
     
    81128    'desc' => "{List.bookshelf_type.never}" } ];   
    82129 
     130my $bookshelf_type_default = "never"; 
     131my $sort_leaf_nodes_using_default = "Title"; 
    83132my $arguments =  
    84133    [ { 'name' => "metadata", 
     
    86135    'type' => "metadata", 
    87136    'reqd' => "yes" }, 
    88       { 'name' => "metadata_selection_mode", 
     137       
     138      { 'name' => "metadata_selection_mode_within_level", 
    89139    'desc' => "{List.metadata_selection_mode}", 
    90140    'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/') 
    91     'list' => $meta_select_type_list, 
    92     'deft' => "firstvalidmetadata" }, 
    93       # The interesting options 
     141    'list' => $metadata_selection_mode_list, 
     142    'deft' => $metadata_selection_mode_default }, 
     143       
     144      { 'name' => "metadata_sort_mode_within_level", 
     145    'desc' => "{List.metadata_sort_mode}", 
     146    'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/') 
     147    'list' => $metadata_sort_mode_list, 
     148    'deft' => $metadata_sort_mode_default }, 
     149       
    94150      { 'name' => "bookshelf_type",  
    95151    'desc' => "{List.bookshelf_type}", 
    96152    'type' => "enum", 
    97153    'list' => $bookshelf_type_list, 
    98     'deft' => "never" },  
     154    'deft' => $bookshelf_type_default }, 
     155       
    99156      { 'name' => "classify_sections", 
    100157    'desc' => "{List.classify_sections}", 
    101158    'type' => "flag" }, 
     159       
    102160      { 'name' => "partition_type_within_level", 
    103161    'desc' => "{List.partition_type_within_level}", 
    104162    'type' => "enumstring",  # Must be enumstring because multiple values can be specified (separated by '/') 
    105163    'list' => $partition_type_list, 
    106     'deft' => "per_letter" }, 
     164    'deft' => $partition_type_default }, 
     165       
    107166      { 'name' => "partition_size_within_level", 
    108167    'desc' => "{List.partition_size_within_level}", 
    109     'type' => "string" },  # Must be string because multiple values can be specified (separated by '/') 
     168    'type' => "string",  # Must be string because multiple values can be specified (separated by '/') 
     169    'deft' => $partition_size_default}, 
     170       
    110171      { 'name' => "partition_name_length", 
    111172    'desc' => "{List.partition_name_length}", 
    112173    'type' => "string" }, 
     174       
     175      {'name' => "partition_sort_mode_within_level", 
     176       'desc' => "{List.partition_sort_mode_within_level}", 
     177       'type' => "enumstring",  # Must be enumstring because multiple values can be specified (separated by '/') 
     178       'list' => $metadata_sort_mode_list, 
     179       'deft' => $metadata_sort_mode_default }, 
     180       
     181      { 'name' => "numeric_partition_type_within_level", 
     182    'desc' => "{List.numeric_partition_type_within_level}", 
     183    'type' => "enumstring",  # Must be enumstring because multiple values can be specified (separated by '/') 
     184    'list' => $numeric_partition_type_list, 
     185    'deft' => $numeric_partition_type_default }, 
     186       
     187      {'name' => "numeric_partition_sort_mode_within_level", 
     188       'desc' => "{List.numeric_partition_sort_mode_within_level}", 
     189       'type' => "enumstring",  # Must be enumstring because multiple values can be specified (separated by '/') 
     190       'list' => $metadata_sort_mode_list, 
     191       'deft' => $metadata_sort_mode_default }, 
     192       
     193      { 'name' => "numeric_partition_size_within_level", 
     194    'desc' => "{List.numeric_partition_size_within_level}", 
     195    'type' => "string",  # Must be string because multiple values can be specified (separated by '/') 
     196    'deft' => $numeric_partition_size_default}, 
     197       
     198      { 'name' => "numeric_partition_name_length_within_level", 
     199    'desc' => "{List.numeric_partition_name_length_within_level}", 
     200    'type' => "string", 
     201    'deft' => $numeric_partition_name_length_default }, 
     202       
     203      { 'name' => "numbers_first", 
     204    'desc' => "{List.numbers_first", 
     205    'type' => 'flag'}, 
     206       
    113207      { 'name' => "sort_leaf_nodes_using", 
    114208    'desc' => "{List.sort_leaf_nodes_using}", 
    115209    'type' => "metadata", 
    116     'deft' => "Title" }, 
     210    'deft' => $sort_leaf_nodes_using_default }, 
     211       
     212      { 'name' => "sort_leaf_nodes_sort_mode", 
     213    'desc' => "{List.sort_leaf_nodes_sort_mode}", 
     214    'type' => "enum",  
     215    'list' => $metadata_sort_mode_list, 
     216    'deft' => $metadata_sort_mode_default }, 
     217       
    117218      { 'name' => "reverse_sort_leaf_nodes", 
    118219    'desc' => "{List.reverse_sort_leaf_nodes}", 
    119220    'type' => "flag"}, 
     221       
    120222      { 'name' => "sort_using_unicode_collation", 
    121     'desc' => "{List.sort_using_unicode_collation}", 
     223    'desc' => "{List.metadata_sort.unicode} {List.sort_using_unicode_collation}", 
    122224    'type' => "flag" }, 
     225       
    123226      { 'name' => "use_hlist_for", 
    124227    'desc' => "{List.use_hlist_for}", 
    125228    'type' => "string" }, 
     229       
    126230      {'name' => "filter_metadata", 
    127231       'desc' => "{List.filter_metadata}", 
    128232       'type' => "metadata"}, 
     233       
    129234      {'name' => "filter_regex", 
    130235       'desc' => "{List.filter_regex}", 
    131236       'type' => "regexp"}, 
     237       
    132238      { 'name' => "standardize_capitalization", 
    133239    'desc' => "{List.standardize_capitalization}", 
    134240    'type' => "flag"}, 
     241       
    135242      { 'name' => "removeprefix", 
    136243    'desc' => "{BasClas.removeprefix}", 
    137244    'type' => "regexp" }, 
     245       
    138246      { 'name' => "removesuffix", 
    139247    'desc' => "{BasClas.removesuffix}", 
     
    158266    my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists); 
    159267 
     268    bless $self, $class; 
     269 
    160270    if ($self->{'info_only'}) { 
    161271    # don't worry about any options etc 
    162     return bless $self, $class; 
     272    return $self; 
    163273    } 
    164274 
     
    179289 
    180290    # meta selection mode for each level 
    181     if (!$self->{'metadata_selection_mode'}) { 
    182     foreach my $metadata_group (@metadata_groups) { 
    183         $self->{$metadata_group . ".metadata_selection_mode"} = "firstvalidmetadata"; 
    184     } 
     291    $self->set_metadata_groups_info_per_level("metadata_selection_mode", $metadata_selection_mode_default, $valid_metadata_selection_modes); 
     292    
     293    # meta sort mode for each level 
     294    if ($self->{'sort_using_unicode_collation'}) { 
     295    print STDERR "WARNING: sort_using_unicode_collation is set, setting metadata_sort_mode_within_level to unicode for all levels, regardless of current setting\n"; 
     296    $self->{'metadata_sort_mode_within_level'} = "unicode"; 
     297    $metadata_sort_mode_default = "unicode"; 
    185298    } else { 
    186     my @metadata_selection_mode_list = split(/\//, $self->{'metadata_selection_mode'}); 
    187     foreach my $metadata_group (@metadata_groups) { 
    188         my $meta_select_mode = shift(@metadata_selection_mode_list); 
    189         if (defined($meta_select_mode) && defined $valid_meta_select_types->{$meta_select_mode}) { 
    190         $self->{$metadata_group . ".metadata_selection_mode"} = $meta_select_mode; 
    191         } else { 
    192         $self->{$metadata_group . ".metadata_selection_mode"} = "firstvalidmetadata";    
    193         } 
    194     } 
    195     }    
     299    if ($self->{'metadata_sort_mode_within_level'} =~ /unicode/) { 
     300        $self->{'sort_using_unicode_collation'} = 1; 
     301    } 
     302    } 
     303     
     304     
     305    $self->set_metadata_groups_info_per_level('metadata_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes); 
     306     
    196307    # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last) 
    197308    foreach my $metadata_group (@metadata_groups) { 
     
    210321    } 
    211322 
    212     # How the items are grouped into partitions (default: no partition)  
     323    # How the items are grouped into partitions (default: by letter)  
    213324    # for each level (metadata group), separated by '/' 
    214     if (!$self->{"partition_type_within_level"}) { 
    215     foreach my $metadata_group (@metadata_groups) { 
    216         $self->{$metadata_group . ".partition_type_within_level"} = "none"; 
    217     } 
    218     } else { 
    219     my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});   
    220          
    221     my $first = 1; 
    222     foreach my $metadata_group (@metadata_groups) { 
    223         my $partition_type_within_levelelem = shift(@partition_type_within_levellist); 
    224         if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") { 
    225         print STDERR "per letter fixed size, changing to approximate size\n"; 
    226         $partition_type_within_levelelem = "approximate_size"; 
    227         } 
    228         if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) { 
    229         $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem; 
    230         } 
    231         else { 
    232         if ($first) { 
    233             $self->{$metadata_group . ".partition_type_within_level"} = "none"; 
    234             $first = 0; 
    235         } else { 
    236             $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"}; 
    237         } 
    238         if (defined($partition_type_within_levelelem)) { 
    239             # ie invalid entry 
    240             print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n"; 
    241         } 
    242         } 
    243     } 
    244     } 
    245      
    246     # The number of items in each partition 
    247     if (!$self->{'partition_size_within_level'}) { 
    248     # Default: 20 
    249     foreach my $metadata_group (@metadata_groups) { 
    250         $self->{$metadata_group . ".partition_size_within_level"} = 20; 
    251     } 
    252     } 
    253     else { 
    254     my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'}); 
    255  
    256     # Assign values based on the partition_size_within_level parameter 
    257     foreach my $metadata_group (@metadata_groups) { 
    258         my $partition_size_within_levelelem = shift(@partition_size_within_levellist); 
    259         if (defined($partition_size_within_levelelem)) { 
    260         $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem; 
    261         } 
    262         else { 
    263         $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"}; 
    264         } 
    265     } 
    266     } 
    267  
     325 
     326    $self->set_metadata_groups_info_per_level("partition_type_within_level", $partition_type_default, $valid_partition_types); 
     327 
     328    $self->set_metadata_groups_info_per_level("numeric_partition_type_within_level", $numeric_partition_type_default, $valid_numeric_partition_types); 
     329 
     330    $self->set_metadata_groups_info_per_level("partition_size_within_level", $partition_size_default); 
     331    $self->set_metadata_groups_info_per_level("numeric_partition_size_within_level", $numeric_partition_size_default); 
     332 
     333    $self->set_metadata_groups_info_per_level('partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes); 
     334 
     335    $self->set_metadata_groups_info_per_level('numeric_partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes); 
     336 
     337    $self->set_metadata_groups_info_per_level("numeric_partition_name_length_within_level", $numeric_partition_name_length_default);  
     338     
    268339    # The removeprefix and removesuffix expressions 
    269340    if ($self->{'removeprefix'}) { 
     
    303374 
    304375    # The metadata elements to use to sort the leaf nodes (default: Title) 
    305     my @sort_leaf_nodes_using_metadata_groups = ( "Title" ); 
    306     if ($self->{'sort_leaf_nodes_using'}) { 
    307     @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'}); 
    308     } 
     376    my @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'}); 
    309377    $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups; 
     378 
    310379    foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) { 
    311380    # set metadata_select_type, if not already set - might be already set if the same group was used in -metadata 
    312381    if (!defined $self->{$sort_group . ".metadata_selection_mode"}) { 
    313         $self->{$sort_group . ".metadata_selection_mode"} = "firstvalue"; 
    314     } 
    315     } 
     382        $self->{$sort_group . ".metadata_selection_mode"} = $metadata_selection_mode_default; 
     383    } 
     384     
     385    } 
     386     
     387    my @leaf_nodes_sort_modes = split (/\|/, $self->{'sort_leaf_nodes_sort_mode'}); 
     388    foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) { 
     389    my $leaf_sort_mode = shift(@leaf_nodes_sort_modes); 
     390    if (!defined $self->{$sort_group . ".metadata_sort_mode_within_level"}) { 
     391        if (defined $leaf_sort_mode && defined $valid_metadata_sort_modes->{$leaf_sort_mode}) {  
     392        $self->{$sort_group . ".metadata_sort_mode_within_level"} = $leaf_sort_mode; 
     393        } 
     394        else { 
     395        $self->{$sort_group . ".metadata_sort_mode_within_level"} = $metadata_sort_mode_default; 
     396        } 
     397    } 
     398 
     399    } 
     400  
    316401    # Create an instance of the Unicode::Collate object if better Unicode sorting is desired 
    317402    if ($self->{'sort_using_unicode_collation'}) { 
     
    327412    # A hash for all the doc ids that we have seen, so we don't classify something twice 
    328413    $self->{'all_doc_OIDs'} = {}; 
    329     return bless $self, $class; 
     414    #return bless $self, $class; 
     415    return $self; 
    330416} 
    331417 
     
    336422} 
    337423 
     424sub set_metadata_groups_info_per_level 
     425{ 
     426    my $self = shift(@_); 
     427    my $info_name = shift(@_); 
     428    my $info_default = shift(@_); 
     429    my $info_valid_types_hash_ref = shift(@_); 
     430     
     431    my @info_list = split(/\//, $self->{$info_name}); 
     432 
     433    my $first = 1; 
     434    foreach my $metadata_group (@{$self->{'metadata_groups'}}) { 
     435    my $info_elem = shift(@info_list); 
     436    if (defined ($info_elem) && (!defined $info_valid_types_hash_ref || defined $info_valid_types_hash_ref->{$info_elem})) { 
     437        $self->{$metadata_group .".$info_name"} = $info_elem; 
     438    } else { 
     439        # its empty or an invalid entry 
     440        my $new_info_elem; 
     441        if ($first) { 
     442        $new_info_elem = $info_default; 
     443        } else { 
     444        # get the value we had at first 
     445        $new_info_elem = $self->{@{$self->{'metadata_groups'}}[0] . ".$info_name"}; 
     446        } 
     447        $self->{$metadata_group .".$info_name"} = $new_info_elem; 
     448        if (defined $info_elem) { 
     449        print STDERR "List Error: $info_elem is not a valid value for $info_name, changing it to $new_info_elem\n"; 
     450         
     451        } 
     452    } 
     453    $first = 0; 
     454    } 
     455 
     456} 
    338457 
    339458# Called for each document in the collection 
     
    347466    return; 
    348467    } 
     468     
    349469    $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1; 
     470     
    350471    # check against filter here 
    351472    if ($self->{'filter_metadata'}) { 
    352     #print STDERR "filtering documents on $self->{'filter_metadata'}\n"; 
    353473    my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'}); 
    354474    return unless defined $meta; 
    355475    if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) { 
    356         #print STDERR "doesn't match regex\n"; 
     476        print STDERR "doc $doc_obj doesn't pass filtering requirement\n" if ($self->{'verbosity'} > 3); 
    357477        return; 
    358  
    359478    } 
    360479    } 
     
    381500 
    382501    my @metadata_groups = @{$self->{'metadata_groups'}}; 
    383  
    384      
     502    
    385503    # Only classify the section if it has a value for one of the metadata elements in the first group 
    386504    my $classify_section = 0; 
     
    391509    my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element); 
    392510    my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);   
    393      
    394511    # Remove prefix/suffix if requested 
    395512    if (defined ($first_metadata_group_element_value)) { 
     
    402519        } 
    403520    } 
     521 
     522    $first_metadata_group_element_value  = &sorttools::format_metadata_for_sorting($first_metadata_group, $first_metadata_group_element_value, $doc_obj) unless $self->{'no_metadata_formatting'}; 
    404523    if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") { 
    405         # This section must be included in the classifier 
     524        # This section must be included in the classifier as we have found a value 
    406525        $classify_section = 1; 
    407526        last; 
    408527    }  
    409528    } 
    410  
     529     
    411530    # We're not classifying this section because it doesn't have the required metadata 
    412531    return if (!$classify_section); 
     
    428547        my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)}; 
    429548        foreach my $metadata_value (@metadata_values) { 
     549            #print STDERR "working with value $metadata_value\n"; 
    430550            # Strip leading and trailing whitespace 
    431551            $metadata_value =~ s/^\s*//; 
     
    444564            # text-transform if stored uppercase). 2 CSS text-transforms have been added to core.css 
    445565            my $lc_metadata_value = lc($metadata_value); 
    446              
    447             # We are already working with unicode aware strings at this  
    448             # stage, so we no longer need to convert from utf8 to unicode  
    449             #my $metadata_value_unicode_string = $metadata_value; # $self->convert_utf8_string_to_unicode_string($metadata_value); 
    450  
    451             # Add the metadata value into the list for this combination of metadata group and section 
     566            $lc_metadata_value  = &sorttools::format_metadata_for_sorting($real_metadata_element, $lc_metadata_value, $doc_obj) unless $self->{'no_metadata_formatting'}; 
     567            #print STDERR "formatted value = $lc_metadata_value\n"; 
     568     
     569            # Add the metadata value into the list for this combination of metadata group  
     570            # and section - if we have some non-whitespace chars 
    452571            # text that we have some non-whitespace chars 
    453572            if ($lc_metadata_value =~ /\S/) { 
     
    455574            push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $lc_metadata_value); 
    456575             
    457              
    458             # add the actual value into the stored values so we can remember the case 
    459             if (!$self->{'standardize_capitalization'}) { 
    460             if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) { 
    461                 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++; 
    462             } else { 
    463                 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1; 
     576            # add the actual value into the stored values so we can remember the case 
     577            if (!$self->{'standardize_capitalization'}) { 
     578                if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) { 
     579                $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++; 
     580                } else { 
     581                $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1; 
     582                } 
    464583            } 
    465             } 
    466584            last if ($self->{$metadata_group . ".metadata_selection_mode"} eq "firstvalue"); 
    467585            } 
     
    508626     
    509627    my $metadata_group = $metadata_groups[0]; 
    510      
    511628    if (!defined($self->{$metadata_group . ".list"})) { 
    512629    print STDERR "Warning: No metadata values assigned to $metadata_group.\n"; 
     
    516633    # Create a mapping from metadata value to OID 
    517634    my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"}; 
    518     my %metadata_value_to_OIDs_hash = ();     
     635    my %metadata_value_to_OIDs_hash = ();  
     636    my %numeric_metadata_value_to_OIDs_hash = (); 
    519637    foreach my $OID (@OIDs) 
    520638    { 
     
    524642        foreach my $metadata_value (@metadata_values) 
    525643        { 
    526         push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID); 
     644        if ($metadata_value =~ /^[0-9]/) { 
     645            push(@{$numeric_metadata_value_to_OIDs_hash{$metadata_value}}, $OID); 
     646        } else { 
     647            push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID); 
     648        } 
    527649        } 
    528650    } 
    529651    } 
    530652    #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n"; 
     653    #print STDERR "Number of distinct numeric values: " . scalar(keys %numeric_metadata_value_to_OIDs_hash) . "\n"; 
    531654 
    532655    # Partition the values (if necessary) 
    533656    my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"}; 
    534657    my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"}; 
    535     if ($partition_type_within_level =~ /^per_letter$/i) { 
    536     # Generate one hlist for each letter 
    537     my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash)); 
    538     my %metadata_value_to_OIDs_subhash = (); 
    539  
    540     my $lastpartition = substr($sortedmetadata_values[0], 0, 1); 
    541     foreach my $metadata_value (@sortedmetadata_values) { 
    542         my $metadata_valuepartition = substr($metadata_value, 0, 1); 
    543  
    544         # Is this the start of a new partition? 
    545         if ($metadata_valuepartition ne $lastpartition) { 
    546         $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash); 
    547         %metadata_value_to_OIDs_subhash = (); 
    548         $lastpartition = $metadata_valuepartition; 
    549         } 
    550  
    551         $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};        
    552     } 
    553  
    554     # Don't forget to add the last partition 
    555     $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash); 
    556  
    557     # The partitions are stored in an HList 
    558     $classifier_node->{'childtype'} = "HList"; 
    559     } 
    560     elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) { 
    561     # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size 
    562     # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible 
    563     # If a letter has more items than specified, split into several hlists. 
    564     # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise) 
    565     my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};         
    566     my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash)); 
    567     my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"}; 
     658    my $partition_sort_mode_within_level = $self->{$metadata_group . ".partition_sort_mode_within_level"}; 
     659    my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"}; 
     660     
     661     
     662    ############################################# 
     663    ### DO THE NUMBERS IF THEY ARE TO COME FIRST 
     664    ############################################# 
     665    if ($self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) { 
     666    $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash); 
     667    } 
     668 
     669    ############################################ 
     670    #  DO THE LETTERS  
     671    ############################################ 
     672    if (keys(%metadata_value_to_OIDs_hash)){ # make sure we have some values 
     673    if ($partition_type_within_level =~ /^per_letter$/i) { 
     674        $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash); 
     675    } 
     676    elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) { 
     677        $self->split_approximate_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'}); 
     678         
     679    }  
     680    elsif ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) { 
     681        $self->split_constant_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'}); 
     682    } 
     683         
     684    # Otherwise just add all the values to a VList 
     685    else { 
     686        $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash); 
     687    } 
     688    } 
     689 
     690    ########################################### 
     691    ### DO THE NUMBERS IF THEY ARE TO COME LAST 
     692    ########################################### 
     693    if (!$self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) { 
     694    $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash); 
     695    } 
     696} 
     697 
     698sub partition_numeric_values 
     699{ 
     700    my $self = shift(@_); 
     701    my @metadata_groups = @{shift(@_)}; 
     702    my $classifier_node = shift(@_); 
     703    my $numeric_metadata_value_to_OIDs_hash_ref = shift(@_); 
     704 
     705    my $metadata_group = $metadata_groups[0]; 
     706    my $numeric_partition_type_within_level = $self->{$metadata_group . ".numeric_partition_type_within_level"}; 
     707    my $numeric_partition_size_within_level = $self->{$metadata_group . ".numeric_partition_size_within_level"}; 
     708    my $numeric_partition_sort_mode_within_level = $self->{$metadata_group . ".numeric_partition_sort_mode_within_level"}; 
     709    my $numeric_partition_name_length_within_level = $self->{$metadata_group . ".numeric_partition_name_length_within_level"}; 
     710    my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"}; 
     711     
     712    if ($numeric_partition_type_within_level eq "single_partition") { 
     713    $self->add_hlist_partition(\@metadata_groups, $classifier_node, "0-9", $numeric_metadata_value_to_OIDs_hash_ref); 
     714    }  
     715    elsif ($numeric_partition_type_within_level eq "per_digit") { 
     716    $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, 1); 
     717    } 
     718    elsif ($numeric_partition_type_within_level eq "per_number") { 
     719    # each different number is a bucket 
     720    $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, $numeric_partition_name_length_within_level); 
     721    } 
     722    elsif ($numeric_partition_type_within_level eq "constant_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) { 
     723    # Generate hlists of a certain size 
    568724     
    569     # Separate values by their first letter, each form a bucket, like the per_letter partition type 
    570     my $last_partition = substr($sortedmetadata_values[0], 0, 1); 
    571     my %partition_buckets = (); 
    572     my @metadata_values_in_bucket = (); 
    573     my $num_items_in_bucket = 0; 
    574     foreach my $metadata_value (@sortedmetadata_values) { 
    575         my $metadata_valuepartition = substr($metadata_value, 0, 1); 
    576         if ($metadata_valuepartition ne $last_partition) { 
    577         my @temp_array = @metadata_values_in_bucket; 
    578         # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values  
    579         my %partition_info = ();         
    580         $partition_info{'metadata_values'} = \@temp_array; 
    581         $partition_info{'size'} = $num_items_in_bucket;      
    582         $partition_buckets{$last_partition} = \%partition_info;      
     725    $self->split_constant_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1); 
     726    } elsif ($numeric_partition_type_within_level eq "approximate_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) { 
     727    $self->split_approximate_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1); 
     728    } 
     729    # Otherwise just add all the values to a VList 
     730    else { 
     731    $self->add_vlist(\@metadata_groups, $classifier_node, $numeric_metadata_value_to_OIDs_hash_ref); 
     732    } 
     733     
     734} 
     735 
     736sub split_approximate_size 
     737{ 
     738 
     739    my $self = shift(@_); 
     740    my @metadata_groups = @{shift(@_)}; 
     741    my $classifier_node = shift(@_); 
     742    my $partition_size = shift(@_); 
     743    my $sort_mode = shift(@_); 
     744    my $bookshelf_type = shift(@_);  
     745    my $metadata_value_to_OIDs_hash_ref = shift(@_); 
     746    my $partition_name_length = shift(@_); 
     747    my $is_numeric = shift(@_); 
     748 
     749    # Generate hlist based on the first letter of the metadata value (like per_letter), or based on 
     750    # numbers, but also with restriction on the partition size 
     751    # If a partition has fewer items than specified by the "partition_size_within_level", then group them together if possible 
     752    # If a partition has more items than specified, split into several hlists. 
     753    # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise) 
     754 
     755    my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref)); 
     756     
     757    # Separate values by their first letter, each form a bucket, like the per_letter partition type 
     758    my $last_partition = $self->generate_partition_name($sortedmetadata_values[0], $partition_name_length, $is_numeric); 
     759     
     760    my @partition_buckets = (); 
     761    my @metadata_values_in_bucket = (); 
     762    my $num_items_in_bucket = 0; 
     763 
     764    foreach my $metadata_value (@sortedmetadata_values) { 
     765    my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric); 
     766    if ($metadata_valuepartition ne $last_partition) { 
     767        my @temp_array = @metadata_values_in_bucket; 
     768        # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values  
     769        my %partition_info = ();         
     770        $partition_info{'metadata_values'} = \@temp_array; 
     771        $partition_info{'size'} = $num_items_in_bucket; 
     772        $partition_info{'name'} = $last_partition; 
     773        push (@partition_buckets, \%partition_info); 
     774         
     775        @metadata_values_in_bucket = ($metadata_value); 
     776        $num_items_in_bucket = ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : scalar(@metadata_values_in_bucket)); 
     777        $last_partition = $metadata_valuepartition; 
     778    } else { 
     779        $num_items_in_bucket += ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1); #scalar(@metadata_values_in_bucket); 
     780        push (@metadata_values_in_bucket, $metadata_value); 
     781    } 
     782    } 
     783    # Last one 
     784    my %partition_info = (); 
     785    $partition_info{'metadata_values'} = \@metadata_values_in_bucket; 
     786    $partition_info{'size'} = $num_items_in_bucket; 
     787    $partition_info{'name'} = $last_partition; 
     788     
     789    push (@partition_buckets, \%partition_info); 
     790 
     791    # now go through the array of buckets, and merge small buckets 
     792    my @new_partition_buckets = (); 
     793    for (my $i = 0; $i < scalar(@partition_buckets) - 1; $i++) { 
     794     
     795    my $this_bucket = $partition_buckets[$i]; #->{'name'}; 
     796    my $next_bucket = $partition_buckets[$i+1]; #->{'name'}; 
     797             
     798    my $items_in_partition = $this_bucket->{'size'};                 
     799     
     800    if ($items_in_partition < $partition_size ) { 
     801        my $items_in_next_partition = $next_bucket->{'size'}; 
     802        if ($items_in_partition + $items_in_next_partition <= $partition_size ) { 
     803        # merge this bucket into the next bucket 
     804        foreach my $metadata_value_to_merge (@{$this_bucket->{'metadata_values'}}) { 
     805            push(@{$next_bucket->{'metadata_values'}}, $metadata_value_to_merge); 
     806        }            
     807        $next_bucket->{'size'} += $items_in_partition; 
    583808         
    584         @metadata_values_in_bucket = ($metadata_value); 
    585         $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket); 
    586         $last_partition = $metadata_valuepartition; 
    587809        } else { 
    588         $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket); 
    589         push (@metadata_values_in_bucket, $metadata_value); 
    590         } 
    591     } 
    592     # Last one 
    593     my %partition_info = (); 
    594     $partition_info{'metadata_values'} = \@metadata_values_in_bucket; 
    595     $partition_info{'size'} = $num_items_in_bucket;  
    596     $partition_buckets{$last_partition} = \%partition_info; 
     810        # remember this bucket 
     811        push (@new_partition_buckets, $this_bucket); 
     812        } 
     813    } else { 
     814        # remember this bucket 
     815        push (@new_partition_buckets, $this_bucket); 
     816    } 
     817    } 
     818    # add in the last bucket 
     819    my $last_bucket = $partition_buckets[scalar(@partition_buckets) - 1]; 
     820    push (@new_partition_buckets, $last_bucket); 
     821     
     822    # Add partitions to the main list, but divide big bucket into several 
     823    my $last_partition_end = ""; 
     824    my $partition_start = ""; 
     825    foreach my $partition (@new_partition_buckets) { 
     826    my @metadata_values = $self->sort_metadata_values_array($sort_mode, @{$partition->{'metadata_values'}}); 
     827    my $items_in_partition = $partition->{'size'}; 
     828    $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $partition_name_length, $is_numeric); 
     829     
     830    if ($items_in_partition > $partition_size) { 
     831        my $items_done = 0; 
     832        my %metadata_values_to_OIDs_subhashes = (); 
     833        for (my $i = 0; $i < scalar(@metadata_values); $i++) { 
     834        my $metadata_value = $metadata_values[$i]; 
     835        # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values 
     836        my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1; 
    597837         
    598     my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));    
    599     for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) { 
    600         my $partition = $partition_keys[$i]; 
    601         my $items_in_partition = $partition_buckets{$partition}->{'size'};               
    602         # Merge small buckets together, but keep the numeric bucket apart 
    603         if ($items_in_partition < $partition_size_within_level) { 
    604         my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'}; 
    605         if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level  
    606             && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)  
    607              || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) { 
    608             foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) { 
    609             push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge); 
    610             }            
    611             $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition; 
    612             delete $partition_buckets{$partition};                       
    613         }  
    614         } 
    615     } 
    616     @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets)); 
    617      
    618     # Add partitions, and divide big bucket into several 
    619     my $last_partition_end = ""; 
    620     my $partition_start = ""; 
    621     foreach my $partition (@partition_keys) { 
    622         my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}}); 
    623         my $items_in_partition = $partition_buckets{$partition}->{'size'};  
    624         $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"}); 
    625          
    626         if ($items_in_partition > $partition_size_within_level) {    
    627         my $items_done = 0; 
    628         my %metadata_values_to_OIDs_subhashes = (); 
    629         for (my $i = 0; $i < scalar(@metadata_values); $i++) { 
    630             my $metadata_value = $metadata_values[$i]; 
    631             # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values 
    632             my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1; 
    633  
    634             my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"}); 
    635             my $partitionname = $partition_start; 
    636             if ($partitionend ne $partition_start) { 
    637             $partitionname = $partitionname . "-" . $partitionend; 
    638             } 
    639              
    640             # Start a new partition 
    641             if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) { 
    642             $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes); 
    643             $last_partition_end = $partitionend;             
    644             $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"}); 
    645             $items_done = 0; 
    646             %metadata_values_to_OIDs_subhashes = (); 
    647             }  
    648              
    649             # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions 
    650                     if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) { 
    651             my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"}); 
    652             # Get the number of partitions needed for this value 
    653             my $num_splits = int($items_for_this_md_value / $partition_size_within_level); 
    654             $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits); 
    655  
    656             my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}}; 
    657             for (my $i = 0; $i < $num_splits; $i++) { 
    658                 my %OIDs_subhashes_for_this_value = (); 
    659                 my @OIDs_for_this_partition = (); 
    660                 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) { 
    661                 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]); 
    662                 }  
    663                  
    664                 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values  
    665                 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) { 
    666                 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition; 
    667                 $items_done += scalar(@OIDs_for_this_partition); 
    668                 next; 
    669                 } 
    670                  
    671                 # Add an HList for this bucket 
    672                 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition; 
    673                 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value); 
    674                 $last_partition_end = $partitionname_for_this_value; 
    675             } 
    676             next; 
    677                     } 
    678                          
    679             $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value}; 
    680             $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;          
    681              
    682             # The last partition 
    683             if($i == scalar(@metadata_values) - 1) { 
    684             $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes); 
    685             } 
    686         }        
    687         } 
    688         else { 
    689         # The easier case, just add a partition 
    690         my %metadata_values_to_OIDs_subhashes = (); 
    691         for (my $i = 0; $i < scalar(@metadata_values); $i++) { 
    692             my $metadata_value = $metadata_values[$i]; 
    693             $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};             
    694         } 
    695         my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1]; 
    696         my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"}); 
     838        my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $partition_name_length, $is_numeric); 
    697839        my $partitionname = $partition_start; 
    698840        if ($partitionend ne $partition_start) { 
    699841            $partitionname = $partitionname . "-" . $partitionend; 
    700842        } 
    701         $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);            
    702         $last_partition_end = $partitionend;         
    703         } 
    704     }    
     843         
     844        if ($items_done + $items_for_this_md_value > $partition_size && $items_done != 0) { 
     845            # Start a new partition 
     846            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes); 
     847            $last_partition_end = $partitionend;             
     848            $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric); 
     849            $items_done = 0; 
     850            %metadata_values_to_OIDs_subhashes = (); 
     851        }  
     852         
     853        # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions 
     854        if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size) { 
     855             
     856            my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric); 
     857            # Get the number of partitions needed for this value 
     858            my $num_splits = int($items_for_this_md_value / $partition_size); 
     859            $num_splits++ if ($items_for_this_md_value / $partition_size > $num_splits); 
     860             
     861            my @OIDs_for_this_value = $metadata_value_to_OIDs_hash_ref->{$metadata_value}; 
     862            for (my $i = 0; $i < $num_splits; $i++) { 
     863            my %OIDs_subhashes_for_this_value = (); 
     864            my @OIDs_for_this_partition = (); 
     865            for (my $d = $i * $partition_size; $d < (($i+1) * $partition_size > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size); $d++) { 
     866                push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]); 
     867            }  
     868             
     869            # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values  
     870            if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size) { 
     871                $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition; 
     872                $items_done += scalar(@OIDs_for_this_partition); 
     873                next; 
     874            } 
     875             
     876            # Add an HList for this bucket 
     877            $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition; 
     878            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value); 
     879            $last_partition_end = $partitionname_for_this_value; 
     880            } 
     881            next; 
     882        } 
     883                         
     884        $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value}; 
     885        $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;          
     886         
     887        # The last partition 
     888        if($i == scalar(@metadata_values) - 1) { 
     889            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes); 
     890        } 
     891        }        
     892    } # end if items in partition > partition size 
     893    else { 
     894        # The easier case, just add a partition 
     895        my %metadata_values_to_OIDs_subhashes = (); 
     896        for (my $i = 0; $i < scalar(@metadata_values); $i++) { 
     897        my $metadata_value = $metadata_values[$i]; 
     898        $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};           
     899        } 
     900        my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1]; 
     901        my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $partition_name_length, $is_numeric); 
     902        my $partitionname = $partition_start; 
     903        if ($partitionend ne $partition_start) { 
     904        $partitionname = $partitionname . "-" . $partitionend; 
     905        } 
    705906     
    706     # The partitions are stored in an HList 
    707     $classifier_node->{'childtype'} = "HList"; 
    708  
    709     } # end approximate_size 
    710     else { 
    711     # Generate hlists of a certain size 
    712     if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) { 
    713         my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash)); 
    714         my $itemsdone = 0; 
    715         my %metadata_value_to_OIDs_subhash = (); 
    716         my $lastpartitionend = ""; 
    717         my $partitionstart; 
    718         foreach my $metadata_value (@sortedmetadata_values) { 
    719         $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value}; 
    720         $itemsdone++; 
    721         my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash); 
    722  
    723         # Is this the start of a new partition? 
    724         if ($itemsinpartition == 1) { 
    725             $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"}); 
    726         } 
    727  
    728         # Is this the end of the partition? 
    729         if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) { 
    730             my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"}); 
    731             my $partitionname = $partitionstart; 
    732             if ($partitionend ne $partitionstart) { 
    733             $partitionname = $partitionname . "-" . $partitionend; 
    734             } 
    735  
    736             $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash); 
    737             %metadata_value_to_OIDs_subhash = (); 
    738             $lastpartitionend = $partitionend; 
    739         } 
    740         } 
    741  
    742         # The partitions are stored in an HList 
    743         $classifier_node->{'childtype'} = "HList"; 
    744     } 
    745  
    746     # Otherwise just add all the values to a VList 
    747     else { 
    748         $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash); 
    749     } 
    750     } 
    751 } 
    752  
     907        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);            
     908        $last_partition_end = $partitionend;         
     909    } 
     910    }    
     911     
     912    # The partitions are stored in an HList 
     913    $classifier_node->{'childtype'} = "HList"; 
     914 
     915} 
     916 
     917sub split_constant_size 
     918{ 
     919    my $self = shift(@_); 
     920    my @metadata_groups = @{shift(@_)}; 
     921    my $classifier_node = shift(@_); 
     922    my $partition_size = shift(@_); 
     923    my $sort_mode = shift(@_); 
     924    my $metadata_value_to_OIDs_hash_ref = shift(@_); 
     925    my $partition_name_length = shift(@_); 
     926    my $is_numeric = shift(@_); 
     927 
     928    my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref)); 
     929    my $itemsdone = 0; 
     930    my $numitems = scalar(@sortedmetadata_values); 
     931    my %metadata_value_to_OIDs_subhash = (); 
     932    my $lastpartitionend = ""; 
     933    my $partitionstart; 
     934    foreach my $metadata_value (@sortedmetadata_values) { 
     935    $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value}; 
     936    $itemsdone++; 
     937    my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash); 
     938 
     939    # Is this the start of a new partition? 
     940    if ($itemsinpartition == 1) { 
     941        $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $is_numeric); 
     942    } 
     943 
     944    # Is this the end of the partition? 
     945    if ($itemsinpartition == $partition_size || $itemsdone == $numitems) { 
     946        my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $is_numeric); 
     947        my $partitionname = $partitionstart; 
     948        if ($partitionend ne $partitionstart) { 
     949        $partitionname = $partitionname . "-" . $partitionend; 
     950        } 
     951 
     952        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash); 
     953        %metadata_value_to_OIDs_subhash = (); 
     954        $lastpartitionend = $partitionend; 
     955    } 
     956    } 
     957 
     958    # The partitions are stored in an HList 
     959    $classifier_node->{'childtype'} = "HList"; 
     960 
     961} 
     962 
     963sub split_per_letter_or_digit 
     964{ 
     965    my $self = shift(@_); 
     966    my @metadata_groups = @{shift(@_)}; 
     967    my $classifier_node = shift(@_); 
     968    my $sort_mode = shift(@_); 
     969    my $metadata_value_to_OIDs_hash_ref = shift(@_); 
     970    my $is_numeric = shift(@_); 
     971    my $numeric_partition_length = shift(@_); 
     972 
     973    if (not defined $is_numeric) { 
     974    $is_numeric = 0; 
     975    } 
     976    if ($is_numeric && not defined($numeric_partition_length)) { 
     977    $numeric_partition_length = 1; 
     978    } 
     979    # Generate one hlist for each letter 
     980    my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref)); 
     981    my %metadata_value_to_OIDs_subhash = (); 
     982 
     983    my $lastpartition = $self->generate_partition_name($sortedmetadata_values[0], $numeric_partition_length, $is_numeric); 
     984    foreach my $metadata_value (@sortedmetadata_values) { 
     985     
     986    my $metadata_valuepartition =  $self->generate_partition_name($metadata_value, $numeric_partition_length, $is_numeric); 
     987 
     988    # Is this the start of a new partition? 
     989    if ($metadata_valuepartition ne $lastpartition) { 
     990        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash); 
     991        %metadata_value_to_OIDs_subhash = (); 
     992        $lastpartition = $metadata_valuepartition; 
     993    } 
     994     
     995    $metadata_value_to_OIDs_subhash{$metadata_value} = %$metadata_value_to_OIDs_hash_ref{$metadata_value};       
     996    } 
     997     
     998    # Don't forget to add the last partition 
     999    $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash); 
     1000     
     1001    # The partitions are stored in an HList 
     1002    $classifier_node->{'childtype'} = "HList"; 
     1003 
     1004} 
     1005 
     1006sub generate_partition_name 
     1007{ 
     1008    my $self = shift(@_); 
     1009    my $mvalue = shift(@_); 
     1010    my $numeric_partition_length = shift(@_); 
     1011    my $is_numeric = shift(@_); 
     1012 
     1013    if (!$is_numeric || $numeric_partition_length == 1 ) { 
     1014    return substr($mvalue, 0, 1); 
     1015    } 
     1016    if ($numeric_partition_length == -1) { 
     1017    my ($all_digits) = $mvalue =~ /^([0-9]+)/; 
     1018    return $all_digits; 
     1019    } 
     1020    my ($some_digits) = $mvalue =~ /^([0-9]{1,$numeric_partition_length})/; 
     1021    return $some_digits; 
     1022} 
    7531023 
    7541024sub generate_partition_start 
     
    7581028    my $lastpartitionend = shift(@_); 
    7591029    my $partition_name_length = shift(@_); 
    760  
     1030    my $is_numeric = shift(@_); 
     1031 
     1032    if ($is_numeric) { 
     1033    return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric); 
     1034    } 
     1035     
    7611036    if ($partition_name_length) { 
    7621037    return substr($metadata_value, 0, $partition_name_length); 
     
    7821057    my $partitionstart = shift(@_); 
    7831058    my $partition_name_length = shift(@_); 
    784  
     1059    my $is_numeric = shift(@_); 
     1060 
     1061    if ($is_numeric) { 
     1062    return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric); 
     1063    } 
    7851064    if ($partition_name_length) { 
    7861065    return substr($metadata_value, 0, $partition_name_length); 
     
    8341113    $classifier_node->{'mdtype'} = $metadata_group; 
    8351114 
     1115    my $sort_type = $self->{$metadata_group .".metadata_sort_mode_within_level"}; 
    8361116    # Create an entry in the vlist for each value 
    837     foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref}))) 
     1117    foreach my $metadata_value ($self->sort_metadata_values_array($sort_type, keys(%{$metadata_value_to_OIDs_hash_ref}))) 
    8381118    { 
    8391119    my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}; 
     
    8561136    else { 
    8571137        my $metadata_value_display = $self->get_metadata_value_display($metadata_group, $metadata_value); 
    858         # Note that we don't need to convert from unicode-aware strings  
    859         # to utf8 here, as that is handled elsewhere in the code 
    860         my %child_classifier_node = ( 'Title' => $metadata_value_display, # 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value), 
     1138        my %child_classifier_node = ( 'Title' => $metadata_value_display,  
    8611139                      'childtype' => "VList", 
    8621140                      'mdtype' => $metadata_group, 
    8631141                      'contains' => [] ); 
    8641142 
    865         #@OIDs = $self->sort_leaf_items(\@OIDs); 
    8661143        # If there are metadata elements remaining, recursively apply the process 
    8671144        if (@metadata_groups > 0) { 
     
    9321209    my $self = shift(@_); 
    9331210    my @OIDs = @{shift(@_)}; 
    934 #    my $classifier_node = shift(@_); 
    935      
     1211 
    9361212    # Sort leaf nodes and add to list 
    9371213    my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}}; 
    9381214    foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) { 
    9391215    my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"}; 
     1216    my $sort_type = $self->{$sort_leaf_nodes_usingmetaelem . ".metadata_sort_mode_within_level"}; 
    9401217    # Force a stable sort (Perl 5.6's sort isn't stable) 
    9411218    # !! The [0] bits aren't ideal (multiple metadata values) !!  
     
    9431220      if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]}) && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})) 
    9441221      { 
    945         $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0]; 
     1222          if ($sort_type eq "numeric") { 
     1223          $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] <=> $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0]; 
     1224          } elsif ($sort_type eq "alphabetic") { 
     1225          $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0]; 
     1226          } else { 
     1227          ncmp($OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0], $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0]); 
     1228          } 
    9461229      } 
    9471230      else 
     
    9521235    } 
    9531236    if ($self->{'reverse_sort_leaf_nodes'}) { 
    954     #print STDERR "reversing\n"; 
    9551237    return reverse @OIDs; 
    9561238    } 
     
    9591241 
    9601242 
    961  
    9621243sub sort_metadata_values_array 
    9631244{ 
    9641245    my $self = shift(@_); 
     1246    my ($sort_mode) = shift(@_); 
    9651247    my @metadata_values = @_; 
    9661248 
    967     if ($self->{'unicode_collator'}) { 
    968     return $self->{'unicode_collator'}->sort(@metadata_values); 
    969     } 
    970     else { 
    971     return sort { $self->alpha_numeric_cmp($a,$b) }(@metadata_values); 
    972     } 
    973 } 
    974  
     1249    if ($sort_mode eq "unicode") { 
     1250    if ($self->{'unicode_collator'}) { 
     1251        return $self->{'unicode_collator'}->sort(@metadata_values); 
     1252    } 
     1253    # the collator wasn't loaded, fall back on default 
     1254    $sort_mode = "alphanumeric"; 
     1255    } 
     1256    if ($sort_mode eq "numeric") { 
     1257    return sort {$a <=> $b} @metadata_values; 
     1258    } 
     1259    if ($sort_mode eq "alphabetic") { 
     1260    return sort {$a cmp $b} @metadata_values; 
     1261    } 
     1262    # natural sort 
     1263    return nsort(@metadata_values); 
     1264} 
     1265 
     1266# we are not using this any more. Using nsort instead 
    9751267# $a and $b args automatically passed in and shouldn't be declared 
    9761268sub alpha_numeric_cmp