Changeset 33452


Ignore:
Timestamp:
2019-09-03T13:15:48+12:00 (5 years ago)
Author:
kjdon
Message:

revamp of list classifier. More precise handling of numeric metadata values. Can now specify what happens to numeric values separately to what happens with word values. eg can have classifier A,B,C,D etc, then a single 0-9 bucket at the end (or the start). numeric values can be sorted lexically (10 < 9) or numerically (9 < 10). this can be different for sorting the partitions, and sorting the values inside the partitions. numeric values can be partitioned based on the first digit, or by the whole number, or a set number of digits - eg 4 to get a date list if partitioning yyyymmdd dates.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/classify/List.pm

    r31767 r33452  
    99# University of Waikato, New Zealand.
    1010#
    11 # Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
    1211#
    1312# Copyright (C) 2005 New Zealand Digital Library Project
     
    3736
    3837use BaseClassifier;
    39 
     38use Sort::Naturally;
    4039use strict;
    4140
     
    4544}
    4645
    47 my $meta_select_type_list =
     46
     47my $metadata_selection_mode_list =
    4848    [
    4949      { 'name' => "firstvalue",
     
    5353      { 'name' => "allvalues",
    5454    'desc' => "{List.metadata_selection.allvalues}"} ];
    55 my $valid_meta_select_types = { 'firstvalue' => 1,
     55
     56my $metadata_selection_mode_default = "firstvalidmetadata";
     57
     58my $valid_metadata_selection_modes = { 'firstvalue' => 1,
    5659                'firstvalidmetadata' => 1,
    5760                'allvalues' => 1 };
     61
     62my $metadata_sort_mode_list =
     63    [
     64     { 'name' => "unicode",
     65       'desc' => "{List.metadata_sort.unicode}"},
     66     { 'name' => "alphabetic",
     67    'desc' => "{List.metadata_sort.alphabetic}"},
     68     { 'name' => "alphanumeric",
     69       'desc' => "{List.metadata_sort.alphanumeric}"} ];
     70
     71my $metadata_sort_mode_default = "alphanumeric";
     72
     73my $valid_metadata_sort_modes = { 'unicode' => 1,
     74                  'alphabetic' => 1,
     75                  'alphanumeric' => 1};
    5876my $partition_type_list =
    5977    [ { 'name' => "per_letter",
     
    6684    'desc' => "{List.level_partition.none}" } ];
    6785
    68 # following used to check types later on
     86my $partition_type_default = "per_letter";
     87
    6988my $valid_partition_types = { 'per_letter' => 1,
    7089                  'constant_size' => 1,
    71                   'per_letter_fixed_size' => 1,
    7290                  'approximate_size' => 1,
    7391                  'none' => 1};
     92
     93my $partition_size_default = 30;
     94
     95my $numeric_partition_type_list =
     96    [ { 'name' => "per_digit",
     97    'desc' => "{List.level_partition.per_digit}" },
     98      { 'name' => "per_number",
     99    'desc' => "{List.level_partition.per_number}" },
     100      { 'name' => "single_partition",
     101    'desc' => "{List.level_partition_single}" },
     102      { 'name' => "approximate_size",
     103    'desc' => "{List.level_partition.approximate_size_numeric}"},
     104      { 'name' => "constant_size",
     105    'desc' => "{List.level_partition.constant_size}" },     
     106      { 'name' => "none",
     107    'desc' => "{List.level_partition.none}" } ];
     108
     109my $numeric_partition_type_default = "single_partition";
     110
     111my $valid_numeric_partition_types = { 'per_digit' => 1,
     112                      'per_number' => 1,
     113                      'constant_size' => 1,
     114                      'single_partition' => 1,
     115                      'approximate_size' => 1,
     116                                      'none' =>1  };
     117
     118my $numeric_partition_size_default = 30;
     119
     120my $numeric_partition_name_length_default = "-1"; # use the full number
    74121
    75122my $bookshelf_type_list =
     
    81128    'desc' => "{List.bookshelf_type.never}" } ]; 
    82129
     130my $bookshelf_type_default = "never";
     131my $sort_leaf_nodes_using_default = "Title";
    83132my $arguments =
    84133    [ { 'name' => "metadata",
     
    86135    'type' => "metadata",
    87136    'reqd' => "yes" },
    88       { 'name' => "metadata_selection_mode",
     137     
     138      { 'name' => "metadata_selection_mode_within_level",
    89139    'desc' => "{List.metadata_selection_mode}",
    90140    'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
    91     'list' => $meta_select_type_list,
    92     'deft' => "firstvalidmetadata" },
    93       # The interesting options
     141    'list' => $metadata_selection_mode_list,
     142    'deft' => $metadata_selection_mode_default },
     143     
     144      { 'name' => "metadata_sort_mode_within_level",
     145    'desc' => "{List.metadata_sort_mode}",
     146    'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
     147    'list' => $metadata_sort_mode_list,
     148    'deft' => $metadata_sort_mode_default },
     149     
    94150      { 'name' => "bookshelf_type",
    95151    'desc' => "{List.bookshelf_type}",
    96152    'type' => "enum",
    97153    'list' => $bookshelf_type_list,
    98     'deft' => "never" },
     154    'deft' => $bookshelf_type_default },
     155     
    99156      { 'name' => "classify_sections",
    100157    'desc' => "{List.classify_sections}",
    101158    'type' => "flag" },
     159     
    102160      { 'name' => "partition_type_within_level",
    103161    'desc' => "{List.partition_type_within_level}",
    104162    'type' => "enumstring",  # Must be enumstring because multiple values can be specified (separated by '/')
    105163    'list' => $partition_type_list,
    106     'deft' => "per_letter" },
     164    'deft' => $partition_type_default },
     165     
    107166      { 'name' => "partition_size_within_level",
    108167    'desc' => "{List.partition_size_within_level}",
    109     'type' => "string" },  # Must be string because multiple values can be specified (separated by '/')
     168    'type' => "string",  # Must be string because multiple values can be specified (separated by '/')
     169    'deft' => $partition_size_default},
     170     
    110171      { 'name' => "partition_name_length",
    111172    'desc' => "{List.partition_name_length}",
    112173    'type' => "string" },
     174     
     175      {'name' => "partition_sort_mode_within_level",
     176       'desc' => "{List.partition_sort_mode_within_level}",
     177       'type' => "enumstring",  # Must be enumstring because multiple values can be specified (separated by '/')
     178       'list' => $metadata_sort_mode_list,
     179       'deft' => $metadata_sort_mode_default },
     180     
     181      { 'name' => "numeric_partition_type_within_level",
     182    'desc' => "{List.numeric_partition_type_within_level}",
     183    'type' => "enumstring",  # Must be enumstring because multiple values can be specified (separated by '/')
     184    'list' => $numeric_partition_type_list,
     185    'deft' => $numeric_partition_type_default },
     186     
     187      {'name' => "numeric_partition_sort_mode_within_level",
     188       'desc' => "{List.numeric_partition_sort_mode_within_level}",
     189       'type' => "enumstring",  # Must be enumstring because multiple values can be specified (separated by '/')
     190       'list' => $metadata_sort_mode_list,
     191       'deft' => $metadata_sort_mode_default },
     192     
     193      { 'name' => "numeric_partition_size_within_level",
     194    'desc' => "{List.numeric_partition_size_within_level}",
     195    'type' => "string",  # Must be string because multiple values can be specified (separated by '/')
     196    'deft' => $numeric_partition_size_default},
     197     
     198      { 'name' => "numeric_partition_name_length_within_level",
     199    'desc' => "{List.numeric_partition_name_length_within_level}",
     200    'type' => "string",
     201    'deft' => $numeric_partition_name_length_default },
     202     
     203      { 'name' => "numbers_first",
     204    'desc' => "{List.numbers_first",
     205    'type' => 'flag'},
     206     
    113207      { 'name' => "sort_leaf_nodes_using",
    114208    'desc' => "{List.sort_leaf_nodes_using}",
    115209    'type' => "metadata",
    116     'deft' => "Title" },
     210    'deft' => $sort_leaf_nodes_using_default },
     211     
     212      { 'name' => "sort_leaf_nodes_sort_mode",
     213    'desc' => "{List.sort_leaf_nodes_sort_mode}",
     214    'type' => "enum",
     215    'list' => $metadata_sort_mode_list,
     216    'deft' => $metadata_sort_mode_default },
     217     
    117218      { 'name' => "reverse_sort_leaf_nodes",
    118219    'desc' => "{List.reverse_sort_leaf_nodes}",
    119220    'type' => "flag"},
     221     
    120222      { 'name' => "sort_using_unicode_collation",
    121     'desc' => "{List.sort_using_unicode_collation}",
     223    'desc' => "{List.metadata_sort.unicode} {List.sort_using_unicode_collation}",
    122224    'type' => "flag" },
     225     
    123226      { 'name' => "use_hlist_for",
    124227    'desc' => "{List.use_hlist_for}",
    125228    'type' => "string" },
     229     
    126230      {'name' => "filter_metadata",
    127231       'desc' => "{List.filter_metadata}",
    128232       'type' => "metadata"},
     233     
    129234      {'name' => "filter_regex",
    130235       'desc' => "{List.filter_regex}",
    131236       'type' => "regexp"},
     237     
    132238      { 'name' => "standardize_capitalization",
    133239    'desc' => "{List.standardize_capitalization}",
    134240    'type' => "flag"},
     241     
    135242      { 'name' => "removeprefix",
    136243    'desc' => "{BasClas.removeprefix}",
    137244    'type' => "regexp" },
     245     
    138246      { 'name' => "removesuffix",
    139247    'desc' => "{BasClas.removesuffix}",
     
    158266    my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
    159267
     268    bless $self, $class;
     269
    160270    if ($self->{'info_only'}) {
    161271    # don't worry about any options etc
    162     return bless $self, $class;
     272    return $self;
    163273    }
    164274
     
    179289
    180290    # meta selection mode for each level
    181     if (!$self->{'metadata_selection_mode'}) {
    182     foreach my $metadata_group (@metadata_groups) {
    183         $self->{$metadata_group . ".metadata_selection_mode"} = "firstvalidmetadata";
    184     }
     291    $self->set_metadata_groups_info_per_level("metadata_selection_mode", $metadata_selection_mode_default, $valid_metadata_selection_modes);
     292   
     293    # meta sort mode for each level
     294    if ($self->{'sort_using_unicode_collation'}) {
     295    print STDERR "WARNING: sort_using_unicode_collation is set, setting metadata_sort_mode_within_level to unicode for all levels, regardless of current setting\n";
     296    $self->{'metadata_sort_mode_within_level'} = "unicode";
     297    $metadata_sort_mode_default = "unicode";
    185298    } else {
    186     my @metadata_selection_mode_list = split(/\//, $self->{'metadata_selection_mode'});
    187     foreach my $metadata_group (@metadata_groups) {
    188         my $meta_select_mode = shift(@metadata_selection_mode_list);
    189         if (defined($meta_select_mode) && defined $valid_meta_select_types->{$meta_select_mode}) {
    190         $self->{$metadata_group . ".metadata_selection_mode"} = $meta_select_mode;
    191         } else {
    192         $self->{$metadata_group . ".metadata_selection_mode"} = "firstvalidmetadata";   
    193         }
    194     }
    195     }   
     299    if ($self->{'metadata_sort_mode_within_level'} =~ /unicode/) {
     300        $self->{'sort_using_unicode_collation'} = 1;
     301    }
     302    }
     303   
     304   
     305    $self->set_metadata_groups_info_per_level('metadata_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
     306   
    196307    # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
    197308    foreach my $metadata_group (@metadata_groups) {
     
    210321    }
    211322
    212     # How the items are grouped into partitions (default: no partition)
     323    # How the items are grouped into partitions (default: by letter)
    213324    # for each level (metadata group), separated by '/'
    214     if (!$self->{"partition_type_within_level"}) {
    215     foreach my $metadata_group (@metadata_groups) {
    216         $self->{$metadata_group . ".partition_type_within_level"} = "none";
    217     }
    218     } else {
    219     my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'}); 
    220        
    221     my $first = 1;
    222     foreach my $metadata_group (@metadata_groups) {
    223         my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
    224         if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
    225         print STDERR "per letter fixed size, changing to approximate size\n";
    226         $partition_type_within_levelelem = "approximate_size";
    227         }
    228         if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
    229         $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
    230         }
    231         else {
    232         if ($first) {
    233             $self->{$metadata_group . ".partition_type_within_level"} = "none";
    234             $first = 0;
    235         } else {
    236             $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
    237         }
    238         if (defined($partition_type_within_levelelem)) {
    239             # ie invalid entry
    240             print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
    241         }
    242         }
    243     }
    244     }
    245    
    246     # The number of items in each partition
    247     if (!$self->{'partition_size_within_level'}) {
    248     # Default: 20
    249     foreach my $metadata_group (@metadata_groups) {
    250         $self->{$metadata_group . ".partition_size_within_level"} = 20;
    251     }
    252     }
    253     else {
    254     my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
    255 
    256     # Assign values based on the partition_size_within_level parameter
    257     foreach my $metadata_group (@metadata_groups) {
    258         my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
    259         if (defined($partition_size_within_levelelem)) {
    260         $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
    261         }
    262         else {
    263         $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
    264         }
    265     }
    266     }
    267 
     325
     326    $self->set_metadata_groups_info_per_level("partition_type_within_level", $partition_type_default, $valid_partition_types);
     327
     328    $self->set_metadata_groups_info_per_level("numeric_partition_type_within_level", $numeric_partition_type_default, $valid_numeric_partition_types);
     329
     330    $self->set_metadata_groups_info_per_level("partition_size_within_level", $partition_size_default);
     331    $self->set_metadata_groups_info_per_level("numeric_partition_size_within_level", $numeric_partition_size_default);
     332
     333    $self->set_metadata_groups_info_per_level('partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
     334
     335    $self->set_metadata_groups_info_per_level('numeric_partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
     336
     337    $self->set_metadata_groups_info_per_level("numeric_partition_name_length_within_level", $numeric_partition_name_length_default);
     338   
    268339    # The removeprefix and removesuffix expressions
    269340    if ($self->{'removeprefix'}) {
     
    303374
    304375    # The metadata elements to use to sort the leaf nodes (default: Title)
    305     my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
    306     if ($self->{'sort_leaf_nodes_using'}) {
    307     @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
    308     }
     376    my @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
    309377    $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
     378
    310379    foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
    311380    # set metadata_select_type, if not already set - might be already set if the same group was used in -metadata
    312381    if (!defined $self->{$sort_group . ".metadata_selection_mode"}) {
    313         $self->{$sort_group . ".metadata_selection_mode"} = "firstvalue";
    314     }
    315     }
     382        $self->{$sort_group . ".metadata_selection_mode"} = $metadata_selection_mode_default;
     383    }
     384   
     385    }
     386   
     387    my @leaf_nodes_sort_modes = split (/\|/, $self->{'sort_leaf_nodes_sort_mode'});
     388    foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
     389    my $leaf_sort_mode = shift(@leaf_nodes_sort_modes);
     390    if (!defined $self->{$sort_group . ".metadata_sort_mode_within_level"}) {
     391        if (defined $leaf_sort_mode && defined $valid_metadata_sort_modes->{$leaf_sort_mode}) {
     392        $self->{$sort_group . ".metadata_sort_mode_within_level"} = $leaf_sort_mode;
     393        }
     394        else {
     395        $self->{$sort_group . ".metadata_sort_mode_within_level"} = $metadata_sort_mode_default;
     396        }
     397    }
     398
     399    }
     400 
    316401    # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
    317402    if ($self->{'sort_using_unicode_collation'}) {
     
    327412    # A hash for all the doc ids that we have seen, so we don't classify something twice
    328413    $self->{'all_doc_OIDs'} = {};
    329     return bless $self, $class;
     414    #return bless $self, $class;
     415    return $self;
    330416}
    331417
     
    336422}
    337423
     424sub set_metadata_groups_info_per_level
     425{
     426    my $self = shift(@_);
     427    my $info_name = shift(@_);
     428    my $info_default = shift(@_);
     429    my $info_valid_types_hash_ref = shift(@_);
     430   
     431    my @info_list = split(/\//, $self->{$info_name});
     432
     433    my $first = 1;
     434    foreach my $metadata_group (@{$self->{'metadata_groups'}}) {
     435    my $info_elem = shift(@info_list);
     436    if (defined ($info_elem) && (!defined $info_valid_types_hash_ref || defined $info_valid_types_hash_ref->{$info_elem})) {
     437        $self->{$metadata_group .".$info_name"} = $info_elem;
     438    } else {
     439        # its empty or an invalid entry
     440        my $new_info_elem;
     441        if ($first) {
     442        $new_info_elem = $info_default;
     443        } else {
     444        # get the value we had at first
     445        $new_info_elem = $self->{@{$self->{'metadata_groups'}}[0] . ".$info_name"};
     446        }
     447        $self->{$metadata_group .".$info_name"} = $new_info_elem;
     448        if (defined $info_elem) {
     449        print STDERR "List Error: $info_elem is not a valid value for $info_name, changing it to $new_info_elem\n";
     450       
     451        }
     452    }
     453    $first = 0;
     454    }
     455
     456}
    338457
    339458# Called for each document in the collection
     
    347466    return;
    348467    }
     468   
    349469    $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
     470   
    350471    # check against filter here
    351472    if ($self->{'filter_metadata'}) {
    352     #print STDERR "filtering documents on $self->{'filter_metadata'}\n";
    353473    my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'});
    354474    return unless defined $meta;
    355475    if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) {
    356         #print STDERR "doesn't match regex\n";
     476        print STDERR "doc $doc_obj doesn't pass filtering requirement\n" if ($self->{'verbosity'} > 3);
    357477        return;
    358 
    359478    }
    360479    }
     
    381500
    382501    my @metadata_groups = @{$self->{'metadata_groups'}};
    383 
    384    
     502   
    385503    # Only classify the section if it has a value for one of the metadata elements in the first group
    386504    my $classify_section = 0;
     
    391509    my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
    392510    my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element); 
    393    
    394511    # Remove prefix/suffix if requested
    395512    if (defined ($first_metadata_group_element_value)) {
     
    402519        }
    403520    }
     521
     522    $first_metadata_group_element_value  = &sorttools::format_metadata_for_sorting($first_metadata_group, $first_metadata_group_element_value, $doc_obj) unless $self->{'no_metadata_formatting'};
    404523    if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
    405         # This section must be included in the classifier
     524        # This section must be included in the classifier as we have found a value
    406525        $classify_section = 1;
    407526        last;
    408527    }
    409528    }
    410 
     529   
    411530    # We're not classifying this section because it doesn't have the required metadata
    412531    return if (!$classify_section);
     
    428547        my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
    429548        foreach my $metadata_value (@metadata_values) {
     549            #print STDERR "working with value $metadata_value\n";
    430550            # Strip leading and trailing whitespace
    431551            $metadata_value =~ s/^\s*//;
     
    444564            # text-transform if stored uppercase). 2 CSS text-transforms have been added to core.css
    445565            my $lc_metadata_value = lc($metadata_value);
    446            
    447             # We are already working with unicode aware strings at this
    448             # stage, so we no longer need to convert from utf8 to unicode
    449             #my $metadata_value_unicode_string = $metadata_value; # $self->convert_utf8_string_to_unicode_string($metadata_value);
    450 
    451             # Add the metadata value into the list for this combination of metadata group and section
     566            $lc_metadata_value  = &sorttools::format_metadata_for_sorting($real_metadata_element, $lc_metadata_value, $doc_obj) unless $self->{'no_metadata_formatting'};
     567            #print STDERR "formatted value = $lc_metadata_value\n";
     568   
     569            # Add the metadata value into the list for this combination of metadata group
     570            # and section - if we have some non-whitespace chars
    452571            # text that we have some non-whitespace chars
    453572            if ($lc_metadata_value =~ /\S/) {
     
    455574            push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $lc_metadata_value);
    456575           
    457            
    458             # add the actual value into the stored values so we can remember the case
    459             if (!$self->{'standardize_capitalization'}) {
    460             if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) {
    461                 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++;
    462             } else {
    463                 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1;
     576            # add the actual value into the stored values so we can remember the case
     577            if (!$self->{'standardize_capitalization'}) {
     578                if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) {
     579                $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++;
     580                } else {
     581                $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1;
     582                }
    464583            }
    465             }
    466584            last if ($self->{$metadata_group . ".metadata_selection_mode"} eq "firstvalue");
    467585            }
     
    508626   
    509627    my $metadata_group = $metadata_groups[0];
    510    
    511628    if (!defined($self->{$metadata_group . ".list"})) {
    512629    print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
     
    516633    # Create a mapping from metadata value to OID
    517634    my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
    518     my %metadata_value_to_OIDs_hash = ();   
     635    my %metadata_value_to_OIDs_hash = ();
     636    my %numeric_metadata_value_to_OIDs_hash = ();
    519637    foreach my $OID (@OIDs)
    520638    {
     
    524642        foreach my $metadata_value (@metadata_values)
    525643        {
    526         push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
     644        if ($metadata_value =~ /^[0-9]/) {
     645            push(@{$numeric_metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
     646        } else {
     647            push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
     648        }
    527649        }
    528650    }
    529651    }
    530652    #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
     653    #print STDERR "Number of distinct numeric values: " . scalar(keys %numeric_metadata_value_to_OIDs_hash) . "\n";
    531654
    532655    # Partition the values (if necessary)
    533656    my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
    534657    my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
    535     if ($partition_type_within_level =~ /^per_letter$/i) {
    536     # Generate one hlist for each letter
    537     my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
    538     my %metadata_value_to_OIDs_subhash = ();
    539 
    540     my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
    541     foreach my $metadata_value (@sortedmetadata_values) {
    542         my $metadata_valuepartition = substr($metadata_value, 0, 1);
    543 
    544         # Is this the start of a new partition?
    545         if ($metadata_valuepartition ne $lastpartition) {
    546         $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
    547         %metadata_value_to_OIDs_subhash = ();
    548         $lastpartition = $metadata_valuepartition;
    549         }
    550 
    551         $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};       
    552     }
    553 
    554     # Don't forget to add the last partition
    555     $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
    556 
    557     # The partitions are stored in an HList
    558     $classifier_node->{'childtype'} = "HList";
    559     }
    560     elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
    561     # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
    562     # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
    563     # If a letter has more items than specified, split into several hlists.
    564     # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
    565     my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};       
    566     my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
    567     my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
     658    my $partition_sort_mode_within_level = $self->{$metadata_group . ".partition_sort_mode_within_level"};
     659    my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"};
     660   
     661   
     662    #############################################
     663    ### DO THE NUMBERS IF THEY ARE TO COME FIRST
     664    #############################################
     665    if ($self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) {
     666    $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash);
     667    }
     668
     669    ############################################
     670    #  DO THE LETTERS
     671    ############################################
     672    if (keys(%metadata_value_to_OIDs_hash)){ # make sure we have some values
     673    if ($partition_type_within_level =~ /^per_letter$/i) {
     674        $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash);
     675    }
     676    elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
     677        $self->split_approximate_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'});
     678       
     679    }
     680    elsif ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
     681        $self->split_constant_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'});
     682    }
     683       
     684    # Otherwise just add all the values to a VList
     685    else {
     686        $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
     687    }
     688    }
     689
     690    ###########################################
     691    ### DO THE NUMBERS IF THEY ARE TO COME LAST
     692    ###########################################
     693    if (!$self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) {
     694    $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash);
     695    }
     696}
     697
     698sub partition_numeric_values
     699{
     700    my $self = shift(@_);
     701    my @metadata_groups = @{shift(@_)};
     702    my $classifier_node = shift(@_);
     703    my $numeric_metadata_value_to_OIDs_hash_ref = shift(@_);
     704
     705    my $metadata_group = $metadata_groups[0];
     706    my $numeric_partition_type_within_level = $self->{$metadata_group . ".numeric_partition_type_within_level"};
     707    my $numeric_partition_size_within_level = $self->{$metadata_group . ".numeric_partition_size_within_level"};
     708    my $numeric_partition_sort_mode_within_level = $self->{$metadata_group . ".numeric_partition_sort_mode_within_level"};
     709    my $numeric_partition_name_length_within_level = $self->{$metadata_group . ".numeric_partition_name_length_within_level"};
     710    my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"};
     711   
     712    if ($numeric_partition_type_within_level eq "single_partition") {
     713    $self->add_hlist_partition(\@metadata_groups, $classifier_node, "0-9", $numeric_metadata_value_to_OIDs_hash_ref);
     714    }
     715    elsif ($numeric_partition_type_within_level eq "per_digit") {
     716    $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, 1);
     717    }
     718    elsif ($numeric_partition_type_within_level eq "per_number") {
     719    # each different number is a bucket
     720    $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, $numeric_partition_name_length_within_level);
     721    }
     722    elsif ($numeric_partition_type_within_level eq "constant_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
     723    # Generate hlists of a certain size
    568724   
    569     # Separate values by their first letter, each form a bucket, like the per_letter partition type
    570     my $last_partition = substr($sortedmetadata_values[0], 0, 1);
    571     my %partition_buckets = ();
    572     my @metadata_values_in_bucket = ();
    573     my $num_items_in_bucket = 0;
    574     foreach my $metadata_value (@sortedmetadata_values) {
    575         my $metadata_valuepartition = substr($metadata_value, 0, 1);
    576         if ($metadata_valuepartition ne $last_partition) {
    577         my @temp_array = @metadata_values_in_bucket;
    578         # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
    579         my %partition_info = ();       
    580         $partition_info{'metadata_values'} = \@temp_array;
    581         $partition_info{'size'} = $num_items_in_bucket;     
    582         $partition_buckets{$last_partition} = \%partition_info;     
     725    $self->split_constant_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1);
     726    } elsif ($numeric_partition_type_within_level eq "approximate_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
     727    $self->split_approximate_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1);
     728    }
     729    # Otherwise just add all the values to a VList
     730    else {
     731    $self->add_vlist(\@metadata_groups, $classifier_node, $numeric_metadata_value_to_OIDs_hash_ref);
     732    }
     733   
     734}
     735
     736sub split_approximate_size
     737{
     738
     739    my $self = shift(@_);
     740    my @metadata_groups = @{shift(@_)};
     741    my $classifier_node = shift(@_);
     742    my $partition_size = shift(@_);
     743    my $sort_mode = shift(@_);
     744    my $bookshelf_type = shift(@_);
     745    my $metadata_value_to_OIDs_hash_ref = shift(@_);
     746    my $partition_name_length = shift(@_);
     747    my $is_numeric = shift(@_);
     748
     749    # Generate hlist based on the first letter of the metadata value (like per_letter), or based on
     750    # numbers, but also with restriction on the partition size
     751    # If a partition has fewer items than specified by the "partition_size_within_level", then group them together if possible
     752    # If a partition has more items than specified, split into several hlists.
     753    # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
     754
     755    my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
     756   
     757    # Separate values by their first letter, each form a bucket, like the per_letter partition type
     758    my $last_partition = $self->generate_partition_name($sortedmetadata_values[0], $partition_name_length, $is_numeric);
     759   
     760    my @partition_buckets = ();
     761    my @metadata_values_in_bucket = ();
     762    my $num_items_in_bucket = 0;
     763
     764    foreach my $metadata_value (@sortedmetadata_values) {
     765    my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
     766    if ($metadata_valuepartition ne $last_partition) {
     767        my @temp_array = @metadata_values_in_bucket;
     768        # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
     769        my %partition_info = ();       
     770        $partition_info{'metadata_values'} = \@temp_array;
     771        $partition_info{'size'} = $num_items_in_bucket;
     772        $partition_info{'name'} = $last_partition;
     773        push (@partition_buckets, \%partition_info);
     774       
     775        @metadata_values_in_bucket = ($metadata_value);
     776        $num_items_in_bucket = ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : scalar(@metadata_values_in_bucket));
     777        $last_partition = $metadata_valuepartition;
     778    } else {
     779        $num_items_in_bucket += ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1); #scalar(@metadata_values_in_bucket);
     780        push (@metadata_values_in_bucket, $metadata_value);
     781    }
     782    }
     783    # Last one
     784    my %partition_info = ();
     785    $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
     786    $partition_info{'size'} = $num_items_in_bucket;
     787    $partition_info{'name'} = $last_partition;
     788   
     789    push (@partition_buckets, \%partition_info);
     790
     791    # now go through the array of buckets, and merge small buckets
     792    my @new_partition_buckets = ();
     793    for (my $i = 0; $i < scalar(@partition_buckets) - 1; $i++) {
     794   
     795    my $this_bucket = $partition_buckets[$i]; #->{'name'};
     796    my $next_bucket = $partition_buckets[$i+1]; #->{'name'};
     797           
     798    my $items_in_partition = $this_bucket->{'size'};               
     799   
     800    if ($items_in_partition < $partition_size ) {
     801        my $items_in_next_partition = $next_bucket->{'size'};
     802        if ($items_in_partition + $items_in_next_partition <= $partition_size ) {
     803        # merge this bucket into the next bucket
     804        foreach my $metadata_value_to_merge (@{$this_bucket->{'metadata_values'}}) {
     805            push(@{$next_bucket->{'metadata_values'}}, $metadata_value_to_merge);
     806        }           
     807        $next_bucket->{'size'} += $items_in_partition;
    583808       
    584         @metadata_values_in_bucket = ($metadata_value);
    585         $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
    586         $last_partition = $metadata_valuepartition;
    587809        } else {
    588         $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
    589         push (@metadata_values_in_bucket, $metadata_value);
    590         }
    591     }
    592     # Last one
    593     my %partition_info = ();
    594     $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
    595     $partition_info{'size'} = $num_items_in_bucket;
    596     $partition_buckets{$last_partition} = \%partition_info;
     810        # remember this bucket
     811        push (@new_partition_buckets, $this_bucket);
     812        }
     813    } else {
     814        # remember this bucket
     815        push (@new_partition_buckets, $this_bucket);
     816    }
     817    }
     818    # add in the last bucket
     819    my $last_bucket = $partition_buckets[scalar(@partition_buckets) - 1];
     820    push (@new_partition_buckets, $last_bucket);
     821   
     822    # Add partitions to the main list, but divide big bucket into several
     823    my $last_partition_end = "";
     824    my $partition_start = "";
     825    foreach my $partition (@new_partition_buckets) {
     826    my @metadata_values = $self->sort_metadata_values_array($sort_mode, @{$partition->{'metadata_values'}});
     827    my $items_in_partition = $partition->{'size'};
     828    $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $partition_name_length, $is_numeric);
     829   
     830    if ($items_in_partition > $partition_size) {
     831        my $items_done = 0;
     832        my %metadata_values_to_OIDs_subhashes = ();
     833        for (my $i = 0; $i < scalar(@metadata_values); $i++) {
     834        my $metadata_value = $metadata_values[$i];
     835        # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
     836        my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1;
    597837       
    598     my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));   
    599     for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
    600         my $partition = $partition_keys[$i];
    601         my $items_in_partition = $partition_buckets{$partition}->{'size'};             
    602         # Merge small buckets together, but keep the numeric bucket apart
    603         if ($items_in_partition < $partition_size_within_level) {
    604         my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
    605         if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
    606             && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
    607              || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
    608             foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
    609             push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
    610             }           
    611             $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
    612             delete $partition_buckets{$partition};                     
    613         }
    614         }
    615     }
    616     @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
    617    
    618     # Add partitions, and divide big bucket into several
    619     my $last_partition_end = "";
    620     my $partition_start = "";
    621     foreach my $partition (@partition_keys) {
    622         my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
    623         my $items_in_partition = $partition_buckets{$partition}->{'size'};
    624         $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
    625        
    626         if ($items_in_partition > $partition_size_within_level) {   
    627         my $items_done = 0;
    628         my %metadata_values_to_OIDs_subhashes = ();
    629         for (my $i = 0; $i < scalar(@metadata_values); $i++) {
    630             my $metadata_value = $metadata_values[$i];
    631             # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
    632             my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
    633 
    634             my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
    635             my $partitionname = $partition_start;
    636             if ($partitionend ne $partition_start) {
    637             $partitionname = $partitionname . "-" . $partitionend;
    638             }
    639            
    640             # Start a new partition
    641             if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
    642             $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
    643             $last_partition_end = $partitionend;           
    644             $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
    645             $items_done = 0;
    646             %metadata_values_to_OIDs_subhashes = ();
    647             }
    648            
    649             # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
    650                     if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
    651             my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
    652             # Get the number of partitions needed for this value
    653             my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
    654             $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
    655 
    656             my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
    657             for (my $i = 0; $i < $num_splits; $i++) {
    658                 my %OIDs_subhashes_for_this_value = ();
    659                 my @OIDs_for_this_partition = ();
    660                 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
    661                 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
    662                 }
    663                
    664                 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
    665                 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
    666                 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
    667                 $items_done += scalar(@OIDs_for_this_partition);
    668                 next;
    669                 }
    670                
    671                 # Add an HList for this bucket
    672                 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
    673                 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
    674                 $last_partition_end = $partitionname_for_this_value;
    675             }
    676             next;
    677                     }
    678                        
    679             $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
    680             $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;         
    681            
    682             # The last partition
    683             if($i == scalar(@metadata_values) - 1) {
    684             $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
    685             }
    686         }       
    687         }
    688         else {
    689         # The easier case, just add a partition
    690         my %metadata_values_to_OIDs_subhashes = ();
    691         for (my $i = 0; $i < scalar(@metadata_values); $i++) {
    692             my $metadata_value = $metadata_values[$i];
    693             $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};           
    694         }
    695         my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
    696         my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
     838        my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $partition_name_length, $is_numeric);
    697839        my $partitionname = $partition_start;
    698840        if ($partitionend ne $partition_start) {
    699841            $partitionname = $partitionname . "-" . $partitionend;
    700842        }
    701         $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);           
    702         $last_partition_end = $partitionend;       
    703         }
    704     }   
     843       
     844        if ($items_done + $items_for_this_md_value > $partition_size && $items_done != 0) {
     845            # Start a new partition
     846            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
     847            $last_partition_end = $partitionend;           
     848            $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric);
     849            $items_done = 0;
     850            %metadata_values_to_OIDs_subhashes = ();
     851        }
     852       
     853        # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
     854        if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size) {
     855           
     856            my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric);
     857            # Get the number of partitions needed for this value
     858            my $num_splits = int($items_for_this_md_value / $partition_size);
     859            $num_splits++ if ($items_for_this_md_value / $partition_size > $num_splits);
     860           
     861            my @OIDs_for_this_value = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
     862            for (my $i = 0; $i < $num_splits; $i++) {
     863            my %OIDs_subhashes_for_this_value = ();
     864            my @OIDs_for_this_partition = ();
     865            for (my $d = $i * $partition_size; $d < (($i+1) * $partition_size > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size); $d++) {
     866                push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
     867            }
     868           
     869            # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
     870            if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size) {
     871                $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
     872                $items_done += scalar(@OIDs_for_this_partition);
     873                next;
     874            }
     875           
     876            # Add an HList for this bucket
     877            $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
     878            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
     879            $last_partition_end = $partitionname_for_this_value;
     880            }
     881            next;
     882        }
     883                       
     884        $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
     885        $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;         
     886       
     887        # The last partition
     888        if($i == scalar(@metadata_values) - 1) {
     889            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
     890        }
     891        }       
     892    } # end if items in partition > partition size
     893    else {
     894        # The easier case, just add a partition
     895        my %metadata_values_to_OIDs_subhashes = ();
     896        for (my $i = 0; $i < scalar(@metadata_values); $i++) {
     897        my $metadata_value = $metadata_values[$i];
     898        $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};         
     899        }
     900        my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
     901        my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $partition_name_length, $is_numeric);
     902        my $partitionname = $partition_start;
     903        if ($partitionend ne $partition_start) {
     904        $partitionname = $partitionname . "-" . $partitionend;
     905        }
    705906   
    706     # The partitions are stored in an HList
    707     $classifier_node->{'childtype'} = "HList";
    708 
    709     } # end approximate_size
    710     else {
    711     # Generate hlists of a certain size
    712     if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
    713         my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
    714         my $itemsdone = 0;
    715         my %metadata_value_to_OIDs_subhash = ();
    716         my $lastpartitionend = "";
    717         my $partitionstart;
    718         foreach my $metadata_value (@sortedmetadata_values) {
    719         $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
    720         $itemsdone++;
    721         my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
    722 
    723         # Is this the start of a new partition?
    724         if ($itemsinpartition == 1) {
    725             $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
    726         }
    727 
    728         # Is this the end of the partition?
    729         if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
    730             my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
    731             my $partitionname = $partitionstart;
    732             if ($partitionend ne $partitionstart) {
    733             $partitionname = $partitionname . "-" . $partitionend;
    734             }
    735 
    736             $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
    737             %metadata_value_to_OIDs_subhash = ();
    738             $lastpartitionend = $partitionend;
    739         }
    740         }
    741 
    742         # The partitions are stored in an HList
    743         $classifier_node->{'childtype'} = "HList";
    744     }
    745 
    746     # Otherwise just add all the values to a VList
    747     else {
    748         $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
    749     }
    750     }
    751 }
    752 
     907        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);           
     908        $last_partition_end = $partitionend;       
     909    }
     910    }   
     911   
     912    # The partitions are stored in an HList
     913    $classifier_node->{'childtype'} = "HList";
     914
     915}
     916
     917sub split_constant_size
     918{
     919    my $self = shift(@_);
     920    my @metadata_groups = @{shift(@_)};
     921    my $classifier_node = shift(@_);
     922    my $partition_size = shift(@_);
     923    my $sort_mode = shift(@_);
     924    my $metadata_value_to_OIDs_hash_ref = shift(@_);
     925    my $partition_name_length = shift(@_);
     926    my $is_numeric = shift(@_);
     927
     928    my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
     929    my $itemsdone = 0;
     930    my $numitems = scalar(@sortedmetadata_values);
     931    my %metadata_value_to_OIDs_subhash = ();
     932    my $lastpartitionend = "";
     933    my $partitionstart;
     934    foreach my $metadata_value (@sortedmetadata_values) {
     935    $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
     936    $itemsdone++;
     937    my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
     938
     939    # Is this the start of a new partition?
     940    if ($itemsinpartition == 1) {
     941        $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $is_numeric);
     942    }
     943
     944    # Is this the end of the partition?
     945    if ($itemsinpartition == $partition_size || $itemsdone == $numitems) {
     946        my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $is_numeric);
     947        my $partitionname = $partitionstart;
     948        if ($partitionend ne $partitionstart) {
     949        $partitionname = $partitionname . "-" . $partitionend;
     950        }
     951
     952        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
     953        %metadata_value_to_OIDs_subhash = ();
     954        $lastpartitionend = $partitionend;
     955    }
     956    }
     957
     958    # The partitions are stored in an HList
     959    $classifier_node->{'childtype'} = "HList";
     960
     961}
     962
     963sub split_per_letter_or_digit
     964{
     965    my $self = shift(@_);
     966    my @metadata_groups = @{shift(@_)};
     967    my $classifier_node = shift(@_);
     968    my $sort_mode = shift(@_);
     969    my $metadata_value_to_OIDs_hash_ref = shift(@_);
     970    my $is_numeric = shift(@_);
     971    my $numeric_partition_length = shift(@_);
     972
     973    if (not defined $is_numeric) {
     974    $is_numeric = 0;
     975    }
     976    if ($is_numeric && not defined($numeric_partition_length)) {
     977    $numeric_partition_length = 1;
     978    }
     979    # Generate one hlist for each letter
     980    my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
     981    my %metadata_value_to_OIDs_subhash = ();
     982
     983    my $lastpartition = $self->generate_partition_name($sortedmetadata_values[0], $numeric_partition_length, $is_numeric);
     984    foreach my $metadata_value (@sortedmetadata_values) {
     985   
     986    my $metadata_valuepartition =  $self->generate_partition_name($metadata_value, $numeric_partition_length, $is_numeric);
     987
     988    # Is this the start of a new partition?
     989    if ($metadata_valuepartition ne $lastpartition) {
     990        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
     991        %metadata_value_to_OIDs_subhash = ();
     992        $lastpartition = $metadata_valuepartition;
     993    }
     994   
     995    $metadata_value_to_OIDs_subhash{$metadata_value} = %$metadata_value_to_OIDs_hash_ref{$metadata_value};     
     996    }
     997   
     998    # Don't forget to add the last partition
     999    $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
     1000   
     1001    # The partitions are stored in an HList
     1002    $classifier_node->{'childtype'} = "HList";
     1003
     1004}
     1005
     1006sub generate_partition_name
     1007{
     1008    my $self = shift(@_);
     1009    my $mvalue = shift(@_);
     1010    my $numeric_partition_length = shift(@_);
     1011    my $is_numeric = shift(@_);
     1012
     1013    if (!$is_numeric || $numeric_partition_length == 1 ) {
     1014    return substr($mvalue, 0, 1);
     1015    }
     1016    if ($numeric_partition_length == -1) {
     1017    my ($all_digits) = $mvalue =~ /^([0-9]+)/;
     1018    return $all_digits;
     1019    }
     1020    my ($some_digits) = $mvalue =~ /^([0-9]{1,$numeric_partition_length})/;
     1021    return $some_digits;
     1022}
    7531023
    7541024sub generate_partition_start
     
    7581028    my $lastpartitionend = shift(@_);
    7591029    my $partition_name_length = shift(@_);
    760 
     1030    my $is_numeric = shift(@_);
     1031
     1032    if ($is_numeric) {
     1033    return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
     1034    }
     1035   
    7611036    if ($partition_name_length) {
    7621037    return substr($metadata_value, 0, $partition_name_length);
     
    7821057    my $partitionstart = shift(@_);
    7831058    my $partition_name_length = shift(@_);
    784 
     1059    my $is_numeric = shift(@_);
     1060
     1061    if ($is_numeric) {
     1062    return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
     1063    }
    7851064    if ($partition_name_length) {
    7861065    return substr($metadata_value, 0, $partition_name_length);
     
    8341113    $classifier_node->{'mdtype'} = $metadata_group;
    8351114
     1115    my $sort_type = $self->{$metadata_group .".metadata_sort_mode_within_level"};
    8361116    # Create an entry in the vlist for each value
    837     foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
     1117    foreach my $metadata_value ($self->sort_metadata_values_array($sort_type, keys(%{$metadata_value_to_OIDs_hash_ref})))
    8381118    {
    8391119    my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
     
    8561136    else {
    8571137        my $metadata_value_display = $self->get_metadata_value_display($metadata_group, $metadata_value);
    858         # Note that we don't need to convert from unicode-aware strings
    859         # to utf8 here, as that is handled elsewhere in the code
    860         my %child_classifier_node = ( 'Title' => $metadata_value_display, # 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
     1138        my %child_classifier_node = ( 'Title' => $metadata_value_display,
    8611139                      'childtype' => "VList",
    8621140                      'mdtype' => $metadata_group,
    8631141                      'contains' => [] );
    8641142
    865         #@OIDs = $self->sort_leaf_items(\@OIDs);
    8661143        # If there are metadata elements remaining, recursively apply the process
    8671144        if (@metadata_groups > 0) {
     
    9321209    my $self = shift(@_);
    9331210    my @OIDs = @{shift(@_)};
    934 #    my $classifier_node = shift(@_);
    935    
     1211
    9361212    # Sort leaf nodes and add to list
    9371213    my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
    9381214    foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
    9391215    my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
     1216    my $sort_type = $self->{$sort_leaf_nodes_usingmetaelem . ".metadata_sort_mode_within_level"};
    9401217    # Force a stable sort (Perl 5.6's sort isn't stable)
    9411218    # !! The [0] bits aren't ideal (multiple metadata values) !!
     
    9431220      if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]}) && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]}))
    9441221      {
    945         $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
     1222          if ($sort_type eq "numeric") {
     1223          $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] <=> $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
     1224          } elsif ($sort_type eq "alphabetic") {
     1225          $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
     1226          } else {
     1227          ncmp($OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0], $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0]);
     1228          }
    9461229      }
    9471230      else
     
    9521235    }
    9531236    if ($self->{'reverse_sort_leaf_nodes'}) {
    954     #print STDERR "reversing\n";
    9551237    return reverse @OIDs;
    9561238    }
     
    9591241
    9601242
    961 
    9621243sub sort_metadata_values_array
    9631244{
    9641245    my $self = shift(@_);
     1246    my ($sort_mode) = shift(@_);
    9651247    my @metadata_values = @_;
    9661248
    967     if ($self->{'unicode_collator'}) {
    968     return $self->{'unicode_collator'}->sort(@metadata_values);
    969     }
    970     else {
    971     return sort { $self->alpha_numeric_cmp($a,$b) }(@metadata_values);
    972     }
    973 }
    974 
     1249    if ($sort_mode eq "unicode") {
     1250    if ($self->{'unicode_collator'}) {
     1251        return $self->{'unicode_collator'}->sort(@metadata_values);
     1252    }
     1253    # the collator wasn't loaded, fall back on default
     1254    $sort_mode = "alphanumeric";
     1255    }
     1256    if ($sort_mode eq "numeric") {
     1257    return sort {$a <=> $b} @metadata_values;
     1258    }
     1259    if ($sort_mode eq "alphabetic") {
     1260    return sort {$a cmp $b} @metadata_values;
     1261    }
     1262    # natural sort
     1263    return nsort(@metadata_values);
     1264}
     1265
     1266# we are not using this any more. Using nsort instead
    9751267# $a and $b args automatically passed in and shouldn't be declared
    9761268sub alpha_numeric_cmp
Note: See TracChangeset for help on using the changeset viewer.