Changeset 33452
- Timestamp:
- 2019-09-03T13:15:48+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/classify/List.pm
r31767 r33452 9 9 # University of Waikato, New Zealand. 10 10 # 11 # Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ12 11 # 13 12 # Copyright (C) 2005 New Zealand Digital Library Project … … 37 36 38 37 use BaseClassifier; 39 38 use Sort::Naturally; 40 39 use strict; 41 40 … … 45 44 } 46 45 47 my $meta_select_type_list = 46 47 my $metadata_selection_mode_list = 48 48 [ 49 49 { 'name' => "firstvalue", … … 53 53 { 'name' => "allvalues", 54 54 'desc' => "{List.metadata_selection.allvalues}"} ]; 55 my $valid_meta_select_types = { 'firstvalue' => 1, 55 56 my $metadata_selection_mode_default = "firstvalidmetadata"; 57 58 my $valid_metadata_selection_modes = { 'firstvalue' => 1, 56 59 'firstvalidmetadata' => 1, 57 60 'allvalues' => 1 }; 61 62 my $metadata_sort_mode_list = 63 [ 64 { 'name' => "unicode", 65 'desc' => "{List.metadata_sort.unicode}"}, 66 { 'name' => "alphabetic", 67 'desc' => "{List.metadata_sort.alphabetic}"}, 68 { 'name' => "alphanumeric", 69 'desc' => "{List.metadata_sort.alphanumeric}"} ]; 70 71 my $metadata_sort_mode_default = "alphanumeric"; 72 73 my $valid_metadata_sort_modes = { 'unicode' => 1, 74 'alphabetic' => 1, 75 'alphanumeric' => 1}; 58 76 my $partition_type_list = 59 77 [ { 'name' => "per_letter", … … 66 84 'desc' => "{List.level_partition.none}" } ]; 67 85 68 # following used to check types later on 86 my $partition_type_default = "per_letter"; 87 69 88 my $valid_partition_types = { 'per_letter' => 1, 70 89 'constant_size' => 1, 71 'per_letter_fixed_size' => 1,72 90 'approximate_size' => 1, 73 91 'none' => 1}; 92 93 my $partition_size_default = 30; 94 95 my $numeric_partition_type_list = 96 [ { 'name' => "per_digit", 97 'desc' => "{List.level_partition.per_digit}" }, 98 { 'name' => "per_number", 99 'desc' => "{List.level_partition.per_number}" }, 100 { 'name' => "single_partition", 101 'desc' => "{List.level_partition_single}" }, 102 { 'name' => "approximate_size", 103 'desc' => "{List.level_partition.approximate_size_numeric}"}, 104 { 'name' => "constant_size", 105 'desc' => "{List.level_partition.constant_size}" }, 106 { 'name' => "none", 107 'desc' => "{List.level_partition.none}" } ]; 108 109 my $numeric_partition_type_default = "single_partition"; 110 111 my $valid_numeric_partition_types = { 'per_digit' => 1, 112 'per_number' => 1, 113 'constant_size' => 1, 114 'single_partition' => 1, 115 'approximate_size' => 1, 116 'none' =>1 }; 117 118 my $numeric_partition_size_default = 30; 119 120 my $numeric_partition_name_length_default = "-1"; # use the full number 74 121 75 122 my $bookshelf_type_list = … … 81 128 'desc' => "{List.bookshelf_type.never}" } ]; 82 129 130 my $bookshelf_type_default = "never"; 131 my $sort_leaf_nodes_using_default = "Title"; 83 132 my $arguments = 84 133 [ { 'name' => "metadata", … … 86 135 'type' => "metadata", 87 136 'reqd' => "yes" }, 88 { 'name' => "metadata_selection_mode", 137 138 { 'name' => "metadata_selection_mode_within_level", 89 139 'desc' => "{List.metadata_selection_mode}", 90 140 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/') 91 'list' => $meta_select_type_list, 92 'deft' => "firstvalidmetadata" }, 93 # The interesting options 141 'list' => $metadata_selection_mode_list, 142 'deft' => $metadata_selection_mode_default }, 143 144 { 'name' => "metadata_sort_mode_within_level", 145 'desc' => "{List.metadata_sort_mode}", 146 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/') 147 'list' => $metadata_sort_mode_list, 148 'deft' => $metadata_sort_mode_default }, 149 94 150 { 'name' => "bookshelf_type", 95 151 'desc' => "{List.bookshelf_type}", 96 152 'type' => "enum", 97 153 'list' => $bookshelf_type_list, 98 'deft' => "never" }, 154 'deft' => $bookshelf_type_default }, 155 99 156 { 'name' => "classify_sections", 100 157 'desc' => "{List.classify_sections}", 101 158 'type' => "flag" }, 159 102 160 { 'name' => "partition_type_within_level", 103 161 'desc' => "{List.partition_type_within_level}", 104 162 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/') 105 163 'list' => $partition_type_list, 106 'deft' => "per_letter" }, 164 'deft' => $partition_type_default }, 165 107 166 { 'name' => "partition_size_within_level", 108 167 'desc' => "{List.partition_size_within_level}", 109 'type' => "string" }, # Must be string because multiple values can be specified (separated by '/') 168 'type' => "string", # Must be string because multiple values can be specified (separated by '/') 169 'deft' => $partition_size_default}, 170 110 171 { 'name' => "partition_name_length", 111 172 'desc' => "{List.partition_name_length}", 112 173 'type' => "string" }, 174 175 {'name' => "partition_sort_mode_within_level", 176 'desc' => "{List.partition_sort_mode_within_level}", 177 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/') 178 'list' => $metadata_sort_mode_list, 179 'deft' => $metadata_sort_mode_default }, 180 181 { 'name' => "numeric_partition_type_within_level", 182 'desc' => "{List.numeric_partition_type_within_level}", 183 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/') 184 'list' => $numeric_partition_type_list, 185 'deft' => $numeric_partition_type_default }, 186 187 {'name' => "numeric_partition_sort_mode_within_level", 188 'desc' => "{List.numeric_partition_sort_mode_within_level}", 189 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/') 190 'list' => $metadata_sort_mode_list, 191 'deft' => $metadata_sort_mode_default }, 192 193 { 'name' => "numeric_partition_size_within_level", 194 'desc' => "{List.numeric_partition_size_within_level}", 195 'type' => "string", # Must be string because multiple values can be specified (separated by '/') 196 'deft' => $numeric_partition_size_default}, 197 198 { 'name' => "numeric_partition_name_length_within_level", 199 'desc' => "{List.numeric_partition_name_length_within_level}", 200 'type' => "string", 201 'deft' => $numeric_partition_name_length_default }, 202 203 { 'name' => "numbers_first", 204 'desc' => "{List.numbers_first", 205 'type' => 'flag'}, 206 113 207 { 'name' => "sort_leaf_nodes_using", 114 208 'desc' => "{List.sort_leaf_nodes_using}", 115 209 'type' => "metadata", 116 'deft' => "Title" }, 210 'deft' => $sort_leaf_nodes_using_default }, 211 212 { 'name' => "sort_leaf_nodes_sort_mode", 213 'desc' => "{List.sort_leaf_nodes_sort_mode}", 214 'type' => "enum", 215 'list' => $metadata_sort_mode_list, 216 'deft' => $metadata_sort_mode_default }, 217 117 218 { 'name' => "reverse_sort_leaf_nodes", 118 219 'desc' => "{List.reverse_sort_leaf_nodes}", 119 220 'type' => "flag"}, 221 120 222 { 'name' => "sort_using_unicode_collation", 121 'desc' => "{List. sort_using_unicode_collation}",223 'desc' => "{List.metadata_sort.unicode} {List.sort_using_unicode_collation}", 122 224 'type' => "flag" }, 225 123 226 { 'name' => "use_hlist_for", 124 227 'desc' => "{List.use_hlist_for}", 125 228 'type' => "string" }, 229 126 230 {'name' => "filter_metadata", 127 231 'desc' => "{List.filter_metadata}", 128 232 'type' => "metadata"}, 233 129 234 {'name' => "filter_regex", 130 235 'desc' => "{List.filter_regex}", 131 236 'type' => "regexp"}, 237 132 238 { 'name' => "standardize_capitalization", 133 239 'desc' => "{List.standardize_capitalization}", 134 240 'type' => "flag"}, 241 135 242 { 'name' => "removeprefix", 136 243 'desc' => "{BasClas.removeprefix}", 137 244 'type' => "regexp" }, 245 138 246 { 'name' => "removesuffix", 139 247 'desc' => "{BasClas.removesuffix}", … … 158 266 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists); 159 267 268 bless $self, $class; 269 160 270 if ($self->{'info_only'}) { 161 271 # don't worry about any options etc 162 return bless $self, $class;272 return $self; 163 273 } 164 274 … … 179 289 180 290 # meta selection mode for each level 181 if (!$self->{'metadata_selection_mode'}) { 182 foreach my $metadata_group (@metadata_groups) { 183 $self->{$metadata_group . ".metadata_selection_mode"} = "firstvalidmetadata"; 184 } 291 $self->set_metadata_groups_info_per_level("metadata_selection_mode", $metadata_selection_mode_default, $valid_metadata_selection_modes); 292 293 # meta sort mode for each level 294 if ($self->{'sort_using_unicode_collation'}) { 295 print STDERR "WARNING: sort_using_unicode_collation is set, setting metadata_sort_mode_within_level to unicode for all levels, regardless of current setting\n"; 296 $self->{'metadata_sort_mode_within_level'} = "unicode"; 297 $metadata_sort_mode_default = "unicode"; 185 298 } else { 186 my @metadata_selection_mode_list = split(/\//, $self->{'metadata_selection_mode'}); 187 foreach my $metadata_group (@metadata_groups) { 188 my $meta_select_mode = shift(@metadata_selection_mode_list); 189 if (defined($meta_select_mode) && defined $valid_meta_select_types->{$meta_select_mode}) { 190 $self->{$metadata_group . ".metadata_selection_mode"} = $meta_select_mode; 191 } else { 192 $self->{$metadata_group . ".metadata_selection_mode"} = "firstvalidmetadata"; 193 } 194 } 195 } 299 if ($self->{'metadata_sort_mode_within_level'} =~ /unicode/) { 300 $self->{'sort_using_unicode_collation'} = 1; 301 } 302 } 303 304 305 $self->set_metadata_groups_info_per_level('metadata_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes); 306 196 307 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last) 197 308 foreach my $metadata_group (@metadata_groups) { … … 210 321 } 211 322 212 # How the items are grouped into partitions (default: no partition)323 # How the items are grouped into partitions (default: by letter) 213 324 # for each level (metadata group), separated by '/' 214 if (!$self->{"partition_type_within_level"}) { 215 foreach my $metadata_group (@metadata_groups) { 216 $self->{$metadata_group . ".partition_type_within_level"} = "none"; 217 } 218 } else { 219 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'}); 220 221 my $first = 1; 222 foreach my $metadata_group (@metadata_groups) { 223 my $partition_type_within_levelelem = shift(@partition_type_within_levellist); 224 if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") { 225 print STDERR "per letter fixed size, changing to approximate size\n"; 226 $partition_type_within_levelelem = "approximate_size"; 227 } 228 if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) { 229 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem; 230 } 231 else { 232 if ($first) { 233 $self->{$metadata_group . ".partition_type_within_level"} = "none"; 234 $first = 0; 235 } else { 236 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"}; 237 } 238 if (defined($partition_type_within_levelelem)) { 239 # ie invalid entry 240 print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n"; 241 } 242 } 243 } 244 } 245 246 # The number of items in each partition 247 if (!$self->{'partition_size_within_level'}) { 248 # Default: 20 249 foreach my $metadata_group (@metadata_groups) { 250 $self->{$metadata_group . ".partition_size_within_level"} = 20; 251 } 252 } 253 else { 254 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'}); 255 256 # Assign values based on the partition_size_within_level parameter 257 foreach my $metadata_group (@metadata_groups) { 258 my $partition_size_within_levelelem = shift(@partition_size_within_levellist); 259 if (defined($partition_size_within_levelelem)) { 260 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem; 261 } 262 else { 263 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"}; 264 } 265 } 266 } 267 325 326 $self->set_metadata_groups_info_per_level("partition_type_within_level", $partition_type_default, $valid_partition_types); 327 328 $self->set_metadata_groups_info_per_level("numeric_partition_type_within_level", $numeric_partition_type_default, $valid_numeric_partition_types); 329 330 $self->set_metadata_groups_info_per_level("partition_size_within_level", $partition_size_default); 331 $self->set_metadata_groups_info_per_level("numeric_partition_size_within_level", $numeric_partition_size_default); 332 333 $self->set_metadata_groups_info_per_level('partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes); 334 335 $self->set_metadata_groups_info_per_level('numeric_partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes); 336 337 $self->set_metadata_groups_info_per_level("numeric_partition_name_length_within_level", $numeric_partition_name_length_default); 338 268 339 # The removeprefix and removesuffix expressions 269 340 if ($self->{'removeprefix'}) { … … 303 374 304 375 # The metadata elements to use to sort the leaf nodes (default: Title) 305 my @sort_leaf_nodes_using_metadata_groups = ( "Title" ); 306 if ($self->{'sort_leaf_nodes_using'}) { 307 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'}); 308 } 376 my @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'}); 309 377 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups; 378 310 379 foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) { 311 380 # set metadata_select_type, if not already set - might be already set if the same group was used in -metadata 312 381 if (!defined $self->{$sort_group . ".metadata_selection_mode"}) { 313 $self->{$sort_group . ".metadata_selection_mode"} = "firstvalue"; 314 } 315 } 382 $self->{$sort_group . ".metadata_selection_mode"} = $metadata_selection_mode_default; 383 } 384 385 } 386 387 my @leaf_nodes_sort_modes = split (/\|/, $self->{'sort_leaf_nodes_sort_mode'}); 388 foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) { 389 my $leaf_sort_mode = shift(@leaf_nodes_sort_modes); 390 if (!defined $self->{$sort_group . ".metadata_sort_mode_within_level"}) { 391 if (defined $leaf_sort_mode && defined $valid_metadata_sort_modes->{$leaf_sort_mode}) { 392 $self->{$sort_group . ".metadata_sort_mode_within_level"} = $leaf_sort_mode; 393 } 394 else { 395 $self->{$sort_group . ".metadata_sort_mode_within_level"} = $metadata_sort_mode_default; 396 } 397 } 398 399 } 400 316 401 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired 317 402 if ($self->{'sort_using_unicode_collation'}) { … … 327 412 # A hash for all the doc ids that we have seen, so we don't classify something twice 328 413 $self->{'all_doc_OIDs'} = {}; 329 return bless $self, $class; 414 #return bless $self, $class; 415 return $self; 330 416 } 331 417 … … 336 422 } 337 423 424 sub set_metadata_groups_info_per_level 425 { 426 my $self = shift(@_); 427 my $info_name = shift(@_); 428 my $info_default = shift(@_); 429 my $info_valid_types_hash_ref = shift(@_); 430 431 my @info_list = split(/\//, $self->{$info_name}); 432 433 my $first = 1; 434 foreach my $metadata_group (@{$self->{'metadata_groups'}}) { 435 my $info_elem = shift(@info_list); 436 if (defined ($info_elem) && (!defined $info_valid_types_hash_ref || defined $info_valid_types_hash_ref->{$info_elem})) { 437 $self->{$metadata_group .".$info_name"} = $info_elem; 438 } else { 439 # its empty or an invalid entry 440 my $new_info_elem; 441 if ($first) { 442 $new_info_elem = $info_default; 443 } else { 444 # get the value we had at first 445 $new_info_elem = $self->{@{$self->{'metadata_groups'}}[0] . ".$info_name"}; 446 } 447 $self->{$metadata_group .".$info_name"} = $new_info_elem; 448 if (defined $info_elem) { 449 print STDERR "List Error: $info_elem is not a valid value for $info_name, changing it to $new_info_elem\n"; 450 451 } 452 } 453 $first = 0; 454 } 455 456 } 338 457 339 458 # Called for each document in the collection … … 347 466 return; 348 467 } 468 349 469 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1; 470 350 471 # check against filter here 351 472 if ($self->{'filter_metadata'}) { 352 #print STDERR "filtering documents on $self->{'filter_metadata'}\n";353 473 my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'}); 354 474 return unless defined $meta; 355 475 if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) { 356 #print STDERR "doesn't match regex\n";476 print STDERR "doc $doc_obj doesn't pass filtering requirement\n" if ($self->{'verbosity'} > 3); 357 477 return; 358 359 478 } 360 479 } … … 381 500 382 501 my @metadata_groups = @{$self->{'metadata_groups'}}; 383 384 502 385 503 # Only classify the section if it has a value for one of the metadata elements in the first group 386 504 my $classify_section = 0; … … 391 509 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element); 392 510 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element); 393 394 511 # Remove prefix/suffix if requested 395 512 if (defined ($first_metadata_group_element_value)) { … … 402 519 } 403 520 } 521 522 $first_metadata_group_element_value = &sorttools::format_metadata_for_sorting($first_metadata_group, $first_metadata_group_element_value, $doc_obj) unless $self->{'no_metadata_formatting'}; 404 523 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") { 405 # This section must be included in the classifier 524 # This section must be included in the classifier as we have found a value 406 525 $classify_section = 1; 407 526 last; 408 527 } 409 528 } 410 529 411 530 # We're not classifying this section because it doesn't have the required metadata 412 531 return if (!$classify_section); … … 428 547 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)}; 429 548 foreach my $metadata_value (@metadata_values) { 549 #print STDERR "working with value $metadata_value\n"; 430 550 # Strip leading and trailing whitespace 431 551 $metadata_value =~ s/^\s*//; … … 444 564 # text-transform if stored uppercase). 2 CSS text-transforms have been added to core.css 445 565 my $lc_metadata_value = lc($metadata_value); 446 447 # We are already working with unicode aware strings at this 448 # stage, so we no longer need to convert from utf8 to unicode 449 #my $metadata_value_unicode_string = $metadata_value; # $self->convert_utf8_string_to_unicode_string($metadata_value); 450 451 # Add the metadata value into the list for this combination of metadata group and section 566 $lc_metadata_value = &sorttools::format_metadata_for_sorting($real_metadata_element, $lc_metadata_value, $doc_obj) unless $self->{'no_metadata_formatting'}; 567 #print STDERR "formatted value = $lc_metadata_value\n"; 568 569 # Add the metadata value into the list for this combination of metadata group 570 # and section - if we have some non-whitespace chars 452 571 # text that we have some non-whitespace chars 453 572 if ($lc_metadata_value =~ /\S/) { … … 455 574 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $lc_metadata_value); 456 575 457 458 # add the actual value into the stored values so we can remember the case459 if (!$self->{'standardize_capitalization'}) {460 if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) {461 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++;462 } else {463 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1;576 # add the actual value into the stored values so we can remember the case 577 if (!$self->{'standardize_capitalization'}) { 578 if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) { 579 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++; 580 } else { 581 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1; 582 } 464 583 } 465 }466 584 last if ($self->{$metadata_group . ".metadata_selection_mode"} eq "firstvalue"); 467 585 } … … 508 626 509 627 my $metadata_group = $metadata_groups[0]; 510 511 628 if (!defined($self->{$metadata_group . ".list"})) { 512 629 print STDERR "Warning: No metadata values assigned to $metadata_group.\n"; … … 516 633 # Create a mapping from metadata value to OID 517 634 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"}; 518 my %metadata_value_to_OIDs_hash = (); 635 my %metadata_value_to_OIDs_hash = (); 636 my %numeric_metadata_value_to_OIDs_hash = (); 519 637 foreach my $OID (@OIDs) 520 638 { … … 524 642 foreach my $metadata_value (@metadata_values) 525 643 { 526 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID); 644 if ($metadata_value =~ /^[0-9]/) { 645 push(@{$numeric_metadata_value_to_OIDs_hash{$metadata_value}}, $OID); 646 } else { 647 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID); 648 } 527 649 } 528 650 } 529 651 } 530 652 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n"; 653 #print STDERR "Number of distinct numeric values: " . scalar(keys %numeric_metadata_value_to_OIDs_hash) . "\n"; 531 654 532 655 # Partition the values (if necessary) 533 656 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"}; 534 657 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"}; 535 if ($partition_type_within_level =~ /^per_letter$/i) { 536 # Generate one hlist for each letter 537 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash)); 538 my %metadata_value_to_OIDs_subhash = (); 539 540 my $lastpartition = substr($sortedmetadata_values[0], 0, 1); 541 foreach my $metadata_value (@sortedmetadata_values) { 542 my $metadata_valuepartition = substr($metadata_value, 0, 1); 543 544 # Is this the start of a new partition? 545 if ($metadata_valuepartition ne $lastpartition) { 546 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash); 547 %metadata_value_to_OIDs_subhash = (); 548 $lastpartition = $metadata_valuepartition; 549 } 550 551 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value}; 552 } 553 554 # Don't forget to add the last partition 555 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash); 556 557 # The partitions are stored in an HList 558 $classifier_node->{'childtype'} = "HList"; 559 } 560 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) { 561 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size 562 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible 563 # If a letter has more items than specified, split into several hlists. 564 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise) 565 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"}; 566 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash)); 567 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"}; 658 my $partition_sort_mode_within_level = $self->{$metadata_group . ".partition_sort_mode_within_level"}; 659 my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"}; 660 661 662 ############################################# 663 ### DO THE NUMBERS IF THEY ARE TO COME FIRST 664 ############################################# 665 if ($self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) { 666 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash); 667 } 668 669 ############################################ 670 # DO THE LETTERS 671 ############################################ 672 if (keys(%metadata_value_to_OIDs_hash)){ # make sure we have some values 673 if ($partition_type_within_level =~ /^per_letter$/i) { 674 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash); 675 } 676 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) { 677 $self->split_approximate_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'}); 678 679 } 680 elsif ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) { 681 $self->split_constant_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'}); 682 } 683 684 # Otherwise just add all the values to a VList 685 else { 686 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash); 687 } 688 } 689 690 ########################################### 691 ### DO THE NUMBERS IF THEY ARE TO COME LAST 692 ########################################### 693 if (!$self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) { 694 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash); 695 } 696 } 697 698 sub partition_numeric_values 699 { 700 my $self = shift(@_); 701 my @metadata_groups = @{shift(@_)}; 702 my $classifier_node = shift(@_); 703 my $numeric_metadata_value_to_OIDs_hash_ref = shift(@_); 704 705 my $metadata_group = $metadata_groups[0]; 706 my $numeric_partition_type_within_level = $self->{$metadata_group . ".numeric_partition_type_within_level"}; 707 my $numeric_partition_size_within_level = $self->{$metadata_group . ".numeric_partition_size_within_level"}; 708 my $numeric_partition_sort_mode_within_level = $self->{$metadata_group . ".numeric_partition_sort_mode_within_level"}; 709 my $numeric_partition_name_length_within_level = $self->{$metadata_group . ".numeric_partition_name_length_within_level"}; 710 my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"}; 711 712 if ($numeric_partition_type_within_level eq "single_partition") { 713 $self->add_hlist_partition(\@metadata_groups, $classifier_node, "0-9", $numeric_metadata_value_to_OIDs_hash_ref); 714 } 715 elsif ($numeric_partition_type_within_level eq "per_digit") { 716 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, 1); 717 } 718 elsif ($numeric_partition_type_within_level eq "per_number") { 719 # each different number is a bucket 720 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, $numeric_partition_name_length_within_level); 721 } 722 elsif ($numeric_partition_type_within_level eq "constant_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) { 723 # Generate hlists of a certain size 568 724 569 # Separate values by their first letter, each form a bucket, like the per_letter partition type 570 my $last_partition = substr($sortedmetadata_values[0], 0, 1); 571 my %partition_buckets = (); 572 my @metadata_values_in_bucket = (); 573 my $num_items_in_bucket = 0; 574 foreach my $metadata_value (@sortedmetadata_values) { 575 my $metadata_valuepartition = substr($metadata_value, 0, 1); 576 if ($metadata_valuepartition ne $last_partition) { 577 my @temp_array = @metadata_values_in_bucket; 578 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values 579 my %partition_info = (); 580 $partition_info{'metadata_values'} = \@temp_array; 581 $partition_info{'size'} = $num_items_in_bucket; 582 $partition_buckets{$last_partition} = \%partition_info; 725 $self->split_constant_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1); 726 } elsif ($numeric_partition_type_within_level eq "approximate_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) { 727 $self->split_approximate_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1); 728 } 729 # Otherwise just add all the values to a VList 730 else { 731 $self->add_vlist(\@metadata_groups, $classifier_node, $numeric_metadata_value_to_OIDs_hash_ref); 732 } 733 734 } 735 736 sub split_approximate_size 737 { 738 739 my $self = shift(@_); 740 my @metadata_groups = @{shift(@_)}; 741 my $classifier_node = shift(@_); 742 my $partition_size = shift(@_); 743 my $sort_mode = shift(@_); 744 my $bookshelf_type = shift(@_); 745 my $metadata_value_to_OIDs_hash_ref = shift(@_); 746 my $partition_name_length = shift(@_); 747 my $is_numeric = shift(@_); 748 749 # Generate hlist based on the first letter of the metadata value (like per_letter), or based on 750 # numbers, but also with restriction on the partition size 751 # If a partition has fewer items than specified by the "partition_size_within_level", then group them together if possible 752 # If a partition has more items than specified, split into several hlists. 753 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise) 754 755 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref)); 756 757 # Separate values by their first letter, each form a bucket, like the per_letter partition type 758 my $last_partition = $self->generate_partition_name($sortedmetadata_values[0], $partition_name_length, $is_numeric); 759 760 my @partition_buckets = (); 761 my @metadata_values_in_bucket = (); 762 my $num_items_in_bucket = 0; 763 764 foreach my $metadata_value (@sortedmetadata_values) { 765 my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric); 766 if ($metadata_valuepartition ne $last_partition) { 767 my @temp_array = @metadata_values_in_bucket; 768 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values 769 my %partition_info = (); 770 $partition_info{'metadata_values'} = \@temp_array; 771 $partition_info{'size'} = $num_items_in_bucket; 772 $partition_info{'name'} = $last_partition; 773 push (@partition_buckets, \%partition_info); 774 775 @metadata_values_in_bucket = ($metadata_value); 776 $num_items_in_bucket = ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : scalar(@metadata_values_in_bucket)); 777 $last_partition = $metadata_valuepartition; 778 } else { 779 $num_items_in_bucket += ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1); #scalar(@metadata_values_in_bucket); 780 push (@metadata_values_in_bucket, $metadata_value); 781 } 782 } 783 # Last one 784 my %partition_info = (); 785 $partition_info{'metadata_values'} = \@metadata_values_in_bucket; 786 $partition_info{'size'} = $num_items_in_bucket; 787 $partition_info{'name'} = $last_partition; 788 789 push (@partition_buckets, \%partition_info); 790 791 # now go through the array of buckets, and merge small buckets 792 my @new_partition_buckets = (); 793 for (my $i = 0; $i < scalar(@partition_buckets) - 1; $i++) { 794 795 my $this_bucket = $partition_buckets[$i]; #->{'name'}; 796 my $next_bucket = $partition_buckets[$i+1]; #->{'name'}; 797 798 my $items_in_partition = $this_bucket->{'size'}; 799 800 if ($items_in_partition < $partition_size ) { 801 my $items_in_next_partition = $next_bucket->{'size'}; 802 if ($items_in_partition + $items_in_next_partition <= $partition_size ) { 803 # merge this bucket into the next bucket 804 foreach my $metadata_value_to_merge (@{$this_bucket->{'metadata_values'}}) { 805 push(@{$next_bucket->{'metadata_values'}}, $metadata_value_to_merge); 806 } 807 $next_bucket->{'size'} += $items_in_partition; 583 808 584 @metadata_values_in_bucket = ($metadata_value);585 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);586 $last_partition = $metadata_valuepartition;587 809 } else { 588 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket); 589 push (@metadata_values_in_bucket, $metadata_value); 590 } 591 } 592 # Last one 593 my %partition_info = (); 594 $partition_info{'metadata_values'} = \@metadata_values_in_bucket; 595 $partition_info{'size'} = $num_items_in_bucket; 596 $partition_buckets{$last_partition} = \%partition_info; 810 # remember this bucket 811 push (@new_partition_buckets, $this_bucket); 812 } 813 } else { 814 # remember this bucket 815 push (@new_partition_buckets, $this_bucket); 816 } 817 } 818 # add in the last bucket 819 my $last_bucket = $partition_buckets[scalar(@partition_buckets) - 1]; 820 push (@new_partition_buckets, $last_bucket); 821 822 # Add partitions to the main list, but divide big bucket into several 823 my $last_partition_end = ""; 824 my $partition_start = ""; 825 foreach my $partition (@new_partition_buckets) { 826 my @metadata_values = $self->sort_metadata_values_array($sort_mode, @{$partition->{'metadata_values'}}); 827 my $items_in_partition = $partition->{'size'}; 828 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $partition_name_length, $is_numeric); 829 830 if ($items_in_partition > $partition_size) { 831 my $items_done = 0; 832 my %metadata_values_to_OIDs_subhashes = (); 833 for (my $i = 0; $i < scalar(@metadata_values); $i++) { 834 my $metadata_value = $metadata_values[$i]; 835 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values 836 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1; 597 837 598 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets)); 599 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) { 600 my $partition = $partition_keys[$i]; 601 my $items_in_partition = $partition_buckets{$partition}->{'size'}; 602 # Merge small buckets together, but keep the numeric bucket apart 603 if ($items_in_partition < $partition_size_within_level) { 604 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'}; 605 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level 606 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/) 607 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) { 608 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) { 609 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge); 610 } 611 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition; 612 delete $partition_buckets{$partition}; 613 } 614 } 615 } 616 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets)); 617 618 # Add partitions, and divide big bucket into several 619 my $last_partition_end = ""; 620 my $partition_start = ""; 621 foreach my $partition (@partition_keys) { 622 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}}); 623 my $items_in_partition = $partition_buckets{$partition}->{'size'}; 624 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"}); 625 626 if ($items_in_partition > $partition_size_within_level) { 627 my $items_done = 0; 628 my %metadata_values_to_OIDs_subhashes = (); 629 for (my $i = 0; $i < scalar(@metadata_values); $i++) { 630 my $metadata_value = $metadata_values[$i]; 631 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values 632 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1; 633 634 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"}); 635 my $partitionname = $partition_start; 636 if ($partitionend ne $partition_start) { 637 $partitionname = $partitionname . "-" . $partitionend; 638 } 639 640 # Start a new partition 641 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) { 642 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes); 643 $last_partition_end = $partitionend; 644 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"}); 645 $items_done = 0; 646 %metadata_values_to_OIDs_subhashes = (); 647 } 648 649 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions 650 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) { 651 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"}); 652 # Get the number of partitions needed for this value 653 my $num_splits = int($items_for_this_md_value / $partition_size_within_level); 654 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits); 655 656 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}}; 657 for (my $i = 0; $i < $num_splits; $i++) { 658 my %OIDs_subhashes_for_this_value = (); 659 my @OIDs_for_this_partition = (); 660 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) { 661 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]); 662 } 663 664 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values 665 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) { 666 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition; 667 $items_done += scalar(@OIDs_for_this_partition); 668 next; 669 } 670 671 # Add an HList for this bucket 672 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition; 673 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value); 674 $last_partition_end = $partitionname_for_this_value; 675 } 676 next; 677 } 678 679 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value}; 680 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1; 681 682 # The last partition 683 if($i == scalar(@metadata_values) - 1) { 684 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes); 685 } 686 } 687 } 688 else { 689 # The easier case, just add a partition 690 my %metadata_values_to_OIDs_subhashes = (); 691 for (my $i = 0; $i < scalar(@metadata_values); $i++) { 692 my $metadata_value = $metadata_values[$i]; 693 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value}; 694 } 695 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1]; 696 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"}); 838 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $partition_name_length, $is_numeric); 697 839 my $partitionname = $partition_start; 698 840 if ($partitionend ne $partition_start) { 699 841 $partitionname = $partitionname . "-" . $partitionend; 700 842 } 701 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes); 702 $last_partition_end = $partitionend; 703 } 704 } 843 844 if ($items_done + $items_for_this_md_value > $partition_size && $items_done != 0) { 845 # Start a new partition 846 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes); 847 $last_partition_end = $partitionend; 848 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric); 849 $items_done = 0; 850 %metadata_values_to_OIDs_subhashes = (); 851 } 852 853 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions 854 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size) { 855 856 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric); 857 # Get the number of partitions needed for this value 858 my $num_splits = int($items_for_this_md_value / $partition_size); 859 $num_splits++ if ($items_for_this_md_value / $partition_size > $num_splits); 860 861 my @OIDs_for_this_value = $metadata_value_to_OIDs_hash_ref->{$metadata_value}; 862 for (my $i = 0; $i < $num_splits; $i++) { 863 my %OIDs_subhashes_for_this_value = (); 864 my @OIDs_for_this_partition = (); 865 for (my $d = $i * $partition_size; $d < (($i+1) * $partition_size > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size); $d++) { 866 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]); 867 } 868 869 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values 870 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size) { 871 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition; 872 $items_done += scalar(@OIDs_for_this_partition); 873 next; 874 } 875 876 # Add an HList for this bucket 877 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition; 878 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value); 879 $last_partition_end = $partitionname_for_this_value; 880 } 881 next; 882 } 883 884 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value}; 885 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1; 886 887 # The last partition 888 if($i == scalar(@metadata_values) - 1) { 889 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes); 890 } 891 } 892 } # end if items in partition > partition size 893 else { 894 # The easier case, just add a partition 895 my %metadata_values_to_OIDs_subhashes = (); 896 for (my $i = 0; $i < scalar(@metadata_values); $i++) { 897 my $metadata_value = $metadata_values[$i]; 898 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value}; 899 } 900 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1]; 901 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $partition_name_length, $is_numeric); 902 my $partitionname = $partition_start; 903 if ($partitionend ne $partition_start) { 904 $partitionname = $partitionname . "-" . $partitionend; 905 } 705 906 706 # The partitions are stored in an HList 707 $classifier_node->{'childtype'} = "HList"; 708 709 } # end approximate_size 710 else { 711 # Generate hlists of a certain size 712 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) { 713 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash)); 714 my $itemsdone = 0; 715 my %metadata_value_to_OIDs_subhash = (); 716 my $lastpartitionend = ""; 717 my $partitionstart; 718 foreach my $metadata_value (@sortedmetadata_values) { 719 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value}; 720 $itemsdone++; 721 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash); 722 723 # Is this the start of a new partition? 724 if ($itemsinpartition == 1) { 725 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"}); 726 } 727 728 # Is this the end of the partition? 729 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) { 730 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"}); 731 my $partitionname = $partitionstart; 732 if ($partitionend ne $partitionstart) { 733 $partitionname = $partitionname . "-" . $partitionend; 734 } 735 736 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash); 737 %metadata_value_to_OIDs_subhash = (); 738 $lastpartitionend = $partitionend; 739 } 740 } 741 742 # The partitions are stored in an HList 743 $classifier_node->{'childtype'} = "HList"; 744 } 745 746 # Otherwise just add all the values to a VList 747 else { 748 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash); 749 } 750 } 751 } 752 907 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes); 908 $last_partition_end = $partitionend; 909 } 910 } 911 912 # The partitions are stored in an HList 913 $classifier_node->{'childtype'} = "HList"; 914 915 } 916 917 sub split_constant_size 918 { 919 my $self = shift(@_); 920 my @metadata_groups = @{shift(@_)}; 921 my $classifier_node = shift(@_); 922 my $partition_size = shift(@_); 923 my $sort_mode = shift(@_); 924 my $metadata_value_to_OIDs_hash_ref = shift(@_); 925 my $partition_name_length = shift(@_); 926 my $is_numeric = shift(@_); 927 928 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref)); 929 my $itemsdone = 0; 930 my $numitems = scalar(@sortedmetadata_values); 931 my %metadata_value_to_OIDs_subhash = (); 932 my $lastpartitionend = ""; 933 my $partitionstart; 934 foreach my $metadata_value (@sortedmetadata_values) { 935 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value}; 936 $itemsdone++; 937 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash); 938 939 # Is this the start of a new partition? 940 if ($itemsinpartition == 1) { 941 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $is_numeric); 942 } 943 944 # Is this the end of the partition? 945 if ($itemsinpartition == $partition_size || $itemsdone == $numitems) { 946 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $is_numeric); 947 my $partitionname = $partitionstart; 948 if ($partitionend ne $partitionstart) { 949 $partitionname = $partitionname . "-" . $partitionend; 950 } 951 952 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash); 953 %metadata_value_to_OIDs_subhash = (); 954 $lastpartitionend = $partitionend; 955 } 956 } 957 958 # The partitions are stored in an HList 959 $classifier_node->{'childtype'} = "HList"; 960 961 } 962 963 sub split_per_letter_or_digit 964 { 965 my $self = shift(@_); 966 my @metadata_groups = @{shift(@_)}; 967 my $classifier_node = shift(@_); 968 my $sort_mode = shift(@_); 969 my $metadata_value_to_OIDs_hash_ref = shift(@_); 970 my $is_numeric = shift(@_); 971 my $numeric_partition_length = shift(@_); 972 973 if (not defined $is_numeric) { 974 $is_numeric = 0; 975 } 976 if ($is_numeric && not defined($numeric_partition_length)) { 977 $numeric_partition_length = 1; 978 } 979 # Generate one hlist for each letter 980 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref)); 981 my %metadata_value_to_OIDs_subhash = (); 982 983 my $lastpartition = $self->generate_partition_name($sortedmetadata_values[0], $numeric_partition_length, $is_numeric); 984 foreach my $metadata_value (@sortedmetadata_values) { 985 986 my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $numeric_partition_length, $is_numeric); 987 988 # Is this the start of a new partition? 989 if ($metadata_valuepartition ne $lastpartition) { 990 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash); 991 %metadata_value_to_OIDs_subhash = (); 992 $lastpartition = $metadata_valuepartition; 993 } 994 995 $metadata_value_to_OIDs_subhash{$metadata_value} = %$metadata_value_to_OIDs_hash_ref{$metadata_value}; 996 } 997 998 # Don't forget to add the last partition 999 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash); 1000 1001 # The partitions are stored in an HList 1002 $classifier_node->{'childtype'} = "HList"; 1003 1004 } 1005 1006 sub generate_partition_name 1007 { 1008 my $self = shift(@_); 1009 my $mvalue = shift(@_); 1010 my $numeric_partition_length = shift(@_); 1011 my $is_numeric = shift(@_); 1012 1013 if (!$is_numeric || $numeric_partition_length == 1 ) { 1014 return substr($mvalue, 0, 1); 1015 } 1016 if ($numeric_partition_length == -1) { 1017 my ($all_digits) = $mvalue =~ /^([0-9]+)/; 1018 return $all_digits; 1019 } 1020 my ($some_digits) = $mvalue =~ /^([0-9]{1,$numeric_partition_length})/; 1021 return $some_digits; 1022 } 753 1023 754 1024 sub generate_partition_start … … 758 1028 my $lastpartitionend = shift(@_); 759 1029 my $partition_name_length = shift(@_); 760 1030 my $is_numeric = shift(@_); 1031 1032 if ($is_numeric) { 1033 return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric); 1034 } 1035 761 1036 if ($partition_name_length) { 762 1037 return substr($metadata_value, 0, $partition_name_length); … … 782 1057 my $partitionstart = shift(@_); 783 1058 my $partition_name_length = shift(@_); 784 1059 my $is_numeric = shift(@_); 1060 1061 if ($is_numeric) { 1062 return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric); 1063 } 785 1064 if ($partition_name_length) { 786 1065 return substr($metadata_value, 0, $partition_name_length); … … 834 1113 $classifier_node->{'mdtype'} = $metadata_group; 835 1114 1115 my $sort_type = $self->{$metadata_group .".metadata_sort_mode_within_level"}; 836 1116 # Create an entry in the vlist for each value 837 foreach my $metadata_value ($self->sort_metadata_values_array( keys(%{$metadata_value_to_OIDs_hash_ref})))1117 foreach my $metadata_value ($self->sort_metadata_values_array($sort_type, keys(%{$metadata_value_to_OIDs_hash_ref}))) 838 1118 { 839 1119 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}; … … 856 1136 else { 857 1137 my $metadata_value_display = $self->get_metadata_value_display($metadata_group, $metadata_value); 858 # Note that we don't need to convert from unicode-aware strings 859 # to utf8 here, as that is handled elsewhere in the code 860 my %child_classifier_node = ( 'Title' => $metadata_value_display, # 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value), 1138 my %child_classifier_node = ( 'Title' => $metadata_value_display, 861 1139 'childtype' => "VList", 862 1140 'mdtype' => $metadata_group, 863 1141 'contains' => [] ); 864 1142 865 #@OIDs = $self->sort_leaf_items(\@OIDs);866 1143 # If there are metadata elements remaining, recursively apply the process 867 1144 if (@metadata_groups > 0) { … … 932 1209 my $self = shift(@_); 933 1210 my @OIDs = @{shift(@_)}; 934 # my $classifier_node = shift(@_); 935 1211 936 1212 # Sort leaf nodes and add to list 937 1213 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}}; 938 1214 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) { 939 1215 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"}; 1216 my $sort_type = $self->{$sort_leaf_nodes_usingmetaelem . ".metadata_sort_mode_within_level"}; 940 1217 # Force a stable sort (Perl 5.6's sort isn't stable) 941 1218 # !! The [0] bits aren't ideal (multiple metadata values) !! … … 943 1220 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]}) && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})) 944 1221 { 945 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0]; 1222 if ($sort_type eq "numeric") { 1223 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] <=> $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0]; 1224 } elsif ($sort_type eq "alphabetic") { 1225 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0]; 1226 } else { 1227 ncmp($OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0], $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0]); 1228 } 946 1229 } 947 1230 else … … 952 1235 } 953 1236 if ($self->{'reverse_sort_leaf_nodes'}) { 954 #print STDERR "reversing\n";955 1237 return reverse @OIDs; 956 1238 } … … 959 1241 960 1242 961 962 1243 sub sort_metadata_values_array 963 1244 { 964 1245 my $self = shift(@_); 1246 my ($sort_mode) = shift(@_); 965 1247 my @metadata_values = @_; 966 1248 967 if ($self->{'unicode_collator'}) { 968 return $self->{'unicode_collator'}->sort(@metadata_values); 969 } 970 else { 971 return sort { $self->alpha_numeric_cmp($a,$b) }(@metadata_values); 972 } 973 } 974 1249 if ($sort_mode eq "unicode") { 1250 if ($self->{'unicode_collator'}) { 1251 return $self->{'unicode_collator'}->sort(@metadata_values); 1252 } 1253 # the collator wasn't loaded, fall back on default 1254 $sort_mode = "alphanumeric"; 1255 } 1256 if ($sort_mode eq "numeric") { 1257 return sort {$a <=> $b} @metadata_values; 1258 } 1259 if ($sort_mode eq "alphabetic") { 1260 return sort {$a cmp $b} @metadata_values; 1261 } 1262 # natural sort 1263 return nsort(@metadata_values); 1264 } 1265 1266 # we are not using this any more. Using nsort instead 975 1267 # $a and $b args automatically passed in and shouldn't be declared 976 1268 sub alpha_numeric_cmp
Note:
See TracChangeset
for help on using the changeset viewer.