Changeset 18619
- Timestamp:
- 2009-03-04T16:49:04+13:00 (15 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/classify/List.pm
r18572 r18619 48 48 'desc' => "{List.level_partition.per_letter}" }, 49 49 { 'name' => "constant_size", 50 'desc' => "{List.level_partition.constant_size}" }, 50 'desc' => "{List.level_partition.constant_size}" }, 51 { 'name' => "per_letter_fixed_size", 52 'desc' => "{List.level_partition.per_letter_fixed_size}"}, 51 53 { 'name' => "none", 52 54 'desc' => "{List.level_partition.none}" } ]; 53 55 56 my $bookshelf_type_list = 57 [ { 'name' => "always", 58 'desc' => "{List.bookshelf_type.always}" }, 59 { 'name' => "duplicate_only", 60 'desc' => "{List.bookshelf_type.duplicate_only}" }, 61 { 'name' => "never", 62 'desc' => "{List.bookshelf_type.never}" } ]; 54 63 55 64 my $arguments = … … 60 69 61 70 # The interesting options 62 { 'name' => "always_bookshelf_last_level", 63 'desc' => "{GenericList.always_bookshelf_last_level}", 64 'type' => "flag" }, 71 { 'name' => "bookshelf_type", 72 'desc' => "{List.bookshelf_types}", 73 'type' => "enum", 74 'list' => $bookshelf_type_list, 75 'deft' => "duplicate_only" }, 65 76 { 'name' => "classify_sections", 66 77 'desc' => "{GenericList.classify_sections}", … … 68 79 { 'name' => "partition_type_within_level", 69 80 'desc' => "{GenericList.partition_type_within_level}", 70 'type' => "enum", 71 'list' => $partition_type_list, 81 'type' => "string", 72 82 'deft' => "none" }, 73 83 { 'name' => "partition_size_within_level", … … 88 98 { 'name' => "use_hlist_for", 89 99 'desc' => "{GenericList.use_hlist_for}", 90 'type' => "string" } ]; 100 'type' => "string" }, 101 { 'name' => "removeprefix", 102 'desc' => "{BasClas.removeprefix}", 103 'type' => "regexp" }, 104 { 'name' => "removesuffix", 105 'desc' => "{BasClas.removesuffix}", 106 'type' => "regexp" } ]; 91 107 92 108 my $options = { 'name' => "List", … … 127 143 } 128 144 129 # Whether to group single items into a bookshelf (must be truefor all metadata fields except the last)145 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last) 130 146 foreach my $metadata_group (@metadata_groups) { 131 $self->{$metadata_group . ".always_bookshelf"} = "t"; 132 } 133 if (!$self->{'always_bookshelf_last_level'}) { 134 # Default: leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2) 135 my $last_metadata_group = $metadata_groups[$#metadata_groups]; 136 $self->{$last_metadata_group . ".always_bookshelf"} = "f"; 137 } 138 147 $self->{$metadata_group . ".bookshelf_type"} = "always"; 148 } 149 my $last_metadata_group = $metadata_groups[$#metadata_groups]; 150 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2) 151 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'}; 152 139 153 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist) 140 154 foreach my $metadata_group (@metadata_groups) { … … 145 159 } 146 160 147 # How the items are grouped into partitions (default: no partition) 161 # How the items are grouped into partitions (default: no partition) 162 # for each level (metadata group), separated by '/' 148 163 if (!$self->{"partition_type_within_level"}) { 149 $self->{"partition_type_within_level"} = "none"; 164 foreach my $metadata_group (@metadata_groups) { 165 $self->{$metadata_group . ".partition_type_within_level"} = "none"; 166 } 167 } else { 168 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'}); 169 foreach my $metadata_group (@metadata_groups) { 170 my $partition_type_within_levelelem = shift(@partition_type_within_levellist); 171 if (defined($partition_type_within_levelelem)) { 172 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem; 173 } 174 else { 175 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"}; 176 } 177 } 150 178 } 151 179 … … 168 196 else { 169 197 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"}; 198 } 199 } 200 } 201 202 # The removeprefix and removesuffix expressions 203 if ($self->{'removeprefix'}) { 204 # If there are more than one expressions, use '' to quote each experession and '/' to separate 205 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'}); 206 207 foreach my $metadata_group (@metadata_groups) { 208 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist); 209 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") { 210 # Remove the other ' at the beginning and the end if there is any 211 $removeprefix_expr_within_levelelem =~ s/^'//; 212 $removeprefix_expr_within_levelelem =~ s/'$//; 213 # Remove the extra ^ at the beginning 214 $removeprefix_expr_within_levelelem =~ s/^\^//; 215 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem; 216 } else { 217 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"}; 218 } 219 } 220 } 221 if ($self->{'removesuffix'}) { 222 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'}); 223 224 foreach my $metadata_group (@metadata_groups) { 225 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist); 226 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") { 227 $removesuffix_expr_within_levelelem =~ s/^'//; 228 $removesuffix_expr_within_levelelem =~ s/'$//; 229 # Remove the extra $ at the end 230 $removesuffix_expr_within_levelelem =~ s/\$$//; 231 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem; 232 } else { 233 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"}; 170 234 } 171 235 } … … 233 297 my $first_metadata_group = $metadata_groups[0]; 234 298 foreach my $first_metadata_group_element (split(/\;/, $first_metadata_group)) { 235 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $first_metadata_group_element); 299 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $first_metadata_group_element); 300 301 # Remove prefix/suffix if requested 302 my $remove_prefix_expr = $self->{$first_metadata_group_element . ".remove_prefix_expr"}; 303 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") { 304 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//; 305 } 306 my $remove_suffix_expr = $self->{$first_metadata_group_element . ".remove_suffix_expr"}; 307 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") { 308 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//; 309 } 310 236 311 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") { 237 312 # This section must be included in the classifier 238 313 $classify_section = 1; 239 314 last; 240 } 315 } 241 316 } 242 317 243 318 # We're not classifying this section because it doesn't have the required metadata 244 319 return if (!$classify_section); 245 246 320 247 321 if (($edit_mode eq "delete") || ($edit_mode eq "update")) { … … 260 334 # Take care not to do a metadata group more than once 261 335 unless ($metadata_groups_done{$metadata_group}) { 262 foreach my $metadata_element (split(/\;/, $metadata_group)) { 336 foreach my $metadata_element (split(/\;/, $metadata_group)) { 337 my $remove_prefix_expr = $self->{$metadata_element . ".remove_prefix_expr"}; 338 my $remove_suffix_expr = $self->{$metadata_element . ".remove_suffix_expr"}; 263 339 my @metadata_values = @{$doc_obj->get_metadata($section, $metadata_element)}; 264 340 foreach my $metadata_value (@metadata_values) { … … 266 342 $metadata_value =~ s/^\s*//; 267 343 $metadata_value =~ s/\s*$//; 344 345 # Remove prefix/suffix if requested 346 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") { 347 $metadata_value =~ s/^$remove_prefix_expr//; 348 } 349 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") { 350 $metadata_value =~ s/$remove_suffix_expr$//; 351 } 268 352 269 353 # Convert the metadata value from a UTF-8 string to a Unicode string … … 327 411 # Create a mapping from metadata value to OID 328 412 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"}; 329 my %metadata_value_to_OIDs_hash = (); 413 my %metadata_value_to_OIDs_hash = (); 330 414 foreach my $OID (@OIDs) 331 415 { … … 342 426 343 427 # Partition the values (if necessary) 344 my $partition_type_within_level = $self->{ "partition_type_within_level"};428 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"}; 345 429 if ($partition_type_within_level =~ /^per_letter$/i) { 346 430 # Generate one hlist for each letter … … 368 452 $classifier_node->{'childtype'} = "HList"; 369 453 } 370 454 elsif ($partition_type_within_level =~ /^per_letter_fixed_size$/i) { 455 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size 456 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible 457 # If a letter has more items than specified, split into several hlists. 458 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise) 459 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"}; 460 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash)); 461 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"}; 462 463 # Separate values by their first letter, each form a bucket, like the per_letter partition type 464 my $last_partition = substr($sortedmetadata_values[0], 0, 1); 465 my %partition_buckets = (); 466 my @metadata_values_in_bucket = (); 467 my $num_items_in_bucket = 0; 468 foreach my $metadata_value (@sortedmetadata_values) { 469 my $metadata_valuepartition = substr($metadata_value, 0, 1); 470 if ($metadata_valuepartition ne $last_partition) { 471 my @temp_array = @metadata_values_in_bucket; 472 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values 473 my %partition_info = (); 474 $partition_info{'metadata_values'} = \@temp_array; 475 $partition_info{'size'} = $num_items_in_bucket; 476 $partition_buckets{$last_partition} = \%partition_info; 477 478 @metadata_values_in_bucket = ($metadata_value); 479 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket); 480 $last_partition = $metadata_valuepartition; 481 } else { 482 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket); 483 push (@metadata_values_in_bucket, $metadata_value); 484 } 485 } 486 # Last one 487 my %partition_info = (); 488 $partition_info{'metadata_values'} = \@metadata_values_in_bucket; 489 $partition_info{'size'} = $num_items_in_bucket; 490 $partition_buckets{$last_partition} = \%partition_info; 491 492 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets)); 493 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) { 494 my $partition = $partition_keys[$i]; 495 my $items_in_partition = $partition_buckets{$partition}->{'size'}; 496 # Merge small buckets together, but keep the numeric bucket apart 497 if ($items_in_partition < $partition_size_within_level) { 498 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'}; 499 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level 500 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/) 501 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) { 502 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) { 503 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge); 504 } 505 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition; 506 delete $partition_buckets{$partition}; 507 } 508 } 509 } 510 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets)); 511 512 # Add partitions, and divide big bucket into several 513 my $last_partition_end = ""; 514 my $partition_start = ""; 515 foreach my $partition (@partition_keys) { 516 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}}); 517 my $items_in_partition = $partition_buckets{$partition}->{'size'}; 518 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"}); 519 520 if ($items_in_partition > $partition_size_within_level) { 521 my $items_done = 0; 522 my %metadata_values_to_OIDs_subhashes = (); 523 for (my $i = 0; $i < scalar(@metadata_values); $i++) { 524 my $metadata_value = $metadata_values[$i]; 525 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values 526 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1; 527 528 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"}); 529 my $partitionname = $partition_start; 530 if ($partitionend ne $partition_start) { 531 $partitionname = $partitionname . "-" . $partitionend; 532 } 533 534 # Start a new partition 535 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) { 536 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes); 537 $last_partition_end = $partitionend; 538 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"}); 539 $items_done = 0; 540 %metadata_values_to_OIDs_subhashes = (); 541 } 542 543 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions 544 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) { 545 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"}); 546 # Get the number of partitions needed for this value 547 my $num_splits = int($items_for_this_md_value / $partition_size_within_level); 548 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits); 549 550 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}}; 551 for (my $i = 0; $i < $num_splits; $i++) { 552 my %OIDs_subhashes_for_this_value = (); 553 my @OIDs_for_this_partition = (); 554 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) { 555 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]); 556 } 557 558 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values 559 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) { 560 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition; 561 $items_done += scalar(@OIDs_for_this_partition); 562 next; 563 } 564 565 # Add an HList for this bucket 566 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition; 567 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value); 568 $last_partition_end = $partitionname_for_this_value; 569 } 570 next; 571 } 572 573 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value}; 574 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1; 575 576 # The last partition 577 if($i == scalar(@metadata_values) - 1) { 578 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes); 579 } 580 } 581 } 582 else { 583 # The easier case, just add a partition 584 my %metadata_values_to_OIDs_subhashes = (); 585 for (my $i = 0; $i < scalar(@metadata_values); $i++) { 586 my $metadata_value = $metadata_values[$i]; 587 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value}; 588 } 589 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1]; 590 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"}); 591 my $partitionname = $partition_start; 592 if ($partitionend ne $partition_start) { 593 $partitionname = $partitionname . "-" . $partitionend; 594 } 595 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes); 596 $last_partition_end = $partitionend; 597 } 598 } 599 } 371 600 else { 372 601 # Generate hlists of a certain size … … 525 754 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}; 526 755 527 # If there is only one item and ' always_bookshelf' is false, add the item to the list528 if (@OIDs == 1 && $self->{$metadata_group . ". always_bookshelf"} eq "f") {756 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list 757 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") { 529 758 my $OID = $OIDs[0]; 530 759 … … 540 769 } 541 770 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset }); 542 } 543 771 } 772 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values 773 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") { 774 $self->add_sorted_leaf_items(\@OIDs, $classifier_node); 775 } 544 776 # Otherwise create a sublist (bookshelf) for the metadata value 545 else { 777 else { 546 778 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value), 547 779 'childtype' => "VList", … … 556 788 # Otherwise just add the documents as children of this list 557 789 else { 558 # Sort the leaf nodes by the metadata elements specified with -sort_leaf_nodes_using 559 my @sort_leaf_nodes_usingmetadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}}; 560 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_usingmetadata_groups) { 561 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"}; 562 # Force a stable sort (Perl 5.6's sort isn't stable) 563 # !! The [0] bits aren't ideal (multiple metadata values) !! 564 @OIDs = @OIDs[ sort { $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ]; 565 } 566 567 foreach my $OID (@OIDs) { 568 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID }); 569 } 790 $self->add_sorted_leaf_items(\@OIDs, \%child_classifier_node); 570 791 } 571 792 … … 576 797 } 577 798 799 sub add_sorted_leaf_items 800 { 801 my $self = shift(@_); 802 my @OIDs = @{shift(@_)}; 803 my $classifier_node = shift(@_); 804 805 # Sort leaf nodes and add to list 806 my @sort_leaf_nodes_usingmetadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}}; 807 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_usingmetadata_groups) { 808 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"}; 809 # Force a stable sort (Perl 5.6's sort isn't stable) 810 # !! The [0] bits aren't ideal (multiple metadata values) !! 811 @OIDs = @OIDs[ sort { $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ]; 812 } 813 814 foreach my $OID (@OIDs) { 815 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID }); 816 } 817 } 818 578 819 579 820 sub sort_metadata_values_array
Note:
See TracChangeset
for help on using the changeset viewer.