source: main/trunk/greenstone2/perllib/classify/List.pm

Last change on this file was 38882, checked in by kjdon, 4 weeks ago

fine tuning on generating buckets. if one side (alpha or numeric) has made buckets, then the other side must - don't want to end up with HList with some buckets and some documents in it

  • Property svn:keywords set to Author Date Id Revision
File size: 70.9 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11#
12# Copyright (C) 2005 New Zealand Digital Library Project
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28# TO DO: - Remove punctuation from metadata values before sorting.
29# - Add an AZCompactList-style hlist option?
30#
31###########################################################################
32
33
34package List;
35
36
37use BaseClassifier;
38use Sort::Naturally;
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46
47my $metadata_selection_mode_list =
48 [
49 { 'name' => "firstvalue",
50 'desc' => "{List.metadata_selection.firstvalue}"},
51 { 'name' => "firstvalidmetadata",
52 'desc' => "{List.metadata_selection.firstvalidmetadata}"},
53 { 'name' => "allvalues",
54 'desc' => "{List.metadata_selection.allvalues}"} ];
55
56my $metadata_selection_mode_default = "firstvalidmetadata";
57
58my $valid_metadata_selection_modes = { 'firstvalue' => 1,
59 'firstvalidmetadata' => 1,
60 'allvalues' => 1 };
61
62my $metadata_sort_mode_list =
63 [
64 { 'name' => "unicode",
65 'desc' => "{List.metadata_sort.unicode}"},
66 { 'name' => "alphabetic",
67 'desc' => "{List.metadata_sort.alphabetic}"},
68 { 'name' => "alphanumeric",
69 'desc' => "{List.metadata_sort.alphanumeric}"} ];
70
71my $metadata_sort_mode_default = "alphanumeric";
72
73my $valid_metadata_sort_modes = { 'unicode' => 1,
74 'alphabetic' => 1,
75 'alphanumeric' => 1};
76my $partition_type_list =
77 [ { 'name' => "per_letter",
78 'desc' => "{List.level_partition.per_letter}" },
79 { 'name' => "approximate_size",
80 'desc' => "{List.level_partition.approximate_size}"},
81 { 'name' => "constant_size",
82 'desc' => "{List.level_partition.constant_size}" },
83 { 'name' => "all_values",
84 'desc' => "{List.level_partition.all_values}" },
85 { 'name' => "none",
86 'desc' => "{List.level_partition.none}" } ];
87
88my $partition_type_default = "per_letter";
89
90my $valid_partition_types = { 'per_letter' => 1,
91 'constant_size' => 1,
92 'approximate_size' => 1,
93 'all_values' => 1,
94 'none' => 1};
95
96my $partition_size_default = 20;
97
98my $numeric_partition_type_list =
99 [ { 'name' => "per_digit",
100 'desc' => "{List.level_partition.per_digit}" },
101 { 'name' => "per_number",
102 'desc' => "{List.level_partition.per_number}" },
103 { 'name' => "single_partition",
104 'desc' => "{List.level_partition.single}" },
105 { 'name' => "approximate_size",
106 'desc' => "{List.level_partition.approximate_size_numeric}"},
107 { 'name' => "constant_size",
108 'desc' => "{List.level_partition.constant_size}" },
109 { 'name' => "all_values",
110 'desc' => "{List.level_partition.all_values}" },
111 { 'name' => "none",
112 'desc' => "{List.level_partition.none}" } ];
113
114my $numeric_partition_type_default = "single_partition";
115
116my $valid_numeric_partition_types = { 'per_digit' => 1,
117 'per_number' => 1,
118 'constant_size' => 1,
119 'single_partition' => 1,
120 'approximate_size' => 1,
121 'all_values' => 1,
122 'none' =>1 };
123
124my $numeric_partition_size_default = 20;
125
126my $numeric_partition_name_length_default = "-1"; # use the full number
127
128my $bookshelf_type_list =
129 [ { 'name' => "always",
130 'desc' => "{List.bookshelf_type.always}" },
131 { 'name' => "duplicate_only",
132 'desc' => "{List.bookshelf_type.duplicate_only}" },
133 { 'name' => "never",
134 'desc' => "{List.bookshelf_type.never}" } ];
135
136my $bookshelf_type_default = "never";
137my $sort_leaf_nodes_using_default = "Title";
138my $arguments =
139 [ { 'name' => "metadata",
140 'desc' => "{List.metadata}",
141 'type' => "metadata",
142 'reqd' => "yes" },
143
144 { 'name' => "metadata_selection_mode_within_level",
145 'desc' => "{List.metadata_selection_mode_within_level}",
146 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
147 'list' => $metadata_selection_mode_list,
148 'deft' => $metadata_selection_mode_default },
149
150 { 'name' => "metadata_sort_mode_within_level",
151 'desc' => "{List.metadata_sort_mode_within_level}",
152 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
153 'list' => $metadata_sort_mode_list,
154 'deft' => $metadata_sort_mode_default },
155
156 { 'name' => "bookshelf_type",
157 'desc' => "{List.bookshelf_type}",
158 'type' => "enum",
159 'list' => $bookshelf_type_list,
160 'deft' => $bookshelf_type_default },
161
162 { 'name' => "classify_sections",
163 'desc' => "{List.classify_sections}",
164 'type' => "flag" },
165
166 { 'name' => "partition_type_within_level",
167 'desc' => "{List.partition_type_within_level}",
168 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
169 'list' => $partition_type_list,
170 'deft' => $partition_type_default },
171
172 { 'name' => "partition_size_within_level",
173 'desc' => "{List.partition_size_within_level}",
174 'type' => "string", # Must be string because multiple values can be specified (separated by '/')
175 'deft' => $partition_size_default},
176
177 { 'name' => "partition_name_length",
178 'desc' => "{List.partition_name_length}",
179 'type' => "string" },
180
181 { 'name' => "max_partition_name_length",
182 'desc' => "{List.max_partition_name_length}",
183 'type' => "string",
184 'deft' => "3" },
185
186 {'name' => "partition_sort_mode_within_level",
187 'desc' => "{List.partition_sort_mode_within_level}",
188 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
189 'list' => $metadata_sort_mode_list,
190 'deft' => $metadata_sort_mode_default },
191
192 { 'name' => "numeric_partition_type_within_level",
193 'desc' => "{List.numeric_partition_type_within_level}",
194 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
195 'list' => $numeric_partition_type_list,
196 'deft' => $numeric_partition_type_default },
197
198 { 'name' => "numeric_partition_size_within_level",
199 'desc' => "{List.numeric_partition_size_within_level}",
200 'type' => "string", # Must be string because multiple values can be specified (separated by '/')
201 'deft' => $numeric_partition_size_default},
202
203 { 'name' => "numeric_partition_name_length_within_level",
204 'desc' => "{List.numeric_partition_name_length_within_level}",
205 'type' => "string",
206 'deft' => $numeric_partition_name_length_default },
207
208 {'name' => "numeric_partition_sort_mode_within_level",
209 'desc' => "{List.numeric_partition_sort_mode_within_level}",
210 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
211 'list' => $metadata_sort_mode_list,
212 'deft' => $metadata_sort_mode_default },
213
214 { 'name' => "numbers_first",
215 'desc' => "{List.numbers_first}",
216 'type' => 'flag'},
217
218 { 'name' => "sort_leaf_nodes_using",
219 'desc' => "{List.sort_leaf_nodes_using}",
220 'type' => "metadata",
221 'deft' => $sort_leaf_nodes_using_default },
222
223 { 'name' => "sort_leaf_nodes_sort_mode",
224 'desc' => "{List.sort_leaf_nodes_sort_mode}",
225 'type' => "enum",
226 'list' => $metadata_sort_mode_list,
227 'deft' => $metadata_sort_mode_default },
228
229 { 'name' => "reverse_sort_leaf_nodes",
230 'desc' => "{List.reverse_sort_leaf_nodes}",
231 'type' => "flag"},
232
233 { 'name' => "sort_leaf_nodes_removeprefix",
234 'desc' => "{List.sort_leaf_removeprefix}",
235 'type' => "regexp" },
236
237 { 'name' => "sort_leaf_nodes_removesuffix",
238 'desc' => "{List.sort_leaf_nodes_removesuffix}",
239 'type' => "regexp" },
240
241 { 'name' => "sort_using_unicode_collation",
242 'desc' => "{List.metadata_sort.unicode} {List.sort_using_unicode_collation}",
243 'type' => "flag" },
244
245 {'name' => "filter_metadata",
246 'desc' => "{List.filter_metadata}",
247 'type' => "metadata"},
248
249 {'name' => "filter_regex",
250 'desc' => "{List.filter_regex}",
251 'type' => "regexp"},
252
253 { 'name' => "use_formatted_metadata_for_bookshelf_display",
254 'desc' => "{List.use_formatted_metadata_for_bookshelf_display}",
255 'type' => "flag"},
256
257 { 'name' => "removeprefix",
258 'desc' => "{BasClas.removeprefix}",
259 'type' => "regexp" },
260
261 { 'name' => "removesuffix",
262 'desc' => "{BasClas.removesuffix}",
263 'type' => "regexp" } ];
264
265my $options = { 'name' => "List",
266 'desc' => "{List.desc}",
267 'abstract' => "no",
268 'inherits' => "yes",
269 'args' => $arguments };
270
271
272sub new
273{
274 my ($class) = shift(@_);
275 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
276 push(@$classifierslist, $class);
277
278 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
279 push(@{$hashArgOptLists->{"OptList"}}, $options);
280
281 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
282
283 bless $self, $class;
284
285 if ($self->{'info_only'}) {
286 # don't worry about any options etc
287 return $self;
288 }
289
290 # The metadata elements to use (required)
291 if (!$self->{'metadata'}) {
292 die "Error: No metadata fields specified for List.\n";
293 }
294
295 my @metadata_groups = split(/[\/]/, $self->{'metadata'});
296 $self->{'metadata_groups'} = \@metadata_groups;
297
298 # The classifier button name (default: the first metadata element specified)
299 if (!$self->{'buttonname'}) {
300 my $first_metadata_group = $metadata_groups[0];
301 my $first_metadata_element = (split(/[\;|,\/]/, $first_metadata_group))[0];
302 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
303 }
304
305 # meta selection mode for each level
306 $self->set_metadata_groups_info_per_level("metadata_selection_mode_within_level", $metadata_selection_mode_default, $valid_metadata_selection_modes);
307
308 # meta sort mode for each level
309 if ($self->{'sort_using_unicode_collation'}) {
310 print STDERR "WARNING: sort_using_unicode_collation is set, setting metadata_sort_mode_within_level to unicode for all levels, regardless of current setting\n";
311 $self->{'metadata_sort_mode_within_level'} = "unicode";
312 $metadata_sort_mode_default = "unicode";
313 } else {
314 if ($self->{'metadata_sort_mode_within_level'} =~ /unicode/) {
315 $self->{'sort_using_unicode_collation'} = 1;
316 }
317 }
318
319
320 $self->set_metadata_groups_info_per_level('metadata_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
321
322 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
323 foreach my $metadata_group (@metadata_groups) {
324 $self->{$metadata_group . ".bookshelf_type"} = "always";
325 }
326 my $last_metadata_group = $metadata_groups[$#metadata_groups];
327 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
328 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
329
330 # How the items are grouped into partitions (default: by letter)
331 # for each level (metadata group), separated by '/'
332 $self->set_metadata_groups_info_per_level("partition_type_within_level", $partition_type_default, $valid_partition_types);
333 $self->set_metadata_groups_info_per_level("numeric_partition_type_within_level", $numeric_partition_type_default, $valid_numeric_partition_types);
334
335 # now check whether a level was none - need to set the equivalent level in the other half also to none
336 foreach my $metadata_group (@metadata_groups) {
337 if ($self->{$metadata_group . ".partition_type_within_level"} eq "none" || $self->{$metadata_group . ".numeric_partition_type_within_level"} eq "none") {
338
339 print STDERR "WARNING: one of -partition_type_within_level or -numeric_partition_type_within_level was set to 'none' for level $metadata_group, overriding current value of both these options to 'none'\n";
340
341 $self->{$metadata_group . ".partition_type_within_level"} = "none";
342 $self->{$metadata_group . ".numeric_partition_type_within_level"} = "none";
343 }
344 }
345
346 $self->set_metadata_groups_info_per_level("partition_size_within_level", $partition_size_default);
347 $self->set_metadata_groups_info_per_level("numeric_partition_size_within_level", $numeric_partition_size_default);
348
349 $self->set_metadata_groups_info_per_level('partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
350
351 $self->set_metadata_groups_info_per_level('numeric_partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
352
353 $self->set_metadata_groups_info_per_level("numeric_partition_name_length_within_level", $numeric_partition_name_length_default);
354
355 # The removeprefix and removesuffix expressions
356 if ($self->{'removeprefix'}) {
357 # If there are more than one expressions, use '' to quote each experession and '/' to separate
358 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
359
360 foreach my $metadata_group (@metadata_groups) {
361 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
362
363 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
364 # Remove the other ' at the beginning and the end if there is any
365 $removeprefix_expr_within_levelelem =~ s/^'//;
366 $removeprefix_expr_within_levelelem =~ s/'$//;
367 # Remove the extra ^ at the beginning
368 $removeprefix_expr_within_levelelem =~ s/^\^//;
369 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
370 } else {
371 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
372 }
373 }
374 }
375 if ($self->{'removesuffix'}) {
376 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
377
378 foreach my $metadata_group (@metadata_groups) {
379 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
380
381 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
382 $removesuffix_expr_within_levelelem =~ s/^'//;
383 $removesuffix_expr_within_levelelem =~ s/'$//;
384 # Remove the extra $ at the end
385 $removesuffix_expr_within_levelelem =~ s/\$$//;
386 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
387 } else {
388 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
389 }
390 }
391 }
392
393 # The metadata elements to use to sort the leaf nodes (default: Title)
394 my @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
395 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
396
397 foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
398 # set metadata_select_type, if not already set - might be already set if the same group was used in -metadata
399 if (!defined $self->{$sort_group . ".metadata_selection_mode_within_level"}) {
400 $self->{$sort_group . ".metadata_selection_mode_within_level"} = $metadata_selection_mode_default;
401 }
402
403 }
404
405 my @leaf_nodes_sort_modes = split (/\|/, $self->{'sort_leaf_nodes_sort_mode'});
406 foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
407 my $leaf_sort_mode = shift(@leaf_nodes_sort_modes);
408 if (!defined $self->{$sort_group . ".metadata_sort_mode_within_level"}) {
409 if (defined $leaf_sort_mode && defined $valid_metadata_sort_modes->{$leaf_sort_mode}) {
410 $self->{$sort_group . ".metadata_sort_mode_within_level"} = $leaf_sort_mode;
411 }
412 else {
413 $self->{$sort_group . ".metadata_sort_mode_within_level"} = $metadata_sort_mode_default;
414 }
415 }
416
417 }
418
419 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
420 if ($self->{'sort_using_unicode_collation'}) {
421 # To use this you first need to download the allkeys.txt file from
422 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
423 # Unicode/Collate directory.
424 require Unicode::Collate;
425 $self->{'unicode_collator'} = Unicode::Collate->new();
426 }
427
428 # An empty array for the document/section OIDs that we are classifying
429 $self->{'OIDs'} = [];
430 # A hash for all the doc ids that we have seen, so we don't classify something twice
431 $self->{'all_doc_OIDs'} = {};
432 return $self;
433}
434
435
436sub init
437{
438 # Nothing to do...
439}
440
441sub set_metadata_groups_info_per_level
442{
443 my $self = shift(@_);
444 my $info_name = shift(@_);
445 my $info_default = shift(@_);
446 my $info_valid_types_hash_ref = shift(@_);
447
448 if (!defined $self->{$info_name}) {
449 print STDERR "List Error: no values were set for option $info_name\n";
450 }
451 my @info_list = split(/\//, $self->{$info_name});
452
453 my $first = 1;
454 foreach my $metadata_group (@{$self->{'metadata_groups'}}) {
455 my $info_elem = shift(@info_list);
456 if (defined ($info_elem) && (!defined $info_valid_types_hash_ref || defined $info_valid_types_hash_ref->{$info_elem})) {
457 $self->{$metadata_group .".$info_name"} = $info_elem;
458 } else {
459 # its empty or an invalid entry
460 my $new_info_elem;
461 if ($first) {
462 $new_info_elem = $info_default;
463 } else {
464 # get the value we had at first
465 $new_info_elem = $self->{@{$self->{'metadata_groups'}}[0] . ".$info_name"};
466 }
467 $self->{$metadata_group .".$info_name"} = $new_info_elem;
468 if (defined $info_elem) {
469 print STDERR "List Error: $info_elem is not a valid value for $info_name, changing it to $new_info_elem\n";
470
471 }
472 }
473 $first = 0;
474 }
475
476}
477
478# Called for each document in the collection
479sub classify
480{
481 my $self = shift(@_);
482 my ($doc_obj) = @_;
483
484 if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
485 print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
486 return;
487 }
488
489 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
490
491 # check against filter here
492 if ($self->{'filter_metadata'}) {
493 my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'});
494 return unless defined $meta;
495 if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) {
496 print STDERR "doc $doc_obj doesn't pass filtering requirement. Not classifying.\n" if ($self->{'verbosity'} > 3);
497 return;
498 }
499 }
500 # if we get here, we have passed the test for filtering
501 # If "-classify_sections" is set, classify every section of the document
502 if ($self->{'classify_sections'}) {
503 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
504 while (defined $section) {
505 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
506 $section = $doc_obj->get_next_section($section);
507 }
508 }
509 # Otherwise just classify the top document section
510 else {
511 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
512 }
513
514}
515
516sub classify_section
517{
518 my $self = shift(@_);
519 my ($doc_obj,$section_OID,$section) = @_;
520
521 my @metadata_groups = @{$self->{'metadata_groups'}};
522
523 # Only classify the section if it has a value for one of the metadata elements in the first group
524 my $classify_section = 0;
525 my $first_metadata_group = $metadata_groups[0];
526 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
527 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
528 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
529 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
530 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
531 # Remove prefix/suffix if requested
532 if (defined ($first_metadata_group_element_value)) {
533 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
534 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
535 }
536
537 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
538 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
539 }
540 }
541
542 $first_metadata_group_element_value = &sorttools::format_metadata_for_sorting($first_metadata_group, $first_metadata_group_element_value, $doc_obj, $self->{'casefold_metadata_for_sorting'}, $self->{'accentfold_metadata_for_sorting'}) unless $self->{'no_metadata_formatting'};
543 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
544 # This section must be included in the classifier as we have found a value
545 $classify_section = 1;
546 last;
547
548 }
549 }
550
551 # We're not classifying this section because it doesn't have the required metadata
552 return if (!$classify_section);
553
554 # Otherwise, include this section in the classifier
555
556 push(@{$self->{'OIDs'}}, $section_OID);
557
558 # Create a hash for the metadata values of each metadata element we're interested in
559 my %metadata_groups_done = ();
560 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
561 # Take care not to do a metadata group more than once
562 unless ($metadata_groups_done{$metadata_group}) {
563 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
564 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
565 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
566 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
567
568 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
569 foreach my $metadata_value (@metadata_values) {
570 # Strip leading and trailing whitespace
571 $metadata_value =~ s/^\s*//;
572 $metadata_value =~ s/\s*$//;
573
574 # Remove prefix/suffix if requested
575 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
576 $metadata_value =~ s/^$remove_prefix_expr//;
577 }
578 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
579 $metadata_value =~ s/$remove_suffix_expr$//;
580 }
581
582 # lowercase metadata both for sorting meta (d/D under D), and to allow CSS to be able to
583 # text-transform the stored lowercase values as capitalize or uppercase (can't CSS
584 # text-transform if stored uppercase). 2 CSS text-transforms have been added to core.css
585 ### no longer do this, as lowercasing is now an option for the user, and is handled by format_metadata_for_sorting
586 my $lc_metadata_value = $metadata_value; #lc($metadata_value);
587 $lc_metadata_value = &sorttools::format_metadata_for_sorting($real_metadata_element, $lc_metadata_value, $doc_obj, $self->{'casefold_metadata_for_sorting'}, $self->{'accentfold_metadata_for_sorting'}) unless $self->{'no_metadata_formatting'};
588
589 # Add the metadata value into the list for this combination of metadata group
590 # and section - if we have some non-whitespace chars
591 # test that we have some non-whitespace chars
592 if ($lc_metadata_value =~ /\S/) {
593
594 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $lc_metadata_value);
595
596 # add the actual value into the stored values so we can remember the case
597 if (!$self->{'use_formatted_metadata_for_bookshelf_display'}) {
598 if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) {
599 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++;
600 } else {
601 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1;
602 }
603 }
604 last if ($self->{$metadata_group . ".metadata_selection_mode_within_level"} eq "firstvalue");
605 }
606 } # foreach metadatavalue
607 last if ((@metadata_values > 0) && $self->{$metadata_group . ".metadata_selection_mode_within_level"} =~ /^(firstvalue|firstvalidmetadata)$/ );
608 } # foreach metadata element
609
610 $metadata_groups_done{$metadata_group} = 1;
611 }
612 }
613}
614
615
616sub get_classify_info
617{
618 my $self = shift(@_);
619
620 # The metadata groups to classify by
621 my @metadata_groups = @{$self->{'metadata_groups'}};
622 my $first_metadata_group = $metadata_groups[0];
623
624 # The OID values of the documents to include in the classifier
625 my @OIDs = @{$self->{'OIDs'}};
626
627 # Create the root node of the classification hierarchy
628 my %classifier_node = ( 'thistype' => "Invisible",
629 'childtype' => "VList",
630 'Title' => $self->{'buttonname'},
631 'contains' => [],
632 'mdtype' => $first_metadata_group );
633
634 # Recursively create the classification hierarchy, one level for each metadata group
635 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
636 return \%classifier_node;
637}
638
639# when using approximate_size or constant_size buckets, if there are less items than partition_size, then we generally don't make a bucket.
640# However, if the alpha values have partitions and the numerica values don't, then this doesn't display nicely (get documents in an HList)
641# so if one side has partitions, then we force the other side to have them too.
642sub will_we_have_partitions
643{
644 my $self = shift(@_);
645 my $metadata_group = shift(@_);
646 my $metadata_value_to_OIDs_hash_ref = shift(@_);
647 my $numeric_metadata_value_to_OIDs_hash_ref = shift(@_);
648
649 my $partition_type = $self->{$metadata_group . ".partition_type_within_level"};
650 my $partition_size = $self->{$metadata_group . ".partition_size_within_level"};
651 my $bookshelf_type = $self->{$metadata_group. ".bookshelf_type"};
652 my $numeric_partition_type = $self->{$metadata_group . ".numeric_partition_type_within_level"};
653 my $numeric_partition_size = $self->{$metadata_group . ".numeric_partition_size_within_level"};
654
655 if ($partition_type eq "none") { #if this is none, then numeric will also be none, so only need to check one of them.
656 return 0;
657 }
658 # if either alpha or numeric has partitions, then we force partitions both sides
659 if ($partition_type =~ /^(per_letter|all_values)$/ || $numeric_partition_type =~ /^(single_partition|per_digit|all_values)/) {
660 return 1;
661 }
662 # we are using approximate/constant size buckets - lets count the items to see if either side will be making buckets
663 my $num_alpha = 0;
664 my $num_numeric = 0;
665 if ($bookshelf_type =~ /^(always|duplicate_only)$/) {
666 # each individual value will be a bookshelf or a single item
667 $num_alpha = scalar(keys %$metadata_value_to_OIDs_hash_ref);
668 $num_numeric = scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref);
669 } else {
670 # we have to count actual values to see how many entires there are
671 foreach my $metadata_value (keys %{$metadata_value_to_OIDs_hash_ref}) {
672 $num_alpha += scalar( @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}});
673 }
674 foreach my $metadata_value (keys %{$numeric_metadata_value_to_OIDs_hash_ref}) {
675 $num_numeric += scalar( @{$numeric_metadata_value_to_OIDs_hash_ref->{$metadata_value}});
676 }
677
678 }
679
680 my $tolerance = 0;
681 my $numeric_tolerance = 0;
682 if ($partition_type eq "approximate_size") {
683 $tolerance = ($partition_size > 40 ? 10 : int($partition_size/4));
684 }
685 if ($numeric_partition_type eq "approximate_size" ) {
686 $numeric_tolerance = ($numeric_partition_size > 40 ? 10 : int($numeric_partition_size/4));
687 }
688
689 # now, test to see if we have partition on either side - in which case we need to force partitions on the other side
690 if ($num_alpha > $partition_size+$tolerance || $num_numeric > $numeric_partition_size+$numeric_tolerance) {
691 return 1;
692 }
693 return 0;
694}
695sub add_level
696{
697 my $self = shift(@_);
698 my @metadata_groups = @{shift(@_)};
699 my @OIDs = @{shift(@_)};
700 my $classifier_node = shift(@_);
701
702 my $metadata_group = $metadata_groups[0];
703 if (!defined($self->{$metadata_group . ".list"})) {
704 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
705 return;
706 }
707
708 # Create a mapping from metadata value to OID
709 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
710 my %metadata_value_to_OIDs_hash = ();
711 my %numeric_metadata_value_to_OIDs_hash = ();
712 foreach my $OID (@OIDs)
713 {
714 if ($OID_to_metadata_values_hash_ref->{$OID})
715 {
716 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
717 foreach my $metadata_value (@metadata_values)
718 {
719 if ($metadata_value =~ /^[0-9]/) {
720 push(@{$numeric_metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
721 } else {
722 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
723 }
724 }
725 }
726 }
727 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
728 #print STDERR "Number of distinct numeric values: " . scalar(keys %numeric_metadata_value_to_OIDs_hash) . "\n";
729
730 # Partition the values (if necessary)
731 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
732 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
733 my $partition_sort_mode_within_level = $self->{$metadata_group . ".partition_sort_mode_within_level"};
734 my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"};
735
736 my ($has_partitions) = $self->will_we_have_partitions($metadata_group, \%metadata_value_to_OIDs_hash, \%numeric_metadata_value_to_OIDs_hash);
737
738 #############################################
739 ### DO THE NUMBERS IF THEY ARE TO COME FIRST
740 #############################################
741 if ($self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) {
742 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash, $has_partitions);
743 }
744
745 ############################################
746 # DO THE LETTERS
747 ############################################
748 if (keys(%metadata_value_to_OIDs_hash)){ # make sure we have some values
749 if ($partition_type_within_level =~ /^per_letter$/i) {
750 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash);
751 }
752 elsif ($partition_type_within_level =~ /^approximate_size$/i && $has_partitions) { # scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
753 $self->split_approximate_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'}, $self->{'max_partition_name_length'});
754
755 }
756 elsif ($partition_type_within_level =~ /^constant_size$/i && $has_partitions) { #scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
757 $self->split_constant_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'}, $self->{'max_partition_name_length'});
758 }
759
760 # Otherwise just add all the values to a VList
761 else {
762 if ($partition_type_within_level =~ /^all_values$/i) {
763 $classifier_node->{'childtype'} = "HList";
764 $self->{$metadata_group. ".bookshelf_type"} = "always";
765 }
766 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
767 }
768 }
769
770 ###########################################
771 ### DO THE NUMBERS IF THEY ARE TO COME LAST
772 ###########################################
773 if (!$self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) {
774 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash, $has_partitions);
775 }
776}
777
778sub partition_numeric_values
779{
780 my $self = shift(@_);
781 my @metadata_groups = @{shift(@_)};
782 my $classifier_node = shift(@_);
783 my $numeric_metadata_value_to_OIDs_hash_ref = shift(@_);
784 my $has_partitions = shift(@_);
785
786 my $metadata_group = $metadata_groups[0];
787 my $numeric_partition_type_within_level = $self->{$metadata_group . ".numeric_partition_type_within_level"};
788 my $numeric_partition_size_within_level = $self->{$metadata_group . ".numeric_partition_size_within_level"};
789 my $numeric_partition_sort_mode_within_level = $self->{$metadata_group . ".numeric_partition_sort_mode_within_level"};
790 my $numeric_partition_name_length_within_level = $self->{$metadata_group . ".numeric_partition_name_length_within_level"};
791 my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"};
792
793 if ($numeric_partition_type_within_level eq "single_partition") {
794 $self->add_hlist_partition(\@metadata_groups, $classifier_node, "0-9", $numeric_metadata_value_to_OIDs_hash_ref);
795 }
796 elsif ($numeric_partition_type_within_level eq "per_digit") {
797 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, 1);
798 }
799 elsif ($numeric_partition_type_within_level eq "per_number") {
800 # each different number is a bucket
801 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, $numeric_partition_name_length_within_level);
802 }
803 elsif ($numeric_partition_type_within_level eq "constant_size" && $has_partitions) { #scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
804 # Generate hlists of a certain size
805
806 $self->split_constant_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level,$numeric_partition_name_length_within_level, 1);
807 } elsif ($numeric_partition_type_within_level eq "approximate_size" && $has_partitions) { #scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
808 $self->split_approximate_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, $numeric_partition_name_length_within_level, 1);
809 }
810 # Otherwise just add all the values to a VList
811 else {
812 if ($numeric_partition_type_within_level =~ /^all_values$/i) {
813 $classifier_node->{'childtype'} = "HList";
814 $self->{$metadata_group. ".bookshelf_type"} = "always";
815 }
816
817 $self->add_vlist(\@metadata_groups, $classifier_node, $numeric_metadata_value_to_OIDs_hash_ref);
818 }
819
820}
821
822
823
824sub split_approximate_size
825{
826
827 my $self = shift(@_);
828 my @metadata_groups = @{shift(@_)};
829 my $classifier_node = shift(@_);
830 my $partition_size = shift(@_);
831 my $sort_mode = shift(@_);
832 my $bookshelf_type = shift(@_);
833 my $metadata_value_to_OIDs_hash_ref = shift(@_);
834 my $partition_name_length = shift(@_);
835 my $max_partition_name_length = shift(@_);
836 my $is_numeric = shift(@_);
837
838 my $tolerance = ($partition_size > 40 ? 10 : int($partition_size/4)); # should this be an option??
839
840 # Generate hlist based on the first letter of the metadata value (like per_letter), or based on
841 # numbers, but also with restriction on the partition size
842 # If a partition has fewer items than specified by the "partition_size_within_level", then group them together if possible
843 # If a partition has more items than specified, split into several hlists.
844 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
845
846 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
847
848 # Separate values by their first letter, each form a bucket, like the per_letter partition type
849 my $last_partition = $self->generate_partition_name($sortedmetadata_values[0], $partition_name_length, $is_numeric);
850
851 my @partition_buckets = ();
852 my @metadata_values_in_bucket = ();
853 my $num_items_in_bucket = 0;
854
855 foreach my $metadata_value (@sortedmetadata_values) {
856 my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
857 if ($metadata_valuepartition ne $last_partition) {
858 my @temp_array = @metadata_values_in_bucket;
859 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
860 my %partition_info = ();
861 $partition_info{'metadata_values'} = \@temp_array;
862 $partition_info{'size'} = $num_items_in_bucket;
863 $partition_info{'name'} = $last_partition;
864 push (@partition_buckets, \%partition_info);
865
866 @metadata_values_in_bucket = ($metadata_value);
867 $num_items_in_bucket = ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : scalar(@metadata_values_in_bucket));
868 $last_partition = $metadata_valuepartition;
869 } else {
870 $num_items_in_bucket += ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1);
871 push (@metadata_values_in_bucket, $metadata_value);
872 }
873 }
874 # Last one
875 my %partition_info = ();
876 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
877 $partition_info{'size'} = $num_items_in_bucket;
878 $partition_info{'name'} = $last_partition;
879
880 push (@partition_buckets, \%partition_info);
881
882 # now go through the array of A, B, C buckets, and merge small buckets
883 my @new_partition_buckets = ();
884 for (my $i = 0; $i < scalar(@partition_buckets) - 1; $i++) {
885
886 my $this_bucket = $partition_buckets[$i];
887 my $next_bucket = $partition_buckets[$i+1];
888
889 my $items_in_partition = $this_bucket->{'size'};
890
891 if ($items_in_partition < $partition_size ) {
892 my $items_in_next_partition = $next_bucket->{'size'};
893 if ($items_in_partition + $items_in_next_partition <= $partition_size+$tolerance ) {
894 # merge this bucket into the next bucket
895 foreach my $metadata_value_to_merge (@{$this_bucket->{'metadata_values'}}) {
896 push(@{$next_bucket->{'metadata_values'}}, $metadata_value_to_merge);
897 }
898 $next_bucket->{'size'} += $items_in_partition;
899
900 } else {
901 # remember this bucket
902 push (@new_partition_buckets, $this_bucket);
903 }
904 } else {
905 # remember this bucket
906 push (@new_partition_buckets, $this_bucket);
907 }
908 }
909 # add in the last bucket
910 my $last_bucket = $partition_buckets[scalar(@partition_buckets) - 1];
911 push (@new_partition_buckets, $last_bucket);
912
913 # Add partitions to the main list, but divide big bucket into several
914 my $last_partition_end = "";
915 my $partition_start = "";
916 my $partition_end = "";
917 my $partition_name = "";
918 foreach my $partition (@new_partition_buckets) {
919 my @metadata_values = $self->sort_metadata_values_array($sort_mode, @{$partition->{'metadata_values'}});
920 my $items_in_partition = $partition->{'size'};
921 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric);
922
923 if ($items_in_partition <= $partition_size+$tolerance) {
924 # we can just add the partition as is
925 my %metadata_values_to_OIDs_subhashes = ();
926 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
927 my $metadata_value = $metadata_values[$i];
928 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
929 }
930 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
931 $partition_end = $self->generate_partition_end($last_metadata_value, $partition_start, $partition_name_length, $max_partition_name_length, $is_numeric);
932 $partition_name = $partition_start;
933 if ($partition_end ne $partition_start) {
934 $partition_name = $partition_name . "-" . $partition_end;
935 }
936 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes);
937 $last_partition_end = $partition_end;
938 } else {
939 # we have too many items, need to split the partition
940 my $num_splits = int($items_in_partition/$partition_size);
941 $num_splits++ if (($items_in_partition - $partition_size*$num_splits) > $tolerance);
942
943 my $this_partition_size = int($items_in_partition/$num_splits);
944 if ($this_partition_size < $items_in_partition/$num_splits) { $this_partition_size++ };
945
946 # regenerate-the start, noticing that we are a split
947 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric, 1);
948 my $items_done = 0;
949 my $prev_done = 0;
950 my %metadata_values_to_OIDs_subhashes = ();
951 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
952 my $metadata_value = $metadata_values[$i];
953 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
954 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1;
955
956
957 if ($items_done >= $this_partition_size) {
958 # what we have stored already is enough for a partition
959 # store this partition
960 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes);
961 $last_partition_end = $partition_end;
962 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric, 1);
963 $prev_done += $items_done;
964 $items_done = 0;
965 %metadata_values_to_OIDs_subhashes = ();
966 }
967 $partition_end = $self->generate_partition_end($metadata_value, $partition_start, $partition_name_length, $max_partition_name_length, $is_numeric);
968 $partition_name = $partition_start;
969 if ($partition_end ne $partition_start) {
970 $partition_name = $partition_name . "-" . $partition_end;
971 }
972
973 if ($items_done + $items_for_this_md_value <= $this_partition_size) {
974 # store all the values for the current metadata to the
975 # current partition
976 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
977 $items_done += $items_for_this_md_value;
978
979 } else {
980 # we only want to add some of the values
981 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
982 my @OIDs_for_this_partition = ();
983 for (my $j=0; $j< $items_for_this_md_value; $j++) {
984 push (@OIDs_for_this_partition, $OIDs_for_this_value[$j]);
985 $items_done++;
986
987 if ($items_done >= $this_partition_size ) {
988 # add the partition
989 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
990
991 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes);
992 $last_partition_end = $partition_end;
993 $partition_start = $partition_end; # is that right? $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric, 1);
994 $prev_done += $items_done;
995 $items_done = 0;
996 %metadata_values_to_OIDs_subhashes = ();
997 @OIDs_for_this_partition = ();
998 }
999
1000 } # for each OID in the list
1001 # at the end, have we got some ids not added to a partition?
1002 # add them into the subhash
1003 if (scalar(@OIDs_for_this_partition)) {
1004 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
1005 }
1006 }
1007 } # end foreach metadata value
1008 # The last partition?
1009 if($items_done >0) {
1010 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes);
1011 }
1012
1013 } # end if items in partition > partition size
1014
1015 } # end for each partition bucket
1016
1017 # The partitions are stored in an HList
1018 $classifier_node->{'childtype'} = "HList";
1019
1020}
1021
1022
1023sub split_approximate_size_ORIG
1024{
1025
1026 my $self = shift(@_);
1027 my @metadata_groups = @{shift(@_)};
1028 my $classifier_node = shift(@_);
1029 my $partition_size = shift(@_);
1030 my $sort_mode = shift(@_);
1031 my $bookshelf_type = shift(@_);
1032 my $metadata_value_to_OIDs_hash_ref = shift(@_);
1033 my $partition_name_length = shift(@_);
1034 my $is_numeric = shift(@_);
1035
1036 # Generate hlist based on the first letter of the metadata value (like per_letter), or based on
1037 # numbers, but also with restriction on the partition size
1038 # If a partition has fewer items than specified by the "partition_size_within_level", then group them together if possible
1039 # If a partition has more items than specified, split into several hlists.
1040 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
1041
1042 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
1043
1044 # Separate values by their first letter, each form a bucket, like the per_letter partition type
1045 my $last_partition = $self->generate_partition_name($sortedmetadata_values[0], $partition_name_length, $is_numeric);
1046
1047 my @partition_buckets = ();
1048 my @metadata_values_in_bucket = ();
1049 my $num_items_in_bucket = 0;
1050
1051 foreach my $metadata_value (@sortedmetadata_values) {
1052 my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
1053 if ($metadata_valuepartition ne $last_partition) {
1054 my @temp_array = @metadata_values_in_bucket;
1055 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
1056 my %partition_info = ();
1057 $partition_info{'metadata_values'} = \@temp_array;
1058 $partition_info{'size'} = $num_items_in_bucket;
1059 $partition_info{'name'} = $last_partition;
1060 push (@partition_buckets, \%partition_info);
1061
1062 @metadata_values_in_bucket = ($metadata_value);
1063 $num_items_in_bucket = ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : scalar(@metadata_values_in_bucket));
1064 $last_partition = $metadata_valuepartition;
1065 } else {
1066 $num_items_in_bucket += ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1); #scalar(@metadata_values_in_bucket);
1067 push (@metadata_values_in_bucket, $metadata_value);
1068 }
1069 }
1070 # Last one
1071 my %partition_info = ();
1072 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
1073 $partition_info{'size'} = $num_items_in_bucket;
1074 $partition_info{'name'} = $last_partition;
1075
1076 push (@partition_buckets, \%partition_info);
1077
1078 # now go through the array of buckets, and merge small buckets
1079 my @new_partition_buckets = ();
1080 for (my $i = 0; $i < scalar(@partition_buckets) - 1; $i++) {
1081
1082 my $this_bucket = $partition_buckets[$i];
1083 my $next_bucket = $partition_buckets[$i+1];
1084
1085 my $items_in_partition = $this_bucket->{'size'};
1086
1087 if ($items_in_partition < $partition_size ) {
1088 my $items_in_next_partition = $next_bucket->{'size'};
1089 if ($items_in_partition + $items_in_next_partition <= $partition_size ) {
1090 # merge this bucket into the next bucket
1091 foreach my $metadata_value_to_merge (@{$this_bucket->{'metadata_values'}}) {
1092 push(@{$next_bucket->{'metadata_values'}}, $metadata_value_to_merge);
1093 }
1094 $next_bucket->{'size'} += $items_in_partition;
1095
1096 } else {
1097 # remember this bucket
1098 push (@new_partition_buckets, $this_bucket);
1099 }
1100 } else {
1101 # remember this bucket
1102 push (@new_partition_buckets, $this_bucket);
1103 }
1104 }
1105 # add in the last bucket
1106 my $last_bucket = $partition_buckets[scalar(@partition_buckets) - 1];
1107 push (@new_partition_buckets, $last_bucket);
1108
1109 # Add partitions to the main list, but divide big bucket into several
1110 my $last_partition_end = "";
1111 my $partition_start = "";
1112 my $partition_end = "";
1113 my $partition_name = "";
1114 foreach my $partition (@new_partition_buckets) {
1115 my @metadata_values = $self->sort_metadata_values_array($sort_mode, @{$partition->{'metadata_values'}});
1116 my $items_in_partition = $partition->{'size'};
1117 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $partition_name_length, $is_numeric);
1118
1119 if ($items_in_partition <= $partition_size) {
1120 # we can just add the partition as is
1121 my %metadata_values_to_OIDs_subhashes = ();
1122 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
1123 my $metadata_value = $metadata_values[$i];
1124 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
1125 }
1126 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
1127 $partition_end = $self->generate_partition_end($last_metadata_value, $partition_start, $partition_name_length, $is_numeric);
1128 $partition_name = $partition_start;
1129 if ($partition_end ne $partition_start) {
1130 $partition_name = $partition_name . "-" . $partition_end;
1131 }
1132 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes);
1133 $last_partition_end = $partition_end;
1134 } else {
1135 # we have too many items, need to split the partition
1136 my $items_done = 0;
1137 my %metadata_values_to_OIDs_subhashes = ();
1138 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
1139 my $metadata_value = $metadata_values[$i];
1140 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
1141 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1;
1142
1143 if ($items_done + $items_for_this_md_value > $partition_size && $items_done != 0) {
1144 # Save the stored items into a partition
1145 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes);
1146 $last_partition_end = $partition_end;
1147 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric);
1148 $items_done = 0;
1149 %metadata_values_to_OIDs_subhashes = ();
1150 }
1151
1152 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
1153 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size) {
1154
1155 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric);
1156 # Get the number of partitions needed for this value
1157 my $num_splits = int($items_for_this_md_value / $partition_size);
1158 $num_splits++ if ($items_for_this_md_value / $partition_size > $num_splits);
1159 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
1160 for (my $i = 0; $i < $num_splits; $i++) {
1161 my %OIDs_subhashes_for_this_value = ();
1162 my @OIDs_for_this_partition = ();
1163 for (my $d = $i * $partition_size; $d < (($i+1) * $partition_size > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size); $d++) {
1164 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
1165 }
1166
1167 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
1168 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size) {
1169 $partition_start = $partitionname_for_this_value;
1170 $partition_name = $partition_start;
1171 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
1172 $items_done += scalar(@OIDs_for_this_partition);
1173 $last_partition_end = $partitionname_for_this_value
1174 } else {
1175
1176 # Add an HList for this bucket
1177 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
1178 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
1179 $last_partition_end = $partitionname_for_this_value;
1180 }
1181 }
1182 } else {
1183
1184 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
1185 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
1186 $partition_end = $self->generate_partition_end($metadata_value, $partition_start, $partition_name_length, $is_numeric);
1187 $partition_name = $partition_start;
1188 if ($partition_end ne $partition_start) {
1189 $partition_name = $partition_name . "-" . $partition_end;
1190 }
1191
1192 }
1193
1194 # The last partition
1195 if($i == scalar(@metadata_values) - 1 && $items_done >0) {
1196 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes);
1197 }
1198
1199 }
1200 } # end if items in partition > partition size
1201
1202 }
1203
1204 # The partitions are stored in an HList
1205 $classifier_node->{'childtype'} = "HList";
1206
1207}
1208
1209
1210sub split_constant_size
1211{
1212 my $self = shift(@_);
1213 my @metadata_groups = @{shift(@_)};
1214 my $classifier_node = shift(@_);
1215 my $partition_size = shift(@_);
1216 my $sort_mode = shift(@_);
1217 my $bookshelf_type = shift(@_);
1218 my $metadata_value_to_OIDs_hash_ref = shift(@_);
1219 my $partition_name_length = shift(@_);
1220 my $max_partition_name_length = shift(@_);
1221 my $is_numeric = shift(@_);
1222
1223 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
1224 my $items_in_partition = 0;
1225 my %metadata_value_to_OIDs_subhash = ();
1226 my $lastpartitionend = "";
1227 my $partitionstart;
1228
1229 foreach my $metadata_value (@sortedmetadata_values) {
1230 if ($items_in_partition == 0) {
1231 # a new partition, set the name
1232 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $max_partition_name_length, $is_numeric);
1233 }
1234 my $numitems_for_this_value = ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1);
1235 if ($items_in_partition + $numitems_for_this_value <= $partition_size) {
1236 # add all the current values into the temporary list
1237 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
1238 $items_in_partition += $numitems_for_this_value;
1239 } elsif ($items_in_partition < $partition_size) {
1240 # only want to add some of the values into temporary list
1241 # note, we only get here if bookshelf type is never
1242 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
1243 @OIDs = $self->sort_leaf_items(\@OIDs);
1244 my $num_items_needed = $partition_size - $items_in_partition;
1245 my @slice = splice(@OIDs, 0, $num_items_needed);
1246 $metadata_value_to_OIDs_subhash{$metadata_value} = \@slice;
1247
1248 # now we have filled up the partition
1249 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $max_partition_name_length, $is_numeric);
1250 my $partitionname = $partitionstart;
1251 if ($partitionend ne $partitionstart) {
1252 $partitionname = $partitionname . "-" . $partitionend;
1253 }
1254
1255 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
1256 %metadata_value_to_OIDs_subhash = ();
1257 $items_in_partition = 0;
1258 $lastpartitionend = $partitionend;
1259
1260 # can we get more partitions from this metadata value?
1261 while (scalar(@OIDs) >= $partition_size) {
1262 my @slice = splice(@OIDs, 0, $partition_size);
1263 $metadata_value_to_OIDs_subhash{$metadata_value} = \@slice;
1264 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $max_partition_name_length, $is_numeric);
1265 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $max_partition_name_length, $is_numeric);
1266 my $partitionname = $partitionstart;
1267 if ($partitionend ne $partitionstart) {
1268 $partitionname = $partitionname . "-" . $partitionend;
1269 }
1270 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
1271 %metadata_value_to_OIDs_subhash = ();
1272 $items_in_partition = 0;
1273 $lastpartitionend = $partitionend;
1274
1275 }
1276 if (scalar(@OIDs) > 0) {
1277 $metadata_value_to_OIDs_subhash{$metadata_value} = \@OIDs;
1278 $items_in_partition = scalar(@OIDs);
1279 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $max_partition_name_length, $is_numeric);
1280 }
1281
1282
1283 }
1284
1285 if ($items_in_partition == $partition_size) {
1286 # its the end of a partition
1287 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $max_partition_name_length, $is_numeric);
1288 my $partitionname = $partitionstart;
1289 if ($partitionend ne $partitionstart) {
1290 $partitionname = $partitionname . "-" . $partitionend;
1291 }
1292
1293 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
1294 %metadata_value_to_OIDs_subhash = ();
1295 $items_in_partition = 0;
1296 $lastpartitionend = $partitionend;
1297 }
1298 } # foreach metadata value
1299
1300 if ($items_in_partition > 0) {
1301 # we have to add the last partition
1302 my $partitionend = $self->generate_partition_end(@sortedmetadata_values[@sortedmetadata_values-1], $partitionstart, $partition_name_length, $max_partition_name_length, $is_numeric);
1303 my $partitionname = $partitionstart;
1304 if ($partitionend ne $partitionstart) {
1305 $partitionname = $partitionname . "-" . $partitionend;
1306 }
1307
1308 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
1309 }
1310
1311 # The partitions are stored in an HList
1312 $classifier_node->{'childtype'} = "HList";
1313
1314}
1315
1316sub split_per_letter_or_digit
1317{
1318 my $self = shift(@_);
1319 my @metadata_groups = @{shift(@_)};
1320 my $classifier_node = shift(@_);
1321 my $sort_mode = shift(@_);
1322 my $metadata_value_to_OIDs_hash_ref = shift(@_);
1323 my $is_numeric = shift(@_);
1324 my $numeric_partition_length = shift(@_);
1325
1326 if (not defined $is_numeric) {
1327 $is_numeric = 0;
1328 }
1329 if ($is_numeric && not defined($numeric_partition_length)) {
1330 $numeric_partition_length = 1;
1331 }
1332 # Generate one hlist for each letter
1333 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
1334 my %metadata_value_to_OIDs_subhash = ();
1335
1336 my $lastpartition = $self->generate_partition_name($sortedmetadata_values[0], $numeric_partition_length, $is_numeric);
1337 foreach my $metadata_value (@sortedmetadata_values) {
1338 my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $numeric_partition_length, $is_numeric);
1339 # Is this the start of a new partition?
1340 if ($metadata_valuepartition ne $lastpartition) {
1341 #print STDERR "new partition, old = $lastpartition, new=$metadata_valuepartition\n";
1342 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
1343 %metadata_value_to_OIDs_subhash = ();
1344 $lastpartition = $metadata_valuepartition;
1345 }
1346
1347 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
1348 }
1349
1350 # Don't forget to add the last partition
1351 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
1352
1353 # The partitions are stored in an HList
1354 $classifier_node->{'childtype'} = "HList";
1355
1356}
1357
1358sub generate_partition_name
1359{
1360 my $self = shift(@_);
1361 my $mvalue = shift(@_);
1362 my $numeric_partition_length = shift(@_);
1363 my $is_numeric = shift(@_);
1364
1365 if (!$is_numeric || $numeric_partition_length == 1 ) {
1366 return substr($mvalue, 0, 1);
1367 }
1368 if ($numeric_partition_length == -1) {
1369 my ($all_digits) = $mvalue =~ /^([0-9]+)/;
1370 return $all_digits;
1371 }
1372 my ($some_digits) = $mvalue =~ /^([0-9]{1,$numeric_partition_length})/;
1373 return $some_digits;
1374}
1375
1376sub generate_partition_start
1377{
1378 my $self = shift(@_);
1379 my $metadata_value = shift(@_);
1380 my $lastpartitionend = shift(@_);
1381 my $partition_name_length = shift(@_);
1382 my $max_partition_name_length = shift(@_);
1383 my $is_numeric = shift(@_);
1384 my $is_split = shift(@_);
1385
1386# print STDERR "generate-partitoin start, $metadata_value, $lastpartitionend, $max_partition_name_length\n";
1387 if ($is_numeric) {
1388 return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
1389 }
1390
1391 $is_split = 0 unless defined $is_split;
1392 # print STDERR "is split = $is_split\n";
1393 if ($partition_name_length) {
1394 # print STDERR "partiton anme lenght = $partition_name_length, just using that length\n";
1395 return substr($metadata_value, 0, $partition_name_length);
1396 }
1397
1398 my $variable_partition_name_length = (1+$is_split <= $max_partition_name_length ? 1+$is_split : $max_partition_name_length);
1399 my $partitionstart = substr($metadata_value, 0, $variable_partition_name_length);
1400 # print STDERR "new start = $partitionstart, $variable_partition_name_length\n";
1401 while(($partitionstart le $lastpartitionend) && (($variable_partition_name_length += 1) <= $max_partition_name_length)) {
1402 # print STDERR "NEW###### $variable_partition_name_length\n";
1403 $partitionstart = substr($metadata_value, 0, $variable_partition_name_length);
1404 }
1405
1406 return $partitionstart;
1407}
1408
1409
1410sub generate_partition_end
1411{
1412 my $self = shift(@_);
1413 my $metadata_value = shift(@_);
1414 my $partitionstart = shift(@_);
1415 my $partition_name_length = shift(@_);
1416 my $max_partition_name_length = shift(@_);
1417 my $is_numeric = shift(@_);
1418 #print STDERR "geenrate end, $metadata_value: $partitionstart: $partition_name_length: $max_partition_name_length\n";
1419 if ($is_numeric) {
1420 return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
1421 }
1422 if ($partition_name_length) {
1423 return substr($metadata_value, 0, $partition_name_length);
1424 }
1425
1426 my $partitionend = substr($metadata_value, 0, length($partitionstart));
1427 #if ($partitionend gt $partitionstart) {
1428# $partitionend = substr($metadata_value, 0, 1);
1429# if ($partitionend le $partitionstart) {
1430# $partitionend = substr($metadata_value, 0, 2);
1431# # Give up after three characters
1432# if ($partitionend le $partitionstart) {
1433# $partitionend = substr($metadata_value, 0, 3);
1434# }
1435# }
1436 # }
1437
1438 return $partitionend;
1439}
1440
1441
1442sub add_hlist_partition
1443{
1444 my $self = shift(@_);
1445 my @metadata_groups = @{shift(@_)};
1446 my $classifier_node = shift(@_);
1447 my $partitionname = shift(@_);
1448 my $metadata_value_to_OIDs_hash_ref = shift(@_);
1449
1450 # Create an hlist partition
1451 # Note that we don't need to convert from unicode-aware strings
1452 # to utf8 here, as that is handled elsewhere in the code
1453 my %child_classifier_node = ( 'Title' => $partitionname, #'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
1454 'childtype' => "VList",
1455 'contains' => [] );
1456
1457 # Add the children to the hlist partition
1458 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
1459 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
1460}
1461
1462
1463sub add_vlist
1464{
1465 my $self = shift(@_);
1466 my @metadata_groups = @{shift(@_)};
1467 my $classifier_node = shift(@_);
1468 my $metadata_value_to_OIDs_hash_ref = shift(@_);
1469 my $metadata_group = shift(@metadata_groups);
1470 $classifier_node->{'mdtype'} = $metadata_group;
1471 my $sort_type = $self->{$metadata_group .".metadata_sort_mode_within_level"};
1472 # Create an entry in the vlist for each value
1473 foreach my $metadata_value ($self->sort_metadata_values_array($sort_type, keys(%{$metadata_value_to_OIDs_hash_ref})))
1474 {
1475 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
1476
1477 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
1478 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
1479 my $OID = $OIDs[0];
1480 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
1481 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
1482 }
1483 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
1484 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
1485 @OIDs = $self->sort_leaf_items(\@OIDs);
1486 foreach my $OID (@OIDs) {
1487 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
1488 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
1489 }
1490
1491 }
1492 # Otherwise create a sublist (bookshelf) for the metadata value
1493 else {
1494 my $metadata_value_display = $self->get_metadata_value_display($metadata_group, $metadata_value);
1495 my %child_classifier_node = ( 'Title' => $metadata_value_display,
1496 'childtype' => "VList",
1497 'mdtype' => $metadata_group,
1498 'contains' => [] );
1499
1500 # If there are metadata elements remaining, recursively apply the process
1501 if (@metadata_groups > 0) {
1502 my $next_metadata_group = $metadata_groups[0];
1503
1504 # separate metadata into those that belong in the next/sub-metadata_group
1505 # and those that belong at the current level's metadata_group
1506
1507 my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
1508 my @current_level_OIDs = ();
1509 my @next_level_OIDs = ();
1510 foreach my $OID (@OIDs)
1511 {
1512 if ($OID_to_metadata_values_hash_ref->{$OID}) {
1513 push(@next_level_OIDs, $OID);
1514 } else {
1515 push(@current_level_OIDs, $OID);
1516 }
1517 }
1518 # recursively process those docs belonging to the sub-metadata_group
1519 $self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
1520
1521 # For those docs that don't belong in the sub/next_metadata_group, but which belong
1522 # at this level, just add the documents as children of this list at the current level
1523 @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
1524 foreach my $current_level_OID (@current_level_OIDs) {
1525 my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
1526 push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
1527 }
1528 }
1529 # Otherwise just add the documents as children of this list
1530 else {
1531 @OIDs = $self->sort_leaf_items(\@OIDs);
1532 foreach my $OID (@OIDs) {
1533 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
1534 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
1535 }
1536
1537 }
1538
1539 # Add the sublist to the list
1540 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
1541 }
1542 }
1543}
1544
1545sub metadata_offset
1546{
1547 my $self = shift(@_);
1548 my $metadata_group = shift(@_);
1549 my $OID = shift(@_);
1550 my $metadata_value = shift(@_);
1551
1552 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
1553 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
1554 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
1555 if ($metadata_value eq $metadata_values[$i]) {
1556 return $i;
1557 }
1558 }
1559
1560 return 0;
1561}
1562
1563sub sort_leaf_items
1564{
1565 my $self = shift(@_);
1566 my @OIDs = @{shift(@_)};
1567
1568 # Sort leaf nodes and add to list
1569 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
1570 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
1571 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
1572 my $sort_type = $self->{$sort_leaf_nodes_usingmetaelem . ".metadata_sort_mode_within_level"};
1573 # Force a stable sort (Perl 5.6's sort isn't stable)
1574 # !! The [0] bits aren't ideal (multiple metadata values) !!
1575
1576 @OIDs = @OIDs[ sort {
1577 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]}) && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]}))
1578 {
1579 if ($sort_type eq "numeric") {
1580 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] <=> $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
1581 } elsif ($sort_type eq "alphabetic") {
1582 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
1583 } else {
1584 ncmp($OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0], $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0]);
1585 }
1586 }
1587 else
1588 {
1589 $a <=> $b;
1590 }
1591 } 0..$#OIDs ];
1592 }
1593 if ($self->{'reverse_sort_leaf_nodes'}) {
1594 return reverse @OIDs;
1595 }
1596 return @OIDs;
1597}
1598
1599
1600sub sort_metadata_values_array
1601{
1602 my $self = shift(@_);
1603 my ($sort_mode) = shift(@_);
1604 my @metadata_values = @_;
1605
1606 if ($sort_mode eq "unicode") {
1607 if ($self->{'unicode_collator'}) {
1608 return $self->{'unicode_collator'}->sort(@metadata_values);
1609 }
1610 # the collator wasn't loaded, fall back on default
1611 $sort_mode = "alphanumeric";
1612 }
1613 if ($sort_mode eq "numeric") {
1614 return sort {$a <=> $b} @metadata_values;
1615 }
1616 if ($sort_mode eq "alphabetic") {
1617 return sort {$a cmp $b} @metadata_values;
1618 }
1619 # natural sort
1620 return nsort(@metadata_values);
1621}
1622
1623
1624# we are not using this any more. Using nsort instead
1625# $a and $b args automatically passed in and shouldn't be declared
1626sub alpha_numeric_cmp
1627{
1628 my $self = shift (@_);
1629 my ($aStr, $bStr) = @_;
1630 if ($aStr =~ m/^(\d+(\.\d+)?)/)
1631 {
1632 my $val_a = $1;
1633 if ($bStr =~ m/^(\d+(\.\d+)?)/)
1634 {
1635 my $val_b = $1;
1636 if ($val_a != $val_b)
1637 {
1638 return ($val_a <=> $val_b);
1639 }
1640 }
1641 }
1642
1643 return ($aStr cmp $bStr);
1644}
1645
1646
1647sub get_metadata_value_display {
1648 my $self = shift(@_);
1649 my ($metadata_group, $metadata_value) = @_;
1650 return $metadata_value if $self->{'use_formatted_metadata_for_bookshelf_display'};
1651 my $actual_values_hash = $self->{$metadata_group . ".actualvalues"}->{$metadata_value};
1652 my $display_value ="";
1653 my $max_count=0;
1654 foreach my $v (keys %$actual_values_hash) {
1655 if ($actual_values_hash->{$v} > $max_count) {
1656 $display_value = $v;
1657 $max_count = $actual_values_hash->{$v};
1658 }
1659 }
1660 return $display_value;
1661}
16621;
Note: See TracBrowser for help on using the repository browser.