source: main/trunk/greenstone2/perllib/classify/List.pm@ 33460

Last change on this file since 33460 was 33460, checked in by kjdon, 5 years ago

fixed up some typos. removed use_hlist_for option. This is very hard to understand and actually only works if partition_type is none, and bookshelf_type is always - puts the bookshelves into hlist instead of vlist. instead, I have added all_values partition type. This just makes each value become a partition. Might be useful for Dates, or other short classification id.

  • Property svn:keywords set to Author Date Id Revision
File size: 52.9 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11#
12# Copyright (C) 2005 New Zealand Digital Library Project
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28# TO DO: - Remove punctuation from metadata values before sorting.
29# - Add an AZCompactList-style hlist option?
30#
31###########################################################################
32
33
34package List;
35
36
37use BaseClassifier;
38use Sort::Naturally;
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46
47my $metadata_selection_mode_list =
48 [
49 { 'name' => "firstvalue",
50 'desc' => "{List.metadata_selection.firstvalue}"},
51 { 'name' => "firstvalidmetadata",
52 'desc' => "{List.metadata_selection.firstvalidmetadata}"},
53 { 'name' => "allvalues",
54 'desc' => "{List.metadata_selection.allvalues}"} ];
55
56my $metadata_selection_mode_default = "firstvalidmetadata";
57
58my $valid_metadata_selection_modes = { 'firstvalue' => 1,
59 'firstvalidmetadata' => 1,
60 'allvalues' => 1 };
61
62my $metadata_sort_mode_list =
63 [
64 { 'name' => "unicode",
65 'desc' => "{List.metadata_sort.unicode}"},
66 { 'name' => "alphabetic",
67 'desc' => "{List.metadata_sort.alphabetic}"},
68 { 'name' => "alphanumeric",
69 'desc' => "{List.metadata_sort.alphanumeric}"} ];
70
71my $metadata_sort_mode_default = "alphanumeric";
72
73my $valid_metadata_sort_modes = { 'unicode' => 1,
74 'alphabetic' => 1,
75 'alphanumeric' => 1};
76my $partition_type_list =
77 [ { 'name' => "per_letter",
78 'desc' => "{List.level_partition.per_letter}" },
79 { 'name' => "approximate_size",
80 'desc' => "{List.level_partition.approximate_size}"},
81 { 'name' => "constant_size",
82 'desc' => "{List.level_partition.constant_size}" },
83 { 'name' => "all_values",
84 'desc' => "{List.level_partition.all_values}" },
85 { 'name' => "none",
86 'desc' => "{List.level_partition.none}" } ];
87
88my $partition_type_default = "per_letter";
89
90my $valid_partition_types = { 'per_letter' => 1,
91 'constant_size' => 1,
92 'approximate_size' => 1,
93 'all_values' => 1,
94 'none' => 1};
95
96my $partition_size_default = 30;
97
98my $numeric_partition_type_list =
99 [ { 'name' => "per_digit",
100 'desc' => "{List.level_partition.per_digit}" },
101 { 'name' => "per_number",
102 'desc' => "{List.level_partition.per_number}" },
103 { 'name' => "single_partition",
104 'desc' => "{List.level_partition.single}" },
105 { 'name' => "approximate_size",
106 'desc' => "{List.level_partition.approximate_size_numeric}"},
107 { 'name' => "constant_size",
108 'desc' => "{List.level_partition.constant_size}" },
109 { 'name' => "all_values",
110 'desc' => "{List.level_partition.all_values}" },
111 { 'name' => "none",
112 'desc' => "{List.level_partition.none}" } ];
113
114my $numeric_partition_type_default = "single_partition";
115
116my $valid_numeric_partition_types = { 'per_digit' => 1,
117 'per_number' => 1,
118 'constant_size' => 1,
119 'single_partition' => 1,
120 'approximate_size' => 1,
121 'all_values' => 1,
122 'none' =>1 };
123
124my $numeric_partition_size_default = 30;
125
126my $numeric_partition_name_length_default = "-1"; # use the full number
127
128my $bookshelf_type_list =
129 [ { 'name' => "always",
130 'desc' => "{List.bookshelf_type.always}" },
131 { 'name' => "duplicate_only",
132 'desc' => "{List.bookshelf_type.duplicate_only}" },
133 { 'name' => "never",
134 'desc' => "{List.bookshelf_type.never}" } ];
135
136my $bookshelf_type_default = "never";
137my $sort_leaf_nodes_using_default = "Title";
138my $arguments =
139 [ { 'name' => "metadata",
140 'desc' => "{List.metadata}",
141 'type' => "metadata",
142 'reqd' => "yes" },
143
144 { 'name' => "metadata_selection_mode_within_level",
145 'desc' => "{List.metadata_selection_mode_within_level}",
146 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
147 'list' => $metadata_selection_mode_list,
148 'deft' => $metadata_selection_mode_default },
149
150 { 'name' => "metadata_sort_mode_within_level",
151 'desc' => "{List.metadata_sort_mode_within_level}",
152 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
153 'list' => $metadata_sort_mode_list,
154 'deft' => $metadata_sort_mode_default },
155
156 { 'name' => "bookshelf_type",
157 'desc' => "{List.bookshelf_type}",
158 'type' => "enum",
159 'list' => $bookshelf_type_list,
160 'deft' => $bookshelf_type_default },
161
162 { 'name' => "classify_sections",
163 'desc' => "{List.classify_sections}",
164 'type' => "flag" },
165
166 { 'name' => "partition_type_within_level",
167 'desc' => "{List.partition_type_within_level}",
168 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
169 'list' => $partition_type_list,
170 'deft' => $partition_type_default },
171
172 { 'name' => "partition_size_within_level",
173 'desc' => "{List.partition_size_within_level}",
174 'type' => "string", # Must be string because multiple values can be specified (separated by '/')
175 'deft' => $partition_size_default},
176
177 { 'name' => "partition_name_length",
178 'desc' => "{List.partition_name_length}",
179 'type' => "string" },
180
181 {'name' => "partition_sort_mode_within_level",
182 'desc' => "{List.partition_sort_mode_within_level}",
183 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
184 'list' => $metadata_sort_mode_list,
185 'deft' => $metadata_sort_mode_default },
186
187 { 'name' => "numeric_partition_type_within_level",
188 'desc' => "{List.numeric_partition_type_within_level}",
189 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
190 'list' => $numeric_partition_type_list,
191 'deft' => $numeric_partition_type_default },
192
193 {'name' => "numeric_partition_sort_mode_within_level",
194 'desc' => "{List.numeric_partition_sort_mode_within_level}",
195 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
196 'list' => $metadata_sort_mode_list,
197 'deft' => $metadata_sort_mode_default },
198
199 { 'name' => "numeric_partition_size_within_level",
200 'desc' => "{List.numeric_partition_size_within_level}",
201 'type' => "string", # Must be string because multiple values can be specified (separated by '/')
202 'deft' => $numeric_partition_size_default},
203
204 { 'name' => "numeric_partition_name_length_within_level",
205 'desc' => "{List.numeric_partition_name_length_within_level}",
206 'type' => "string",
207 'deft' => $numeric_partition_name_length_default },
208
209 { 'name' => "numbers_first",
210 'desc' => "{List.numbers_first}",
211 'type' => 'flag'},
212
213 { 'name' => "sort_leaf_nodes_using",
214 'desc' => "{List.sort_leaf_nodes_using}",
215 'type' => "metadata",
216 'deft' => $sort_leaf_nodes_using_default },
217
218 { 'name' => "sort_leaf_nodes_sort_mode",
219 'desc' => "{List.sort_leaf_nodes_sort_mode}",
220 'type' => "enum",
221 'list' => $metadata_sort_mode_list,
222 'deft' => $metadata_sort_mode_default },
223
224 { 'name' => "reverse_sort_leaf_nodes",
225 'desc' => "{List.reverse_sort_leaf_nodes}",
226 'type' => "flag"},
227
228 { 'name' => "sort_using_unicode_collation",
229 'desc' => "{List.metadata_sort.unicode} {List.sort_using_unicode_collation}",
230 'type' => "flag" },
231
232 {'name' => "filter_metadata",
233 'desc' => "{List.filter_metadata}",
234 'type' => "metadata"},
235
236 {'name' => "filter_regex",
237 'desc' => "{List.filter_regex}",
238 'type' => "regexp"},
239
240 { 'name' => "standardize_capitalization",
241 'desc' => "{List.standardize_capitalization}",
242 'type' => "flag"},
243
244 { 'name' => "removeprefix",
245 'desc' => "{BasClas.removeprefix}",
246 'type' => "regexp" },
247
248 { 'name' => "removesuffix",
249 'desc' => "{BasClas.removesuffix}",
250 'type' => "regexp" } ];
251
252my $options = { 'name' => "List",
253 'desc' => "{List.desc}",
254 'abstract' => "no",
255 'inherits' => "yes",
256 'args' => $arguments };
257
258
259sub new
260{
261 my ($class) = shift(@_);
262 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
263 push(@$classifierslist, $class);
264
265 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
266 push(@{$hashArgOptLists->{"OptList"}}, $options);
267
268 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
269
270 bless $self, $class;
271
272 if ($self->{'info_only'}) {
273 # don't worry about any options etc
274 return $self;
275 }
276
277 # The metadata elements to use (required)
278 if (!$self->{'metadata'}) {
279 die "Error: No metadata fields specified for List.\n";
280 }
281
282 my @metadata_groups = split(/[\/]/, $self->{'metadata'});
283 $self->{'metadata_groups'} = \@metadata_groups;
284
285 # The classifier button name (default: the first metadata element specified)
286 if (!$self->{'buttonname'}) {
287 my $first_metadata_group = $metadata_groups[0];
288 my $first_metadata_element = (split(/[\;|,\/]/, $first_metadata_group))[0];
289 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
290 }
291
292 # meta selection mode for each level
293 $self->set_metadata_groups_info_per_level("metadata_selection_mode_within_level", $metadata_selection_mode_default, $valid_metadata_selection_modes);
294
295 # meta sort mode for each level
296 if ($self->{'sort_using_unicode_collation'}) {
297 print STDERR "WARNING: sort_using_unicode_collation is set, setting metadata_sort_mode_within_level to unicode for all levels, regardless of current setting\n";
298 $self->{'metadata_sort_mode_within_level'} = "unicode";
299 $metadata_sort_mode_default = "unicode";
300 } else {
301 if ($self->{'metadata_sort_mode_within_level'} =~ /unicode/) {
302 $self->{'sort_using_unicode_collation'} = 1;
303 }
304 }
305
306
307 $self->set_metadata_groups_info_per_level('metadata_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
308
309 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
310 foreach my $metadata_group (@metadata_groups) {
311 $self->{$metadata_group . ".bookshelf_type"} = "always";
312 }
313 my $last_metadata_group = $metadata_groups[$#metadata_groups];
314 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
315 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
316
317 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
318 foreach my $metadata_group (@metadata_groups) {
319 $self->{$metadata_group . ".list_type"} = "VList";
320 }
321 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
322 $self->{$metadata_group . ".list_type"} = "HList";
323 }
324
325 # How the items are grouped into partitions (default: by letter)
326 # for each level (metadata group), separated by '/'
327
328 $self->set_metadata_groups_info_per_level("partition_type_within_level", $partition_type_default, $valid_partition_types);
329
330 $self->set_metadata_groups_info_per_level("numeric_partition_type_within_level", $numeric_partition_type_default, $valid_numeric_partition_types);
331
332 # now check whether a level was none - need to set the equivalent level in the other half also to none
333 foreach my $metadata_group (@metadata_groups) {
334 if ($self->{$metadata_group . ".partition_type_within_level"} eq "none" || $self->{$metadata_group . ".numeric_partition_type_within_level"} eq "none") {
335
336 print STDERR "WARNING: one of -partition_type_within_level or -numeric_partition_type_within_level was set to 'none' for level $metadata_group, overriding current value of both these options to 'none'\n";
337
338 $self->{$metadata_group . ".partition_type_within_level"} = "none";
339 $self->{$metadata_group . ".numeric_partition_type_within_level"} = "none";
340 }
341 }
342
343 $self->set_metadata_groups_info_per_level("partition_size_within_level", $partition_size_default);
344 $self->set_metadata_groups_info_per_level("numeric_partition_size_within_level", $numeric_partition_size_default);
345
346 $self->set_metadata_groups_info_per_level('partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
347
348 $self->set_metadata_groups_info_per_level('numeric_partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
349
350 $self->set_metadata_groups_info_per_level("numeric_partition_name_length_within_level", $numeric_partition_name_length_default);
351
352 # The removeprefix and removesuffix expressions
353 if ($self->{'removeprefix'}) {
354 # If there are more than one expressions, use '' to quote each experession and '/' to separate
355 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
356
357 foreach my $metadata_group (@metadata_groups) {
358 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
359 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
360 # Remove the other ' at the beginning and the end if there is any
361 $removeprefix_expr_within_levelelem =~ s/^'//;
362 $removeprefix_expr_within_levelelem =~ s/'$//;
363 # Remove the extra ^ at the beginning
364 $removeprefix_expr_within_levelelem =~ s/^\^//;
365 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
366 } else {
367 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
368 }
369 }
370 }
371 if ($self->{'removesuffix'}) {
372 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
373
374 foreach my $metadata_group (@metadata_groups) {
375 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
376 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
377 $removesuffix_expr_within_levelelem =~ s/^'//;
378 $removesuffix_expr_within_levelelem =~ s/'$//;
379 # Remove the extra $ at the end
380 $removesuffix_expr_within_levelelem =~ s/\$$//;
381 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
382 } else {
383 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
384 }
385 }
386 }
387
388 # The metadata elements to use to sort the leaf nodes (default: Title)
389 my @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
390 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
391
392 foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
393 # set metadata_select_type, if not already set - might be already set if the same group was used in -metadata
394 if (!defined $self->{$sort_group . ".metadata_selection_mode_within_level"}) {
395 $self->{$sort_group . ".metadata_selection_mode_within_level"} = $metadata_selection_mode_default;
396 }
397
398 }
399
400 my @leaf_nodes_sort_modes = split (/\|/, $self->{'sort_leaf_nodes_sort_mode'});
401 foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
402 my $leaf_sort_mode = shift(@leaf_nodes_sort_modes);
403 if (!defined $self->{$sort_group . ".metadata_sort_mode_within_level"}) {
404 if (defined $leaf_sort_mode && defined $valid_metadata_sort_modes->{$leaf_sort_mode}) {
405 $self->{$sort_group . ".metadata_sort_mode_within_level"} = $leaf_sort_mode;
406 }
407 else {
408 $self->{$sort_group . ".metadata_sort_mode_within_level"} = $metadata_sort_mode_default;
409 }
410 }
411
412 }
413
414 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
415 if ($self->{'sort_using_unicode_collation'}) {
416 # To use this you first need to download the allkeys.txt file from
417 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
418 # Unicode/Collate directory.
419 require Unicode::Collate;
420 $self->{'unicode_collator'} = Unicode::Collate->new();
421 }
422
423 # An empty array for the document/section OIDs that we are classifying
424 $self->{'OIDs'} = [];
425 # A hash for all the doc ids that we have seen, so we don't classify something twice
426 $self->{'all_doc_OIDs'} = {};
427 #return bless $self, $class;
428 return $self;
429}
430
431
432sub init
433{
434 # Nothing to do...
435}
436
437sub set_metadata_groups_info_per_level
438{
439 my $self = shift(@_);
440 my $info_name = shift(@_);
441 my $info_default = shift(@_);
442 my $info_valid_types_hash_ref = shift(@_);
443
444 if (!defined $self->{$info_name}) {
445 print STDERR "**** undefined $info_name\n";
446 }
447 my @info_list = split(/\//, $self->{$info_name});
448
449 my $first = 1;
450 foreach my $metadata_group (@{$self->{'metadata_groups'}}) {
451 my $info_elem = shift(@info_list);
452 if (defined ($info_elem) && (!defined $info_valid_types_hash_ref || defined $info_valid_types_hash_ref->{$info_elem})) {
453 $self->{$metadata_group .".$info_name"} = $info_elem;
454 } else {
455 # its empty or an invalid entry
456 my $new_info_elem;
457 if ($first) {
458 $new_info_elem = $info_default;
459 } else {
460 # get the value we had at first
461 $new_info_elem = $self->{@{$self->{'metadata_groups'}}[0] . ".$info_name"};
462 }
463 $self->{$metadata_group .".$info_name"} = $new_info_elem;
464 if (defined $info_elem) {
465 print STDERR "List Error: $info_elem is not a valid value for $info_name, changing it to $new_info_elem\n";
466
467 }
468 }
469 $first = 0;
470 }
471
472}
473
474# Called for each document in the collection
475sub classify
476{
477 my $self = shift(@_);
478 my ($doc_obj) = @_;
479
480 if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
481 print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
482 return;
483 }
484
485 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
486
487 # check against filter here
488 if ($self->{'filter_metadata'}) {
489 my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'});
490 return unless defined $meta;
491 if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) {
492 print STDERR "doc $doc_obj doesn't pass filtering requirement\n" if ($self->{'verbosity'} > 3);
493 return;
494 }
495 }
496 # if we get here, we have passed the test for filtering
497 # If "-classify_sections" is set, classify every section of the document
498 if ($self->{'classify_sections'}) {
499 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
500 while (defined $section) {
501 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
502 $section = $doc_obj->get_next_section($section);
503 }
504 }
505 # Otherwise just classify the top document section
506 else {
507 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
508 }
509
510}
511
512sub classify_section
513{
514 my $self = shift(@_);
515 my ($doc_obj,$section_OID,$section) = @_;
516
517 my @metadata_groups = @{$self->{'metadata_groups'}};
518
519 # Only classify the section if it has a value for one of the metadata elements in the first group
520 my $classify_section = 0;
521 my $first_metadata_group = $metadata_groups[0];
522 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
523 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
524 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
525 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
526 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
527 # Remove prefix/suffix if requested
528 if (defined ($first_metadata_group_element_value)) {
529 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
530 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
531 }
532
533 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
534 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
535 }
536 }
537
538 $first_metadata_group_element_value = &sorttools::format_metadata_for_sorting($first_metadata_group, $first_metadata_group_element_value, $doc_obj) unless $self->{'no_metadata_formatting'};
539 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
540 # This section must be included in the classifier as we have found a value
541 $classify_section = 1;
542 last;
543 }
544 }
545
546 # We're not classifying this section because it doesn't have the required metadata
547 return if (!$classify_section);
548
549 # Otherwise, include this section in the classifier
550
551 push(@{$self->{'OIDs'}}, $section_OID);
552
553 # Create a hash for the metadata values of each metadata element we're interested in
554 my %metadata_groups_done = ();
555 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
556 # Take care not to do a metadata group more than once
557 unless ($metadata_groups_done{$metadata_group}) {
558 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
559 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
560 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
561 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
562
563 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
564 foreach my $metadata_value (@metadata_values) {
565 #print STDERR "working with value $metadata_value\n";
566 # Strip leading and trailing whitespace
567 $metadata_value =~ s/^\s*//;
568 $metadata_value =~ s/\s*$//;
569
570 # Remove prefix/suffix if requested
571 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
572 $metadata_value =~ s/^$remove_prefix_expr//;
573 }
574 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
575 $metadata_value =~ s/$remove_suffix_expr$//;
576 }
577
578 # lowercase metadata both for sorting meta (d/D under D), and to allow CSS to be able to
579 # text-transform the stored lowercase values as capitalize or uppercase (can't CSS
580 # text-transform if stored uppercase). 2 CSS text-transforms have been added to core.css
581 my $lc_metadata_value = lc($metadata_value);
582 $lc_metadata_value = &sorttools::format_metadata_for_sorting($real_metadata_element, $lc_metadata_value, $doc_obj) unless $self->{'no_metadata_formatting'};
583 #print STDERR "formatted value = $lc_metadata_value\n";
584
585 # Add the metadata value into the list for this combination of metadata group
586 # and section - if we have some non-whitespace chars
587 # text that we have some non-whitespace chars
588 if ($lc_metadata_value =~ /\S/) {
589
590 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $lc_metadata_value);
591
592 # add the actual value into the stored values so we can remember the case
593 if (!$self->{'standardize_capitalization'}) {
594 if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) {
595 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++;
596 } else {
597 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1;
598 }
599 }
600 last if ($self->{$metadata_group . ".metadata_selection_mode_within_level"} eq "firstvalue");
601 }
602 } # foreach metadatavalue
603 last if ((@metadata_values > 0) && $self->{$metadata_group . ".metadata_selection_mode_within_level"} =~ /^(firstvalue|firstvalidmetadata)$/ );
604 } # foreach metadata element
605
606 $metadata_groups_done{$metadata_group} = 1;
607 }
608 }
609}
610
611
612sub get_classify_info
613{
614 my $self = shift(@_);
615
616 # The metadata groups to classify by
617 my @metadata_groups = @{$self->{'metadata_groups'}};
618 my $first_metadata_group = $metadata_groups[0];
619
620 # The OID values of the documents to include in the classifier
621 my @OIDs = @{$self->{'OIDs'}};
622
623 # Create the root node of the classification hierarchy
624 my %classifier_node = ( 'thistype' => "Invisible",
625 'childtype' => $self->{$first_metadata_group . ".list_type"},
626 'Title' => $self->{'buttonname'},
627 'contains' => [],
628 'mdtype' => $first_metadata_group );
629
630 # Recursively create the classification hierarchy, one level for each metadata group
631 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
632 return \%classifier_node;
633}
634
635
636sub add_level
637{
638 my $self = shift(@_);
639 my @metadata_groups = @{shift(@_)};
640 my @OIDs = @{shift(@_)};
641 my $classifier_node = shift(@_);
642
643 my $metadata_group = $metadata_groups[0];
644 if (!defined($self->{$metadata_group . ".list"})) {
645 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
646 return;
647 }
648
649 # Create a mapping from metadata value to OID
650 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
651 my %metadata_value_to_OIDs_hash = ();
652 my %numeric_metadata_value_to_OIDs_hash = ();
653 foreach my $OID (@OIDs)
654 {
655 if ($OID_to_metadata_values_hash_ref->{$OID})
656 {
657 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
658 foreach my $metadata_value (@metadata_values)
659 {
660 if ($metadata_value =~ /^[0-9]/) {
661 push(@{$numeric_metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
662 } else {
663 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
664 }
665 }
666 }
667 }
668 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
669 #print STDERR "Number of distinct numeric values: " . scalar(keys %numeric_metadata_value_to_OIDs_hash) . "\n";
670
671 # Partition the values (if necessary)
672 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
673 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
674 my $partition_sort_mode_within_level = $self->{$metadata_group . ".partition_sort_mode_within_level"};
675 my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"};
676
677
678 #############################################
679 ### DO THE NUMBERS IF THEY ARE TO COME FIRST
680 #############################################
681 if ($self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) {
682 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash);
683 }
684
685 ############################################
686 # DO THE LETTERS
687 ############################################
688 if (keys(%metadata_value_to_OIDs_hash)){ # make sure we have some values
689 if ($partition_type_within_level =~ /^per_letter$/i) {
690 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash);
691 }
692 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
693 $self->split_approximate_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'});
694
695 }
696 elsif ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
697 $self->split_constant_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'});
698 }
699
700 # Otherwise just add all the values to a VList
701 else {
702 if ($partition_type_within_level =~ /^all_values$/i) {
703 $classifier_node->{'childtype'} = "HList";
704 $self->{$metadata_group. ".bookshelf_type"} = "always";
705 }
706 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
707 }
708 }
709
710 ###########################################
711 ### DO THE NUMBERS IF THEY ARE TO COME LAST
712 ###########################################
713 if (!$self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) {
714 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash);
715 }
716}
717
718sub partition_numeric_values
719{
720 my $self = shift(@_);
721 my @metadata_groups = @{shift(@_)};
722 my $classifier_node = shift(@_);
723 my $numeric_metadata_value_to_OIDs_hash_ref = shift(@_);
724
725 my $metadata_group = $metadata_groups[0];
726 my $numeric_partition_type_within_level = $self->{$metadata_group . ".numeric_partition_type_within_level"};
727 my $numeric_partition_size_within_level = $self->{$metadata_group . ".numeric_partition_size_within_level"};
728 my $numeric_partition_sort_mode_within_level = $self->{$metadata_group . ".numeric_partition_sort_mode_within_level"};
729 my $numeric_partition_name_length_within_level = $self->{$metadata_group . ".numeric_partition_name_length_within_level"};
730 my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"};
731
732 if ($numeric_partition_type_within_level eq "single_partition") {
733 $self->add_hlist_partition(\@metadata_groups, $classifier_node, "0-9", $numeric_metadata_value_to_OIDs_hash_ref);
734 }
735 elsif ($numeric_partition_type_within_level eq "per_digit") {
736 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, 1);
737 }
738 elsif ($numeric_partition_type_within_level eq "per_number") {
739 # each different number is a bucket
740 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, $numeric_partition_name_length_within_level);
741 }
742 elsif ($numeric_partition_type_within_level eq "constant_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
743 # Generate hlists of a certain size
744
745 $self->split_constant_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1);
746 } elsif ($numeric_partition_type_within_level eq "approximate_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
747 $self->split_approximate_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1);
748 }
749 # Otherwise just add all the values to a VList
750 else {
751 if ($numeric_partition_type_within_level =~ /^all_values$/i) {
752 $classifier_node->{'childtype'} = "HList";
753 $self->{$metadata_group. ".bookshelf_type"} = "always";
754 }
755
756 $self->add_vlist(\@metadata_groups, $classifier_node, $numeric_metadata_value_to_OIDs_hash_ref);
757 }
758
759}
760
761sub split_approximate_size
762{
763
764 my $self = shift(@_);
765 my @metadata_groups = @{shift(@_)};
766 my $classifier_node = shift(@_);
767 my $partition_size = shift(@_);
768 my $sort_mode = shift(@_);
769 my $bookshelf_type = shift(@_);
770 my $metadata_value_to_OIDs_hash_ref = shift(@_);
771 my $partition_name_length = shift(@_);
772 my $is_numeric = shift(@_);
773
774 # Generate hlist based on the first letter of the metadata value (like per_letter), or based on
775 # numbers, but also with restriction on the partition size
776 # If a partition has fewer items than specified by the "partition_size_within_level", then group them together if possible
777 # If a partition has more items than specified, split into several hlists.
778 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
779
780 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
781
782 # Separate values by their first letter, each form a bucket, like the per_letter partition type
783 my $last_partition = $self->generate_partition_name($sortedmetadata_values[0], $partition_name_length, $is_numeric);
784
785 my @partition_buckets = ();
786 my @metadata_values_in_bucket = ();
787 my $num_items_in_bucket = 0;
788
789 foreach my $metadata_value (@sortedmetadata_values) {
790 my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
791 if ($metadata_valuepartition ne $last_partition) {
792 my @temp_array = @metadata_values_in_bucket;
793 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
794 my %partition_info = ();
795 $partition_info{'metadata_values'} = \@temp_array;
796 $partition_info{'size'} = $num_items_in_bucket;
797 $partition_info{'name'} = $last_partition;
798 push (@partition_buckets, \%partition_info);
799
800 @metadata_values_in_bucket = ($metadata_value);
801 $num_items_in_bucket = ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : scalar(@metadata_values_in_bucket));
802 $last_partition = $metadata_valuepartition;
803 } else {
804 $num_items_in_bucket += ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1); #scalar(@metadata_values_in_bucket);
805 push (@metadata_values_in_bucket, $metadata_value);
806 }
807 }
808 # Last one
809 my %partition_info = ();
810 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
811 $partition_info{'size'} = $num_items_in_bucket;
812 $partition_info{'name'} = $last_partition;
813
814 push (@partition_buckets, \%partition_info);
815
816 # now go through the array of buckets, and merge small buckets
817 my @new_partition_buckets = ();
818 for (my $i = 0; $i < scalar(@partition_buckets) - 1; $i++) {
819
820 my $this_bucket = $partition_buckets[$i]; #->{'name'};
821 my $next_bucket = $partition_buckets[$i+1]; #->{'name'};
822
823 my $items_in_partition = $this_bucket->{'size'};
824
825 if ($items_in_partition < $partition_size ) {
826 my $items_in_next_partition = $next_bucket->{'size'};
827 if ($items_in_partition + $items_in_next_partition <= $partition_size ) {
828 # merge this bucket into the next bucket
829 foreach my $metadata_value_to_merge (@{$this_bucket->{'metadata_values'}}) {
830 push(@{$next_bucket->{'metadata_values'}}, $metadata_value_to_merge);
831 }
832 $next_bucket->{'size'} += $items_in_partition;
833
834 } else {
835 # remember this bucket
836 push (@new_partition_buckets, $this_bucket);
837 }
838 } else {
839 # remember this bucket
840 push (@new_partition_buckets, $this_bucket);
841 }
842 }
843 # add in the last bucket
844 my $last_bucket = $partition_buckets[scalar(@partition_buckets) - 1];
845 push (@new_partition_buckets, $last_bucket);
846
847 # Add partitions to the main list, but divide big bucket into several
848 my $last_partition_end = "";
849 my $partition_start = "";
850 foreach my $partition (@new_partition_buckets) {
851 my @metadata_values = $self->sort_metadata_values_array($sort_mode, @{$partition->{'metadata_values'}});
852 my $items_in_partition = $partition->{'size'};
853 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $partition_name_length, $is_numeric);
854
855 if ($items_in_partition > $partition_size) {
856 my $items_done = 0;
857 my %metadata_values_to_OIDs_subhashes = ();
858 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
859 my $metadata_value = $metadata_values[$i];
860 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
861 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1;
862
863 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $partition_name_length, $is_numeric);
864 my $partitionname = $partition_start;
865 if ($partitionend ne $partition_start) {
866 $partitionname = $partitionname . "-" . $partitionend;
867 }
868
869 if ($items_done + $items_for_this_md_value > $partition_size && $items_done != 0) {
870 # Start a new partition
871 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
872 $last_partition_end = $partitionend;
873 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric);
874 $items_done = 0;
875 %metadata_values_to_OIDs_subhashes = ();
876 }
877
878 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
879 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size) {
880
881 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric);
882 # Get the number of partitions needed for this value
883 my $num_splits = int($items_for_this_md_value / $partition_size);
884 $num_splits++ if ($items_for_this_md_value / $partition_size > $num_splits);
885
886 my @OIDs_for_this_value = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
887 for (my $i = 0; $i < $num_splits; $i++) {
888 my %OIDs_subhashes_for_this_value = ();
889 my @OIDs_for_this_partition = ();
890 for (my $d = $i * $partition_size; $d < (($i+1) * $partition_size > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size); $d++) {
891 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
892 }
893
894 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
895 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size) {
896 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
897 $items_done += scalar(@OIDs_for_this_partition);
898 next;
899 }
900
901 # Add an HList for this bucket
902 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
903 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
904 $last_partition_end = $partitionname_for_this_value;
905 }
906 next;
907 }
908
909 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
910 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
911
912 # The last partition
913 if($i == scalar(@metadata_values) - 1) {
914 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
915 }
916 }
917 } # end if items in partition > partition size
918 else {
919 # The easier case, just add a partition
920 my %metadata_values_to_OIDs_subhashes = ();
921 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
922 my $metadata_value = $metadata_values[$i];
923 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
924 }
925 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
926 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $partition_name_length, $is_numeric);
927 my $partitionname = $partition_start;
928 if ($partitionend ne $partition_start) {
929 $partitionname = $partitionname . "-" . $partitionend;
930 }
931
932 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
933 $last_partition_end = $partitionend;
934 }
935 }
936
937 # The partitions are stored in an HList
938 $classifier_node->{'childtype'} = "HList";
939
940}
941
942sub split_constant_size
943{
944 my $self = shift(@_);
945 my @metadata_groups = @{shift(@_)};
946 my $classifier_node = shift(@_);
947 my $partition_size = shift(@_);
948 my $sort_mode = shift(@_);
949 my $metadata_value_to_OIDs_hash_ref = shift(@_);
950 my $partition_name_length = shift(@_);
951 my $is_numeric = shift(@_);
952
953 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
954 my $itemsdone = 0;
955 my $numitems = scalar(@sortedmetadata_values);
956 my %metadata_value_to_OIDs_subhash = ();
957 my $lastpartitionend = "";
958 my $partitionstart;
959 foreach my $metadata_value (@sortedmetadata_values) {
960 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
961 $itemsdone++;
962 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
963
964 # Is this the start of a new partition?
965 if ($itemsinpartition == 1) {
966 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $is_numeric);
967 }
968
969 # Is this the end of the partition?
970 if ($itemsinpartition == $partition_size || $itemsdone == $numitems) {
971 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $is_numeric);
972 my $partitionname = $partitionstart;
973 if ($partitionend ne $partitionstart) {
974 $partitionname = $partitionname . "-" . $partitionend;
975 }
976
977 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
978 %metadata_value_to_OIDs_subhash = ();
979 $lastpartitionend = $partitionend;
980 }
981 }
982
983 # The partitions are stored in an HList
984 $classifier_node->{'childtype'} = "HList";
985
986}
987
988sub split_per_letter_or_digit
989{
990 my $self = shift(@_);
991 my @metadata_groups = @{shift(@_)};
992 my $classifier_node = shift(@_);
993 my $sort_mode = shift(@_);
994 my $metadata_value_to_OIDs_hash_ref = shift(@_);
995 my $is_numeric = shift(@_);
996 my $numeric_partition_length = shift(@_);
997
998 if (not defined $is_numeric) {
999 $is_numeric = 0;
1000 }
1001 if ($is_numeric && not defined($numeric_partition_length)) {
1002 $numeric_partition_length = 1;
1003 }
1004 # Generate one hlist for each letter
1005 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
1006 my %metadata_value_to_OIDs_subhash = ();
1007
1008 my $lastpartition = $self->generate_partition_name($sortedmetadata_values[0], $numeric_partition_length, $is_numeric);
1009 foreach my $metadata_value (@sortedmetadata_values) {
1010
1011 my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $numeric_partition_length, $is_numeric);
1012
1013 # Is this the start of a new partition?
1014 if ($metadata_valuepartition ne $lastpartition) {
1015 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
1016 %metadata_value_to_OIDs_subhash = ();
1017 $lastpartition = $metadata_valuepartition;
1018 }
1019
1020 $metadata_value_to_OIDs_subhash{$metadata_value} = %$metadata_value_to_OIDs_hash_ref{$metadata_value};
1021 }
1022
1023 # Don't forget to add the last partition
1024 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
1025
1026 # The partitions are stored in an HList
1027 $classifier_node->{'childtype'} = "HList";
1028
1029}
1030
1031sub generate_partition_name
1032{
1033 my $self = shift(@_);
1034 my $mvalue = shift(@_);
1035 my $numeric_partition_length = shift(@_);
1036 my $is_numeric = shift(@_);
1037
1038 if (!$is_numeric || $numeric_partition_length == 1 ) {
1039 return substr($mvalue, 0, 1);
1040 }
1041 if ($numeric_partition_length == -1) {
1042 my ($all_digits) = $mvalue =~ /^([0-9]+)/;
1043 return $all_digits;
1044 }
1045 my ($some_digits) = $mvalue =~ /^([0-9]{1,$numeric_partition_length})/;
1046 return $some_digits;
1047}
1048
1049sub generate_partition_start
1050{
1051 my $self = shift(@_);
1052 my $metadata_value = shift(@_);
1053 my $lastpartitionend = shift(@_);
1054 my $partition_name_length = shift(@_);
1055 my $is_numeric = shift(@_);
1056
1057 if ($is_numeric) {
1058 return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
1059 }
1060
1061 if ($partition_name_length) {
1062 return substr($metadata_value, 0, $partition_name_length);
1063 }
1064
1065 my $partitionstart = substr($metadata_value, 0, 1);
1066 if ($partitionstart le $lastpartitionend) {
1067 $partitionstart = substr($metadata_value, 0, 2);
1068 # Give up after three characters
1069 if ($partitionstart le $lastpartitionend) {
1070 $partitionstart = substr($metadata_value, 0, 3);
1071 }
1072 }
1073
1074 return $partitionstart;
1075}
1076
1077
1078sub generate_partition_end
1079{
1080 my $self = shift(@_);
1081 my $metadata_value = shift(@_);
1082 my $partitionstart = shift(@_);
1083 my $partition_name_length = shift(@_);
1084 my $is_numeric = shift(@_);
1085
1086 if ($is_numeric) {
1087 return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
1088 }
1089 if ($partition_name_length) {
1090 return substr($metadata_value, 0, $partition_name_length);
1091 }
1092
1093 my $partitionend = substr($metadata_value, 0, length($partitionstart));
1094 if ($partitionend gt $partitionstart) {
1095 $partitionend = substr($metadata_value, 0, 1);
1096 if ($partitionend le $partitionstart) {
1097 $partitionend = substr($metadata_value, 0, 2);
1098 # Give up after three characters
1099 if ($partitionend le $partitionstart) {
1100 $partitionend = substr($metadata_value, 0, 3);
1101 }
1102 }
1103 }
1104
1105 return $partitionend;
1106}
1107
1108
1109sub add_hlist_partition
1110{
1111 my $self = shift(@_);
1112 my @metadata_groups = @{shift(@_)};
1113 my $classifier_node = shift(@_);
1114 my $partitionname = shift(@_);
1115 my $metadata_value_to_OIDs_hash_ref = shift(@_);
1116
1117 # Create an hlist partition
1118 # Note that we don't need to convert from unicode-aware strings
1119 # to utf8 here, as that is handled elsewhere in the code
1120 my %child_classifier_node = ( 'Title' => $partitionname, #'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
1121 'childtype' => "VList",
1122 'contains' => [] );
1123
1124 # Add the children to the hlist partition
1125 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
1126 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
1127}
1128
1129
1130sub add_vlist
1131{
1132 my $self = shift(@_);
1133 my @metadata_groups = @{shift(@_)};
1134 my $classifier_node = shift(@_);
1135 my $metadata_value_to_OIDs_hash_ref = shift(@_);
1136
1137 my $metadata_group = shift(@metadata_groups);
1138 $classifier_node->{'mdtype'} = $metadata_group;
1139
1140 my $sort_type = $self->{$metadata_group .".metadata_sort_mode_within_level"};
1141 # Create an entry in the vlist for each value
1142 foreach my $metadata_value ($self->sort_metadata_values_array($sort_type, keys(%{$metadata_value_to_OIDs_hash_ref})))
1143 {
1144 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
1145 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
1146 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
1147 my $OID = $OIDs[0];
1148 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
1149 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
1150 }
1151 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
1152 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
1153 @OIDs = $self->sort_leaf_items(\@OIDs);
1154 foreach my $OID (@OIDs) {
1155 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
1156 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
1157 }
1158
1159 }
1160 # Otherwise create a sublist (bookshelf) for the metadata value
1161 else {
1162 my $metadata_value_display = $self->get_metadata_value_display($metadata_group, $metadata_value);
1163 my %child_classifier_node = ( 'Title' => $metadata_value_display,
1164 'childtype' => "VList",
1165 'mdtype' => $metadata_group,
1166 'contains' => [] );
1167
1168 # If there are metadata elements remaining, recursively apply the process
1169 if (@metadata_groups > 0) {
1170 my $next_metadata_group = $metadata_groups[0];
1171 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
1172
1173 # separate metadata into those that below in the next/sub-metadata_group
1174 # and those that below at the current level's metadata_group
1175
1176 my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
1177 my @current_level_OIDs = ();
1178 my @next_level_OIDs = ();
1179 foreach my $OID (@OIDs)
1180 {
1181 if ($OID_to_metadata_values_hash_ref->{$OID}) {
1182 push(@next_level_OIDs, $OID);
1183 } else {
1184 push(@current_level_OIDs, $OID);
1185 }
1186 }
1187 # recursively process those docs belonging to the sub-metadata_group
1188 $self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
1189
1190 # For those docs that don't belong in the sub/next_metadata_group, but which belong
1191 # at this level, just add the documents as children of this list at the current level
1192 @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
1193 foreach my $current_level_OID (@current_level_OIDs) {
1194 my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
1195 push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
1196 }
1197 }
1198 # Otherwise just add the documents as children of this list
1199 else {
1200 @OIDs = $self->sort_leaf_items(\@OIDs);
1201 foreach my $OID (@OIDs) {
1202 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
1203 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
1204 }
1205
1206 }
1207
1208 # Add the sublist to the list
1209 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
1210 }
1211 }
1212}
1213
1214sub metadata_offset
1215{
1216 my $self = shift(@_);
1217 my $metadata_group = shift(@_);
1218 my $OID = shift(@_);
1219 my $metadata_value = shift(@_);
1220
1221 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
1222 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
1223 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
1224 if ($metadata_value eq $metadata_values[$i]) {
1225 return $i;
1226 }
1227 }
1228
1229 return 0;
1230}
1231
1232sub sort_leaf_items
1233{
1234 my $self = shift(@_);
1235 my @OIDs = @{shift(@_)};
1236
1237 # Sort leaf nodes and add to list
1238 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
1239 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
1240 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
1241 my $sort_type = $self->{$sort_leaf_nodes_usingmetaelem . ".metadata_sort_mode_within_level"};
1242 # Force a stable sort (Perl 5.6's sort isn't stable)
1243 # !! The [0] bits aren't ideal (multiple metadata values) !!
1244 @OIDs = @OIDs[ sort {
1245 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]}) && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]}))
1246 {
1247 if ($sort_type eq "numeric") {
1248 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] <=> $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
1249 } elsif ($sort_type eq "alphabetic") {
1250 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
1251 } else {
1252 ncmp($OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0], $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0]);
1253 }
1254 }
1255 else
1256 {
1257 $a <=> $b;
1258 }
1259 } 0..$#OIDs ];
1260 }
1261 if ($self->{'reverse_sort_leaf_nodes'}) {
1262 return reverse @OIDs;
1263 }
1264 return @OIDs;
1265}
1266
1267
1268sub sort_metadata_values_array
1269{
1270 my $self = shift(@_);
1271 my ($sort_mode) = shift(@_);
1272 my @metadata_values = @_;
1273
1274 if ($sort_mode eq "unicode") {
1275 if ($self->{'unicode_collator'}) {
1276 return $self->{'unicode_collator'}->sort(@metadata_values);
1277 }
1278 # the collator wasn't loaded, fall back on default
1279 $sort_mode = "alphanumeric";
1280 }
1281 if ($sort_mode eq "numeric") {
1282 return sort {$a <=> $b} @metadata_values;
1283 }
1284 if ($sort_mode eq "alphabetic") {
1285 return sort {$a cmp $b} @metadata_values;
1286 }
1287 # natural sort
1288 return nsort(@metadata_values);
1289}
1290
1291# we are not using this any more. Using nsort instead
1292# $a and $b args automatically passed in and shouldn't be declared
1293sub alpha_numeric_cmp
1294{
1295 my $self = shift (@_);
1296 my ($aStr, $bStr) = @_;
1297 if ($aStr =~ m/^(\d+(\.\d+)?)/)
1298 {
1299 my $val_a = $1;
1300 if ($bStr =~ m/^(\d+(\.\d+)?)/)
1301 {
1302 my $val_b = $1;
1303 if ($val_a != $val_b)
1304 {
1305 return ($val_a <=> $val_b);
1306 }
1307 }
1308 }
1309
1310 return ($aStr cmp $bStr);
1311}
1312
1313
1314sub get_metadata_value_display {
1315 my $self = shift(@_);
1316 my ($metadata_group, $metadata_value) = @_;
1317 return $metadata_value if $self->{'standardize_capitalization'};
1318 my $actual_values_hash = $self->{$metadata_group . ".actualvalues"}->{$metadata_value};
1319 my $display_value ="";
1320 my $max_count=0;
1321 foreach my $v (keys %$actual_values_hash) {
1322 if ($actual_values_hash->{$v} > $max_count) {
1323 $display_value = $v;
1324 $max_count = $actual_values_hash->{$v};
1325 }
1326 }
1327 return $display_value;
1328}
13291;
Note: See TracBrowser for help on using the repository browser.