source: main/trunk/greenstone2/perllib/classify/List.pm@ 33452

Last change on this file since 33452 was 33452, checked in by kjdon, 5 years ago

revamp of list classifier. More precise handling of numeric metadata values. Can now specify what happens to numeric values separately to what happens with word values. eg can have classifier A,B,C,D etc, then a single 0-9 bucket at the end (or the start). numeric values can be sorted lexically (10 < 9) or numerically (9 < 10). this can be different for sorting the partitions, and sorting the values inside the partitions. numeric values can be partitioned based on the first digit, or by the whole number, or a set number of digits - eg 4 to get a date list if partitioning yyyymmdd dates.

  • Property svn:keywords set to Author Date Id Revision
File size: 51.6 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11#
12# Copyright (C) 2005 New Zealand Digital Library Project
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28# TO DO: - Remove punctuation from metadata values before sorting.
29# - Add an AZCompactList-style hlist option?
30#
31###########################################################################
32
33
34package List;
35
36
37use BaseClassifier;
38use Sort::Naturally;
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46
47my $metadata_selection_mode_list =
48 [
49 { 'name' => "firstvalue",
50 'desc' => "{List.metadata_selection.firstvalue}"},
51 { 'name' => "firstvalidmetadata",
52 'desc' => "{List.metadata_selection.firstvalidmetadata}"},
53 { 'name' => "allvalues",
54 'desc' => "{List.metadata_selection.allvalues}"} ];
55
56my $metadata_selection_mode_default = "firstvalidmetadata";
57
58my $valid_metadata_selection_modes = { 'firstvalue' => 1,
59 'firstvalidmetadata' => 1,
60 'allvalues' => 1 };
61
62my $metadata_sort_mode_list =
63 [
64 { 'name' => "unicode",
65 'desc' => "{List.metadata_sort.unicode}"},
66 { 'name' => "alphabetic",
67 'desc' => "{List.metadata_sort.alphabetic}"},
68 { 'name' => "alphanumeric",
69 'desc' => "{List.metadata_sort.alphanumeric}"} ];
70
71my $metadata_sort_mode_default = "alphanumeric";
72
73my $valid_metadata_sort_modes = { 'unicode' => 1,
74 'alphabetic' => 1,
75 'alphanumeric' => 1};
76my $partition_type_list =
77 [ { 'name' => "per_letter",
78 'desc' => "{List.level_partition.per_letter}" },
79 { 'name' => "approximate_size",
80 'desc' => "{List.level_partition.approximate_size}"},
81 { 'name' => "constant_size",
82 'desc' => "{List.level_partition.constant_size}" },
83 { 'name' => "none",
84 'desc' => "{List.level_partition.none}" } ];
85
86my $partition_type_default = "per_letter";
87
88my $valid_partition_types = { 'per_letter' => 1,
89 'constant_size' => 1,
90 'approximate_size' => 1,
91 'none' => 1};
92
93my $partition_size_default = 30;
94
95my $numeric_partition_type_list =
96 [ { 'name' => "per_digit",
97 'desc' => "{List.level_partition.per_digit}" },
98 { 'name' => "per_number",
99 'desc' => "{List.level_partition.per_number}" },
100 { 'name' => "single_partition",
101 'desc' => "{List.level_partition_single}" },
102 { 'name' => "approximate_size",
103 'desc' => "{List.level_partition.approximate_size_numeric}"},
104 { 'name' => "constant_size",
105 'desc' => "{List.level_partition.constant_size}" },
106 { 'name' => "none",
107 'desc' => "{List.level_partition.none}" } ];
108
109my $numeric_partition_type_default = "single_partition";
110
111my $valid_numeric_partition_types = { 'per_digit' => 1,
112 'per_number' => 1,
113 'constant_size' => 1,
114 'single_partition' => 1,
115 'approximate_size' => 1,
116 'none' =>1 };
117
118my $numeric_partition_size_default = 30;
119
120my $numeric_partition_name_length_default = "-1"; # use the full number
121
122my $bookshelf_type_list =
123 [ { 'name' => "always",
124 'desc' => "{List.bookshelf_type.always}" },
125 { 'name' => "duplicate_only",
126 'desc' => "{List.bookshelf_type.duplicate_only}" },
127 { 'name' => "never",
128 'desc' => "{List.bookshelf_type.never}" } ];
129
130my $bookshelf_type_default = "never";
131my $sort_leaf_nodes_using_default = "Title";
132my $arguments =
133 [ { 'name' => "metadata",
134 'desc' => "{List.metadata}",
135 'type' => "metadata",
136 'reqd' => "yes" },
137
138 { 'name' => "metadata_selection_mode_within_level",
139 'desc' => "{List.metadata_selection_mode}",
140 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
141 'list' => $metadata_selection_mode_list,
142 'deft' => $metadata_selection_mode_default },
143
144 { 'name' => "metadata_sort_mode_within_level",
145 'desc' => "{List.metadata_sort_mode}",
146 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
147 'list' => $metadata_sort_mode_list,
148 'deft' => $metadata_sort_mode_default },
149
150 { 'name' => "bookshelf_type",
151 'desc' => "{List.bookshelf_type}",
152 'type' => "enum",
153 'list' => $bookshelf_type_list,
154 'deft' => $bookshelf_type_default },
155
156 { 'name' => "classify_sections",
157 'desc' => "{List.classify_sections}",
158 'type' => "flag" },
159
160 { 'name' => "partition_type_within_level",
161 'desc' => "{List.partition_type_within_level}",
162 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
163 'list' => $partition_type_list,
164 'deft' => $partition_type_default },
165
166 { 'name' => "partition_size_within_level",
167 'desc' => "{List.partition_size_within_level}",
168 'type' => "string", # Must be string because multiple values can be specified (separated by '/')
169 'deft' => $partition_size_default},
170
171 { 'name' => "partition_name_length",
172 'desc' => "{List.partition_name_length}",
173 'type' => "string" },
174
175 {'name' => "partition_sort_mode_within_level",
176 'desc' => "{List.partition_sort_mode_within_level}",
177 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
178 'list' => $metadata_sort_mode_list,
179 'deft' => $metadata_sort_mode_default },
180
181 { 'name' => "numeric_partition_type_within_level",
182 'desc' => "{List.numeric_partition_type_within_level}",
183 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
184 'list' => $numeric_partition_type_list,
185 'deft' => $numeric_partition_type_default },
186
187 {'name' => "numeric_partition_sort_mode_within_level",
188 'desc' => "{List.numeric_partition_sort_mode_within_level}",
189 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
190 'list' => $metadata_sort_mode_list,
191 'deft' => $metadata_sort_mode_default },
192
193 { 'name' => "numeric_partition_size_within_level",
194 'desc' => "{List.numeric_partition_size_within_level}",
195 'type' => "string", # Must be string because multiple values can be specified (separated by '/')
196 'deft' => $numeric_partition_size_default},
197
198 { 'name' => "numeric_partition_name_length_within_level",
199 'desc' => "{List.numeric_partition_name_length_within_level}",
200 'type' => "string",
201 'deft' => $numeric_partition_name_length_default },
202
203 { 'name' => "numbers_first",
204 'desc' => "{List.numbers_first",
205 'type' => 'flag'},
206
207 { 'name' => "sort_leaf_nodes_using",
208 'desc' => "{List.sort_leaf_nodes_using}",
209 'type' => "metadata",
210 'deft' => $sort_leaf_nodes_using_default },
211
212 { 'name' => "sort_leaf_nodes_sort_mode",
213 'desc' => "{List.sort_leaf_nodes_sort_mode}",
214 'type' => "enum",
215 'list' => $metadata_sort_mode_list,
216 'deft' => $metadata_sort_mode_default },
217
218 { 'name' => "reverse_sort_leaf_nodes",
219 'desc' => "{List.reverse_sort_leaf_nodes}",
220 'type' => "flag"},
221
222 { 'name' => "sort_using_unicode_collation",
223 'desc' => "{List.metadata_sort.unicode} {List.sort_using_unicode_collation}",
224 'type' => "flag" },
225
226 { 'name' => "use_hlist_for",
227 'desc' => "{List.use_hlist_for}",
228 'type' => "string" },
229
230 {'name' => "filter_metadata",
231 'desc' => "{List.filter_metadata}",
232 'type' => "metadata"},
233
234 {'name' => "filter_regex",
235 'desc' => "{List.filter_regex}",
236 'type' => "regexp"},
237
238 { 'name' => "standardize_capitalization",
239 'desc' => "{List.standardize_capitalization}",
240 'type' => "flag"},
241
242 { 'name' => "removeprefix",
243 'desc' => "{BasClas.removeprefix}",
244 'type' => "regexp" },
245
246 { 'name' => "removesuffix",
247 'desc' => "{BasClas.removesuffix}",
248 'type' => "regexp" } ];
249
250my $options = { 'name' => "List",
251 'desc' => "{List.desc}",
252 'abstract' => "no",
253 'inherits' => "yes",
254 'args' => $arguments };
255
256
257sub new
258{
259 my ($class) = shift(@_);
260 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
261 push(@$classifierslist, $class);
262
263 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
264 push(@{$hashArgOptLists->{"OptList"}}, $options);
265
266 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
267
268 bless $self, $class;
269
270 if ($self->{'info_only'}) {
271 # don't worry about any options etc
272 return $self;
273 }
274
275 # The metadata elements to use (required)
276 if (!$self->{'metadata'}) {
277 die "Error: No metadata fields specified for List.\n";
278 }
279
280 my @metadata_groups = split(/[\/]/, $self->{'metadata'});
281 $self->{'metadata_groups'} = \@metadata_groups;
282
283 # The classifier button name (default: the first metadata element specified)
284 if (!$self->{'buttonname'}) {
285 my $first_metadata_group = $metadata_groups[0];
286 my $first_metadata_element = (split(/[\;|,\/]/, $first_metadata_group))[0];
287 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
288 }
289
290 # meta selection mode for each level
291 $self->set_metadata_groups_info_per_level("metadata_selection_mode", $metadata_selection_mode_default, $valid_metadata_selection_modes);
292
293 # meta sort mode for each level
294 if ($self->{'sort_using_unicode_collation'}) {
295 print STDERR "WARNING: sort_using_unicode_collation is set, setting metadata_sort_mode_within_level to unicode for all levels, regardless of current setting\n";
296 $self->{'metadata_sort_mode_within_level'} = "unicode";
297 $metadata_sort_mode_default = "unicode";
298 } else {
299 if ($self->{'metadata_sort_mode_within_level'} =~ /unicode/) {
300 $self->{'sort_using_unicode_collation'} = 1;
301 }
302 }
303
304
305 $self->set_metadata_groups_info_per_level('metadata_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
306
307 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
308 foreach my $metadata_group (@metadata_groups) {
309 $self->{$metadata_group . ".bookshelf_type"} = "always";
310 }
311 my $last_metadata_group = $metadata_groups[$#metadata_groups];
312 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
313 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
314
315 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
316 foreach my $metadata_group (@metadata_groups) {
317 $self->{$metadata_group . ".list_type"} = "VList";
318 }
319 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
320 $self->{$metadata_group . ".list_type"} = "HList";
321 }
322
323 # How the items are grouped into partitions (default: by letter)
324 # for each level (metadata group), separated by '/'
325
326 $self->set_metadata_groups_info_per_level("partition_type_within_level", $partition_type_default, $valid_partition_types);
327
328 $self->set_metadata_groups_info_per_level("numeric_partition_type_within_level", $numeric_partition_type_default, $valid_numeric_partition_types);
329
330 $self->set_metadata_groups_info_per_level("partition_size_within_level", $partition_size_default);
331 $self->set_metadata_groups_info_per_level("numeric_partition_size_within_level", $numeric_partition_size_default);
332
333 $self->set_metadata_groups_info_per_level('partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
334
335 $self->set_metadata_groups_info_per_level('numeric_partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
336
337 $self->set_metadata_groups_info_per_level("numeric_partition_name_length_within_level", $numeric_partition_name_length_default);
338
339 # The removeprefix and removesuffix expressions
340 if ($self->{'removeprefix'}) {
341 # If there are more than one expressions, use '' to quote each experession and '/' to separate
342 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
343
344 foreach my $metadata_group (@metadata_groups) {
345 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
346 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
347 # Remove the other ' at the beginning and the end if there is any
348 $removeprefix_expr_within_levelelem =~ s/^'//;
349 $removeprefix_expr_within_levelelem =~ s/'$//;
350 # Remove the extra ^ at the beginning
351 $removeprefix_expr_within_levelelem =~ s/^\^//;
352 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
353 } else {
354 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
355 }
356 }
357 }
358 if ($self->{'removesuffix'}) {
359 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
360
361 foreach my $metadata_group (@metadata_groups) {
362 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
363 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
364 $removesuffix_expr_within_levelelem =~ s/^'//;
365 $removesuffix_expr_within_levelelem =~ s/'$//;
366 # Remove the extra $ at the end
367 $removesuffix_expr_within_levelelem =~ s/\$$//;
368 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
369 } else {
370 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
371 }
372 }
373 }
374
375 # The metadata elements to use to sort the leaf nodes (default: Title)
376 my @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
377 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
378
379 foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
380 # set metadata_select_type, if not already set - might be already set if the same group was used in -metadata
381 if (!defined $self->{$sort_group . ".metadata_selection_mode"}) {
382 $self->{$sort_group . ".metadata_selection_mode"} = $metadata_selection_mode_default;
383 }
384
385 }
386
387 my @leaf_nodes_sort_modes = split (/\|/, $self->{'sort_leaf_nodes_sort_mode'});
388 foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
389 my $leaf_sort_mode = shift(@leaf_nodes_sort_modes);
390 if (!defined $self->{$sort_group . ".metadata_sort_mode_within_level"}) {
391 if (defined $leaf_sort_mode && defined $valid_metadata_sort_modes->{$leaf_sort_mode}) {
392 $self->{$sort_group . ".metadata_sort_mode_within_level"} = $leaf_sort_mode;
393 }
394 else {
395 $self->{$sort_group . ".metadata_sort_mode_within_level"} = $metadata_sort_mode_default;
396 }
397 }
398
399 }
400
401 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
402 if ($self->{'sort_using_unicode_collation'}) {
403 # To use this you first need to download the allkeys.txt file from
404 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
405 # Unicode/Collate directory.
406 require Unicode::Collate;
407 $self->{'unicode_collator'} = Unicode::Collate->new();
408 }
409
410 # An empty array for the document/section OIDs that we are classifying
411 $self->{'OIDs'} = [];
412 # A hash for all the doc ids that we have seen, so we don't classify something twice
413 $self->{'all_doc_OIDs'} = {};
414 #return bless $self, $class;
415 return $self;
416}
417
418
419sub init
420{
421 # Nothing to do...
422}
423
424sub set_metadata_groups_info_per_level
425{
426 my $self = shift(@_);
427 my $info_name = shift(@_);
428 my $info_default = shift(@_);
429 my $info_valid_types_hash_ref = shift(@_);
430
431 my @info_list = split(/\//, $self->{$info_name});
432
433 my $first = 1;
434 foreach my $metadata_group (@{$self->{'metadata_groups'}}) {
435 my $info_elem = shift(@info_list);
436 if (defined ($info_elem) && (!defined $info_valid_types_hash_ref || defined $info_valid_types_hash_ref->{$info_elem})) {
437 $self->{$metadata_group .".$info_name"} = $info_elem;
438 } else {
439 # its empty or an invalid entry
440 my $new_info_elem;
441 if ($first) {
442 $new_info_elem = $info_default;
443 } else {
444 # get the value we had at first
445 $new_info_elem = $self->{@{$self->{'metadata_groups'}}[0] . ".$info_name"};
446 }
447 $self->{$metadata_group .".$info_name"} = $new_info_elem;
448 if (defined $info_elem) {
449 print STDERR "List Error: $info_elem is not a valid value for $info_name, changing it to $new_info_elem\n";
450
451 }
452 }
453 $first = 0;
454 }
455
456}
457
458# Called for each document in the collection
459sub classify
460{
461 my $self = shift(@_);
462 my ($doc_obj) = @_;
463
464 if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
465 print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
466 return;
467 }
468
469 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
470
471 # check against filter here
472 if ($self->{'filter_metadata'}) {
473 my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'});
474 return unless defined $meta;
475 if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) {
476 print STDERR "doc $doc_obj doesn't pass filtering requirement\n" if ($self->{'verbosity'} > 3);
477 return;
478 }
479 }
480 # if we get here, we have passed the test for filtering
481 # If "-classify_sections" is set, classify every section of the document
482 if ($self->{'classify_sections'}) {
483 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
484 while (defined $section) {
485 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
486 $section = $doc_obj->get_next_section($section);
487 }
488 }
489 # Otherwise just classify the top document section
490 else {
491 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
492 }
493
494}
495
496sub classify_section
497{
498 my $self = shift(@_);
499 my ($doc_obj,$section_OID,$section) = @_;
500
501 my @metadata_groups = @{$self->{'metadata_groups'}};
502
503 # Only classify the section if it has a value for one of the metadata elements in the first group
504 my $classify_section = 0;
505 my $first_metadata_group = $metadata_groups[0];
506 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
507 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
508 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
509 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
510 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
511 # Remove prefix/suffix if requested
512 if (defined ($first_metadata_group_element_value)) {
513 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
514 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
515 }
516
517 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
518 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
519 }
520 }
521
522 $first_metadata_group_element_value = &sorttools::format_metadata_for_sorting($first_metadata_group, $first_metadata_group_element_value, $doc_obj) unless $self->{'no_metadata_formatting'};
523 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
524 # This section must be included in the classifier as we have found a value
525 $classify_section = 1;
526 last;
527 }
528 }
529
530 # We're not classifying this section because it doesn't have the required metadata
531 return if (!$classify_section);
532
533 # Otherwise, include this section in the classifier
534
535 push(@{$self->{'OIDs'}}, $section_OID);
536
537 # Create a hash for the metadata values of each metadata element we're interested in
538 my %metadata_groups_done = ();
539 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
540 # Take care not to do a metadata group more than once
541 unless ($metadata_groups_done{$metadata_group}) {
542 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
543 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
544 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
545 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
546
547 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
548 foreach my $metadata_value (@metadata_values) {
549 #print STDERR "working with value $metadata_value\n";
550 # Strip leading and trailing whitespace
551 $metadata_value =~ s/^\s*//;
552 $metadata_value =~ s/\s*$//;
553
554 # Remove prefix/suffix if requested
555 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
556 $metadata_value =~ s/^$remove_prefix_expr//;
557 }
558 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
559 $metadata_value =~ s/$remove_suffix_expr$//;
560 }
561
562 # lowercase metadata both for sorting meta (d/D under D), and to allow CSS to be able to
563 # text-transform the stored lowercase values as capitalize or uppercase (can't CSS
564 # text-transform if stored uppercase). 2 CSS text-transforms have been added to core.css
565 my $lc_metadata_value = lc($metadata_value);
566 $lc_metadata_value = &sorttools::format_metadata_for_sorting($real_metadata_element, $lc_metadata_value, $doc_obj) unless $self->{'no_metadata_formatting'};
567 #print STDERR "formatted value = $lc_metadata_value\n";
568
569 # Add the metadata value into the list for this combination of metadata group
570 # and section - if we have some non-whitespace chars
571 # text that we have some non-whitespace chars
572 if ($lc_metadata_value =~ /\S/) {
573
574 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $lc_metadata_value);
575
576 # add the actual value into the stored values so we can remember the case
577 if (!$self->{'standardize_capitalization'}) {
578 if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) {
579 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++;
580 } else {
581 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1;
582 }
583 }
584 last if ($self->{$metadata_group . ".metadata_selection_mode"} eq "firstvalue");
585 }
586 } # foreach metadatavalue
587 last if ((@metadata_values > 0) && $self->{$metadata_group . ".metadata_selection_mode"} =~ /^(firstvalue|firstvalidmetadata)$/ );
588 } # foreach metadata element
589
590 $metadata_groups_done{$metadata_group} = 1;
591 }
592 }
593}
594
595
596sub get_classify_info
597{
598 my $self = shift(@_);
599
600 # The metadata groups to classify by
601 my @metadata_groups = @{$self->{'metadata_groups'}};
602 my $first_metadata_group = $metadata_groups[0];
603
604 # The OID values of the documents to include in the classifier
605 my @OIDs = @{$self->{'OIDs'}};
606
607 # Create the root node of the classification hierarchy
608 my %classifier_node = ( 'thistype' => "Invisible",
609 'childtype' => $self->{$first_metadata_group . ".list_type"},
610 'Title' => $self->{'buttonname'},
611 'contains' => [],
612 'mdtype' => $first_metadata_group );
613
614 # Recursively create the classification hierarchy, one level for each metadata group
615 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
616 return \%classifier_node;
617}
618
619
620sub add_level
621{
622 my $self = shift(@_);
623 my @metadata_groups = @{shift(@_)};
624 my @OIDs = @{shift(@_)};
625 my $classifier_node = shift(@_);
626
627 my $metadata_group = $metadata_groups[0];
628 if (!defined($self->{$metadata_group . ".list"})) {
629 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
630 return;
631 }
632
633 # Create a mapping from metadata value to OID
634 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
635 my %metadata_value_to_OIDs_hash = ();
636 my %numeric_metadata_value_to_OIDs_hash = ();
637 foreach my $OID (@OIDs)
638 {
639 if ($OID_to_metadata_values_hash_ref->{$OID})
640 {
641 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
642 foreach my $metadata_value (@metadata_values)
643 {
644 if ($metadata_value =~ /^[0-9]/) {
645 push(@{$numeric_metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
646 } else {
647 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
648 }
649 }
650 }
651 }
652 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
653 #print STDERR "Number of distinct numeric values: " . scalar(keys %numeric_metadata_value_to_OIDs_hash) . "\n";
654
655 # Partition the values (if necessary)
656 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
657 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
658 my $partition_sort_mode_within_level = $self->{$metadata_group . ".partition_sort_mode_within_level"};
659 my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"};
660
661
662 #############################################
663 ### DO THE NUMBERS IF THEY ARE TO COME FIRST
664 #############################################
665 if ($self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) {
666 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash);
667 }
668
669 ############################################
670 # DO THE LETTERS
671 ############################################
672 if (keys(%metadata_value_to_OIDs_hash)){ # make sure we have some values
673 if ($partition_type_within_level =~ /^per_letter$/i) {
674 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash);
675 }
676 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
677 $self->split_approximate_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'});
678
679 }
680 elsif ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
681 $self->split_constant_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'});
682 }
683
684 # Otherwise just add all the values to a VList
685 else {
686 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
687 }
688 }
689
690 ###########################################
691 ### DO THE NUMBERS IF THEY ARE TO COME LAST
692 ###########################################
693 if (!$self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) {
694 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash);
695 }
696}
697
698sub partition_numeric_values
699{
700 my $self = shift(@_);
701 my @metadata_groups = @{shift(@_)};
702 my $classifier_node = shift(@_);
703 my $numeric_metadata_value_to_OIDs_hash_ref = shift(@_);
704
705 my $metadata_group = $metadata_groups[0];
706 my $numeric_partition_type_within_level = $self->{$metadata_group . ".numeric_partition_type_within_level"};
707 my $numeric_partition_size_within_level = $self->{$metadata_group . ".numeric_partition_size_within_level"};
708 my $numeric_partition_sort_mode_within_level = $self->{$metadata_group . ".numeric_partition_sort_mode_within_level"};
709 my $numeric_partition_name_length_within_level = $self->{$metadata_group . ".numeric_partition_name_length_within_level"};
710 my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"};
711
712 if ($numeric_partition_type_within_level eq "single_partition") {
713 $self->add_hlist_partition(\@metadata_groups, $classifier_node, "0-9", $numeric_metadata_value_to_OIDs_hash_ref);
714 }
715 elsif ($numeric_partition_type_within_level eq "per_digit") {
716 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, 1);
717 }
718 elsif ($numeric_partition_type_within_level eq "per_number") {
719 # each different number is a bucket
720 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, $numeric_partition_name_length_within_level);
721 }
722 elsif ($numeric_partition_type_within_level eq "constant_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
723 # Generate hlists of a certain size
724
725 $self->split_constant_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1);
726 } elsif ($numeric_partition_type_within_level eq "approximate_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
727 $self->split_approximate_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1);
728 }
729 # Otherwise just add all the values to a VList
730 else {
731 $self->add_vlist(\@metadata_groups, $classifier_node, $numeric_metadata_value_to_OIDs_hash_ref);
732 }
733
734}
735
736sub split_approximate_size
737{
738
739 my $self = shift(@_);
740 my @metadata_groups = @{shift(@_)};
741 my $classifier_node = shift(@_);
742 my $partition_size = shift(@_);
743 my $sort_mode = shift(@_);
744 my $bookshelf_type = shift(@_);
745 my $metadata_value_to_OIDs_hash_ref = shift(@_);
746 my $partition_name_length = shift(@_);
747 my $is_numeric = shift(@_);
748
749 # Generate hlist based on the first letter of the metadata value (like per_letter), or based on
750 # numbers, but also with restriction on the partition size
751 # If a partition has fewer items than specified by the "partition_size_within_level", then group them together if possible
752 # If a partition has more items than specified, split into several hlists.
753 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
754
755 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
756
757 # Separate values by their first letter, each form a bucket, like the per_letter partition type
758 my $last_partition = $self->generate_partition_name($sortedmetadata_values[0], $partition_name_length, $is_numeric);
759
760 my @partition_buckets = ();
761 my @metadata_values_in_bucket = ();
762 my $num_items_in_bucket = 0;
763
764 foreach my $metadata_value (@sortedmetadata_values) {
765 my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
766 if ($metadata_valuepartition ne $last_partition) {
767 my @temp_array = @metadata_values_in_bucket;
768 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
769 my %partition_info = ();
770 $partition_info{'metadata_values'} = \@temp_array;
771 $partition_info{'size'} = $num_items_in_bucket;
772 $partition_info{'name'} = $last_partition;
773 push (@partition_buckets, \%partition_info);
774
775 @metadata_values_in_bucket = ($metadata_value);
776 $num_items_in_bucket = ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : scalar(@metadata_values_in_bucket));
777 $last_partition = $metadata_valuepartition;
778 } else {
779 $num_items_in_bucket += ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1); #scalar(@metadata_values_in_bucket);
780 push (@metadata_values_in_bucket, $metadata_value);
781 }
782 }
783 # Last one
784 my %partition_info = ();
785 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
786 $partition_info{'size'} = $num_items_in_bucket;
787 $partition_info{'name'} = $last_partition;
788
789 push (@partition_buckets, \%partition_info);
790
791 # now go through the array of buckets, and merge small buckets
792 my @new_partition_buckets = ();
793 for (my $i = 0; $i < scalar(@partition_buckets) - 1; $i++) {
794
795 my $this_bucket = $partition_buckets[$i]; #->{'name'};
796 my $next_bucket = $partition_buckets[$i+1]; #->{'name'};
797
798 my $items_in_partition = $this_bucket->{'size'};
799
800 if ($items_in_partition < $partition_size ) {
801 my $items_in_next_partition = $next_bucket->{'size'};
802 if ($items_in_partition + $items_in_next_partition <= $partition_size ) {
803 # merge this bucket into the next bucket
804 foreach my $metadata_value_to_merge (@{$this_bucket->{'metadata_values'}}) {
805 push(@{$next_bucket->{'metadata_values'}}, $metadata_value_to_merge);
806 }
807 $next_bucket->{'size'} += $items_in_partition;
808
809 } else {
810 # remember this bucket
811 push (@new_partition_buckets, $this_bucket);
812 }
813 } else {
814 # remember this bucket
815 push (@new_partition_buckets, $this_bucket);
816 }
817 }
818 # add in the last bucket
819 my $last_bucket = $partition_buckets[scalar(@partition_buckets) - 1];
820 push (@new_partition_buckets, $last_bucket);
821
822 # Add partitions to the main list, but divide big bucket into several
823 my $last_partition_end = "";
824 my $partition_start = "";
825 foreach my $partition (@new_partition_buckets) {
826 my @metadata_values = $self->sort_metadata_values_array($sort_mode, @{$partition->{'metadata_values'}});
827 my $items_in_partition = $partition->{'size'};
828 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $partition_name_length, $is_numeric);
829
830 if ($items_in_partition > $partition_size) {
831 my $items_done = 0;
832 my %metadata_values_to_OIDs_subhashes = ();
833 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
834 my $metadata_value = $metadata_values[$i];
835 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
836 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1;
837
838 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $partition_name_length, $is_numeric);
839 my $partitionname = $partition_start;
840 if ($partitionend ne $partition_start) {
841 $partitionname = $partitionname . "-" . $partitionend;
842 }
843
844 if ($items_done + $items_for_this_md_value > $partition_size && $items_done != 0) {
845 # Start a new partition
846 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
847 $last_partition_end = $partitionend;
848 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric);
849 $items_done = 0;
850 %metadata_values_to_OIDs_subhashes = ();
851 }
852
853 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
854 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size) {
855
856 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric);
857 # Get the number of partitions needed for this value
858 my $num_splits = int($items_for_this_md_value / $partition_size);
859 $num_splits++ if ($items_for_this_md_value / $partition_size > $num_splits);
860
861 my @OIDs_for_this_value = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
862 for (my $i = 0; $i < $num_splits; $i++) {
863 my %OIDs_subhashes_for_this_value = ();
864 my @OIDs_for_this_partition = ();
865 for (my $d = $i * $partition_size; $d < (($i+1) * $partition_size > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size); $d++) {
866 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
867 }
868
869 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
870 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size) {
871 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
872 $items_done += scalar(@OIDs_for_this_partition);
873 next;
874 }
875
876 # Add an HList for this bucket
877 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
878 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
879 $last_partition_end = $partitionname_for_this_value;
880 }
881 next;
882 }
883
884 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
885 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
886
887 # The last partition
888 if($i == scalar(@metadata_values) - 1) {
889 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
890 }
891 }
892 } # end if items in partition > partition size
893 else {
894 # The easier case, just add a partition
895 my %metadata_values_to_OIDs_subhashes = ();
896 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
897 my $metadata_value = $metadata_values[$i];
898 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
899 }
900 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
901 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $partition_name_length, $is_numeric);
902 my $partitionname = $partition_start;
903 if ($partitionend ne $partition_start) {
904 $partitionname = $partitionname . "-" . $partitionend;
905 }
906
907 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
908 $last_partition_end = $partitionend;
909 }
910 }
911
912 # The partitions are stored in an HList
913 $classifier_node->{'childtype'} = "HList";
914
915}
916
917sub split_constant_size
918{
919 my $self = shift(@_);
920 my @metadata_groups = @{shift(@_)};
921 my $classifier_node = shift(@_);
922 my $partition_size = shift(@_);
923 my $sort_mode = shift(@_);
924 my $metadata_value_to_OIDs_hash_ref = shift(@_);
925 my $partition_name_length = shift(@_);
926 my $is_numeric = shift(@_);
927
928 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
929 my $itemsdone = 0;
930 my $numitems = scalar(@sortedmetadata_values);
931 my %metadata_value_to_OIDs_subhash = ();
932 my $lastpartitionend = "";
933 my $partitionstart;
934 foreach my $metadata_value (@sortedmetadata_values) {
935 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
936 $itemsdone++;
937 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
938
939 # Is this the start of a new partition?
940 if ($itemsinpartition == 1) {
941 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $is_numeric);
942 }
943
944 # Is this the end of the partition?
945 if ($itemsinpartition == $partition_size || $itemsdone == $numitems) {
946 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $is_numeric);
947 my $partitionname = $partitionstart;
948 if ($partitionend ne $partitionstart) {
949 $partitionname = $partitionname . "-" . $partitionend;
950 }
951
952 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
953 %metadata_value_to_OIDs_subhash = ();
954 $lastpartitionend = $partitionend;
955 }
956 }
957
958 # The partitions are stored in an HList
959 $classifier_node->{'childtype'} = "HList";
960
961}
962
963sub split_per_letter_or_digit
964{
965 my $self = shift(@_);
966 my @metadata_groups = @{shift(@_)};
967 my $classifier_node = shift(@_);
968 my $sort_mode = shift(@_);
969 my $metadata_value_to_OIDs_hash_ref = shift(@_);
970 my $is_numeric = shift(@_);
971 my $numeric_partition_length = shift(@_);
972
973 if (not defined $is_numeric) {
974 $is_numeric = 0;
975 }
976 if ($is_numeric && not defined($numeric_partition_length)) {
977 $numeric_partition_length = 1;
978 }
979 # Generate one hlist for each letter
980 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
981 my %metadata_value_to_OIDs_subhash = ();
982
983 my $lastpartition = $self->generate_partition_name($sortedmetadata_values[0], $numeric_partition_length, $is_numeric);
984 foreach my $metadata_value (@sortedmetadata_values) {
985
986 my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $numeric_partition_length, $is_numeric);
987
988 # Is this the start of a new partition?
989 if ($metadata_valuepartition ne $lastpartition) {
990 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
991 %metadata_value_to_OIDs_subhash = ();
992 $lastpartition = $metadata_valuepartition;
993 }
994
995 $metadata_value_to_OIDs_subhash{$metadata_value} = %$metadata_value_to_OIDs_hash_ref{$metadata_value};
996 }
997
998 # Don't forget to add the last partition
999 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
1000
1001 # The partitions are stored in an HList
1002 $classifier_node->{'childtype'} = "HList";
1003
1004}
1005
1006sub generate_partition_name
1007{
1008 my $self = shift(@_);
1009 my $mvalue = shift(@_);
1010 my $numeric_partition_length = shift(@_);
1011 my $is_numeric = shift(@_);
1012
1013 if (!$is_numeric || $numeric_partition_length == 1 ) {
1014 return substr($mvalue, 0, 1);
1015 }
1016 if ($numeric_partition_length == -1) {
1017 my ($all_digits) = $mvalue =~ /^([0-9]+)/;
1018 return $all_digits;
1019 }
1020 my ($some_digits) = $mvalue =~ /^([0-9]{1,$numeric_partition_length})/;
1021 return $some_digits;
1022}
1023
1024sub generate_partition_start
1025{
1026 my $self = shift(@_);
1027 my $metadata_value = shift(@_);
1028 my $lastpartitionend = shift(@_);
1029 my $partition_name_length = shift(@_);
1030 my $is_numeric = shift(@_);
1031
1032 if ($is_numeric) {
1033 return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
1034 }
1035
1036 if ($partition_name_length) {
1037 return substr($metadata_value, 0, $partition_name_length);
1038 }
1039
1040 my $partitionstart = substr($metadata_value, 0, 1);
1041 if ($partitionstart le $lastpartitionend) {
1042 $partitionstart = substr($metadata_value, 0, 2);
1043 # Give up after three characters
1044 if ($partitionstart le $lastpartitionend) {
1045 $partitionstart = substr($metadata_value, 0, 3);
1046 }
1047 }
1048
1049 return $partitionstart;
1050}
1051
1052
1053sub generate_partition_end
1054{
1055 my $self = shift(@_);
1056 my $metadata_value = shift(@_);
1057 my $partitionstart = shift(@_);
1058 my $partition_name_length = shift(@_);
1059 my $is_numeric = shift(@_);
1060
1061 if ($is_numeric) {
1062 return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
1063 }
1064 if ($partition_name_length) {
1065 return substr($metadata_value, 0, $partition_name_length);
1066 }
1067
1068 my $partitionend = substr($metadata_value, 0, length($partitionstart));
1069 if ($partitionend gt $partitionstart) {
1070 $partitionend = substr($metadata_value, 0, 1);
1071 if ($partitionend le $partitionstart) {
1072 $partitionend = substr($metadata_value, 0, 2);
1073 # Give up after three characters
1074 if ($partitionend le $partitionstart) {
1075 $partitionend = substr($metadata_value, 0, 3);
1076 }
1077 }
1078 }
1079
1080 return $partitionend;
1081}
1082
1083
1084sub add_hlist_partition
1085{
1086 my $self = shift(@_);
1087 my @metadata_groups = @{shift(@_)};
1088 my $classifier_node = shift(@_);
1089 my $partitionname = shift(@_);
1090 my $metadata_value_to_OIDs_hash_ref = shift(@_);
1091
1092 # Create an hlist partition
1093 # Note that we don't need to convert from unicode-aware strings
1094 # to utf8 here, as that is handled elsewhere in the code
1095 my %child_classifier_node = ( 'Title' => $partitionname, #'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
1096 'childtype' => "VList",
1097 'contains' => [] );
1098
1099 # Add the children to the hlist partition
1100 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
1101 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
1102}
1103
1104
1105sub add_vlist
1106{
1107 my $self = shift(@_);
1108 my @metadata_groups = @{shift(@_)};
1109 my $classifier_node = shift(@_);
1110 my $metadata_value_to_OIDs_hash_ref = shift(@_);
1111
1112 my $metadata_group = shift(@metadata_groups);
1113 $classifier_node->{'mdtype'} = $metadata_group;
1114
1115 my $sort_type = $self->{$metadata_group .".metadata_sort_mode_within_level"};
1116 # Create an entry in the vlist for each value
1117 foreach my $metadata_value ($self->sort_metadata_values_array($sort_type, keys(%{$metadata_value_to_OIDs_hash_ref})))
1118 {
1119 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
1120 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
1121 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
1122 my $OID = $OIDs[0];
1123 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
1124 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
1125 }
1126 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
1127 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
1128 @OIDs = $self->sort_leaf_items(\@OIDs);
1129 foreach my $OID (@OIDs) {
1130 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
1131 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
1132 }
1133
1134 }
1135 # Otherwise create a sublist (bookshelf) for the metadata value
1136 else {
1137 my $metadata_value_display = $self->get_metadata_value_display($metadata_group, $metadata_value);
1138 my %child_classifier_node = ( 'Title' => $metadata_value_display,
1139 'childtype' => "VList",
1140 'mdtype' => $metadata_group,
1141 'contains' => [] );
1142
1143 # If there are metadata elements remaining, recursively apply the process
1144 if (@metadata_groups > 0) {
1145 my $next_metadata_group = $metadata_groups[0];
1146 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
1147
1148 # separate metadata into those that below in the next/sub-metadata_group
1149 # and those that below at the current level's metadata_group
1150
1151 my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
1152 my @current_level_OIDs = ();
1153 my @next_level_OIDs = ();
1154 foreach my $OID (@OIDs)
1155 {
1156 if ($OID_to_metadata_values_hash_ref->{$OID}) {
1157 push(@next_level_OIDs, $OID);
1158 } else {
1159 push(@current_level_OIDs, $OID);
1160 }
1161 }
1162 # recursively process those docs belonging to the sub-metadata_group
1163 $self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
1164
1165 # For those docs that don't belong in the sub/next_metadata_group, but which belong
1166 # at this level, just add the documents as children of this list at the current level
1167 @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
1168 foreach my $current_level_OID (@current_level_OIDs) {
1169 my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
1170 push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
1171 }
1172 }
1173 # Otherwise just add the documents as children of this list
1174 else {
1175 @OIDs = $self->sort_leaf_items(\@OIDs);
1176 foreach my $OID (@OIDs) {
1177 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
1178 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
1179 }
1180
1181 }
1182
1183 # Add the sublist to the list
1184 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
1185 }
1186 }
1187}
1188
1189sub metadata_offset
1190{
1191 my $self = shift(@_);
1192 my $metadata_group = shift(@_);
1193 my $OID = shift(@_);
1194 my $metadata_value = shift(@_);
1195
1196 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
1197 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
1198 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
1199 if ($metadata_value eq $metadata_values[$i]) {
1200 return $i;
1201 }
1202 }
1203
1204 return 0;
1205}
1206
1207sub sort_leaf_items
1208{
1209 my $self = shift(@_);
1210 my @OIDs = @{shift(@_)};
1211
1212 # Sort leaf nodes and add to list
1213 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
1214 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
1215 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
1216 my $sort_type = $self->{$sort_leaf_nodes_usingmetaelem . ".metadata_sort_mode_within_level"};
1217 # Force a stable sort (Perl 5.6's sort isn't stable)
1218 # !! The [0] bits aren't ideal (multiple metadata values) !!
1219 @OIDs = @OIDs[ sort {
1220 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]}) && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]}))
1221 {
1222 if ($sort_type eq "numeric") {
1223 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] <=> $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
1224 } elsif ($sort_type eq "alphabetic") {
1225 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
1226 } else {
1227 ncmp($OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0], $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0]);
1228 }
1229 }
1230 else
1231 {
1232 $a <=> $b;
1233 }
1234 } 0..$#OIDs ];
1235 }
1236 if ($self->{'reverse_sort_leaf_nodes'}) {
1237 return reverse @OIDs;
1238 }
1239 return @OIDs;
1240}
1241
1242
1243sub sort_metadata_values_array
1244{
1245 my $self = shift(@_);
1246 my ($sort_mode) = shift(@_);
1247 my @metadata_values = @_;
1248
1249 if ($sort_mode eq "unicode") {
1250 if ($self->{'unicode_collator'}) {
1251 return $self->{'unicode_collator'}->sort(@metadata_values);
1252 }
1253 # the collator wasn't loaded, fall back on default
1254 $sort_mode = "alphanumeric";
1255 }
1256 if ($sort_mode eq "numeric") {
1257 return sort {$a <=> $b} @metadata_values;
1258 }
1259 if ($sort_mode eq "alphabetic") {
1260 return sort {$a cmp $b} @metadata_values;
1261 }
1262 # natural sort
1263 return nsort(@metadata_values);
1264}
1265
1266# we are not using this any more. Using nsort instead
1267# $a and $b args automatically passed in and shouldn't be declared
1268sub alpha_numeric_cmp
1269{
1270 my $self = shift (@_);
1271 my ($aStr, $bStr) = @_;
1272 if ($aStr =~ m/^(\d+(\.\d+)?)/)
1273 {
1274 my $val_a = $1;
1275 if ($bStr =~ m/^(\d+(\.\d+)?)/)
1276 {
1277 my $val_b = $1;
1278 if ($val_a != $val_b)
1279 {
1280 return ($val_a <=> $val_b);
1281 }
1282 }
1283 }
1284
1285 return ($aStr cmp $bStr);
1286}
1287
1288
1289sub get_metadata_value_display {
1290 my $self = shift(@_);
1291 my ($metadata_group, $metadata_value) = @_;
1292 return $metadata_value if $self->{'standardize_capitalization'};
1293 my $actual_values_hash = $self->{$metadata_group . ".actualvalues"}->{$metadata_value};
1294 my $display_value ="";
1295 my $max_count=0;
1296 foreach my $v (keys %$actual_values_hash) {
1297 if ($actual_values_hash->{$v} > $max_count) {
1298 $display_value = $v;
1299 $max_count = $actual_values_hash->{$v};
1300 }
1301 }
1302 return $display_value;
1303}
13041;
Note: See TracBrowser for help on using the repository browser.