source: main/trunk/greenstone2/perllib/classify/List.pm@ 33454

Last change on this file since 33454 was 33454, checked in by kjdon, 5 years ago

updated metadata_selection_mode to be metadata_selection_mode_within_level - I had changed the param name, but not teh code to match. if either partition_type or numeric_partition_type is none, then set the other one for the same level to be none. Can't have half the values partitioned and not the other half

  • Property svn:keywords set to Author Date Id Revision
File size: 52.4 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11#
12# Copyright (C) 2005 New Zealand Digital Library Project
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28# TO DO: - Remove punctuation from metadata values before sorting.
29# - Add an AZCompactList-style hlist option?
30#
31###########################################################################
32
33
34package List;
35
36
37use BaseClassifier;
38use Sort::Naturally;
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46
47my $metadata_selection_mode_list =
48 [
49 { 'name' => "firstvalue",
50 'desc' => "{List.metadata_selection.firstvalue}"},
51 { 'name' => "firstvalidmetadata",
52 'desc' => "{List.metadata_selection.firstvalidmetadata}"},
53 { 'name' => "allvalues",
54 'desc' => "{List.metadata_selection.allvalues}"} ];
55
56my $metadata_selection_mode_default = "firstvalidmetadata";
57
58my $valid_metadata_selection_modes = { 'firstvalue' => 1,
59 'firstvalidmetadata' => 1,
60 'allvalues' => 1 };
61
62my $metadata_sort_mode_list =
63 [
64 { 'name' => "unicode",
65 'desc' => "{List.metadata_sort.unicode}"},
66 { 'name' => "alphabetic",
67 'desc' => "{List.metadata_sort.alphabetic}"},
68 { 'name' => "alphanumeric",
69 'desc' => "{List.metadata_sort.alphanumeric}"} ];
70
71my $metadata_sort_mode_default = "alphanumeric";
72
73my $valid_metadata_sort_modes = { 'unicode' => 1,
74 'alphabetic' => 1,
75 'alphanumeric' => 1};
76my $partition_type_list =
77 [ { 'name' => "per_letter",
78 'desc' => "{List.level_partition.per_letter}" },
79 { 'name' => "approximate_size",
80 'desc' => "{List.level_partition.approximate_size}"},
81 { 'name' => "constant_size",
82 'desc' => "{List.level_partition.constant_size}" },
83 { 'name' => "none",
84 'desc' => "{List.level_partition.none}" } ];
85
86my $partition_type_default = "per_letter";
87
88my $valid_partition_types = { 'per_letter' => 1,
89 'constant_size' => 1,
90 'approximate_size' => 1,
91 'none' => 1};
92
93my $partition_size_default = 30;
94
95my $numeric_partition_type_list =
96 [ { 'name' => "per_digit",
97 'desc' => "{List.level_partition.per_digit}" },
98 { 'name' => "per_number",
99 'desc' => "{List.level_partition.per_number}" },
100 { 'name' => "single_partition",
101 'desc' => "{List.level_partition_single}" },
102 { 'name' => "approximate_size",
103 'desc' => "{List.level_partition.approximate_size_numeric}"},
104 { 'name' => "constant_size",
105 'desc' => "{List.level_partition.constant_size}" },
106 { 'name' => "none",
107 'desc' => "{List.level_partition.none}" } ];
108
109my $numeric_partition_type_default = "single_partition";
110
111my $valid_numeric_partition_types = { 'per_digit' => 1,
112 'per_number' => 1,
113 'constant_size' => 1,
114 'single_partition' => 1,
115 'approximate_size' => 1,
116 'none' =>1 };
117
118my $numeric_partition_size_default = 30;
119
120my $numeric_partition_name_length_default = "-1"; # use the full number
121
122my $bookshelf_type_list =
123 [ { 'name' => "always",
124 'desc' => "{List.bookshelf_type.always}" },
125 { 'name' => "duplicate_only",
126 'desc' => "{List.bookshelf_type.duplicate_only}" },
127 { 'name' => "never",
128 'desc' => "{List.bookshelf_type.never}" } ];
129
130my $bookshelf_type_default = "never";
131my $sort_leaf_nodes_using_default = "Title";
132my $arguments =
133 [ { 'name' => "metadata",
134 'desc' => "{List.metadata}",
135 'type' => "metadata",
136 'reqd' => "yes" },
137
138 { 'name' => "metadata_selection_mode_within_level",
139 'desc' => "{List.metadata_selection_mode_within_level}",
140 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
141 'list' => $metadata_selection_mode_list,
142 'deft' => $metadata_selection_mode_default },
143
144 { 'name' => "metadata_sort_mode_within_level",
145 'desc' => "{List.metadata_sort_mode_within_level}",
146 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
147 'list' => $metadata_sort_mode_list,
148 'deft' => $metadata_sort_mode_default },
149
150 { 'name' => "bookshelf_type",
151 'desc' => "{List.bookshelf_type}",
152 'type' => "enum",
153 'list' => $bookshelf_type_list,
154 'deft' => $bookshelf_type_default },
155
156 { 'name' => "classify_sections",
157 'desc' => "{List.classify_sections}",
158 'type' => "flag" },
159
160 { 'name' => "partition_type_within_level",
161 'desc' => "{List.partition_type_within_level}",
162 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
163 'list' => $partition_type_list,
164 'deft' => $partition_type_default },
165
166 { 'name' => "partition_size_within_level",
167 'desc' => "{List.partition_size_within_level}",
168 'type' => "string", # Must be string because multiple values can be specified (separated by '/')
169 'deft' => $partition_size_default},
170
171 { 'name' => "partition_name_length",
172 'desc' => "{List.partition_name_length}",
173 'type' => "string" },
174
175 {'name' => "partition_sort_mode_within_level",
176 'desc' => "{List.partition_sort_mode_within_level}",
177 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
178 'list' => $metadata_sort_mode_list,
179 'deft' => $metadata_sort_mode_default },
180
181 { 'name' => "numeric_partition_type_within_level",
182 'desc' => "{List.numeric_partition_type_within_level}",
183 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
184 'list' => $numeric_partition_type_list,
185 'deft' => $numeric_partition_type_default },
186
187 {'name' => "numeric_partition_sort_mode_within_level",
188 'desc' => "{List.numeric_partition_sort_mode_within_level}",
189 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
190 'list' => $metadata_sort_mode_list,
191 'deft' => $metadata_sort_mode_default },
192
193 { 'name' => "numeric_partition_size_within_level",
194 'desc' => "{List.numeric_partition_size_within_level}",
195 'type' => "string", # Must be string because multiple values can be specified (separated by '/')
196 'deft' => $numeric_partition_size_default},
197
198 { 'name' => "numeric_partition_name_length_within_level",
199 'desc' => "{List.numeric_partition_name_length_within_level}",
200 'type' => "string",
201 'deft' => $numeric_partition_name_length_default },
202
203 { 'name' => "numbers_first",
204 'desc' => "{List.numbers_first",
205 'type' => 'flag'},
206
207 { 'name' => "sort_leaf_nodes_using",
208 'desc' => "{List.sort_leaf_nodes_using}",
209 'type' => "metadata",
210 'deft' => $sort_leaf_nodes_using_default },
211
212 { 'name' => "sort_leaf_nodes_sort_mode",
213 'desc' => "{List.sort_leaf_nodes_sort_mode}",
214 'type' => "enum",
215 'list' => $metadata_sort_mode_list,
216 'deft' => $metadata_sort_mode_default },
217
218 { 'name' => "reverse_sort_leaf_nodes",
219 'desc' => "{List.reverse_sort_leaf_nodes}",
220 'type' => "flag"},
221
222 { 'name' => "sort_using_unicode_collation",
223 'desc' => "{List.metadata_sort.unicode} {List.sort_using_unicode_collation}",
224 'type' => "flag" },
225
226 { 'name' => "use_hlist_for",
227 'desc' => "{List.use_hlist_for}",
228 'type' => "string" },
229
230 {'name' => "filter_metadata",
231 'desc' => "{List.filter_metadata}",
232 'type' => "metadata"},
233
234 {'name' => "filter_regex",
235 'desc' => "{List.filter_regex}",
236 'type' => "regexp"},
237
238 { 'name' => "standardize_capitalization",
239 'desc' => "{List.standardize_capitalization}",
240 'type' => "flag"},
241
242 { 'name' => "removeprefix",
243 'desc' => "{BasClas.removeprefix}",
244 'type' => "regexp" },
245
246 { 'name' => "removesuffix",
247 'desc' => "{BasClas.removesuffix}",
248 'type' => "regexp" } ];
249
250my $options = { 'name' => "List",
251 'desc' => "{List.desc}",
252 'abstract' => "no",
253 'inherits' => "yes",
254 'args' => $arguments };
255
256
257sub new
258{
259 my ($class) = shift(@_);
260 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
261 push(@$classifierslist, $class);
262
263 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
264 push(@{$hashArgOptLists->{"OptList"}}, $options);
265
266 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
267
268 bless $self, $class;
269
270 if ($self->{'info_only'}) {
271 # don't worry about any options etc
272 return $self;
273 }
274
275 # The metadata elements to use (required)
276 if (!$self->{'metadata'}) {
277 die "Error: No metadata fields specified for List.\n";
278 }
279
280 my @metadata_groups = split(/[\/]/, $self->{'metadata'});
281 $self->{'metadata_groups'} = \@metadata_groups;
282
283 # The classifier button name (default: the first metadata element specified)
284 if (!$self->{'buttonname'}) {
285 my $first_metadata_group = $metadata_groups[0];
286 my $first_metadata_element = (split(/[\;|,\/]/, $first_metadata_group))[0];
287 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
288 }
289
290 # meta selection mode for each level
291 $self->set_metadata_groups_info_per_level("metadata_selection_mode_within_level", $metadata_selection_mode_default, $valid_metadata_selection_modes);
292
293 # meta sort mode for each level
294 if ($self->{'sort_using_unicode_collation'}) {
295 print STDERR "WARNING: sort_using_unicode_collation is set, setting metadata_sort_mode_within_level to unicode for all levels, regardless of current setting\n";
296 $self->{'metadata_sort_mode_within_level'} = "unicode";
297 $metadata_sort_mode_default = "unicode";
298 } else {
299 if ($self->{'metadata_sort_mode_within_level'} =~ /unicode/) {
300 $self->{'sort_using_unicode_collation'} = 1;
301 }
302 }
303
304
305 $self->set_metadata_groups_info_per_level('metadata_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
306
307 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
308 foreach my $metadata_group (@metadata_groups) {
309 $self->{$metadata_group . ".bookshelf_type"} = "always";
310 }
311 my $last_metadata_group = $metadata_groups[$#metadata_groups];
312 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
313 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
314
315 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
316 foreach my $metadata_group (@metadata_groups) {
317 $self->{$metadata_group . ".list_type"} = "VList";
318 }
319 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
320 $self->{$metadata_group . ".list_type"} = "HList";
321 }
322
323 # How the items are grouped into partitions (default: by letter)
324 # for each level (metadata group), separated by '/'
325
326 $self->set_metadata_groups_info_per_level("partition_type_within_level", $partition_type_default, $valid_partition_types);
327
328 $self->set_metadata_groups_info_per_level("numeric_partition_type_within_level", $numeric_partition_type_default, $valid_numeric_partition_types);
329
330 # now check whether a level was none - need to set the equivalent level in the other half also to none
331 foreach my $metadata_group (@metadata_groups) {
332 if ($self->{$metadata_group . ".partition_type_within_level"} eq "none" || $self->{$metadata_group . ".numeric_partition_type_within_level"} eq "none") {
333
334 print STDERR "WARNING: one of -partition_type_within_level or -numeric_partition_type_within_level was set to 'none' for level $metadata_group, overriding current value of both these options to 'none'\n";
335
336 $self->{$metadata_group . ".partition_type_within_level"} = "none";
337 $self->{$metadata_group . ".numeric_partition_type_within_level"} = "none";
338 }
339 }
340
341 $self->set_metadata_groups_info_per_level("partition_size_within_level", $partition_size_default);
342 $self->set_metadata_groups_info_per_level("numeric_partition_size_within_level", $numeric_partition_size_default);
343
344 $self->set_metadata_groups_info_per_level('partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
345
346 $self->set_metadata_groups_info_per_level('numeric_partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
347
348 $self->set_metadata_groups_info_per_level("numeric_partition_name_length_within_level", $numeric_partition_name_length_default);
349
350 # The removeprefix and removesuffix expressions
351 if ($self->{'removeprefix'}) {
352 # If there are more than one expressions, use '' to quote each experession and '/' to separate
353 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
354
355 foreach my $metadata_group (@metadata_groups) {
356 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
357 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
358 # Remove the other ' at the beginning and the end if there is any
359 $removeprefix_expr_within_levelelem =~ s/^'//;
360 $removeprefix_expr_within_levelelem =~ s/'$//;
361 # Remove the extra ^ at the beginning
362 $removeprefix_expr_within_levelelem =~ s/^\^//;
363 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
364 } else {
365 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
366 }
367 }
368 }
369 if ($self->{'removesuffix'}) {
370 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
371
372 foreach my $metadata_group (@metadata_groups) {
373 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
374 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
375 $removesuffix_expr_within_levelelem =~ s/^'//;
376 $removesuffix_expr_within_levelelem =~ s/'$//;
377 # Remove the extra $ at the end
378 $removesuffix_expr_within_levelelem =~ s/\$$//;
379 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
380 } else {
381 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
382 }
383 }
384 }
385
386 # The metadata elements to use to sort the leaf nodes (default: Title)
387 my @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
388 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
389
390 foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
391 # set metadata_select_type, if not already set - might be already set if the same group was used in -metadata
392 if (!defined $self->{$sort_group . ".metadata_selection_mode_within_level"}) {
393 $self->{$sort_group . ".metadata_selection_mode_within_level"} = $metadata_selection_mode_default;
394 }
395
396 }
397
398 my @leaf_nodes_sort_modes = split (/\|/, $self->{'sort_leaf_nodes_sort_mode'});
399 foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
400 my $leaf_sort_mode = shift(@leaf_nodes_sort_modes);
401 if (!defined $self->{$sort_group . ".metadata_sort_mode_within_level"}) {
402 if (defined $leaf_sort_mode && defined $valid_metadata_sort_modes->{$leaf_sort_mode}) {
403 $self->{$sort_group . ".metadata_sort_mode_within_level"} = $leaf_sort_mode;
404 }
405 else {
406 $self->{$sort_group . ".metadata_sort_mode_within_level"} = $metadata_sort_mode_default;
407 }
408 }
409
410 }
411
412 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
413 if ($self->{'sort_using_unicode_collation'}) {
414 # To use this you first need to download the allkeys.txt file from
415 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
416 # Unicode/Collate directory.
417 require Unicode::Collate;
418 $self->{'unicode_collator'} = Unicode::Collate->new();
419 }
420
421 # An empty array for the document/section OIDs that we are classifying
422 $self->{'OIDs'} = [];
423 # A hash for all the doc ids that we have seen, so we don't classify something twice
424 $self->{'all_doc_OIDs'} = {};
425 #return bless $self, $class;
426 return $self;
427}
428
429
430sub init
431{
432 # Nothing to do...
433}
434
435sub set_metadata_groups_info_per_level
436{
437 my $self = shift(@_);
438 my $info_name = shift(@_);
439 my $info_default = shift(@_);
440 my $info_valid_types_hash_ref = shift(@_);
441
442 if (!defined $self->{$info_name}) {
443 print STDERR "**** undefined $info_name\n";
444 }
445 my @info_list = split(/\//, $self->{$info_name});
446
447 my $first = 1;
448 foreach my $metadata_group (@{$self->{'metadata_groups'}}) {
449 my $info_elem = shift(@info_list);
450 if (defined ($info_elem) && (!defined $info_valid_types_hash_ref || defined $info_valid_types_hash_ref->{$info_elem})) {
451 $self->{$metadata_group .".$info_name"} = $info_elem;
452 } else {
453 # its empty or an invalid entry
454 my $new_info_elem;
455 if ($first) {
456 $new_info_elem = $info_default;
457 } else {
458 # get the value we had at first
459 $new_info_elem = $self->{@{$self->{'metadata_groups'}}[0] . ".$info_name"};
460 }
461 $self->{$metadata_group .".$info_name"} = $new_info_elem;
462 if (defined $info_elem) {
463 print STDERR "List Error: $info_elem is not a valid value for $info_name, changing it to $new_info_elem\n";
464
465 }
466 }
467 $first = 0;
468 }
469
470}
471
472# Called for each document in the collection
473sub classify
474{
475 my $self = shift(@_);
476 my ($doc_obj) = @_;
477
478 if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
479 print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
480 return;
481 }
482
483 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
484
485 # check against filter here
486 if ($self->{'filter_metadata'}) {
487 my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'});
488 return unless defined $meta;
489 if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) {
490 print STDERR "doc $doc_obj doesn't pass filtering requirement\n" if ($self->{'verbosity'} > 3);
491 return;
492 }
493 }
494 # if we get here, we have passed the test for filtering
495 # If "-classify_sections" is set, classify every section of the document
496 if ($self->{'classify_sections'}) {
497 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
498 while (defined $section) {
499 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
500 $section = $doc_obj->get_next_section($section);
501 }
502 }
503 # Otherwise just classify the top document section
504 else {
505 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
506 }
507
508}
509
510sub classify_section
511{
512 my $self = shift(@_);
513 my ($doc_obj,$section_OID,$section) = @_;
514
515 my @metadata_groups = @{$self->{'metadata_groups'}};
516
517 # Only classify the section if it has a value for one of the metadata elements in the first group
518 my $classify_section = 0;
519 my $first_metadata_group = $metadata_groups[0];
520 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
521 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
522 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
523 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
524 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
525 # Remove prefix/suffix if requested
526 if (defined ($first_metadata_group_element_value)) {
527 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
528 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
529 }
530
531 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
532 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
533 }
534 }
535
536 $first_metadata_group_element_value = &sorttools::format_metadata_for_sorting($first_metadata_group, $first_metadata_group_element_value, $doc_obj) unless $self->{'no_metadata_formatting'};
537 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
538 # This section must be included in the classifier as we have found a value
539 $classify_section = 1;
540 last;
541 }
542 }
543
544 # We're not classifying this section because it doesn't have the required metadata
545 return if (!$classify_section);
546
547 # Otherwise, include this section in the classifier
548
549 push(@{$self->{'OIDs'}}, $section_OID);
550
551 # Create a hash for the metadata values of each metadata element we're interested in
552 my %metadata_groups_done = ();
553 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
554 # Take care not to do a metadata group more than once
555 unless ($metadata_groups_done{$metadata_group}) {
556 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
557 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
558 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
559 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
560
561 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
562 foreach my $metadata_value (@metadata_values) {
563 #print STDERR "working with value $metadata_value\n";
564 # Strip leading and trailing whitespace
565 $metadata_value =~ s/^\s*//;
566 $metadata_value =~ s/\s*$//;
567
568 # Remove prefix/suffix if requested
569 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
570 $metadata_value =~ s/^$remove_prefix_expr//;
571 }
572 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
573 $metadata_value =~ s/$remove_suffix_expr$//;
574 }
575
576 # lowercase metadata both for sorting meta (d/D under D), and to allow CSS to be able to
577 # text-transform the stored lowercase values as capitalize or uppercase (can't CSS
578 # text-transform if stored uppercase). 2 CSS text-transforms have been added to core.css
579 my $lc_metadata_value = lc($metadata_value);
580 $lc_metadata_value = &sorttools::format_metadata_for_sorting($real_metadata_element, $lc_metadata_value, $doc_obj) unless $self->{'no_metadata_formatting'};
581 #print STDERR "formatted value = $lc_metadata_value\n";
582
583 # Add the metadata value into the list for this combination of metadata group
584 # and section - if we have some non-whitespace chars
585 # text that we have some non-whitespace chars
586 if ($lc_metadata_value =~ /\S/) {
587
588 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $lc_metadata_value);
589
590 # add the actual value into the stored values so we can remember the case
591 if (!$self->{'standardize_capitalization'}) {
592 if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) {
593 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++;
594 } else {
595 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1;
596 }
597 }
598 last if ($self->{$metadata_group . ".metadata_selection_mode_within_level"} eq "firstvalue");
599 }
600 } # foreach metadatavalue
601 last if ((@metadata_values > 0) && $self->{$metadata_group . ".metadata_selection_mode_within_level"} =~ /^(firstvalue|firstvalidmetadata)$/ );
602 } # foreach metadata element
603
604 $metadata_groups_done{$metadata_group} = 1;
605 }
606 }
607}
608
609
610sub get_classify_info
611{
612 my $self = shift(@_);
613
614 # The metadata groups to classify by
615 my @metadata_groups = @{$self->{'metadata_groups'}};
616 my $first_metadata_group = $metadata_groups[0];
617
618 # The OID values of the documents to include in the classifier
619 my @OIDs = @{$self->{'OIDs'}};
620
621 # Create the root node of the classification hierarchy
622 my %classifier_node = ( 'thistype' => "Invisible",
623 'childtype' => $self->{$first_metadata_group . ".list_type"},
624 'Title' => $self->{'buttonname'},
625 'contains' => [],
626 'mdtype' => $first_metadata_group );
627
628 # Recursively create the classification hierarchy, one level for each metadata group
629 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
630 return \%classifier_node;
631}
632
633
634sub add_level
635{
636 my $self = shift(@_);
637 my @metadata_groups = @{shift(@_)};
638 my @OIDs = @{shift(@_)};
639 my $classifier_node = shift(@_);
640
641 my $metadata_group = $metadata_groups[0];
642 if (!defined($self->{$metadata_group . ".list"})) {
643 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
644 return;
645 }
646
647 # Create a mapping from metadata value to OID
648 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
649 my %metadata_value_to_OIDs_hash = ();
650 my %numeric_metadata_value_to_OIDs_hash = ();
651 foreach my $OID (@OIDs)
652 {
653 if ($OID_to_metadata_values_hash_ref->{$OID})
654 {
655 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
656 foreach my $metadata_value (@metadata_values)
657 {
658 if ($metadata_value =~ /^[0-9]/) {
659 push(@{$numeric_metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
660 } else {
661 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
662 }
663 }
664 }
665 }
666 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
667 #print STDERR "Number of distinct numeric values: " . scalar(keys %numeric_metadata_value_to_OIDs_hash) . "\n";
668
669 # Partition the values (if necessary)
670 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
671 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
672 my $partition_sort_mode_within_level = $self->{$metadata_group . ".partition_sort_mode_within_level"};
673 my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"};
674
675
676 #############################################
677 ### DO THE NUMBERS IF THEY ARE TO COME FIRST
678 #############################################
679 if ($self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) {
680 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash);
681 }
682
683 ############################################
684 # DO THE LETTERS
685 ############################################
686 if (keys(%metadata_value_to_OIDs_hash)){ # make sure we have some values
687 if ($partition_type_within_level =~ /^per_letter$/i) {
688 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash);
689 }
690 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
691 $self->split_approximate_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'});
692
693 }
694 elsif ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
695 $self->split_constant_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'});
696 }
697
698 # Otherwise just add all the values to a VList
699 else {
700 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
701 }
702 }
703
704 ###########################################
705 ### DO THE NUMBERS IF THEY ARE TO COME LAST
706 ###########################################
707 if (!$self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) {
708 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash);
709 }
710}
711
712sub partition_numeric_values
713{
714 my $self = shift(@_);
715 my @metadata_groups = @{shift(@_)};
716 my $classifier_node = shift(@_);
717 my $numeric_metadata_value_to_OIDs_hash_ref = shift(@_);
718
719 my $metadata_group = $metadata_groups[0];
720 my $numeric_partition_type_within_level = $self->{$metadata_group . ".numeric_partition_type_within_level"};
721 my $numeric_partition_size_within_level = $self->{$metadata_group . ".numeric_partition_size_within_level"};
722 my $numeric_partition_sort_mode_within_level = $self->{$metadata_group . ".numeric_partition_sort_mode_within_level"};
723 my $numeric_partition_name_length_within_level = $self->{$metadata_group . ".numeric_partition_name_length_within_level"};
724 my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"};
725
726 if ($numeric_partition_type_within_level eq "single_partition") {
727 $self->add_hlist_partition(\@metadata_groups, $classifier_node, "0-9", $numeric_metadata_value_to_OIDs_hash_ref);
728 }
729 elsif ($numeric_partition_type_within_level eq "per_digit") {
730 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, 1);
731 }
732 elsif ($numeric_partition_type_within_level eq "per_number") {
733 # each different number is a bucket
734 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, $numeric_partition_name_length_within_level);
735 }
736 elsif ($numeric_partition_type_within_level eq "constant_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
737 # Generate hlists of a certain size
738
739 $self->split_constant_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1);
740 } elsif ($numeric_partition_type_within_level eq "approximate_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
741 $self->split_approximate_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1);
742 }
743 # Otherwise just add all the values to a VList
744 else {
745 $self->add_vlist(\@metadata_groups, $classifier_node, $numeric_metadata_value_to_OIDs_hash_ref);
746 }
747
748}
749
750sub split_approximate_size
751{
752
753 my $self = shift(@_);
754 my @metadata_groups = @{shift(@_)};
755 my $classifier_node = shift(@_);
756 my $partition_size = shift(@_);
757 my $sort_mode = shift(@_);
758 my $bookshelf_type = shift(@_);
759 my $metadata_value_to_OIDs_hash_ref = shift(@_);
760 my $partition_name_length = shift(@_);
761 my $is_numeric = shift(@_);
762
763 # Generate hlist based on the first letter of the metadata value (like per_letter), or based on
764 # numbers, but also with restriction on the partition size
765 # If a partition has fewer items than specified by the "partition_size_within_level", then group them together if possible
766 # If a partition has more items than specified, split into several hlists.
767 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
768
769 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
770
771 # Separate values by their first letter, each form a bucket, like the per_letter partition type
772 my $last_partition = $self->generate_partition_name($sortedmetadata_values[0], $partition_name_length, $is_numeric);
773
774 my @partition_buckets = ();
775 my @metadata_values_in_bucket = ();
776 my $num_items_in_bucket = 0;
777
778 foreach my $metadata_value (@sortedmetadata_values) {
779 my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
780 if ($metadata_valuepartition ne $last_partition) {
781 my @temp_array = @metadata_values_in_bucket;
782 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
783 my %partition_info = ();
784 $partition_info{'metadata_values'} = \@temp_array;
785 $partition_info{'size'} = $num_items_in_bucket;
786 $partition_info{'name'} = $last_partition;
787 push (@partition_buckets, \%partition_info);
788
789 @metadata_values_in_bucket = ($metadata_value);
790 $num_items_in_bucket = ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : scalar(@metadata_values_in_bucket));
791 $last_partition = $metadata_valuepartition;
792 } else {
793 $num_items_in_bucket += ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1); #scalar(@metadata_values_in_bucket);
794 push (@metadata_values_in_bucket, $metadata_value);
795 }
796 }
797 # Last one
798 my %partition_info = ();
799 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
800 $partition_info{'size'} = $num_items_in_bucket;
801 $partition_info{'name'} = $last_partition;
802
803 push (@partition_buckets, \%partition_info);
804
805 # now go through the array of buckets, and merge small buckets
806 my @new_partition_buckets = ();
807 for (my $i = 0; $i < scalar(@partition_buckets) - 1; $i++) {
808
809 my $this_bucket = $partition_buckets[$i]; #->{'name'};
810 my $next_bucket = $partition_buckets[$i+1]; #->{'name'};
811
812 my $items_in_partition = $this_bucket->{'size'};
813
814 if ($items_in_partition < $partition_size ) {
815 my $items_in_next_partition = $next_bucket->{'size'};
816 if ($items_in_partition + $items_in_next_partition <= $partition_size ) {
817 # merge this bucket into the next bucket
818 foreach my $metadata_value_to_merge (@{$this_bucket->{'metadata_values'}}) {
819 push(@{$next_bucket->{'metadata_values'}}, $metadata_value_to_merge);
820 }
821 $next_bucket->{'size'} += $items_in_partition;
822
823 } else {
824 # remember this bucket
825 push (@new_partition_buckets, $this_bucket);
826 }
827 } else {
828 # remember this bucket
829 push (@new_partition_buckets, $this_bucket);
830 }
831 }
832 # add in the last bucket
833 my $last_bucket = $partition_buckets[scalar(@partition_buckets) - 1];
834 push (@new_partition_buckets, $last_bucket);
835
836 # Add partitions to the main list, but divide big bucket into several
837 my $last_partition_end = "";
838 my $partition_start = "";
839 foreach my $partition (@new_partition_buckets) {
840 my @metadata_values = $self->sort_metadata_values_array($sort_mode, @{$partition->{'metadata_values'}});
841 my $items_in_partition = $partition->{'size'};
842 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $partition_name_length, $is_numeric);
843
844 if ($items_in_partition > $partition_size) {
845 my $items_done = 0;
846 my %metadata_values_to_OIDs_subhashes = ();
847 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
848 my $metadata_value = $metadata_values[$i];
849 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
850 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1;
851
852 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $partition_name_length, $is_numeric);
853 my $partitionname = $partition_start;
854 if ($partitionend ne $partition_start) {
855 $partitionname = $partitionname . "-" . $partitionend;
856 }
857
858 if ($items_done + $items_for_this_md_value > $partition_size && $items_done != 0) {
859 # Start a new partition
860 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
861 $last_partition_end = $partitionend;
862 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric);
863 $items_done = 0;
864 %metadata_values_to_OIDs_subhashes = ();
865 }
866
867 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
868 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size) {
869
870 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric);
871 # Get the number of partitions needed for this value
872 my $num_splits = int($items_for_this_md_value / $partition_size);
873 $num_splits++ if ($items_for_this_md_value / $partition_size > $num_splits);
874
875 my @OIDs_for_this_value = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
876 for (my $i = 0; $i < $num_splits; $i++) {
877 my %OIDs_subhashes_for_this_value = ();
878 my @OIDs_for_this_partition = ();
879 for (my $d = $i * $partition_size; $d < (($i+1) * $partition_size > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size); $d++) {
880 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
881 }
882
883 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
884 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size) {
885 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
886 $items_done += scalar(@OIDs_for_this_partition);
887 next;
888 }
889
890 # Add an HList for this bucket
891 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
892 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
893 $last_partition_end = $partitionname_for_this_value;
894 }
895 next;
896 }
897
898 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
899 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
900
901 # The last partition
902 if($i == scalar(@metadata_values) - 1) {
903 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
904 }
905 }
906 } # end if items in partition > partition size
907 else {
908 # The easier case, just add a partition
909 my %metadata_values_to_OIDs_subhashes = ();
910 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
911 my $metadata_value = $metadata_values[$i];
912 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
913 }
914 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
915 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $partition_name_length, $is_numeric);
916 my $partitionname = $partition_start;
917 if ($partitionend ne $partition_start) {
918 $partitionname = $partitionname . "-" . $partitionend;
919 }
920
921 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
922 $last_partition_end = $partitionend;
923 }
924 }
925
926 # The partitions are stored in an HList
927 $classifier_node->{'childtype'} = "HList";
928
929}
930
931sub split_constant_size
932{
933 my $self = shift(@_);
934 my @metadata_groups = @{shift(@_)};
935 my $classifier_node = shift(@_);
936 my $partition_size = shift(@_);
937 my $sort_mode = shift(@_);
938 my $metadata_value_to_OIDs_hash_ref = shift(@_);
939 my $partition_name_length = shift(@_);
940 my $is_numeric = shift(@_);
941
942 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
943 my $itemsdone = 0;
944 my $numitems = scalar(@sortedmetadata_values);
945 my %metadata_value_to_OIDs_subhash = ();
946 my $lastpartitionend = "";
947 my $partitionstart;
948 foreach my $metadata_value (@sortedmetadata_values) {
949 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
950 $itemsdone++;
951 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
952
953 # Is this the start of a new partition?
954 if ($itemsinpartition == 1) {
955 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $is_numeric);
956 }
957
958 # Is this the end of the partition?
959 if ($itemsinpartition == $partition_size || $itemsdone == $numitems) {
960 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $is_numeric);
961 my $partitionname = $partitionstart;
962 if ($partitionend ne $partitionstart) {
963 $partitionname = $partitionname . "-" . $partitionend;
964 }
965
966 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
967 %metadata_value_to_OIDs_subhash = ();
968 $lastpartitionend = $partitionend;
969 }
970 }
971
972 # The partitions are stored in an HList
973 $classifier_node->{'childtype'} = "HList";
974
975}
976
977sub split_per_letter_or_digit
978{
979 my $self = shift(@_);
980 my @metadata_groups = @{shift(@_)};
981 my $classifier_node = shift(@_);
982 my $sort_mode = shift(@_);
983 my $metadata_value_to_OIDs_hash_ref = shift(@_);
984 my $is_numeric = shift(@_);
985 my $numeric_partition_length = shift(@_);
986
987 if (not defined $is_numeric) {
988 $is_numeric = 0;
989 }
990 if ($is_numeric && not defined($numeric_partition_length)) {
991 $numeric_partition_length = 1;
992 }
993 # Generate one hlist for each letter
994 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
995 my %metadata_value_to_OIDs_subhash = ();
996
997 my $lastpartition = $self->generate_partition_name($sortedmetadata_values[0], $numeric_partition_length, $is_numeric);
998 foreach my $metadata_value (@sortedmetadata_values) {
999
1000 my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $numeric_partition_length, $is_numeric);
1001
1002 # Is this the start of a new partition?
1003 if ($metadata_valuepartition ne $lastpartition) {
1004 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
1005 %metadata_value_to_OIDs_subhash = ();
1006 $lastpartition = $metadata_valuepartition;
1007 }
1008
1009 $metadata_value_to_OIDs_subhash{$metadata_value} = %$metadata_value_to_OIDs_hash_ref{$metadata_value};
1010 }
1011
1012 # Don't forget to add the last partition
1013 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
1014
1015 # The partitions are stored in an HList
1016 $classifier_node->{'childtype'} = "HList";
1017
1018}
1019
1020sub generate_partition_name
1021{
1022 my $self = shift(@_);
1023 my $mvalue = shift(@_);
1024 my $numeric_partition_length = shift(@_);
1025 my $is_numeric = shift(@_);
1026
1027 if (!$is_numeric || $numeric_partition_length == 1 ) {
1028 return substr($mvalue, 0, 1);
1029 }
1030 if ($numeric_partition_length == -1) {
1031 my ($all_digits) = $mvalue =~ /^([0-9]+)/;
1032 return $all_digits;
1033 }
1034 my ($some_digits) = $mvalue =~ /^([0-9]{1,$numeric_partition_length})/;
1035 return $some_digits;
1036}
1037
1038sub generate_partition_start
1039{
1040 my $self = shift(@_);
1041 my $metadata_value = shift(@_);
1042 my $lastpartitionend = shift(@_);
1043 my $partition_name_length = shift(@_);
1044 my $is_numeric = shift(@_);
1045
1046 if ($is_numeric) {
1047 return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
1048 }
1049
1050 if ($partition_name_length) {
1051 return substr($metadata_value, 0, $partition_name_length);
1052 }
1053
1054 my $partitionstart = substr($metadata_value, 0, 1);
1055 if ($partitionstart le $lastpartitionend) {
1056 $partitionstart = substr($metadata_value, 0, 2);
1057 # Give up after three characters
1058 if ($partitionstart le $lastpartitionend) {
1059 $partitionstart = substr($metadata_value, 0, 3);
1060 }
1061 }
1062
1063 return $partitionstart;
1064}
1065
1066
1067sub generate_partition_end
1068{
1069 my $self = shift(@_);
1070 my $metadata_value = shift(@_);
1071 my $partitionstart = shift(@_);
1072 my $partition_name_length = shift(@_);
1073 my $is_numeric = shift(@_);
1074
1075 if ($is_numeric) {
1076 return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
1077 }
1078 if ($partition_name_length) {
1079 return substr($metadata_value, 0, $partition_name_length);
1080 }
1081
1082 my $partitionend = substr($metadata_value, 0, length($partitionstart));
1083 if ($partitionend gt $partitionstart) {
1084 $partitionend = substr($metadata_value, 0, 1);
1085 if ($partitionend le $partitionstart) {
1086 $partitionend = substr($metadata_value, 0, 2);
1087 # Give up after three characters
1088 if ($partitionend le $partitionstart) {
1089 $partitionend = substr($metadata_value, 0, 3);
1090 }
1091 }
1092 }
1093
1094 return $partitionend;
1095}
1096
1097
1098sub add_hlist_partition
1099{
1100 my $self = shift(@_);
1101 my @metadata_groups = @{shift(@_)};
1102 my $classifier_node = shift(@_);
1103 my $partitionname = shift(@_);
1104 my $metadata_value_to_OIDs_hash_ref = shift(@_);
1105
1106 # Create an hlist partition
1107 # Note that we don't need to convert from unicode-aware strings
1108 # to utf8 here, as that is handled elsewhere in the code
1109 my %child_classifier_node = ( 'Title' => $partitionname, #'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
1110 'childtype' => "VList",
1111 'contains' => [] );
1112
1113 # Add the children to the hlist partition
1114 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
1115 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
1116}
1117
1118
1119sub add_vlist
1120{
1121 my $self = shift(@_);
1122 my @metadata_groups = @{shift(@_)};
1123 my $classifier_node = shift(@_);
1124 my $metadata_value_to_OIDs_hash_ref = shift(@_);
1125
1126 my $metadata_group = shift(@metadata_groups);
1127 $classifier_node->{'mdtype'} = $metadata_group;
1128
1129 my $sort_type = $self->{$metadata_group .".metadata_sort_mode_within_level"};
1130 # Create an entry in the vlist for each value
1131 foreach my $metadata_value ($self->sort_metadata_values_array($sort_type, keys(%{$metadata_value_to_OIDs_hash_ref})))
1132 {
1133 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
1134 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
1135 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
1136 my $OID = $OIDs[0];
1137 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
1138 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
1139 }
1140 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
1141 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
1142 @OIDs = $self->sort_leaf_items(\@OIDs);
1143 foreach my $OID (@OIDs) {
1144 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
1145 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
1146 }
1147
1148 }
1149 # Otherwise create a sublist (bookshelf) for the metadata value
1150 else {
1151 my $metadata_value_display = $self->get_metadata_value_display($metadata_group, $metadata_value);
1152 my %child_classifier_node = ( 'Title' => $metadata_value_display,
1153 'childtype' => "VList",
1154 'mdtype' => $metadata_group,
1155 'contains' => [] );
1156
1157 # If there are metadata elements remaining, recursively apply the process
1158 if (@metadata_groups > 0) {
1159 my $next_metadata_group = $metadata_groups[0];
1160 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
1161
1162 # separate metadata into those that below in the next/sub-metadata_group
1163 # and those that below at the current level's metadata_group
1164
1165 my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
1166 my @current_level_OIDs = ();
1167 my @next_level_OIDs = ();
1168 foreach my $OID (@OIDs)
1169 {
1170 if ($OID_to_metadata_values_hash_ref->{$OID}) {
1171 push(@next_level_OIDs, $OID);
1172 } else {
1173 push(@current_level_OIDs, $OID);
1174 }
1175 }
1176 # recursively process those docs belonging to the sub-metadata_group
1177 $self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
1178
1179 # For those docs that don't belong in the sub/next_metadata_group, but which belong
1180 # at this level, just add the documents as children of this list at the current level
1181 @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
1182 foreach my $current_level_OID (@current_level_OIDs) {
1183 my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
1184 push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
1185 }
1186 }
1187 # Otherwise just add the documents as children of this list
1188 else {
1189 @OIDs = $self->sort_leaf_items(\@OIDs);
1190 foreach my $OID (@OIDs) {
1191 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
1192 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
1193 }
1194
1195 }
1196
1197 # Add the sublist to the list
1198 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
1199 }
1200 }
1201}
1202
1203sub metadata_offset
1204{
1205 my $self = shift(@_);
1206 my $metadata_group = shift(@_);
1207 my $OID = shift(@_);
1208 my $metadata_value = shift(@_);
1209
1210 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
1211 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
1212 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
1213 if ($metadata_value eq $metadata_values[$i]) {
1214 return $i;
1215 }
1216 }
1217
1218 return 0;
1219}
1220
1221sub sort_leaf_items
1222{
1223 my $self = shift(@_);
1224 my @OIDs = @{shift(@_)};
1225
1226 # Sort leaf nodes and add to list
1227 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
1228 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
1229 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
1230 my $sort_type = $self->{$sort_leaf_nodes_usingmetaelem . ".metadata_sort_mode_within_level"};
1231 # Force a stable sort (Perl 5.6's sort isn't stable)
1232 # !! The [0] bits aren't ideal (multiple metadata values) !!
1233 @OIDs = @OIDs[ sort {
1234 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]}) && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]}))
1235 {
1236 if ($sort_type eq "numeric") {
1237 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] <=> $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
1238 } elsif ($sort_type eq "alphabetic") {
1239 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
1240 } else {
1241 ncmp($OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0], $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0]);
1242 }
1243 }
1244 else
1245 {
1246 $a <=> $b;
1247 }
1248 } 0..$#OIDs ];
1249 }
1250 if ($self->{'reverse_sort_leaf_nodes'}) {
1251 return reverse @OIDs;
1252 }
1253 return @OIDs;
1254}
1255
1256
1257sub sort_metadata_values_array
1258{
1259 my $self = shift(@_);
1260 my ($sort_mode) = shift(@_);
1261 my @metadata_values = @_;
1262
1263 if ($sort_mode eq "unicode") {
1264 if ($self->{'unicode_collator'}) {
1265 return $self->{'unicode_collator'}->sort(@metadata_values);
1266 }
1267 # the collator wasn't loaded, fall back on default
1268 $sort_mode = "alphanumeric";
1269 }
1270 if ($sort_mode eq "numeric") {
1271 return sort {$a <=> $b} @metadata_values;
1272 }
1273 if ($sort_mode eq "alphabetic") {
1274 return sort {$a cmp $b} @metadata_values;
1275 }
1276 # natural sort
1277 return nsort(@metadata_values);
1278}
1279
1280# we are not using this any more. Using nsort instead
1281# $a and $b args automatically passed in and shouldn't be declared
1282sub alpha_numeric_cmp
1283{
1284 my $self = shift (@_);
1285 my ($aStr, $bStr) = @_;
1286 if ($aStr =~ m/^(\d+(\.\d+)?)/)
1287 {
1288 my $val_a = $1;
1289 if ($bStr =~ m/^(\d+(\.\d+)?)/)
1290 {
1291 my $val_b = $1;
1292 if ($val_a != $val_b)
1293 {
1294 return ($val_a <=> $val_b);
1295 }
1296 }
1297 }
1298
1299 return ($aStr cmp $bStr);
1300}
1301
1302
1303sub get_metadata_value_display {
1304 my $self = shift(@_);
1305 my ($metadata_group, $metadata_value) = @_;
1306 return $metadata_value if $self->{'standardize_capitalization'};
1307 my $actual_values_hash = $self->{$metadata_group . ".actualvalues"}->{$metadata_value};
1308 my $display_value ="";
1309 my $max_count=0;
1310 foreach my $v (keys %$actual_values_hash) {
1311 if ($actual_values_hash->{$v} > $max_count) {
1312 $display_value = $v;
1313 $max_count = $actual_values_hash->{$v};
1314 }
1315 }
1316 return $display_value;
1317}
13181;
Note: See TracBrowser for help on using the repository browser.