source: main/trunk/greenstone2/perllib/classify/List.pm@ 33899

Last change on this file since 33899 was 33899, checked in by kjdon, 4 years ago

pass in new casefold and accentfold options (BaseClassifier) to format_metadata_for_sorting

  • Property svn:keywords set to Author Date Id Revision
File size: 55.9 KB
RevLine 
[10398]1###########################################################################
2#
[18568]3# List.pm -- A general and flexible list classifier with most of
[10398]4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11#
12# Copyright (C) 2005 New Zealand Digital Library Project
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
[13741]28# TO DO: - Remove punctuation from metadata values before sorting.
29# - Add an AZCompactList-style hlist option?
[10398]30#
31###########################################################################
32
[31576]33
[18568]34package List;
[10398]35
36
[17209]37use BaseClassifier;
[33452]38use Sort::Naturally;
[10398]39use strict;
40
41
42sub BEGIN {
[18568]43 @List::ISA = ('BaseClassifier');
[10398]44}
45
[33452]46
47my $metadata_selection_mode_list =
[29094]48 [
49 { 'name' => "firstvalue",
50 'desc' => "{List.metadata_selection.firstvalue}"},
51 { 'name' => "firstvalidmetadata",
52 'desc' => "{List.metadata_selection.firstvalidmetadata}"},
53 { 'name' => "allvalues",
54 'desc' => "{List.metadata_selection.allvalues}"} ];
[33452]55
56my $metadata_selection_mode_default = "firstvalidmetadata";
57
58my $valid_metadata_selection_modes = { 'firstvalue' => 1,
[29094]59 'firstvalidmetadata' => 1,
60 'allvalues' => 1 };
[33452]61
62my $metadata_sort_mode_list =
63 [
64 { 'name' => "unicode",
65 'desc' => "{List.metadata_sort.unicode}"},
66 { 'name' => "alphabetic",
67 'desc' => "{List.metadata_sort.alphabetic}"},
68 { 'name' => "alphanumeric",
69 'desc' => "{List.metadata_sort.alphanumeric}"} ];
70
71my $metadata_sort_mode_default = "alphanumeric";
72
73my $valid_metadata_sort_modes = { 'unicode' => 1,
74 'alphabetic' => 1,
75 'alphanumeric' => 1};
[18572]76my $partition_type_list =
77 [ { 'name' => "per_letter",
78 'desc' => "{List.level_partition.per_letter}" },
[20865]79 { 'name' => "approximate_size",
80 'desc' => "{List.level_partition.approximate_size}"},
[18572]81 { 'name' => "constant_size",
[33460]82 'desc' => "{List.level_partition.constant_size}" },
83 { 'name' => "all_values",
84 'desc' => "{List.level_partition.all_values}" },
[18572]85 { 'name' => "none",
86 'desc' => "{List.level_partition.none}" } ];
[10398]87
[33452]88my $partition_type_default = "per_letter";
89
[20825]90my $valid_partition_types = { 'per_letter' => 1,
91 'constant_size' => 1,
[20865]92 'approximate_size' => 1,
[33460]93 'all_values' => 1,
[20825]94 'none' => 1};
95
[33490]96my $partition_size_default = 20;
[33452]97
98my $numeric_partition_type_list =
99 [ { 'name' => "per_digit",
100 'desc' => "{List.level_partition.per_digit}" },
101 { 'name' => "per_number",
102 'desc' => "{List.level_partition.per_number}" },
103 { 'name' => "single_partition",
[33460]104 'desc' => "{List.level_partition.single}" },
[33452]105 { 'name' => "approximate_size",
106 'desc' => "{List.level_partition.approximate_size_numeric}"},
107 { 'name' => "constant_size",
108 'desc' => "{List.level_partition.constant_size}" },
[33460]109 { 'name' => "all_values",
110 'desc' => "{List.level_partition.all_values}" },
[33452]111 { 'name' => "none",
112 'desc' => "{List.level_partition.none}" } ];
113
114my $numeric_partition_type_default = "single_partition";
115
116my $valid_numeric_partition_types = { 'per_digit' => 1,
117 'per_number' => 1,
118 'constant_size' => 1,
119 'single_partition' => 1,
120 'approximate_size' => 1,
[33460]121 'all_values' => 1,
[33452]122 'none' =>1 };
123
[33490]124my $numeric_partition_size_default = 20;
[33452]125
126my $numeric_partition_name_length_default = "-1"; # use the full number
127
[18619]128my $bookshelf_type_list =
129 [ { 'name' => "always",
130 'desc' => "{List.bookshelf_type.always}" },
131 { 'name' => "duplicate_only",
132 'desc' => "{List.bookshelf_type.duplicate_only}" },
133 { 'name' => "never",
[20008]134 'desc' => "{List.bookshelf_type.never}" } ];
[18619]135
[33452]136my $bookshelf_type_default = "never";
137my $sort_leaf_nodes_using_default = "Title";
[10398]138my $arguments =
139 [ { 'name' => "metadata",
[19234]140 'desc' => "{List.metadata}",
[10398]141 'type' => "metadata",
142 'reqd' => "yes" },
[33452]143
144 { 'name' => "metadata_selection_mode_within_level",
[33454]145 'desc' => "{List.metadata_selection_mode_within_level}",
[29094]146 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
[33452]147 'list' => $metadata_selection_mode_list,
148 'deft' => $metadata_selection_mode_default },
149
150 { 'name' => "metadata_sort_mode_within_level",
[33454]151 'desc' => "{List.metadata_sort_mode_within_level}",
[33452]152 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
153 'list' => $metadata_sort_mode_list,
154 'deft' => $metadata_sort_mode_default },
155
[18619]156 { 'name' => "bookshelf_type",
[19234]157 'desc' => "{List.bookshelf_type}",
[18619]158 'type' => "enum",
159 'list' => $bookshelf_type_list,
[33452]160 'deft' => $bookshelf_type_default },
161
[10498]162 { 'name' => "classify_sections",
[19234]163 'desc' => "{List.classify_sections}",
[10498]164 'type' => "flag" },
[33452]165
[10498]166 { 'name' => "partition_type_within_level",
[19234]167 'desc' => "{List.partition_type_within_level}",
[20679]168 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
[20008]169 'list' => $partition_type_list,
[33452]170 'deft' => $partition_type_default },
171
[10498]172 { 'name' => "partition_size_within_level",
[19234]173 'desc' => "{List.partition_size_within_level}",
[33452]174 'type' => "string", # Must be string because multiple values can be specified (separated by '/')
175 'deft' => $partition_size_default},
176
[14084]177 { 'name' => "partition_name_length",
[19234]178 'desc' => "{List.partition_name_length}",
[14084]179 'type' => "string" },
[33452]180
181 {'name' => "partition_sort_mode_within_level",
182 'desc' => "{List.partition_sort_mode_within_level}",
183 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
184 'list' => $metadata_sort_mode_list,
185 'deft' => $metadata_sort_mode_default },
186
187 { 'name' => "numeric_partition_type_within_level",
188 'desc' => "{List.numeric_partition_type_within_level}",
189 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
190 'list' => $numeric_partition_type_list,
191 'deft' => $numeric_partition_type_default },
[33479]192
[33452]193 { 'name' => "numeric_partition_size_within_level",
194 'desc' => "{List.numeric_partition_size_within_level}",
195 'type' => "string", # Must be string because multiple values can be specified (separated by '/')
196 'deft' => $numeric_partition_size_default},
197
198 { 'name' => "numeric_partition_name_length_within_level",
199 'desc' => "{List.numeric_partition_name_length_within_level}",
200 'type' => "string",
201 'deft' => $numeric_partition_name_length_default },
[33479]202
203 {'name' => "numeric_partition_sort_mode_within_level",
204 'desc' => "{List.numeric_partition_sort_mode_within_level}",
205 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
206 'list' => $metadata_sort_mode_list,
207 'deft' => $metadata_sort_mode_default },
208
[33452]209 { 'name' => "numbers_first",
[33460]210 'desc' => "{List.numbers_first}",
[33452]211 'type' => 'flag'},
212
[10498]213 { 'name' => "sort_leaf_nodes_using",
[19234]214 'desc' => "{List.sort_leaf_nodes_using}",
[10398]215 'type' => "metadata",
[33452]216 'deft' => $sort_leaf_nodes_using_default },
217
218 { 'name' => "sort_leaf_nodes_sort_mode",
219 'desc' => "{List.sort_leaf_nodes_sort_mode}",
220 'type' => "enum",
221 'list' => $metadata_sort_mode_list,
222 'deft' => $metadata_sort_mode_default },
223
[23302]224 { 'name' => "reverse_sort_leaf_nodes",
225 'desc' => "{List.reverse_sort_leaf_nodes}",
226 'type' => "flag"},
[33452]227
[13551]228 { 'name' => "sort_using_unicode_collation",
[33452]229 'desc' => "{List.metadata_sort.unicode} {List.sort_using_unicode_collation}",
[13551]230 'type' => "flag" },
[33899]231
[26267]232 {'name' => "filter_metadata",
233 'desc' => "{List.filter_metadata}",
234 'type' => "metadata"},
[33452]235
[26267]236 {'name' => "filter_regex",
237 'desc' => "{List.filter_regex}",
238 'type' => "regexp"},
[33452]239
[33482]240 { 'name' => "use_formatted_metadata_for_bookshelf_display",
241 'desc' => "{List.use_formatted_metadata_for_bookshelf_display}",
[27098]242 'type' => "flag"},
[33452]243
[18619]244 { 'name' => "removeprefix",
245 'desc' => "{BasClas.removeprefix}",
246 'type' => "regexp" },
[33452]247
[18619]248 { 'name' => "removesuffix",
249 'desc' => "{BasClas.removesuffix}",
250 'type' => "regexp" } ];
[10398]251
[18568]252my $options = { 'name' => "List",
[19234]253 'desc' => "{List.desc}",
[10502]254 'abstract' => "no",
[18572]255 'inherits' => "yes",
[10398]256 'args' => $arguments };
257
258
259sub new
260{
261 my ($class) = shift(@_);
262 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
263 push(@$classifierslist, $class);
264
[17209]265 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
266 push(@{$hashArgOptLists->{"OptList"}}, $options);
[10398]267
[17209]268 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
[10398]269
[33452]270 bless $self, $class;
271
[10398]272 if ($self->{'info_only'}) {
273 # don't worry about any options etc
[33452]274 return $self;
[10398]275 }
276
277 # The metadata elements to use (required)
[12894]278 if (!$self->{'metadata'}) {
[18568]279 die "Error: No metadata fields specified for List.\n";
[10398]280 }
[24012]281
282 my @metadata_groups = split(/[\/]/, $self->{'metadata'});
[12889]283 $self->{'metadata_groups'} = \@metadata_groups;
[10398]284
[12894]285 # The classifier button name (default: the first metadata element specified)
286 if (!$self->{'buttonname'}) {
287 my $first_metadata_group = $metadata_groups[0];
[24012]288 my $first_metadata_element = (split(/[\;|,\/]/, $first_metadata_group))[0];
[12894]289 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
[10398]290 }
291
[29094]292 # meta selection mode for each level
[33454]293 $self->set_metadata_groups_info_per_level("metadata_selection_mode_within_level", $metadata_selection_mode_default, $valid_metadata_selection_modes);
[33452]294
295 # meta sort mode for each level
296 if ($self->{'sort_using_unicode_collation'}) {
297 print STDERR "WARNING: sort_using_unicode_collation is set, setting metadata_sort_mode_within_level to unicode for all levels, regardless of current setting\n";
298 $self->{'metadata_sort_mode_within_level'} = "unicode";
299 $metadata_sort_mode_default = "unicode";
[29094]300 } else {
[33452]301 if ($self->{'metadata_sort_mode_within_level'} =~ /unicode/) {
302 $self->{'sort_using_unicode_collation'} = 1;
[29094]303 }
[33452]304 }
305
306
307 $self->set_metadata_groups_info_per_level('metadata_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
308
[18619]309 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
[12889]310 foreach my $metadata_group (@metadata_groups) {
[18619]311 $self->{$metadata_group . ".bookshelf_type"} = "always";
312 }
313 my $last_metadata_group = $metadata_groups[$#metadata_groups];
314 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
315 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
[10499]316
[33452]317 # How the items are grouped into partitions (default: by letter)
[18619]318 # for each level (metadata group), separated by '/'
[33452]319 $self->set_metadata_groups_info_per_level("partition_type_within_level", $partition_type_default, $valid_partition_types);
320 $self->set_metadata_groups_info_per_level("numeric_partition_type_within_level", $numeric_partition_type_default, $valid_numeric_partition_types);
321
[33454]322 # now check whether a level was none - need to set the equivalent level in the other half also to none
323 foreach my $metadata_group (@metadata_groups) {
324 if ($self->{$metadata_group . ".partition_type_within_level"} eq "none" || $self->{$metadata_group . ".numeric_partition_type_within_level"} eq "none") {
325
326 print STDERR "WARNING: one of -partition_type_within_level or -numeric_partition_type_within_level was set to 'none' for level $metadata_group, overriding current value of both these options to 'none'\n";
327
328 $self->{$metadata_group . ".partition_type_within_level"} = "none";
329 $self->{$metadata_group . ".numeric_partition_type_within_level"} = "none";
330 }
331 }
332
[33452]333 $self->set_metadata_groups_info_per_level("partition_size_within_level", $partition_size_default);
334 $self->set_metadata_groups_info_per_level("numeric_partition_size_within_level", $numeric_partition_size_default);
335
336 $self->set_metadata_groups_info_per_level('partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
337
338 $self->set_metadata_groups_info_per_level('numeric_partition_sort_mode_within_level', $metadata_sort_mode_default, $valid_metadata_sort_modes);
339
340 $self->set_metadata_groups_info_per_level("numeric_partition_name_length_within_level", $numeric_partition_name_length_default);
341
[18619]342 # The removeprefix and removesuffix expressions
343 if ($self->{'removeprefix'}) {
344 # If there are more than one expressions, use '' to quote each experession and '/' to separate
345 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
346
347 foreach my $metadata_group (@metadata_groups) {
348 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
349 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
350 # Remove the other ' at the beginning and the end if there is any
351 $removeprefix_expr_within_levelelem =~ s/^'//;
352 $removeprefix_expr_within_levelelem =~ s/'$//;
353 # Remove the extra ^ at the beginning
354 $removeprefix_expr_within_levelelem =~ s/^\^//;
355 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
356 } else {
357 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
358 }
359 }
360 }
361 if ($self->{'removesuffix'}) {
362 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
363
364 foreach my $metadata_group (@metadata_groups) {
365 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
366 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
367 $removesuffix_expr_within_levelelem =~ s/^'//;
368 $removesuffix_expr_within_levelelem =~ s/'$//;
369 # Remove the extra $ at the end
370 $removesuffix_expr_within_levelelem =~ s/\$$//;
371 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
372 } else {
373 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
374 }
375 }
376 }
377
[12894]378 # The metadata elements to use to sort the leaf nodes (default: Title)
[33452]379 my @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
[12894]380 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
[33452]381
[29094]382 foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
383 # set metadata_select_type, if not already set - might be already set if the same group was used in -metadata
[33454]384 if (!defined $self->{$sort_group . ".metadata_selection_mode_within_level"}) {
385 $self->{$sort_group . ".metadata_selection_mode_within_level"} = $metadata_selection_mode_default;
[29094]386 }
[33452]387
[29094]388 }
[33452]389
390 my @leaf_nodes_sort_modes = split (/\|/, $self->{'sort_leaf_nodes_sort_mode'});
391 foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
392 my $leaf_sort_mode = shift(@leaf_nodes_sort_modes);
393 if (!defined $self->{$sort_group . ".metadata_sort_mode_within_level"}) {
394 if (defined $leaf_sort_mode && defined $valid_metadata_sort_modes->{$leaf_sort_mode}) {
395 $self->{$sort_group . ".metadata_sort_mode_within_level"} = $leaf_sort_mode;
396 }
397 else {
398 $self->{$sort_group . ".metadata_sort_mode_within_level"} = $metadata_sort_mode_default;
399 }
400 }
401
402 }
403
[13551]404 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
405 if ($self->{'sort_using_unicode_collation'}) {
[13791]406 # To use this you first need to download the allkeys.txt file from
407 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
408 # Unicode/Collate directory.
[13551]409 require Unicode::Collate;
410 $self->{'unicode_collator'} = Unicode::Collate->new();
411 }
412
[23154]413 # An empty array for the document/section OIDs that we are classifying
[12894]414 $self->{'OIDs'} = [];
[23154]415 # A hash for all the doc ids that we have seen, so we don't classify something twice
416 $self->{'all_doc_OIDs'} = {};
[33452]417 return $self;
[10398]418}
419
420
421sub init
422{
423 # Nothing to do...
424}
425
[33452]426sub set_metadata_groups_info_per_level
427{
428 my $self = shift(@_);
429 my $info_name = shift(@_);
430 my $info_default = shift(@_);
431 my $info_valid_types_hash_ref = shift(@_);
[33454]432
433 if (!defined $self->{$info_name}) {
[33463]434 print STDERR "List Error: no values were set for option $info_name\n";
[33454]435 }
[33452]436 my @info_list = split(/\//, $self->{$info_name});
[10398]437
[33452]438 my $first = 1;
439 foreach my $metadata_group (@{$self->{'metadata_groups'}}) {
440 my $info_elem = shift(@info_list);
441 if (defined ($info_elem) && (!defined $info_valid_types_hash_ref || defined $info_valid_types_hash_ref->{$info_elem})) {
442 $self->{$metadata_group .".$info_name"} = $info_elem;
443 } else {
444 # its empty or an invalid entry
445 my $new_info_elem;
446 if ($first) {
447 $new_info_elem = $info_default;
448 } else {
449 # get the value we had at first
450 $new_info_elem = $self->{@{$self->{'metadata_groups'}}[0] . ".$info_name"};
451 }
452 $self->{$metadata_group .".$info_name"} = $new_info_elem;
453 if (defined $info_elem) {
454 print STDERR "List Error: $info_elem is not a valid value for $info_name, changing it to $new_info_elem\n";
455
456 }
457 }
458 $first = 0;
459 }
460
461}
462
[12896]463# Called for each document in the collection
[10398]464sub classify
465{
466 my $self = shift(@_);
[23116]467 my ($doc_obj) = @_;
[10398]468
[23154]469 if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
470 print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
471 return;
472 }
[33452]473
[23154]474 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
[33452]475
[26267]476 # check against filter here
477 if ($self->{'filter_metadata'}) {
478 my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'});
479 return unless defined $meta;
480 if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) {
[33452]481 print STDERR "doc $doc_obj doesn't pass filtering requirement\n" if ($self->{'verbosity'} > 3);
[26267]482 return;
483 }
484 }
485 # if we get here, we have passed the test for filtering
[12896]486 # If "-classify_sections" is set, classify every section of the document
[10398]487 if ($self->{'classify_sections'}) {
488 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
489 while (defined $section) {
[23116]490 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
[10398]491 $section = $doc_obj->get_next_section($section);
492 }
493 }
[12896]494 # Otherwise just classify the top document section
[10398]495 else {
[23116]496 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
[10398]497 }
[23154]498
[10398]499}
500
501sub classify_section
502{
503 my $self = shift(@_);
[23116]504 my ($doc_obj,$section_OID,$section) = @_;
[10398]505
[12889]506 my @metadata_groups = @{$self->{'metadata_groups'}};
[33452]507
[12896]508 # Only classify the section if it has a value for one of the metadata elements in the first group
509 my $classify_section = 0;
510 my $first_metadata_group = $metadata_groups[0];
[22175]511 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
512 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
[20008]513 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
[20424]514 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
[20421]515 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
[18619]516 # Remove prefix/suffix if requested
[22175]517 if (defined ($first_metadata_group_element_value)) {
518 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
519 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
520 }
521
522 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
523 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
524 }
[18619]525 }
[33452]526
[33899]527 $first_metadata_group_element_value = &sorttools::format_metadata_for_sorting($first_metadata_group, $first_metadata_group_element_value, $doc_obj, $self->{'casefold_metadata_for_sorting'}, $self->{'accentfold_metadata_for_sorting'}) unless $self->{'no_metadata_formatting'};
[12896]528 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
[33452]529 # This section must be included in the classifier as we have found a value
[12896]530 $classify_section = 1;
531 last;
[18619]532 }
[12896]533 }
[33452]534
[12896]535 # We're not classifying this section because it doesn't have the required metadata
536 return if (!$classify_section);
[18455]537
[12896]538 # Otherwise, include this section in the classifier
[23154]539
[12896]540 push(@{$self->{'OIDs'}}, $section_OID);
541
542 # Create a hash for the metadata values of each metadata element we're interested in
543 my %metadata_groups_done = ();
544 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
545 # Take care not to do a metadata group more than once
546 unless ($metadata_groups_done{$metadata_group}) {
[22175]547 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
548 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
[20008]549 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
[20424]550 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
551
[20421]552 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
[12896]553 foreach my $metadata_value (@metadata_values) {
[33452]554 #print STDERR "working with value $metadata_value\n";
[12896]555 # Strip leading and trailing whitespace
556 $metadata_value =~ s/^\s*//;
557 $metadata_value =~ s/\s*$//;
[13550]558
[18619]559 # Remove prefix/suffix if requested
560 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
561 $metadata_value =~ s/^$remove_prefix_expr//;
562 }
563 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
564 $metadata_value =~ s/$remove_suffix_expr$//;
565 }
566
[26545]567 # lowercase metadata both for sorting meta (d/D under D), and to allow CSS to be able to
568 # text-transform the stored lowercase values as capitalize or uppercase (can't CSS
569 # text-transform if stored uppercase). 2 CSS text-transforms have been added to core.css
[33899]570 ### no longer do this, as lowercasing is now an option for the user, and is handled by format_metadata_for_sorting
571 my $lc_metadata_value = $metadata_value; #lc($metadata_value);
572 $lc_metadata_value = &sorttools::format_metadata_for_sorting($real_metadata_element, $lc_metadata_value, $doc_obj, $self->{'casefold_metadata_for_sorting'}, $self->{'accentfold_metadata_for_sorting'}) unless $self->{'no_metadata_formatting'};
[33452]573
574 # Add the metadata value into the list for this combination of metadata group
575 # and section - if we have some non-whitespace chars
[33899]576 # test that we have some non-whitespace chars
[29094]577 if ($lc_metadata_value =~ /\S/) {
578
579 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $lc_metadata_value);
580
[33452]581 # add the actual value into the stored values so we can remember the case
[33482]582 if (!$self->{'use_formatted_metadata_for_bookshelf_display'}) {
[33452]583 if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) {
584 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++;
585 } else {
586 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1;
587 }
[27098]588 }
[33454]589 last if ($self->{$metadata_group . ".metadata_selection_mode_within_level"} eq "firstvalue");
[29094]590 }
591 } # foreach metadatavalue
[33454]592 last if ((@metadata_values > 0) && $self->{$metadata_group . ".metadata_selection_mode_within_level"} =~ /^(firstvalue|firstvalidmetadata)$/ );
[29094]593 } # foreach metadata element
[10398]594
[12896]595 $metadata_groups_done{$metadata_group} = 1;
[10398]596 }
597 }
598}
599
600
601sub get_classify_info
602{
603 my $self = shift(@_);
604
[12896]605 # The metadata groups to classify by
[12889]606 my @metadata_groups = @{$self->{'metadata_groups'}};
607 my $first_metadata_group = $metadata_groups[0];
[10398]608
[12896]609 # The OID values of the documents to include in the classifier
[12889]610 my @OIDs = @{$self->{'OIDs'}};
[10398]611
[12896]612 # Create the root node of the classification hierarchy
[12893]613 my %classifier_node = ( 'thistype' => "Invisible",
[33474]614 'childtype' => "VList",
[12894]615 'Title' => $self->{'buttonname'},
[13271]616 'contains' => [],
617 'mdtype' => $first_metadata_group );
[10398]618
[12895]619 # Recursively create the classification hierarchy, one level for each metadata group
[14173]620 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
[12893]621 return \%classifier_node;
[10398]622}
623
624
[12895]625sub add_level
[10398]626{
627 my $self = shift(@_);
[12889]628 my @metadata_groups = @{shift(@_)};
629 my @OIDs = @{shift(@_)};
[12893]630 my $classifier_node = shift(@_);
[23154]631
[12889]632 my $metadata_group = $metadata_groups[0];
[13340]633 if (!defined($self->{$metadata_group . ".list"})) {
634 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
635 return;
636 }
[10398]637
638 # Create a mapping from metadata value to OID
[14845]639 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
[33452]640 my %metadata_value_to_OIDs_hash = ();
641 my %numeric_metadata_value_to_OIDs_hash = ();
[14845]642 foreach my $OID (@OIDs)
643 {
644 if ($OID_to_metadata_values_hash_ref->{$OID})
645 {
646 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
647 foreach my $metadata_value (@metadata_values)
648 {
[33452]649 if ($metadata_value =~ /^[0-9]/) {
650 push(@{$numeric_metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
651 } else {
652 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
653 }
[10398]654 }
655 }
656 }
[24012]657 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
[33452]658 #print STDERR "Number of distinct numeric values: " . scalar(keys %numeric_metadata_value_to_OIDs_hash) . "\n";
[10398]659
660 # Partition the values (if necessary)
[18619]661 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
[20904]662 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
[33452]663 my $partition_sort_mode_within_level = $self->{$metadata_group . ".partition_sort_mode_within_level"};
664 my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"};
665
666
667 #############################################
668 ### DO THE NUMBERS IF THEY ARE TO COME FIRST
669 #############################################
670 if ($self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) {
671 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash);
672 }
[10398]673
[33452]674 ############################################
675 # DO THE LETTERS
676 ############################################
677 if (keys(%metadata_value_to_OIDs_hash)){ # make sure we have some values
678 if ($partition_type_within_level =~ /^per_letter$/i) {
679 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $partition_sort_mode_within_level, \%metadata_value_to_OIDs_hash);
680 }
681 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
682 $self->split_approximate_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'});
683
684 }
685 elsif ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
[33463]686 $self->split_constant_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'});
[33452]687 }
688
689 # Otherwise just add all the values to a VList
690 else {
[33460]691 if ($partition_type_within_level =~ /^all_values$/i) {
692 $classifier_node->{'childtype'} = "HList";
693 $self->{$metadata_group. ".bookshelf_type"} = "always";
694 }
[33452]695 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
696 }
697 }
[10398]698
[33452]699 ###########################################
700 ### DO THE NUMBERS IF THEY ARE TO COME LAST
701 ###########################################
702 if (!$self->{'numbers_first'} && keys(%numeric_metadata_value_to_OIDs_hash)) {
703 $self->partition_numeric_values(\@metadata_groups, $classifier_node, \%numeric_metadata_value_to_OIDs_hash);
704 }
705}
[10398]706
[33452]707sub partition_numeric_values
708{
709 my $self = shift(@_);
710 my @metadata_groups = @{shift(@_)};
711 my $classifier_node = shift(@_);
712 my $numeric_metadata_value_to_OIDs_hash_ref = shift(@_);
[10398]713
[33452]714 my $metadata_group = $metadata_groups[0];
715 my $numeric_partition_type_within_level = $self->{$metadata_group . ".numeric_partition_type_within_level"};
716 my $numeric_partition_size_within_level = $self->{$metadata_group . ".numeric_partition_size_within_level"};
717 my $numeric_partition_sort_mode_within_level = $self->{$metadata_group . ".numeric_partition_sort_mode_within_level"};
718 my $numeric_partition_name_length_within_level = $self->{$metadata_group . ".numeric_partition_name_length_within_level"};
719 my $bookshelf_type_within_level = $self->{$metadata_group. ".bookshelf_type"};
720
721 if ($numeric_partition_type_within_level eq "single_partition") {
722 $self->add_hlist_partition(\@metadata_groups, $classifier_node, "0-9", $numeric_metadata_value_to_OIDs_hash_ref);
723 }
724 elsif ($numeric_partition_type_within_level eq "per_digit") {
725 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, 1);
726 }
727 elsif ($numeric_partition_type_within_level eq "per_number") {
728 # each different number is a bucket
729 $self->split_per_letter_or_digit(\@metadata_groups, $classifier_node, $numeric_partition_sort_mode_within_level, $numeric_metadata_value_to_OIDs_hash_ref, 1, $numeric_partition_name_length_within_level);
730 }
731 elsif ($numeric_partition_type_within_level eq "constant_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
732 # Generate hlists of a certain size
733
[33463]734 $self->split_constant_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1);
[33452]735 } elsif ($numeric_partition_type_within_level eq "approximate_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) {
736 $self->split_approximate_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1);
737 }
738 # Otherwise just add all the values to a VList
739 else {
[33460]740 if ($numeric_partition_type_within_level =~ /^all_values$/i) {
741 $classifier_node->{'childtype'} = "HList";
742 $self->{$metadata_group. ".bookshelf_type"} = "always";
743 }
744
[33452]745 $self->add_vlist(\@metadata_groups, $classifier_node, $numeric_metadata_value_to_OIDs_hash_ref);
746 }
747
748}
[10398]749
[33452]750sub split_approximate_size
751{
752
753 my $self = shift(@_);
754 my @metadata_groups = @{shift(@_)};
755 my $classifier_node = shift(@_);
756 my $partition_size = shift(@_);
757 my $sort_mode = shift(@_);
758 my $bookshelf_type = shift(@_);
759 my $metadata_value_to_OIDs_hash_ref = shift(@_);
760 my $partition_name_length = shift(@_);
761 my $is_numeric = shift(@_);
762
763 # Generate hlist based on the first letter of the metadata value (like per_letter), or based on
764 # numbers, but also with restriction on the partition size
765 # If a partition has fewer items than specified by the "partition_size_within_level", then group them together if possible
766 # If a partition has more items than specified, split into several hlists.
767 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
768
769 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
770
771 # Separate values by their first letter, each form a bucket, like the per_letter partition type
772 my $last_partition = $self->generate_partition_name($sortedmetadata_values[0], $partition_name_length, $is_numeric);
773
774 my @partition_buckets = ();
775 my @metadata_values_in_bucket = ();
776 my $num_items_in_bucket = 0;
777
778 foreach my $metadata_value (@sortedmetadata_values) {
779 my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
780 if ($metadata_valuepartition ne $last_partition) {
781 my @temp_array = @metadata_values_in_bucket;
782 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
783 my %partition_info = ();
784 $partition_info{'metadata_values'} = \@temp_array;
785 $partition_info{'size'} = $num_items_in_bucket;
786 $partition_info{'name'} = $last_partition;
787 push (@partition_buckets, \%partition_info);
788
789 @metadata_values_in_bucket = ($metadata_value);
790 $num_items_in_bucket = ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : scalar(@metadata_values_in_bucket));
791 $last_partition = $metadata_valuepartition;
792 } else {
793 $num_items_in_bucket += ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1); #scalar(@metadata_values_in_bucket);
794 push (@metadata_values_in_bucket, $metadata_value);
795 }
[10398]796 }
[33452]797 # Last one
798 my %partition_info = ();
799 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
800 $partition_info{'size'} = $num_items_in_bucket;
801 $partition_info{'name'} = $last_partition;
802
803 push (@partition_buckets, \%partition_info);
804
805 # now go through the array of buckets, and merge small buckets
806 my @new_partition_buckets = ();
807 for (my $i = 0; $i < scalar(@partition_buckets) - 1; $i++) {
808
[33463]809 my $this_bucket = $partition_buckets[$i];
810 my $next_bucket = $partition_buckets[$i+1];
[33452]811
812 my $items_in_partition = $this_bucket->{'size'};
[18619]813
[33452]814 if ($items_in_partition < $partition_size ) {
815 my $items_in_next_partition = $next_bucket->{'size'};
816 if ($items_in_partition + $items_in_next_partition <= $partition_size ) {
817 # merge this bucket into the next bucket
818 foreach my $metadata_value_to_merge (@{$this_bucket->{'metadata_values'}}) {
819 push(@{$next_bucket->{'metadata_values'}}, $metadata_value_to_merge);
820 }
821 $next_bucket->{'size'} += $items_in_partition;
[18619]822
823 } else {
[33452]824 # remember this bucket
825 push (@new_partition_buckets, $this_bucket);
[18619]826 }
[33452]827 } else {
828 # remember this bucket
829 push (@new_partition_buckets, $this_bucket);
[18619]830 }
[33452]831 }
832 # add in the last bucket
833 my $last_bucket = $partition_buckets[scalar(@partition_buckets) - 1];
834 push (@new_partition_buckets, $last_bucket);
835
836 # Add partitions to the main list, but divide big bucket into several
837 my $last_partition_end = "";
838 my $partition_start = "";
[33463]839 my $partition_end = "";
840 my $partition_name = "";
[33452]841 foreach my $partition (@new_partition_buckets) {
842 my @metadata_values = $self->sort_metadata_values_array($sort_mode, @{$partition->{'metadata_values'}});
843 my $items_in_partition = $partition->{'size'};
844 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $partition_name_length, $is_numeric);
845
[33463]846 if ($items_in_partition <= $partition_size) {
847 # we can just add the partition as is
848 my %metadata_values_to_OIDs_subhashes = ();
849 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
850 my $metadata_value = $metadata_values[$i];
851 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
852 }
853 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
854 $partition_end = $self->generate_partition_end($last_metadata_value, $partition_start, $partition_name_length, $is_numeric);
855 $partition_name = $partition_start;
856 if ($partition_end ne $partition_start) {
857 $partition_name = $partition_name . "-" . $partition_end;
858 }
859 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes);
860 $last_partition_end = $partition_end;
861 } else {
862 # we have too many items, need to split the partition
[33452]863 my $items_done = 0;
864 my %metadata_values_to_OIDs_subhashes = ();
865 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
866 my $metadata_value = $metadata_values[$i];
867 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
868 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1;
[18619]869
[33452]870 if ($items_done + $items_for_this_md_value > $partition_size && $items_done != 0) {
[33463]871 # Save the stored items into a partition
872 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes);
873 $last_partition_end = $partition_end;
[33452]874 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric);
875 $items_done = 0;
876 %metadata_values_to_OIDs_subhashes = ();
[18619]877 }
[33452]878
879 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
880 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size) {
[33463]881
[33452]882 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $is_numeric);
883 # Get the number of partitions needed for this value
884 my $num_splits = int($items_for_this_md_value / $partition_size);
885 $num_splits++ if ($items_for_this_md_value / $partition_size > $num_splits);
[33463]886 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
[33452]887 for (my $i = 0; $i < $num_splits; $i++) {
888 my %OIDs_subhashes_for_this_value = ();
889 my @OIDs_for_this_partition = ();
890 for (my $d = $i * $partition_size; $d < (($i+1) * $partition_size > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size); $d++) {
891 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
892 }
893
894 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
895 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size) {
[33463]896 $partition_start = $partitionname_for_this_value;
897 $partition_name = $partition_start;
[33452]898 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
899 $items_done += scalar(@OIDs_for_this_partition);
[33463]900 $last_partition_end = $partitionname_for_this_value
901 } else {
902
903 # Add an HList for this bucket
904 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
905 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
906 $last_partition_end = $partitionname_for_this_value;
[18619]907 }
908 }
[33463]909 } else {
910
911 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
912 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
913 $partition_end = $self->generate_partition_end($metadata_value, $partition_start, $partition_name_length, $is_numeric);
914 $partition_name = $partition_start;
915 if ($partition_end ne $partition_start) {
916 $partition_name = $partition_name . "-" . $partition_end;
917 }
918
[18619]919 }
[33452]920
921 # The last partition
[33463]922 if($i == scalar(@metadata_values) - 1 && $items_done >0) {
923 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes);
[18619]924 }
[33463]925
926 }
[33452]927 } # end if items in partition > partition size
[33463]928
[33452]929 }
930
931 # The partitions are stored in an HList
932 $classifier_node->{'childtype'} = "HList";
[20865]933
[33452]934}
[10398]935
[33452]936sub split_constant_size
937{
938 my $self = shift(@_);
939 my @metadata_groups = @{shift(@_)};
940 my $classifier_node = shift(@_);
941 my $partition_size = shift(@_);
942 my $sort_mode = shift(@_);
[33463]943 my $bookshelf_type = shift(@_);
[33452]944 my $metadata_value_to_OIDs_hash_ref = shift(@_);
945 my $partition_name_length = shift(@_);
946 my $is_numeric = shift(@_);
[10398]947
[33452]948 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
[33463]949 my $items_in_partition = 0;
[33452]950 my %metadata_value_to_OIDs_subhash = ();
951 my $lastpartitionend = "";
952 my $partitionstart;
[33463]953
[33452]954 foreach my $metadata_value (@sortedmetadata_values) {
[33463]955 if ($items_in_partition == 0) {
956 # a new partition, set the name
[33452]957 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $is_numeric);
958 }
[33463]959 my $numitems_for_this_value = ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1);
960 if ($items_in_partition + $numitems_for_this_value <= $partition_size) {
961 # add all the current values into the temporary list
962 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
963 $items_in_partition += $numitems_for_this_value;
964 } elsif ($items_in_partition < $partition_size) {
965 # only want to add some of the values into temporary list
966 # note, we only get here if bookshelf type is never
967 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
968 @OIDs = $self->sort_leaf_items(\@OIDs);
969 my $num_items_needed = $partition_size - $items_in_partition;
970 my @slice = splice(@OIDs, 0, $num_items_needed);
971 $metadata_value_to_OIDs_subhash{$metadata_value} = \@slice;
[33452]972
[33463]973 # now we have filled up the partition
[33452]974 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $is_numeric);
975 my $partitionname = $partitionstart;
976 if ($partitionend ne $partitionstart) {
977 $partitionname = $partitionname . "-" . $partitionend;
[10398]978 }
979
[33452]980 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
981 %metadata_value_to_OIDs_subhash = ();
[33463]982 $items_in_partition = 0;
[33452]983 $lastpartitionend = $partitionend;
[33463]984
985 # can we get more partitions from this metadata value?
986 while (scalar(@OIDs) >= $partition_size) {
987 my @slice = splice(@OIDs, 0, $partition_size);
988 $metadata_value_to_OIDs_subhash{$metadata_value} = \@slice;
989 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $is_numeric);
990 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $is_numeric);
991 my $partitionname = $partitionstart;
992 if ($partitionend ne $partitionstart) {
993 $partitionname = $partitionname . "-" . $partitionend;
994 }
995 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
996 %metadata_value_to_OIDs_subhash = ();
997 $items_in_partition = 0;
998 $lastpartitionend = $partitionend;
999
1000 }
1001 if (scalar(@OIDs) > 0) {
1002 $metadata_value_to_OIDs_subhash{$metadata_value} = \@OIDs;
1003 $items_in_partition = scalar(@OIDs);
1004 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $is_numeric);
1005 }
1006
1007
[10398]1008 }
[33463]1009
1010 if ($items_in_partition == $partition_size) {
1011 # its the end of a partition
1012 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $is_numeric);
1013 my $partitionname = $partitionstart;
1014 if ($partitionend ne $partitionstart) {
1015 $partitionname = $partitionname . "-" . $partitionend;
1016 }
1017
1018 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
1019 %metadata_value_to_OIDs_subhash = ();
1020 $items_in_partition = 0;
1021 $lastpartitionend = $partitionend;
1022 }
1023 } # foreach metadata value
1024
1025 if ($items_in_partition > 0) {
1026 # we have to add the last partition
1027 my $partitionend = $self->generate_partition_end(@sortedmetadata_values[@sortedmetadata_values-1], $partitionstart, $partition_name_length, $is_numeric);
1028 my $partitionname = $partitionstart;
1029 if ($partitionend ne $partitionstart) {
1030 $partitionname = $partitionname . "-" . $partitionend;
1031 }
1032
1033 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
[33452]1034 }
[33463]1035
[33452]1036 # The partitions are stored in an HList
1037 $classifier_node->{'childtype'} = "HList";
1038
1039}
1040
1041sub split_per_letter_or_digit
1042{
1043 my $self = shift(@_);
1044 my @metadata_groups = @{shift(@_)};
1045 my $classifier_node = shift(@_);
1046 my $sort_mode = shift(@_);
1047 my $metadata_value_to_OIDs_hash_ref = shift(@_);
1048 my $is_numeric = shift(@_);
1049 my $numeric_partition_length = shift(@_);
1050
1051 if (not defined $is_numeric) {
1052 $is_numeric = 0;
1053 }
1054 if ($is_numeric && not defined($numeric_partition_length)) {
1055 $numeric_partition_length = 1;
1056 }
1057 # Generate one hlist for each letter
1058 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref));
1059 my %metadata_value_to_OIDs_subhash = ();
1060
1061 my $lastpartition = $self->generate_partition_name($sortedmetadata_values[0], $numeric_partition_length, $is_numeric);
1062 foreach my $metadata_value (@sortedmetadata_values) {
1063
1064 my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $numeric_partition_length, $is_numeric);
1065
1066 # Is this the start of a new partition?
1067 if ($metadata_valuepartition ne $lastpartition) {
1068 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
1069 %metadata_value_to_OIDs_subhash = ();
1070 $lastpartition = $metadata_valuepartition;
[10398]1071 }
[33452]1072
[33473]1073 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value};
[10398]1074 }
[33452]1075
1076 # Don't forget to add the last partition
1077 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
1078
1079 # The partitions are stored in an HList
1080 $classifier_node->{'childtype'} = "HList";
1081
[10398]1082}
1083
[33452]1084sub generate_partition_name
1085{
1086 my $self = shift(@_);
1087 my $mvalue = shift(@_);
1088 my $numeric_partition_length = shift(@_);
1089 my $is_numeric = shift(@_);
[10398]1090
[33452]1091 if (!$is_numeric || $numeric_partition_length == 1 ) {
1092 return substr($mvalue, 0, 1);
[33899]1093 }
[33452]1094 if ($numeric_partition_length == -1) {
1095 my ($all_digits) = $mvalue =~ /^([0-9]+)/;
1096 return $all_digits;
1097 }
1098 my ($some_digits) = $mvalue =~ /^([0-9]{1,$numeric_partition_length})/;
1099 return $some_digits;
1100}
1101
[10398]1102sub generate_partition_start
1103{
[14173]1104 my $self = shift(@_);
[14845]1105 my $metadata_value = shift(@_);
[10398]1106 my $lastpartitionend = shift(@_);
[14084]1107 my $partition_name_length = shift(@_);
[33452]1108 my $is_numeric = shift(@_);
[10398]1109
[33452]1110 if ($is_numeric) {
1111 return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
1112 }
1113
[14084]1114 if ($partition_name_length) {
[14845]1115 return substr($metadata_value, 0, $partition_name_length);
[14084]1116 }
1117
[14845]1118 my $partitionstart = substr($metadata_value, 0, 1);
[10398]1119 if ($partitionstart le $lastpartitionend) {
[14845]1120 $partitionstart = substr($metadata_value, 0, 2);
[10398]1121 # Give up after three characters
1122 if ($partitionstart le $lastpartitionend) {
[14845]1123 $partitionstart = substr($metadata_value, 0, 3);
[10398]1124 }
1125 }
1126
1127 return $partitionstart;
1128}
1129
1130
1131sub generate_partition_end
1132{
[14173]1133 my $self = shift(@_);
[14845]1134 my $metadata_value = shift(@_);
[10398]1135 my $partitionstart = shift(@_);
[14084]1136 my $partition_name_length = shift(@_);
[33452]1137 my $is_numeric = shift(@_);
[10398]1138
[33452]1139 if ($is_numeric) {
1140 return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric);
1141 }
[14084]1142 if ($partition_name_length) {
[14845]1143 return substr($metadata_value, 0, $partition_name_length);
[14084]1144 }
1145
[14845]1146 my $partitionend = substr($metadata_value, 0, length($partitionstart));
[10398]1147 if ($partitionend gt $partitionstart) {
[14845]1148 $partitionend = substr($metadata_value, 0, 1);
[10398]1149 if ($partitionend le $partitionstart) {
[14845]1150 $partitionend = substr($metadata_value, 0, 2);
[10398]1151 # Give up after three characters
1152 if ($partitionend le $partitionstart) {
[14845]1153 $partitionend = substr($metadata_value, 0, 3);
[10398]1154 }
1155 }
1156 }
1157
1158 return $partitionend;
1159}
1160
1161
1162sub add_hlist_partition
1163{
1164 my $self = shift(@_);
[12889]1165 my @metadata_groups = @{shift(@_)};
[12893]1166 my $classifier_node = shift(@_);
[10398]1167 my $partitionname = shift(@_);
[14845]1168 my $metadata_value_to_OIDs_hash_ref = shift(@_);
[10398]1169
1170 # Create an hlist partition
[24016]1171 # Note that we don't need to convert from unicode-aware strings
1172 # to utf8 here, as that is handled elsewhere in the code
1173 my %child_classifier_node = ( 'Title' => $partitionname, #'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
[12893]1174 'childtype' => "VList",
1175 'contains' => [] );
[10398]1176
1177 # Add the children to the hlist partition
[14845]1178 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
[12893]1179 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
[10398]1180}
1181
1182
1183sub add_vlist
1184{
1185 my $self = shift(@_);
[12889]1186 my @metadata_groups = @{shift(@_)};
[12893]1187 my $classifier_node = shift(@_);
[14845]1188 my $metadata_value_to_OIDs_hash_ref = shift(@_);
[12889]1189 my $metadata_group = shift(@metadata_groups);
[13287]1190 $classifier_node->{'mdtype'} = $metadata_group;
[10398]1191
[33452]1192 my $sort_type = $self->{$metadata_group .".metadata_sort_mode_within_level"};
[10398]1193 # Create an entry in the vlist for each value
[33452]1194 foreach my $metadata_value ($self->sort_metadata_values_array($sort_type, keys(%{$metadata_value_to_OIDs_hash_ref})))
[14845]1195 {
1196 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
[33463]1197
[18619]1198 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
1199 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
[13271]1200 my $OID = $OIDs[0];
[21969]1201 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
[13271]1202 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
[21969]1203 }
[18619]1204 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
1205 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
[33463]1206
[21969]1207 @OIDs = $self->sort_leaf_items(\@OIDs);
1208 foreach my $OID (@OIDs) {
1209 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
1210 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
1211 }
1212
[10398]1213 }
1214 # Otherwise create a sublist (bookshelf) for the metadata value
[24012]1215 else {
[27098]1216 my $metadata_value_display = $self->get_metadata_value_display($metadata_group, $metadata_value);
[33452]1217 my %child_classifier_node = ( 'Title' => $metadata_value_display,
[12893]1218 'childtype' => "VList",
[21969]1219 'mdtype' => $metadata_group,
[12893]1220 'contains' => [] );
[10398]1221
1222 # If there are metadata elements remaining, recursively apply the process
[12889]1223 if (@metadata_groups > 0) {
[24012]1224 my $next_metadata_group = $metadata_groups[0];
[33463]1225
1226 # separate metadata into those that belong in the next/sub-metadata_group
1227 # and those that belong at the current level's metadata_group
[24012]1228
1229 my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
1230 my @current_level_OIDs = ();
1231 my @next_level_OIDs = ();
1232 foreach my $OID (@OIDs)
1233 {
1234 if ($OID_to_metadata_values_hash_ref->{$OID}) {
1235 push(@next_level_OIDs, $OID);
1236 } else {
1237 push(@current_level_OIDs, $OID);
1238 }
1239 }
1240 # recursively process those docs belonging to the sub-metadata_group
1241 $self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
1242
1243 # For those docs that don't belong in the sub/next_metadata_group, but which belong
1244 # at this level, just add the documents as children of this list at the current level
1245 @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
1246 foreach my $current_level_OID (@current_level_OIDs) {
1247 my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
1248 push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
1249 }
[10398]1250 }
1251 # Otherwise just add the documents as children of this list
1252 else {
[21969]1253 @OIDs = $self->sort_leaf_items(\@OIDs);
1254 foreach my $OID (@OIDs) {
1255 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
1256 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
1257 }
1258
[10398]1259 }
1260
1261 # Add the sublist to the list
[12893]1262 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
[10398]1263 }
1264 }
1265}
1266
[21969]1267sub metadata_offset
[18619]1268{
1269 my $self = shift(@_);
[21969]1270 my $metadata_group = shift(@_);
1271 my $OID = shift(@_);
1272 my $metadata_value = shift(@_);
[33463]1273
[21969]1274 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
1275 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
1276 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
1277 if ($metadata_value eq $metadata_values[$i]) {
1278 return $i;
1279 }
1280 }
1281
1282 return 0;
1283}
1284
1285sub sort_leaf_items
1286{
1287 my $self = shift(@_);
[18619]1288 my @OIDs = @{shift(@_)};
[33452]1289
[18619]1290 # Sort leaf nodes and add to list
[20825]1291 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
1292 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
[18619]1293 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
[33452]1294 my $sort_type = $self->{$sort_leaf_nodes_usingmetaelem . ".metadata_sort_mode_within_level"};
[18619]1295 # Force a stable sort (Perl 5.6's sort isn't stable)
1296 # !! The [0] bits aren't ideal (multiple metadata values) !!
[22667]1297 @OIDs = @OIDs[ sort {
[31767]1298 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]}) && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]}))
[22667]1299 {
[33452]1300 if ($sort_type eq "numeric") {
1301 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] <=> $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
1302 } elsif ($sort_type eq "alphabetic") {
1303 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
1304 } else {
1305 ncmp($OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0], $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0]);
1306 }
[22667]1307 }
1308 else
1309 {
1310 $a <=> $b;
1311 }
1312 } 0..$#OIDs ];
[18619]1313 }
[23302]1314 if ($self->{'reverse_sort_leaf_nodes'}) {
1315 return reverse @OIDs;
1316 }
[21969]1317 return @OIDs;
[18619]1318}
1319
1320
[13551]1321sub sort_metadata_values_array
1322{
1323 my $self = shift(@_);
[33452]1324 my ($sort_mode) = shift(@_);
[13551]1325 my @metadata_values = @_;
1326
[33452]1327 if ($sort_mode eq "unicode") {
1328 if ($self->{'unicode_collator'}) {
1329 return $self->{'unicode_collator'}->sort(@metadata_values);
1330 }
1331 # the collator wasn't loaded, fall back on default
1332 $sort_mode = "alphanumeric";
[13551]1333 }
[33452]1334 if ($sort_mode eq "numeric") {
1335 return sort {$a <=> $b} @metadata_values;
[13551]1336 }
[33452]1337 if ($sort_mode eq "alphabetic") {
1338 return sort {$a cmp $b} @metadata_values;
1339 }
1340 # natural sort
1341 return nsort(@metadata_values);
[13551]1342}
1343
[33464]1344
[33452]1345# we are not using this any more. Using nsort instead
[31575]1346# $a and $b args automatically passed in and shouldn't be declared
[31567]1347sub alpha_numeric_cmp
1348{
[31577]1349 my $self = shift (@_);
1350 my ($aStr, $bStr) = @_;
1351 if ($aStr =~ m/^(\d+(\.\d+)?)/)
[31567]1352 {
1353 my $val_a = $1;
[31577]1354 if ($bStr =~ m/^(\d+(\.\d+)?)/)
[31567]1355 {
1356 my $val_b = $1;
1357 if ($val_a != $val_b)
1358 {
1359 return ($val_a <=> $val_b);
1360 }
1361 }
1362 }
1363
[31577]1364 return ($aStr cmp $bStr);
[31567]1365}
1366
1367
[27098]1368sub get_metadata_value_display {
1369 my $self = shift(@_);
1370 my ($metadata_group, $metadata_value) = @_;
[33482]1371 return $metadata_value if $self->{'use_formatted_metadata_for_bookshelf_display'};
[27098]1372 my $actual_values_hash = $self->{$metadata_group . ".actualvalues"}->{$metadata_value};
1373 my $display_value ="";
1374 my $max_count=0;
1375 foreach my $v (keys %$actual_values_hash) {
1376 if ($actual_values_hash->{$v} > $max_count) {
1377 $display_value = $v;
1378 $max_count = $actual_values_hash->{$v};
1379 }
1380 }
1381 return $display_value;
1382}
[10398]13831;
Note: See TracBrowser for help on using the repository browser.