source: main/trunk/greenstone2/perllib/classify/List.pm@ 27098

Last change on this file since 27098 was 27098, checked in by kjdon, 11 years ago

new option for List: standardize_capitalization. If set, then bookshelf names will be lowercase. If not set, the majority case variant will be used instead

  • Property svn:keywords set to Author Date Id Revision
File size: 37.8 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46my $partition_type_list =
47 [ { 'name' => "per_letter",
48 'desc' => "{List.level_partition.per_letter}" },
49 { 'name' => "approximate_size",
50 'desc' => "{List.level_partition.approximate_size}"},
51 { 'name' => "constant_size",
52 'desc' => "{List.level_partition.constant_size}" },
53 { 'name' => "none",
54 'desc' => "{List.level_partition.none}" } ];
55
56# following used to check types later on
57my $valid_partition_types = { 'per_letter' => 1,
58 'constant_size' => 1,
59 'per_letter_fixed_size' => 1,
60 'approximate_size' => 1,
61 'none' => 1};
62
63my $bookshelf_type_list =
64 [ { 'name' => "always",
65 'desc' => "{List.bookshelf_type.always}" },
66 { 'name' => "duplicate_only",
67 'desc' => "{List.bookshelf_type.duplicate_only}" },
68 { 'name' => "never",
69 'desc' => "{List.bookshelf_type.never}" } ];
70
71my $arguments =
72 [ { 'name' => "metadata",
73 'desc' => "{List.metadata}",
74 'type' => "metadata",
75 'reqd' => "yes" },
76
77 # The interesting options
78 { 'name' => "bookshelf_type",
79 'desc' => "{List.bookshelf_type}",
80 'type' => "enum",
81 'list' => $bookshelf_type_list,
82 'deft' => "never" },
83 { 'name' => "classify_sections",
84 'desc' => "{List.classify_sections}",
85 'type' => "flag" },
86 { 'name' => "partition_type_within_level",
87 'desc' => "{List.partition_type_within_level}",
88 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
89 'list' => $partition_type_list,
90 'deft' => "per_letter" },
91 { 'name' => "partition_size_within_level",
92 'desc' => "{List.partition_size_within_level}",
93 'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
94 { 'name' => "partition_name_length",
95 'desc' => "{List.partition_name_length}",
96 'type' => "string" },
97 { 'name' => "sort_leaf_nodes_using",
98 'desc' => "{List.sort_leaf_nodes_using}",
99 'type' => "metadata",
100 'deft' => "Title" },
101 { 'name' => "reverse_sort_leaf_nodes",
102 'desc' => "{List.reverse_sort_leaf_nodes}",
103 'type' => "flag"},
104 { 'name' => "sort_using_unicode_collation",
105 'desc' => "{List.sort_using_unicode_collation}",
106 'type' => "flag" },
107 { 'name' => "use_hlist_for",
108 'desc' => "{List.use_hlist_for}",
109 'type' => "string" },
110 {'name' => "filter_metadata",
111 'desc' => "{List.filter_metadata}",
112 'type' => "metadata"},
113 {'name' => "filter_regex",
114 'desc' => "{List.filter_regex}",
115 'type' => "regexp"},
116 { 'name' => "standardize_capitalization",
117 'desc' => "{List.standardize_capitalization}",
118 'type' => "flag"},
119 { 'name' => "removeprefix",
120 'desc' => "{BasClas.removeprefix}",
121 'type' => "regexp" },
122 { 'name' => "removesuffix",
123 'desc' => "{BasClas.removesuffix}",
124 'type' => "regexp" } ];
125
126my $options = { 'name' => "List",
127 'desc' => "{List.desc}",
128 'abstract' => "no",
129 'inherits' => "yes",
130 'args' => $arguments };
131
132
133sub new
134{
135 my ($class) = shift(@_);
136 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
137 push(@$classifierslist, $class);
138
139 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
140 push(@{$hashArgOptLists->{"OptList"}}, $options);
141
142 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
143
144 if ($self->{'info_only'}) {
145 # don't worry about any options etc
146 return bless $self, $class;
147 }
148
149 # The metadata elements to use (required)
150 if (!$self->{'metadata'}) {
151 die "Error: No metadata fields specified for List.\n";
152 }
153
154 my @metadata_groups = split(/[\/]/, $self->{'metadata'});
155 $self->{'metadata_groups'} = \@metadata_groups;
156
157 # The classifier button name (default: the first metadata element specified)
158 if (!$self->{'buttonname'}) {
159 my $first_metadata_group = $metadata_groups[0];
160 my $first_metadata_element = (split(/[\;|,\/]/, $first_metadata_group))[0];
161 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
162 }
163
164 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
165 foreach my $metadata_group (@metadata_groups) {
166 $self->{$metadata_group . ".bookshelf_type"} = "always";
167 }
168 my $last_metadata_group = $metadata_groups[$#metadata_groups];
169 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
170 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
171
172 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
173 foreach my $metadata_group (@metadata_groups) {
174 $self->{$metadata_group . ".list_type"} = "VList";
175 }
176 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
177 $self->{$metadata_group . ".list_type"} = "HList";
178 }
179
180 # How the items are grouped into partitions (default: no partition)
181 # for each level (metadata group), separated by '/'
182 if (!$self->{"partition_type_within_level"}) {
183 foreach my $metadata_group (@metadata_groups) {
184 $self->{$metadata_group . ".partition_type_within_level"} = "none";
185 }
186 } else {
187 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
188
189 my $first = 1;
190 foreach my $metadata_group (@metadata_groups) {
191 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
192 if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
193 print STDERR "per letter fixed size, changing to approximate size\n";
194 $partition_type_within_levelelem = "approximate_size";
195 }
196 if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
197 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
198 }
199 else {
200 if ($first) {
201 $self->{$metadata_group . ".partition_type_within_level"} = "none";
202 $first = 0;
203 } else {
204 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
205 }
206 if (defined($partition_type_within_levelelem)) {
207 # ie invalid entry
208 print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
209 }
210 }
211 }
212 }
213
214 # The number of items in each partition
215 if (!$self->{'partition_size_within_level'}) {
216 # Default: 20
217 foreach my $metadata_group (@metadata_groups) {
218 $self->{$metadata_group . ".partition_size_within_level"} = 20;
219 }
220 }
221 else {
222 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
223
224 # Assign values based on the partition_size_within_level parameter
225 foreach my $metadata_group (@metadata_groups) {
226 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
227 if (defined($partition_size_within_levelelem)) {
228 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
229 }
230 else {
231 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
232 }
233 }
234 }
235
236 # The removeprefix and removesuffix expressions
237 if ($self->{'removeprefix'}) {
238 # If there are more than one expressions, use '' to quote each experession and '/' to separate
239 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
240
241 foreach my $metadata_group (@metadata_groups) {
242 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
243 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
244 # Remove the other ' at the beginning and the end if there is any
245 $removeprefix_expr_within_levelelem =~ s/^'//;
246 $removeprefix_expr_within_levelelem =~ s/'$//;
247 # Remove the extra ^ at the beginning
248 $removeprefix_expr_within_levelelem =~ s/^\^//;
249 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
250 } else {
251 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
252 }
253 }
254 }
255 if ($self->{'removesuffix'}) {
256 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
257
258 foreach my $metadata_group (@metadata_groups) {
259 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
260 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
261 $removesuffix_expr_within_levelelem =~ s/^'//;
262 $removesuffix_expr_within_levelelem =~ s/'$//;
263 # Remove the extra $ at the end
264 $removesuffix_expr_within_levelelem =~ s/\$$//;
265 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
266 } else {
267 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
268 }
269 }
270 }
271
272 # The metadata elements to use to sort the leaf nodes (default: Title)
273 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
274 if ($self->{'sort_leaf_nodes_using'}) {
275 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
276 }
277 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
278
279 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
280 if ($self->{'sort_using_unicode_collation'}) {
281 # To use this you first need to download the allkeys.txt file from
282 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
283 # Unicode/Collate directory.
284 require Unicode::Collate;
285 $self->{'unicode_collator'} = Unicode::Collate->new();
286 }
287
288 # An empty array for the document/section OIDs that we are classifying
289 $self->{'OIDs'} = [];
290 # A hash for all the doc ids that we have seen, so we don't classify something twice
291 $self->{'all_doc_OIDs'} = {};
292 return bless $self, $class;
293}
294
295
296sub init
297{
298 # Nothing to do...
299}
300
301
302# Called for each document in the collection
303sub classify
304{
305 my $self = shift(@_);
306 my ($doc_obj) = @_;
307
308 if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
309 print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
310 return;
311 }
312 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
313 # check against filter here
314 if ($self->{'filter_metadata'}) {
315 #print STDERR "filtering documents on $self->{'filter_metadata'}\n";
316 my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'});
317 return unless defined $meta;
318 if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) {
319 #print STDERR "doesn't match regex\n";
320 return;
321
322 }
323 }
324 # if we get here, we have passed the test for filtering
325 # If "-classify_sections" is set, classify every section of the document
326 if ($self->{'classify_sections'}) {
327 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
328 while (defined $section) {
329 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
330 $section = $doc_obj->get_next_section($section);
331 }
332 }
333 # Otherwise just classify the top document section
334 else {
335 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
336 }
337
338}
339
340sub classify_section
341{
342 my $self = shift(@_);
343 my ($doc_obj,$section_OID,$section) = @_;
344
345 my @metadata_groups = @{$self->{'metadata_groups'}};
346
347
348 # Only classify the section if it has a value for one of the metadata elements in the first group
349 my $classify_section = 0;
350 my $first_metadata_group = $metadata_groups[0];
351 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
352 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
353 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
354 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
355 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
356
357 # Remove prefix/suffix if requested
358 if (defined ($first_metadata_group_element_value)) {
359 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
360 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
361 }
362
363 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
364 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
365 }
366 }
367 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
368 # This section must be included in the classifier
369 $classify_section = 1;
370 last;
371 }
372 }
373
374 # We're not classifying this section because it doesn't have the required metadata
375 return if (!$classify_section);
376
377 # Otherwise, include this section in the classifier
378
379 push(@{$self->{'OIDs'}}, $section_OID);
380
381 # Create a hash for the metadata values of each metadata element we're interested in
382 my %metadata_groups_done = ();
383 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
384 # Take care not to do a metadata group more than once
385 unless ($metadata_groups_done{$metadata_group}) {
386 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
387 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
388 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
389 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
390
391 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
392 foreach my $metadata_value (@metadata_values) {
393 # Strip leading and trailing whitespace
394 $metadata_value =~ s/^\s*//;
395 $metadata_value =~ s/\s*$//;
396
397 # Remove prefix/suffix if requested
398 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
399 $metadata_value =~ s/^$remove_prefix_expr//;
400 }
401 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
402 $metadata_value =~ s/$remove_suffix_expr$//;
403 }
404
405 # lowercase metadata both for sorting meta (d/D under D), and to allow CSS to be able to
406 # text-transform the stored lowercase values as capitalize or uppercase (can't CSS
407 # text-transform if stored uppercase). 2 CSS text-transforms have been added to core.css
408 my $lc_metadata_value = lc($metadata_value);
409
410 # We are already working with unicode aware strings at this
411 # stage, so we no longer need to convert from utf8 to unicode
412 #my $metadata_value_unicode_string = $metadata_value; # $self->convert_utf8_string_to_unicode_string($metadata_value);
413
414 # Add the metadata value into the list for this combination of metadata group and section
415 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $lc_metadata_value);
416 # add the actual value into the stored values so we can remember the case
417 if (!$self->{'standardize_capitalization'}) {
418 if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) {
419 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++;
420 } else {
421 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1;
422 }
423 }
424 }
425 last if (@metadata_values > 0);
426 }
427
428 $metadata_groups_done{$metadata_group} = 1;
429 }
430 }
431}
432
433
434sub get_classify_info
435{
436 my $self = shift(@_);
437
438 # The metadata groups to classify by
439 my @metadata_groups = @{$self->{'metadata_groups'}};
440 my $first_metadata_group = $metadata_groups[0];
441
442 # The OID values of the documents to include in the classifier
443 my @OIDs = @{$self->{'OIDs'}};
444
445 # Create the root node of the classification hierarchy
446 my %classifier_node = ( 'thistype' => "Invisible",
447 'childtype' => $self->{$first_metadata_group . ".list_type"},
448 'Title' => $self->{'buttonname'},
449 'contains' => [],
450 'mdtype' => $first_metadata_group );
451
452 # Recursively create the classification hierarchy, one level for each metadata group
453 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
454 return \%classifier_node;
455}
456
457
458sub add_level
459{
460 my $self = shift(@_);
461 my @metadata_groups = @{shift(@_)};
462 my @OIDs = @{shift(@_)};
463 my $classifier_node = shift(@_);
464
465 my $metadata_group = $metadata_groups[0];
466
467 if (!defined($self->{$metadata_group . ".list"})) {
468 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
469 return;
470 }
471
472 # Create a mapping from metadata value to OID
473 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
474 my %metadata_value_to_OIDs_hash = ();
475 foreach my $OID (@OIDs)
476 {
477 if ($OID_to_metadata_values_hash_ref->{$OID})
478 {
479 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
480 foreach my $metadata_value (@metadata_values)
481 {
482 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
483 }
484 }
485 }
486 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
487
488 # Partition the values (if necessary)
489 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
490 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
491 if ($partition_type_within_level =~ /^per_letter$/i) {
492 # Generate one hlist for each letter
493 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
494 my %metadata_value_to_OIDs_subhash = ();
495
496 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
497 foreach my $metadata_value (@sortedmetadata_values) {
498 my $metadata_valuepartition = substr($metadata_value, 0, 1);
499
500 # Is this the start of a new partition?
501 if ($metadata_valuepartition ne $lastpartition) {
502 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
503 %metadata_value_to_OIDs_subhash = ();
504 $lastpartition = $metadata_valuepartition;
505 }
506
507 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
508 }
509
510 # Don't forget to add the last partition
511 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
512
513 # The partitions are stored in an HList
514 $classifier_node->{'childtype'} = "HList";
515 }
516 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
517 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
518 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
519 # If a letter has more items than specified, split into several hlists.
520 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
521 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
522 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
523 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
524
525 # Separate values by their first letter, each form a bucket, like the per_letter partition type
526 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
527 my %partition_buckets = ();
528 my @metadata_values_in_bucket = ();
529 my $num_items_in_bucket = 0;
530 foreach my $metadata_value (@sortedmetadata_values) {
531 my $metadata_valuepartition = substr($metadata_value, 0, 1);
532 if ($metadata_valuepartition ne $last_partition) {
533 my @temp_array = @metadata_values_in_bucket;
534 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
535 my %partition_info = ();
536 $partition_info{'metadata_values'} = \@temp_array;
537 $partition_info{'size'} = $num_items_in_bucket;
538 $partition_buckets{$last_partition} = \%partition_info;
539
540 @metadata_values_in_bucket = ($metadata_value);
541 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
542 $last_partition = $metadata_valuepartition;
543 } else {
544 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
545 push (@metadata_values_in_bucket, $metadata_value);
546 }
547 }
548 # Last one
549 my %partition_info = ();
550 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
551 $partition_info{'size'} = $num_items_in_bucket;
552 $partition_buckets{$last_partition} = \%partition_info;
553
554 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
555 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
556 my $partition = $partition_keys[$i];
557 my $items_in_partition = $partition_buckets{$partition}->{'size'};
558 # Merge small buckets together, but keep the numeric bucket apart
559 if ($items_in_partition < $partition_size_within_level) {
560 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
561 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
562 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
563 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
564 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
565 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
566 }
567 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
568 delete $partition_buckets{$partition};
569 }
570 }
571 }
572 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
573
574 # Add partitions, and divide big bucket into several
575 my $last_partition_end = "";
576 my $partition_start = "";
577 foreach my $partition (@partition_keys) {
578 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
579 my $items_in_partition = $partition_buckets{$partition}->{'size'};
580 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
581
582 if ($items_in_partition > $partition_size_within_level) {
583 my $items_done = 0;
584 my %metadata_values_to_OIDs_subhashes = ();
585 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
586 my $metadata_value = $metadata_values[$i];
587 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
588 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
589
590 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
591 my $partitionname = $partition_start;
592 if ($partitionend ne $partition_start) {
593 $partitionname = $partitionname . "-" . $partitionend;
594 }
595
596 # Start a new partition
597 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
598 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
599 $last_partition_end = $partitionend;
600 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
601 $items_done = 0;
602 %metadata_values_to_OIDs_subhashes = ();
603 }
604
605 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
606 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
607 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
608 # Get the number of partitions needed for this value
609 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
610 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
611
612 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
613 for (my $i = 0; $i < $num_splits; $i++) {
614 my %OIDs_subhashes_for_this_value = ();
615 my @OIDs_for_this_partition = ();
616 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
617 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
618 }
619
620 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
621 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
622 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
623 $items_done += scalar(@OIDs_for_this_partition);
624 next;
625 }
626
627 # Add an HList for this bucket
628 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
629 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
630 $last_partition_end = $partitionname_for_this_value;
631 }
632 next;
633 }
634
635 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
636 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
637
638 # The last partition
639 if($i == scalar(@metadata_values) - 1) {
640 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
641 }
642 }
643 }
644 else {
645 # The easier case, just add a partition
646 my %metadata_values_to_OIDs_subhashes = ();
647 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
648 my $metadata_value = $metadata_values[$i];
649 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
650 }
651 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
652 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
653 my $partitionname = $partition_start;
654 if ($partitionend ne $partition_start) {
655 $partitionname = $partitionname . "-" . $partitionend;
656 }
657 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
658 $last_partition_end = $partitionend;
659 }
660 }
661
662 # The partitions are stored in an HList
663 $classifier_node->{'childtype'} = "HList";
664
665 } # end approximate_size
666 else {
667 # Generate hlists of a certain size
668 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
669 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
670 my $itemsdone = 0;
671 my %metadata_value_to_OIDs_subhash = ();
672 my $lastpartitionend = "";
673 my $partitionstart;
674 foreach my $metadata_value (@sortedmetadata_values) {
675 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
676 $itemsdone++;
677 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
678
679 # Is this the start of a new partition?
680 if ($itemsinpartition == 1) {
681 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
682 }
683
684 # Is this the end of the partition?
685 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
686 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
687 my $partitionname = $partitionstart;
688 if ($partitionend ne $partitionstart) {
689 $partitionname = $partitionname . "-" . $partitionend;
690 }
691
692 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
693 %metadata_value_to_OIDs_subhash = ();
694 $lastpartitionend = $partitionend;
695 }
696 }
697
698 # The partitions are stored in an HList
699 $classifier_node->{'childtype'} = "HList";
700 }
701
702 # Otherwise just add all the values to a VList
703 else {
704 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
705 }
706 }
707}
708
709
710sub generate_partition_start
711{
712 my $self = shift(@_);
713 my $metadata_value = shift(@_);
714 my $lastpartitionend = shift(@_);
715 my $partition_name_length = shift(@_);
716
717 if ($partition_name_length) {
718 return substr($metadata_value, 0, $partition_name_length);
719 }
720
721 my $partitionstart = substr($metadata_value, 0, 1);
722 if ($partitionstart le $lastpartitionend) {
723 $partitionstart = substr($metadata_value, 0, 2);
724 # Give up after three characters
725 if ($partitionstart le $lastpartitionend) {
726 $partitionstart = substr($metadata_value, 0, 3);
727 }
728 }
729
730 return $partitionstart;
731}
732
733
734sub generate_partition_end
735{
736 my $self = shift(@_);
737 my $metadata_value = shift(@_);
738 my $partitionstart = shift(@_);
739 my $partition_name_length = shift(@_);
740
741 if ($partition_name_length) {
742 return substr($metadata_value, 0, $partition_name_length);
743 }
744
745 my $partitionend = substr($metadata_value, 0, length($partitionstart));
746 if ($partitionend gt $partitionstart) {
747 $partitionend = substr($metadata_value, 0, 1);
748 if ($partitionend le $partitionstart) {
749 $partitionend = substr($metadata_value, 0, 2);
750 # Give up after three characters
751 if ($partitionend le $partitionstart) {
752 $partitionend = substr($metadata_value, 0, 3);
753 }
754 }
755 }
756
757 return $partitionend;
758}
759
760
761sub add_hlist_partition
762{
763 my $self = shift(@_);
764 my @metadata_groups = @{shift(@_)};
765 my $classifier_node = shift(@_);
766 my $partitionname = shift(@_);
767 my $metadata_value_to_OIDs_hash_ref = shift(@_);
768
769 # Create an hlist partition
770 # Note that we don't need to convert from unicode-aware strings
771 # to utf8 here, as that is handled elsewhere in the code
772 my %child_classifier_node = ( 'Title' => $partitionname, #'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
773 'childtype' => "VList",
774 'contains' => [] );
775
776 # Add the children to the hlist partition
777 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
778 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
779}
780
781
782sub add_vlist
783{
784 my $self = shift(@_);
785 my @metadata_groups = @{shift(@_)};
786 my $classifier_node = shift(@_);
787 my $metadata_value_to_OIDs_hash_ref = shift(@_);
788
789 my $metadata_group = shift(@metadata_groups);
790 $classifier_node->{'mdtype'} = $metadata_group;
791
792 # Create an entry in the vlist for each value
793 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
794 {
795 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
796 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
797 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
798 my $OID = $OIDs[0];
799 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
800 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
801 }
802 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
803 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
804 @OIDs = $self->sort_leaf_items(\@OIDs);
805 foreach my $OID (@OIDs) {
806 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
807 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
808 }
809
810 }
811 # Otherwise create a sublist (bookshelf) for the metadata value
812 else {
813 my $metadata_value_display = $self->get_metadata_value_display($metadata_group, $metadata_value);
814 # Note that we don't need to convert from unicode-aware strings
815 # to utf8 here, as that is handled elsewhere in the code
816 my %child_classifier_node = ( 'Title' => $metadata_value_display, # 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
817 'childtype' => "VList",
818 'mdtype' => $metadata_group,
819 'contains' => [] );
820
821 #@OIDs = $self->sort_leaf_items(\@OIDs);
822 # If there are metadata elements remaining, recursively apply the process
823 if (@metadata_groups > 0) {
824 my $next_metadata_group = $metadata_groups[0];
825 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
826
827 # separate metadata into those that below in the next/sub-metadata_group
828 # and those that below at the current level's metadata_group
829
830 my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
831 my @current_level_OIDs = ();
832 my @next_level_OIDs = ();
833 foreach my $OID (@OIDs)
834 {
835 if ($OID_to_metadata_values_hash_ref->{$OID}) {
836 push(@next_level_OIDs, $OID);
837 } else {
838 push(@current_level_OIDs, $OID);
839 }
840 }
841 # recursively process those docs belonging to the sub-metadata_group
842 $self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
843
844 # For those docs that don't belong in the sub/next_metadata_group, but which belong
845 # at this level, just add the documents as children of this list at the current level
846 @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
847 foreach my $current_level_OID (@current_level_OIDs) {
848 my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
849 push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
850 }
851 }
852 # Otherwise just add the documents as children of this list
853 else {
854 @OIDs = $self->sort_leaf_items(\@OIDs);
855 foreach my $OID (@OIDs) {
856 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
857 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
858 }
859
860 }
861
862 # Add the sublist to the list
863 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
864 }
865 }
866}
867
868sub metadata_offset
869{
870 my $self = shift(@_);
871 my $metadata_group = shift(@_);
872 my $OID = shift(@_);
873 my $metadata_value = shift(@_);
874
875 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
876 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
877 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
878 if ($metadata_value eq $metadata_values[$i]) {
879 return $i;
880 }
881 }
882
883 return 0;
884}
885
886sub sort_leaf_items
887{
888 my $self = shift(@_);
889 my @OIDs = @{shift(@_)};
890# my $classifier_node = shift(@_);
891
892 # Sort leaf nodes and add to list
893 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
894 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
895 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
896 # Force a stable sort (Perl 5.6's sort isn't stable)
897 # !! The [0] bits aren't ideal (multiple metadata values) !!
898 @OIDs = @OIDs[ sort {
899 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
900 {
901 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
902 }
903 else
904 {
905 $a <=> $b;
906 }
907 } 0..$#OIDs ];
908 }
909 if ($self->{'reverse_sort_leaf_nodes'}) {
910 #print STDERR "reversing\n";
911 return reverse @OIDs;
912 }
913 return @OIDs;
914}
915
916
917
918sub sort_metadata_values_array
919{
920 my $self = shift(@_);
921 my @metadata_values = @_;
922
923 if ($self->{'unicode_collator'}) {
924 return $self->{'unicode_collator'}->sort(@metadata_values);
925 }
926 else {
927 return sort(@metadata_values);
928 }
929}
930
931sub get_metadata_value_display {
932 my $self = shift(@_);
933 my ($metadata_group, $metadata_value) = @_;
934 return $metadata_value if $self->{'standardize_capitalization'};
935 my $actual_values_hash = $self->{$metadata_group . ".actualvalues"}->{$metadata_value};
936 my $display_value ="";
937 my $max_count=0;
938 foreach my $v (keys %$actual_values_hash) {
939 if ($actual_values_hash->{$v} > $max_count) {
940 $display_value = $v;
941 $max_count = $actual_values_hash->{$v};
942 }
943 }
944 return $display_value;
945}
9461;
Note: See TracBrowser for help on using the repository browser.