source: main/trunk/greenstone2/perllib/classify/List.pm@ 26545

Last change on this file since 26545 was 26545, checked in by ak19, 11 years ago

List.pm makes all bucket titles lowercase now, for sorting purposes (so that titles that start with d and D can still go under bucket D), while css is used to titlecase buckets and bookshelves.

  • Property svn:keywords set to Author Date Id Revision
File size: 36.6 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46my $partition_type_list =
47 [ { 'name' => "per_letter",
48 'desc' => "{List.level_partition.per_letter}" },
49 { 'name' => "approximate_size",
50 'desc' => "{List.level_partition.approximate_size}"},
51 { 'name' => "constant_size",
52 'desc' => "{List.level_partition.constant_size}" },
53 { 'name' => "none",
54 'desc' => "{List.level_partition.none}" } ];
55
56# following used to check types later on
57my $valid_partition_types = { 'per_letter' => 1,
58 'constant_size' => 1,
59 'per_letter_fixed_size' => 1,
60 'approximate_size' => 1,
61 'none' => 1};
62
63my $bookshelf_type_list =
64 [ { 'name' => "always",
65 'desc' => "{List.bookshelf_type.always}" },
66 { 'name' => "duplicate_only",
67 'desc' => "{List.bookshelf_type.duplicate_only}" },
68 { 'name' => "never",
69 'desc' => "{List.bookshelf_type.never}" } ];
70
71my $arguments =
72 [ { 'name' => "metadata",
73 'desc' => "{List.metadata}",
74 'type' => "metadata",
75 'reqd' => "yes" },
76
77 # The interesting options
78 { 'name' => "bookshelf_type",
79 'desc' => "{List.bookshelf_type}",
80 'type' => "enum",
81 'list' => $bookshelf_type_list,
82 'deft' => "never" },
83 { 'name' => "classify_sections",
84 'desc' => "{List.classify_sections}",
85 'type' => "flag" },
86 { 'name' => "partition_type_within_level",
87 'desc' => "{List.partition_type_within_level}",
88 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
89 'list' => $partition_type_list,
90 'deft' => "per_letter" },
91 { 'name' => "partition_size_within_level",
92 'desc' => "{List.partition_size_within_level}",
93 'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
94 { 'name' => "partition_name_length",
95 'desc' => "{List.partition_name_length}",
96 'type' => "string" },
97 { 'name' => "sort_leaf_nodes_using",
98 'desc' => "{List.sort_leaf_nodes_using}",
99 'type' => "metadata",
100 'deft' => "Title" },
101 { 'name' => "reverse_sort_leaf_nodes",
102 'desc' => "{List.reverse_sort_leaf_nodes}",
103 'type' => "flag"},
104 { 'name' => "sort_using_unicode_collation",
105 'desc' => "{List.sort_using_unicode_collation}",
106 'type' => "flag" },
107 { 'name' => "use_hlist_for",
108 'desc' => "{List.use_hlist_for}",
109 'type' => "string" },
110 {'name' => "filter_metadata",
111 'desc' => "{List.filter_metadata}",
112 'type' => "metadata"},
113 {'name' => "filter_regex",
114 'desc' => "{List.filter_regex}",
115 'type' => "regexp"},
116
117 { 'name' => "removeprefix",
118 'desc' => "{BasClas.removeprefix}",
119 'type' => "regexp" },
120 { 'name' => "removesuffix",
121 'desc' => "{BasClas.removesuffix}",
122 'type' => "regexp" } ];
123
124my $options = { 'name' => "List",
125 'desc' => "{List.desc}",
126 'abstract' => "no",
127 'inherits' => "yes",
128 'args' => $arguments };
129
130
131sub new
132{
133 my ($class) = shift(@_);
134 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
135 push(@$classifierslist, $class);
136
137 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
138 push(@{$hashArgOptLists->{"OptList"}}, $options);
139
140 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
141
142 if ($self->{'info_only'}) {
143 # don't worry about any options etc
144 return bless $self, $class;
145 }
146
147 # The metadata elements to use (required)
148 if (!$self->{'metadata'}) {
149 die "Error: No metadata fields specified for List.\n";
150 }
151
152 my @metadata_groups = split(/[\/]/, $self->{'metadata'});
153 $self->{'metadata_groups'} = \@metadata_groups;
154
155 # The classifier button name (default: the first metadata element specified)
156 if (!$self->{'buttonname'}) {
157 my $first_metadata_group = $metadata_groups[0];
158 my $first_metadata_element = (split(/[\;|,\/]/, $first_metadata_group))[0];
159 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
160 }
161
162 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
163 foreach my $metadata_group (@metadata_groups) {
164 $self->{$metadata_group . ".bookshelf_type"} = "always";
165 }
166 my $last_metadata_group = $metadata_groups[$#metadata_groups];
167 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
168 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
169
170 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
171 foreach my $metadata_group (@metadata_groups) {
172 $self->{$metadata_group . ".list_type"} = "VList";
173 }
174 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
175 $self->{$metadata_group . ".list_type"} = "HList";
176 }
177
178 # How the items are grouped into partitions (default: no partition)
179 # for each level (metadata group), separated by '/'
180 if (!$self->{"partition_type_within_level"}) {
181 foreach my $metadata_group (@metadata_groups) {
182 $self->{$metadata_group . ".partition_type_within_level"} = "none";
183 }
184 } else {
185 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
186
187 my $first = 1;
188 foreach my $metadata_group (@metadata_groups) {
189 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
190 if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
191 print STDERR "per letter fixed size, changing to approximate size\n";
192 $partition_type_within_levelelem = "approximate_size";
193 }
194 if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
195 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
196 }
197 else {
198 if ($first) {
199 $self->{$metadata_group . ".partition_type_within_level"} = "none";
200 $first = 0;
201 } else {
202 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
203 }
204 if (defined($partition_type_within_levelelem)) {
205 # ie invalid entry
206 print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
207 }
208 }
209 }
210 }
211
212 # The number of items in each partition
213 if (!$self->{'partition_size_within_level'}) {
214 # Default: 20
215 foreach my $metadata_group (@metadata_groups) {
216 $self->{$metadata_group . ".partition_size_within_level"} = 20;
217 }
218 }
219 else {
220 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
221
222 # Assign values based on the partition_size_within_level parameter
223 foreach my $metadata_group (@metadata_groups) {
224 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
225 if (defined($partition_size_within_levelelem)) {
226 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
227 }
228 else {
229 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
230 }
231 }
232 }
233
234 # The removeprefix and removesuffix expressions
235 if ($self->{'removeprefix'}) {
236 # If there are more than one expressions, use '' to quote each experession and '/' to separate
237 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
238
239 foreach my $metadata_group (@metadata_groups) {
240 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
241 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
242 # Remove the other ' at the beginning and the end if there is any
243 $removeprefix_expr_within_levelelem =~ s/^'//;
244 $removeprefix_expr_within_levelelem =~ s/'$//;
245 # Remove the extra ^ at the beginning
246 $removeprefix_expr_within_levelelem =~ s/^\^//;
247 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
248 } else {
249 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
250 }
251 }
252 }
253 if ($self->{'removesuffix'}) {
254 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
255
256 foreach my $metadata_group (@metadata_groups) {
257 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
258 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
259 $removesuffix_expr_within_levelelem =~ s/^'//;
260 $removesuffix_expr_within_levelelem =~ s/'$//;
261 # Remove the extra $ at the end
262 $removesuffix_expr_within_levelelem =~ s/\$$//;
263 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
264 } else {
265 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
266 }
267 }
268 }
269
270 # The metadata elements to use to sort the leaf nodes (default: Title)
271 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
272 if ($self->{'sort_leaf_nodes_using'}) {
273 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
274 }
275 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
276
277 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
278 if ($self->{'sort_using_unicode_collation'}) {
279 # To use this you first need to download the allkeys.txt file from
280 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
281 # Unicode/Collate directory.
282 require Unicode::Collate;
283 $self->{'unicode_collator'} = Unicode::Collate->new();
284 }
285
286 # An empty array for the document/section OIDs that we are classifying
287 $self->{'OIDs'} = [];
288 # A hash for all the doc ids that we have seen, so we don't classify something twice
289 $self->{'all_doc_OIDs'} = {};
290 return bless $self, $class;
291}
292
293
294sub init
295{
296 # Nothing to do...
297}
298
299
300# Called for each document in the collection
301sub classify
302{
303 my $self = shift(@_);
304 my ($doc_obj) = @_;
305
306 if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
307 print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
308 return;
309 }
310 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
311 # check against filter here
312 if ($self->{'filter_metadata'}) {
313 #print STDERR "filtering documents on $self->{'filter_metadata'}\n";
314 my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'});
315 return unless defined $meta;
316 if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) {
317 #print STDERR "doesn't match regex\n";
318 return;
319
320 }
321 }
322 # if we get here, we have passed the test for filtering
323 # If "-classify_sections" is set, classify every section of the document
324 if ($self->{'classify_sections'}) {
325 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
326 while (defined $section) {
327 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
328 $section = $doc_obj->get_next_section($section);
329 }
330 }
331 # Otherwise just classify the top document section
332 else {
333 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
334 }
335
336}
337
338sub classify_section
339{
340 my $self = shift(@_);
341 my ($doc_obj,$section_OID,$section) = @_;
342
343 my @metadata_groups = @{$self->{'metadata_groups'}};
344
345
346 # Only classify the section if it has a value for one of the metadata elements in the first group
347 my $classify_section = 0;
348 my $first_metadata_group = $metadata_groups[0];
349 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
350 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
351 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
352 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
353 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
354
355 # Remove prefix/suffix if requested
356 if (defined ($first_metadata_group_element_value)) {
357 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
358 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
359 }
360
361 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
362 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
363 }
364 }
365 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
366 # This section must be included in the classifier
367 $classify_section = 1;
368 last;
369 }
370 }
371
372 # We're not classifying this section because it doesn't have the required metadata
373 return if (!$classify_section);
374
375 # Otherwise, include this section in the classifier
376
377 push(@{$self->{'OIDs'}}, $section_OID);
378
379 # Create a hash for the metadata values of each metadata element we're interested in
380 my %metadata_groups_done = ();
381 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
382 # Take care not to do a metadata group more than once
383 unless ($metadata_groups_done{$metadata_group}) {
384 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
385 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
386 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
387 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
388
389 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
390 foreach my $metadata_value (@metadata_values) {
391 # Strip leading and trailing whitespace
392 $metadata_value =~ s/^\s*//;
393 $metadata_value =~ s/\s*$//;
394
395 # Remove prefix/suffix if requested
396 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
397 $metadata_value =~ s/^$remove_prefix_expr//;
398 }
399 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
400 $metadata_value =~ s/$remove_suffix_expr$//;
401 }
402
403 # lowercase metadata both for sorting meta (d/D under D), and to allow CSS to be able to
404 # text-transform the stored lowercase values as capitalize or uppercase (can't CSS
405 # text-transform if stored uppercase). 2 CSS text-transforms have been added to core.css
406 $metadata_value = lc($metadata_value);
407
408 # We are already working with unicode aware strings at this
409 # stage, so we no longer need to convert from utf8 to unicode
410 my $metadata_value_unicode_string = $metadata_value; # $self->convert_utf8_string_to_unicode_string($metadata_value);
411
412 # Add the metadata value into the list for this combination of metadata group and section
413 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
414 }
415 last if (@metadata_values > 0);
416 }
417
418 $metadata_groups_done{$metadata_group} = 1;
419 }
420 }
421}
422
423
424sub get_classify_info
425{
426 my $self = shift(@_);
427
428 # The metadata groups to classify by
429 my @metadata_groups = @{$self->{'metadata_groups'}};
430 my $first_metadata_group = $metadata_groups[0];
431
432 # The OID values of the documents to include in the classifier
433 my @OIDs = @{$self->{'OIDs'}};
434
435 # Create the root node of the classification hierarchy
436 my %classifier_node = ( 'thistype' => "Invisible",
437 'childtype' => $self->{$first_metadata_group . ".list_type"},
438 'Title' => $self->{'buttonname'},
439 'contains' => [],
440 'mdtype' => $first_metadata_group );
441
442 # Recursively create the classification hierarchy, one level for each metadata group
443 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
444 return \%classifier_node;
445}
446
447
448sub add_level
449{
450 my $self = shift(@_);
451 my @metadata_groups = @{shift(@_)};
452 my @OIDs = @{shift(@_)};
453 my $classifier_node = shift(@_);
454
455 my $metadata_group = $metadata_groups[0];
456
457 if (!defined($self->{$metadata_group . ".list"})) {
458 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
459 return;
460 }
461
462 # Create a mapping from metadata value to OID
463 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
464 my %metadata_value_to_OIDs_hash = ();
465 foreach my $OID (@OIDs)
466 {
467 if ($OID_to_metadata_values_hash_ref->{$OID})
468 {
469 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
470 foreach my $metadata_value (@metadata_values)
471 {
472 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
473 }
474 }
475 }
476 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
477
478 # Partition the values (if necessary)
479 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
480 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
481 if ($partition_type_within_level =~ /^per_letter$/i) {
482 # Generate one hlist for each letter
483 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
484 my %metadata_value_to_OIDs_subhash = ();
485
486 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
487 foreach my $metadata_value (@sortedmetadata_values) {
488 my $metadata_valuepartition = substr($metadata_value, 0, 1);
489
490 # Is this the start of a new partition?
491 if ($metadata_valuepartition ne $lastpartition) {
492 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
493 %metadata_value_to_OIDs_subhash = ();
494 $lastpartition = $metadata_valuepartition;
495 }
496
497 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
498 }
499
500 # Don't forget to add the last partition
501 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
502
503 # The partitions are stored in an HList
504 $classifier_node->{'childtype'} = "HList";
505 }
506 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
507 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
508 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
509 # If a letter has more items than specified, split into several hlists.
510 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
511 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
512 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
513 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
514
515 # Separate values by their first letter, each form a bucket, like the per_letter partition type
516 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
517 my %partition_buckets = ();
518 my @metadata_values_in_bucket = ();
519 my $num_items_in_bucket = 0;
520 foreach my $metadata_value (@sortedmetadata_values) {
521 my $metadata_valuepartition = substr($metadata_value, 0, 1);
522 if ($metadata_valuepartition ne $last_partition) {
523 my @temp_array = @metadata_values_in_bucket;
524 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
525 my %partition_info = ();
526 $partition_info{'metadata_values'} = \@temp_array;
527 $partition_info{'size'} = $num_items_in_bucket;
528 $partition_buckets{$last_partition} = \%partition_info;
529
530 @metadata_values_in_bucket = ($metadata_value);
531 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
532 $last_partition = $metadata_valuepartition;
533 } else {
534 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
535 push (@metadata_values_in_bucket, $metadata_value);
536 }
537 }
538 # Last one
539 my %partition_info = ();
540 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
541 $partition_info{'size'} = $num_items_in_bucket;
542 $partition_buckets{$last_partition} = \%partition_info;
543
544 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
545 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
546 my $partition = $partition_keys[$i];
547 my $items_in_partition = $partition_buckets{$partition}->{'size'};
548 # Merge small buckets together, but keep the numeric bucket apart
549 if ($items_in_partition < $partition_size_within_level) {
550 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
551 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
552 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
553 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
554 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
555 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
556 }
557 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
558 delete $partition_buckets{$partition};
559 }
560 }
561 }
562 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
563
564 # Add partitions, and divide big bucket into several
565 my $last_partition_end = "";
566 my $partition_start = "";
567 foreach my $partition (@partition_keys) {
568 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
569 my $items_in_partition = $partition_buckets{$partition}->{'size'};
570 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
571
572 if ($items_in_partition > $partition_size_within_level) {
573 my $items_done = 0;
574 my %metadata_values_to_OIDs_subhashes = ();
575 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
576 my $metadata_value = $metadata_values[$i];
577 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
578 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
579
580 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
581 my $partitionname = $partition_start;
582 if ($partitionend ne $partition_start) {
583 $partitionname = $partitionname . "-" . $partitionend;
584 }
585
586 # Start a new partition
587 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
588 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
589 $last_partition_end = $partitionend;
590 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
591 $items_done = 0;
592 %metadata_values_to_OIDs_subhashes = ();
593 }
594
595 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
596 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
597 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
598 # Get the number of partitions needed for this value
599 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
600 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
601
602 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
603 for (my $i = 0; $i < $num_splits; $i++) {
604 my %OIDs_subhashes_for_this_value = ();
605 my @OIDs_for_this_partition = ();
606 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
607 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
608 }
609
610 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
611 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
612 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
613 $items_done += scalar(@OIDs_for_this_partition);
614 next;
615 }
616
617 # Add an HList for this bucket
618 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
619 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
620 $last_partition_end = $partitionname_for_this_value;
621 }
622 next;
623 }
624
625 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
626 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
627
628 # The last partition
629 if($i == scalar(@metadata_values) - 1) {
630 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
631 }
632 }
633 }
634 else {
635 # The easier case, just add a partition
636 my %metadata_values_to_OIDs_subhashes = ();
637 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
638 my $metadata_value = $metadata_values[$i];
639 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
640 }
641 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
642 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
643 my $partitionname = $partition_start;
644 if ($partitionend ne $partition_start) {
645 $partitionname = $partitionname . "-" . $partitionend;
646 }
647 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
648 $last_partition_end = $partitionend;
649 }
650 }
651
652 # The partitions are stored in an HList
653 $classifier_node->{'childtype'} = "HList";
654
655 } # end approximate_size
656 else {
657 # Generate hlists of a certain size
658 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
659 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
660 my $itemsdone = 0;
661 my %metadata_value_to_OIDs_subhash = ();
662 my $lastpartitionend = "";
663 my $partitionstart;
664 foreach my $metadata_value (@sortedmetadata_values) {
665 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
666 $itemsdone++;
667 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
668
669 # Is this the start of a new partition?
670 if ($itemsinpartition == 1) {
671 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
672 }
673
674 # Is this the end of the partition?
675 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
676 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
677 my $partitionname = $partitionstart;
678 if ($partitionend ne $partitionstart) {
679 $partitionname = $partitionname . "-" . $partitionend;
680 }
681
682 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
683 %metadata_value_to_OIDs_subhash = ();
684 $lastpartitionend = $partitionend;
685 }
686 }
687
688 # The partitions are stored in an HList
689 $classifier_node->{'childtype'} = "HList";
690 }
691
692 # Otherwise just add all the values to a VList
693 else {
694 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
695 }
696 }
697}
698
699
700sub generate_partition_start
701{
702 my $self = shift(@_);
703 my $metadata_value = shift(@_);
704 my $lastpartitionend = shift(@_);
705 my $partition_name_length = shift(@_);
706
707 if ($partition_name_length) {
708 return substr($metadata_value, 0, $partition_name_length);
709 }
710
711 my $partitionstart = substr($metadata_value, 0, 1);
712 if ($partitionstart le $lastpartitionend) {
713 $partitionstart = substr($metadata_value, 0, 2);
714 # Give up after three characters
715 if ($partitionstart le $lastpartitionend) {
716 $partitionstart = substr($metadata_value, 0, 3);
717 }
718 }
719
720 return $partitionstart;
721}
722
723
724sub generate_partition_end
725{
726 my $self = shift(@_);
727 my $metadata_value = shift(@_);
728 my $partitionstart = shift(@_);
729 my $partition_name_length = shift(@_);
730
731 if ($partition_name_length) {
732 return substr($metadata_value, 0, $partition_name_length);
733 }
734
735 my $partitionend = substr($metadata_value, 0, length($partitionstart));
736 if ($partitionend gt $partitionstart) {
737 $partitionend = substr($metadata_value, 0, 1);
738 if ($partitionend le $partitionstart) {
739 $partitionend = substr($metadata_value, 0, 2);
740 # Give up after three characters
741 if ($partitionend le $partitionstart) {
742 $partitionend = substr($metadata_value, 0, 3);
743 }
744 }
745 }
746
747 return $partitionend;
748}
749
750
751sub add_hlist_partition
752{
753 my $self = shift(@_);
754 my @metadata_groups = @{shift(@_)};
755 my $classifier_node = shift(@_);
756 my $partitionname = shift(@_);
757 my $metadata_value_to_OIDs_hash_ref = shift(@_);
758
759 # Create an hlist partition
760 # Note that we don't need to convert from unicode-aware strings
761 # to utf8 here, as that is handled elsewhere in the code
762 my %child_classifier_node = ( 'Title' => $partitionname, #'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
763 'childtype' => "VList",
764 'contains' => [] );
765
766 # Add the children to the hlist partition
767 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
768 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
769}
770
771
772sub add_vlist
773{
774 my $self = shift(@_);
775 my @metadata_groups = @{shift(@_)};
776 my $classifier_node = shift(@_);
777 my $metadata_value_to_OIDs_hash_ref = shift(@_);
778
779 my $metadata_group = shift(@metadata_groups);
780 $classifier_node->{'mdtype'} = $metadata_group;
781
782 # Create an entry in the vlist for each value
783 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
784 {
785 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
786 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
787 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
788 my $OID = $OIDs[0];
789 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
790 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
791 }
792 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
793 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
794 @OIDs = $self->sort_leaf_items(\@OIDs);
795 foreach my $OID (@OIDs) {
796 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
797 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
798 }
799
800 }
801 # Otherwise create a sublist (bookshelf) for the metadata value
802 else {
803 # Note that we don't need to convert from unicode-aware strings
804 # to utf8 here, as that is handled elsewhere in the code
805 my %child_classifier_node = ( 'Title' => $metadata_value, # 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
806 'childtype' => "VList",
807 'mdtype' => $metadata_group,
808 'contains' => [] );
809
810 #@OIDs = $self->sort_leaf_items(\@OIDs);
811 # If there are metadata elements remaining, recursively apply the process
812 if (@metadata_groups > 0) {
813 my $next_metadata_group = $metadata_groups[0];
814 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
815
816 # separate metadata into those that below in the next/sub-metadata_group
817 # and those that below at the current level's metadata_group
818
819 my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
820 my @current_level_OIDs = ();
821 my @next_level_OIDs = ();
822 foreach my $OID (@OIDs)
823 {
824 if ($OID_to_metadata_values_hash_ref->{$OID}) {
825 push(@next_level_OIDs, $OID);
826 } else {
827 push(@current_level_OIDs, $OID);
828 }
829 }
830 # recursively process those docs belonging to the sub-metadata_group
831 $self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
832
833 # For those docs that don't belong in the sub/next_metadata_group, but which belong
834 # at this level, just add the documents as children of this list at the current level
835 @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
836 foreach my $current_level_OID (@current_level_OIDs) {
837 my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
838 push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
839 }
840 }
841 # Otherwise just add the documents as children of this list
842 else {
843 @OIDs = $self->sort_leaf_items(\@OIDs);
844 foreach my $OID (@OIDs) {
845 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
846 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
847 }
848
849 }
850
851 # Add the sublist to the list
852 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
853 }
854 }
855}
856
857sub metadata_offset
858{
859 my $self = shift(@_);
860 my $metadata_group = shift(@_);
861 my $OID = shift(@_);
862 my $metadata_value = shift(@_);
863
864 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
865 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
866 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
867 if ($metadata_value eq $metadata_values[$i]) {
868 return $i;
869 }
870 }
871
872 return 0;
873}
874
875sub sort_leaf_items
876{
877 my $self = shift(@_);
878 my @OIDs = @{shift(@_)};
879# my $classifier_node = shift(@_);
880
881 # Sort leaf nodes and add to list
882 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
883 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
884 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
885 # Force a stable sort (Perl 5.6's sort isn't stable)
886 # !! The [0] bits aren't ideal (multiple metadata values) !!
887 @OIDs = @OIDs[ sort {
888 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
889 {
890 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
891 }
892 else
893 {
894 $a <=> $b;
895 }
896 } 0..$#OIDs ];
897 }
898 if ($self->{'reverse_sort_leaf_nodes'}) {
899 #print STDERR "reversing\n";
900 return reverse @OIDs;
901 }
902 return @OIDs;
903}
904
905
906
907sub sort_metadata_values_array
908{
909 my $self = shift(@_);
910 my @metadata_values = @_;
911
912 if ($self->{'unicode_collator'}) {
913 return $self->{'unicode_collator'}->sort(@metadata_values);
914 }
915 else {
916 return sort(@metadata_values);
917 }
918}
919
920
9211;
Note: See TracBrowser for help on using the repository browser.