source: main/trunk/greenstone2/perllib/classify/List.pm@ 23302

Last change on this file since 23302 was 23302, checked in by kjdon, 14 years ago

added reverse_sort_leaf_nodes option

  • Property svn:keywords set to Author Date Id Revision
File size: 35.0 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46my $partition_type_list =
47 [ { 'name' => "per_letter",
48 'desc' => "{List.level_partition.per_letter}" },
49 { 'name' => "approximate_size",
50 'desc' => "{List.level_partition.approximate_size}"},
51 { 'name' => "constant_size",
52 'desc' => "{List.level_partition.constant_size}" },
53 { 'name' => "none",
54 'desc' => "{List.level_partition.none}" } ];
55
56# following used to check types later on
57my $valid_partition_types = { 'per_letter' => 1,
58 'constant_size' => 1,
59 'per_letter_fixed_size' => 1,
60 'approximate_size' => 1,
61 'none' => 1};
62
63my $bookshelf_type_list =
64 [ { 'name' => "always",
65 'desc' => "{List.bookshelf_type.always}" },
66 { 'name' => "duplicate_only",
67 'desc' => "{List.bookshelf_type.duplicate_only}" },
68 { 'name' => "never",
69 'desc' => "{List.bookshelf_type.never}" } ];
70
71my $arguments =
72 [ { 'name' => "metadata",
73 'desc' => "{List.metadata}",
74 'type' => "metadata",
75 'reqd' => "yes" },
76
77 # The interesting options
78 { 'name' => "bookshelf_type",
79 'desc' => "{List.bookshelf_type}",
80 'type' => "enum",
81 'list' => $bookshelf_type_list,
82 'deft' => "never" },
83 { 'name' => "classify_sections",
84 'desc' => "{List.classify_sections}",
85 'type' => "flag" },
86 { 'name' => "partition_type_within_level",
87 'desc' => "{List.partition_type_within_level}",
88 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
89 'list' => $partition_type_list,
90 'deft' => "per_letter" },
91 { 'name' => "partition_size_within_level",
92 'desc' => "{List.partition_size_within_level}",
93 'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
94 { 'name' => "partition_name_length",
95 'desc' => "{List.partition_name_length}",
96 'type' => "string" },
97 { 'name' => "sort_leaf_nodes_using",
98 'desc' => "{List.sort_leaf_nodes_using}",
99 'type' => "metadata",
100 'deft' => "Title" },
101 { 'name' => "reverse_sort_leaf_nodes",
102 'desc' => "{List.reverse_sort_leaf_nodes}",
103 'type' => "flag"},
104 { 'name' => "sort_using_unicode_collation",
105 'desc' => "{List.sort_using_unicode_collation}",
106 'type' => "flag" },
107 { 'name' => "use_hlist_for",
108 'desc' => "{List.use_hlist_for}",
109 'type' => "string" },
110 { 'name' => "removeprefix",
111 'desc' => "{BasClas.removeprefix}",
112 'type' => "regexp" },
113 { 'name' => "removesuffix",
114 'desc' => "{BasClas.removesuffix}",
115 'type' => "regexp" } ];
116
117my $options = { 'name' => "List",
118 'desc' => "{List.desc}",
119 'abstract' => "no",
120 'inherits' => "yes",
121 'args' => $arguments };
122
123
124sub new
125{
126 my ($class) = shift(@_);
127 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
128 push(@$classifierslist, $class);
129
130 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
131 push(@{$hashArgOptLists->{"OptList"}}, $options);
132
133 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
134
135 if ($self->{'info_only'}) {
136 # don't worry about any options etc
137 return bless $self, $class;
138 }
139
140 # The metadata elements to use (required)
141 if (!$self->{'metadata'}) {
142 die "Error: No metadata fields specified for List.\n";
143 }
144 my @metadata_groups = split(/\//, $self->{'metadata'});
145 $self->{'metadata_groups'} = \@metadata_groups;
146
147 # The classifier button name (default: the first metadata element specified)
148 if (!$self->{'buttonname'}) {
149 my $first_metadata_group = $metadata_groups[0];
150 my $first_metadata_element = (split(/\;|,/, $first_metadata_group))[0];
151 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
152 }
153
154 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
155 foreach my $metadata_group (@metadata_groups) {
156 $self->{$metadata_group . ".bookshelf_type"} = "always";
157 }
158 my $last_metadata_group = $metadata_groups[$#metadata_groups];
159 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
160 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
161
162 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
163 foreach my $metadata_group (@metadata_groups) {
164 $self->{$metadata_group . ".list_type"} = "VList";
165 }
166 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
167 $self->{$metadata_group . ".list_type"} = "HList";
168 }
169
170 # How the items are grouped into partitions (default: no partition)
171 # for each level (metadata group), separated by '/'
172 if (!$self->{"partition_type_within_level"}) {
173 foreach my $metadata_group (@metadata_groups) {
174 $self->{$metadata_group . ".partition_type_within_level"} = "none";
175 }
176 } else {
177 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
178
179 my $first = 1;
180 foreach my $metadata_group (@metadata_groups) {
181 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
182 if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
183 print STDERR "per letter fixed size, changing to approximate size\n";
184 $partition_type_within_levelelem = "approximate_size";
185 }
186 if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
187 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
188 }
189 else {
190 if ($first) {
191 $self->{$metadata_group . ".partition_type_within_level"} = "none";
192 $first = 0;
193 } else {
194 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
195 }
196 if (defined($partition_type_within_levelelem)) {
197 # ie invalid entry
198 print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
199 }
200 }
201 }
202 }
203
204 # The number of items in each partition
205 if (!$self->{'partition_size_within_level'}) {
206 # Default: 20
207 foreach my $metadata_group (@metadata_groups) {
208 $self->{$metadata_group . ".partition_size_within_level"} = 20;
209 }
210 }
211 else {
212 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
213
214 # Assign values based on the partition_size_within_level parameter
215 foreach my $metadata_group (@metadata_groups) {
216 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
217 if (defined($partition_size_within_levelelem)) {
218 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
219 }
220 else {
221 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
222 }
223 }
224 }
225
226 # The removeprefix and removesuffix expressions
227 if ($self->{'removeprefix'}) {
228 # If there are more than one expressions, use '' to quote each experession and '/' to separate
229 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
230
231 foreach my $metadata_group (@metadata_groups) {
232 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
233 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
234 # Remove the other ' at the beginning and the end if there is any
235 $removeprefix_expr_within_levelelem =~ s/^'//;
236 $removeprefix_expr_within_levelelem =~ s/'$//;
237 # Remove the extra ^ at the beginning
238 $removeprefix_expr_within_levelelem =~ s/^\^//;
239 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
240 } else {
241 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
242 }
243 }
244 }
245 if ($self->{'removesuffix'}) {
246 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
247
248 foreach my $metadata_group (@metadata_groups) {
249 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
250 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
251 $removesuffix_expr_within_levelelem =~ s/^'//;
252 $removesuffix_expr_within_levelelem =~ s/'$//;
253 # Remove the extra $ at the end
254 $removesuffix_expr_within_levelelem =~ s/\$$//;
255 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
256 } else {
257 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
258 }
259 }
260 }
261
262 # The metadata elements to use to sort the leaf nodes (default: Title)
263 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
264 if ($self->{'sort_leaf_nodes_using'}) {
265 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
266 }
267 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
268
269 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
270 if ($self->{'sort_using_unicode_collation'}) {
271 # To use this you first need to download the allkeys.txt file from
272 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
273 # Unicode/Collate directory.
274 require Unicode::Collate;
275 $self->{'unicode_collator'} = Unicode::Collate->new();
276 }
277
278 # An empty array for the document/section OIDs that we are classifying
279 $self->{'OIDs'} = [];
280 # A hash for all the doc ids that we have seen, so we don't classify something twice
281 $self->{'all_doc_OIDs'} = {};
282 return bless $self, $class;
283}
284
285
286sub init
287{
288 # Nothing to do...
289}
290
291
292# Called for each document in the collection
293sub classify
294{
295 my $self = shift(@_);
296 my ($doc_obj) = @_;
297
298 if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
299 print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
300 return;
301 }
302 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
303 # If "-classify_sections" is set, classify every section of the document
304 if ($self->{'classify_sections'}) {
305 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
306 while (defined $section) {
307 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
308 $section = $doc_obj->get_next_section($section);
309 }
310 }
311 # Otherwise just classify the top document section
312 else {
313 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
314 }
315
316}
317
318sub classify_section
319{
320 my $self = shift(@_);
321 my ($doc_obj,$section_OID,$section) = @_;
322
323 my @metadata_groups = @{$self->{'metadata_groups'}};
324
325 # Only classify the section if it has a value for one of the metadata elements in the first group
326 my $classify_section = 0;
327 my $first_metadata_group = $metadata_groups[0];
328 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
329 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
330 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
331 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
332 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
333
334 # Remove prefix/suffix if requested
335 if (defined ($first_metadata_group_element_value)) {
336 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
337 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
338 }
339
340 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
341 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
342 }
343 }
344 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
345 # This section must be included in the classifier
346 $classify_section = 1;
347 last;
348 }
349 }
350
351 # We're not classifying this section because it doesn't have the required metadata
352 return if (!$classify_section);
353
354 # Otherwise, include this section in the classifier
355
356 push(@{$self->{'OIDs'}}, $section_OID);
357
358 # Create a hash for the metadata values of each metadata element we're interested in
359 my %metadata_groups_done = ();
360 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
361 # Take care not to do a metadata group more than once
362 unless ($metadata_groups_done{$metadata_group}) {
363 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
364 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
365 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
366 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
367
368 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
369 foreach my $metadata_value (@metadata_values) {
370 # Strip leading and trailing whitespace
371 $metadata_value =~ s/^\s*//;
372 $metadata_value =~ s/\s*$//;
373
374 # Remove prefix/suffix if requested
375 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
376 $metadata_value =~ s/^$remove_prefix_expr//;
377 }
378 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
379 $metadata_value =~ s/$remove_suffix_expr$//;
380 }
381
382 # uppercase the metadata - makes the AZList nicer
383 $metadata_value = uc($metadata_value);
384 # Convert the metadata value from a UTF-8 string to a Unicode string
385 # This means that length() and substr() work properly
386 # We need to be careful to convert classifier node title values back to UTF-8, however
387 my $metadata_value_unicode_string = $self->convert_utf8_string_to_unicode_string($metadata_value);
388
389 # Add the metadata value into the list for this combination of metadata group and section
390 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
391 }
392 last if (@metadata_values > 0);
393 }
394
395 $metadata_groups_done{$metadata_group} = 1;
396 }
397 }
398}
399
400
401sub get_classify_info
402{
403 my $self = shift(@_);
404
405 # The metadata groups to classify by
406 my @metadata_groups = @{$self->{'metadata_groups'}};
407 my $first_metadata_group = $metadata_groups[0];
408
409 # The OID values of the documents to include in the classifier
410 my @OIDs = @{$self->{'OIDs'}};
411
412 # Create the root node of the classification hierarchy
413 my %classifier_node = ( 'thistype' => "Invisible",
414 'childtype' => $self->{$first_metadata_group . ".list_type"},
415 'Title' => $self->{'buttonname'},
416 'contains' => [],
417 'mdtype' => $first_metadata_group );
418
419 # Recursively create the classification hierarchy, one level for each metadata group
420 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
421 return \%classifier_node;
422}
423
424
425sub add_level
426{
427 my $self = shift(@_);
428 my @metadata_groups = @{shift(@_)};
429 my @OIDs = @{shift(@_)};
430 my $classifier_node = shift(@_);
431
432 my $metadata_group = $metadata_groups[0];
433
434 if (!defined($self->{$metadata_group . ".list"})) {
435 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
436 return;
437 }
438
439 # Create a mapping from metadata value to OID
440 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
441 my %metadata_value_to_OIDs_hash = ();
442 foreach my $OID (@OIDs)
443 {
444 if ($OID_to_metadata_values_hash_ref->{$OID})
445 {
446 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
447 foreach my $metadata_value (@metadata_values)
448 {
449 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
450 }
451 }
452 }
453 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
454
455 # Partition the values (if necessary)
456 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
457 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
458 if ($partition_type_within_level =~ /^per_letter$/i) {
459 # Generate one hlist for each letter
460 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
461 my %metadata_value_to_OIDs_subhash = ();
462
463 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
464 foreach my $metadata_value (@sortedmetadata_values) {
465 my $metadata_valuepartition = substr($metadata_value, 0, 1);
466
467 # Is this the start of a new partition?
468 if ($metadata_valuepartition ne $lastpartition) {
469 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
470 %metadata_value_to_OIDs_subhash = ();
471 $lastpartition = $metadata_valuepartition;
472 }
473
474 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
475 }
476
477 # Don't forget to add the last partition
478 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
479
480 # The partitions are stored in an HList
481 $classifier_node->{'childtype'} = "HList";
482 }
483 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
484 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
485 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
486 # If a letter has more items than specified, split into several hlists.
487 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
488 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
489 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
490 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
491
492 # Separate values by their first letter, each form a bucket, like the per_letter partition type
493 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
494 my %partition_buckets = ();
495 my @metadata_values_in_bucket = ();
496 my $num_items_in_bucket = 0;
497 foreach my $metadata_value (@sortedmetadata_values) {
498 my $metadata_valuepartition = substr($metadata_value, 0, 1);
499 if ($metadata_valuepartition ne $last_partition) {
500 my @temp_array = @metadata_values_in_bucket;
501 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
502 my %partition_info = ();
503 $partition_info{'metadata_values'} = \@temp_array;
504 $partition_info{'size'} = $num_items_in_bucket;
505 $partition_buckets{$last_partition} = \%partition_info;
506
507 @metadata_values_in_bucket = ($metadata_value);
508 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
509 $last_partition = $metadata_valuepartition;
510 } else {
511 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
512 push (@metadata_values_in_bucket, $metadata_value);
513 }
514 }
515 # Last one
516 my %partition_info = ();
517 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
518 $partition_info{'size'} = $num_items_in_bucket;
519 $partition_buckets{$last_partition} = \%partition_info;
520
521 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
522 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
523 my $partition = $partition_keys[$i];
524 my $items_in_partition = $partition_buckets{$partition}->{'size'};
525 # Merge small buckets together, but keep the numeric bucket apart
526 if ($items_in_partition < $partition_size_within_level) {
527 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
528 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
529 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
530 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
531 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
532 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
533 }
534 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
535 delete $partition_buckets{$partition};
536 }
537 }
538 }
539 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
540
541 # Add partitions, and divide big bucket into several
542 my $last_partition_end = "";
543 my $partition_start = "";
544 foreach my $partition (@partition_keys) {
545 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
546 my $items_in_partition = $partition_buckets{$partition}->{'size'};
547 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
548
549 if ($items_in_partition > $partition_size_within_level) {
550 my $items_done = 0;
551 my %metadata_values_to_OIDs_subhashes = ();
552 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
553 my $metadata_value = $metadata_values[$i];
554 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
555 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
556
557 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
558 my $partitionname = $partition_start;
559 if ($partitionend ne $partition_start) {
560 $partitionname = $partitionname . "-" . $partitionend;
561 }
562
563 # Start a new partition
564 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
565 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
566 $last_partition_end = $partitionend;
567 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
568 $items_done = 0;
569 %metadata_values_to_OIDs_subhashes = ();
570 }
571
572 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
573 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
574 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
575 # Get the number of partitions needed for this value
576 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
577 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
578
579 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
580 for (my $i = 0; $i < $num_splits; $i++) {
581 my %OIDs_subhashes_for_this_value = ();
582 my @OIDs_for_this_partition = ();
583 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
584 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
585 }
586
587 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
588 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
589 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
590 $items_done += scalar(@OIDs_for_this_partition);
591 next;
592 }
593
594 # Add an HList for this bucket
595 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
596 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
597 $last_partition_end = $partitionname_for_this_value;
598 }
599 next;
600 }
601
602 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
603 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
604
605 # The last partition
606 if($i == scalar(@metadata_values) - 1) {
607 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
608 }
609 }
610 }
611 else {
612 # The easier case, just add a partition
613 my %metadata_values_to_OIDs_subhashes = ();
614 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
615 my $metadata_value = $metadata_values[$i];
616 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
617 }
618 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
619 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
620 my $partitionname = $partition_start;
621 if ($partitionend ne $partition_start) {
622 $partitionname = $partitionname . "-" . $partitionend;
623 }
624 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
625 $last_partition_end = $partitionend;
626 }
627 }
628
629 # The partitions are stored in an HList
630 $classifier_node->{'childtype'} = "HList";
631
632 } # end approximate_size
633 else {
634 # Generate hlists of a certain size
635 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
636 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
637 my $itemsdone = 0;
638 my %metadata_value_to_OIDs_subhash = ();
639 my $lastpartitionend = "";
640 my $partitionstart;
641 foreach my $metadata_value (@sortedmetadata_values) {
642 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
643 $itemsdone++;
644 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
645
646 # Is this the start of a new partition?
647 if ($itemsinpartition == 1) {
648 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
649 }
650
651 # Is this the end of the partition?
652 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
653 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
654 my $partitionname = $partitionstart;
655 if ($partitionend ne $partitionstart) {
656 $partitionname = $partitionname . "-" . $partitionend;
657 }
658
659 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
660 %metadata_value_to_OIDs_subhash = ();
661 $lastpartitionend = $partitionend;
662 }
663 }
664
665 # The partitions are stored in an HList
666 $classifier_node->{'childtype'} = "HList";
667 }
668
669 # Otherwise just add all the values to a VList
670 else {
671 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
672 }
673 }
674}
675
676
677sub convert_utf8_string_to_unicode_string
678{
679 my $self = shift(@_);
680 my $utf8_string = shift(@_);
681
682 my $unicode_string = "";
683 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
684 $unicode_string .= chr($unicode_value);
685 }
686 return $unicode_string;
687}
688
689
690sub convert_unicode_string_to_utf8_string
691{
692 my $self = shift(@_);
693 my $unicode_string = shift(@_);
694
695 my @unicode_array;
696 for (my $i = 0; $i < length($unicode_string); $i++) {
697 push(@unicode_array, ord(substr($unicode_string, $i, 1)));
698 }
699 return &unicode::unicode2utf8(\@unicode_array);
700}
701
702
703sub generate_partition_start
704{
705 my $self = shift(@_);
706 my $metadata_value = shift(@_);
707 my $lastpartitionend = shift(@_);
708 my $partition_name_length = shift(@_);
709
710 if ($partition_name_length) {
711 return substr($metadata_value, 0, $partition_name_length);
712 }
713
714 my $partitionstart = substr($metadata_value, 0, 1);
715 if ($partitionstart le $lastpartitionend) {
716 $partitionstart = substr($metadata_value, 0, 2);
717 # Give up after three characters
718 if ($partitionstart le $lastpartitionend) {
719 $partitionstart = substr($metadata_value, 0, 3);
720 }
721 }
722
723 return $partitionstart;
724}
725
726
727sub generate_partition_end
728{
729 my $self = shift(@_);
730 my $metadata_value = shift(@_);
731 my $partitionstart = shift(@_);
732 my $partition_name_length = shift(@_);
733
734 if ($partition_name_length) {
735 return substr($metadata_value, 0, $partition_name_length);
736 }
737
738 my $partitionend = substr($metadata_value, 0, length($partitionstart));
739 if ($partitionend gt $partitionstart) {
740 $partitionend = substr($metadata_value, 0, 1);
741 if ($partitionend le $partitionstart) {
742 $partitionend = substr($metadata_value, 0, 2);
743 # Give up after three characters
744 if ($partitionend le $partitionstart) {
745 $partitionend = substr($metadata_value, 0, 3);
746 }
747 }
748 }
749
750 return $partitionend;
751}
752
753
754sub add_hlist_partition
755{
756 my $self = shift(@_);
757 my @metadata_groups = @{shift(@_)};
758 my $classifier_node = shift(@_);
759 my $partitionname = shift(@_);
760 my $metadata_value_to_OIDs_hash_ref = shift(@_);
761
762 # Create an hlist partition
763 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
764 'childtype' => "VList",
765 'contains' => [] );
766
767 # Add the children to the hlist partition
768 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
769 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
770}
771
772
773sub add_vlist
774{
775 my $self = shift(@_);
776 my @metadata_groups = @{shift(@_)};
777 my $classifier_node = shift(@_);
778 my $metadata_value_to_OIDs_hash_ref = shift(@_);
779
780 my $metadata_group = shift(@metadata_groups);
781 $classifier_node->{'mdtype'} = $metadata_group;
782
783 # Create an entry in the vlist for each value
784 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
785 {
786 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
787 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
788 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
789 my $OID = $OIDs[0];
790 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
791 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
792 }
793 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
794 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
795 @OIDs = $self->sort_leaf_items(\@OIDs);
796 foreach my $OID (@OIDs) {
797 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
798 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
799 }
800
801 }
802 # Otherwise create a sublist (bookshelf) for the metadata value
803 else {
804 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
805 'childtype' => "VList",
806 'mdtype' => $metadata_group,
807 'contains' => [] );
808
809 # If there are metadata elements remaining, recursively apply the process
810 if (@metadata_groups > 0) {
811 my $next_metadata_group = $metadata_groups[0];
812 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
813 $self->add_level(\@metadata_groups, \@OIDs, \%child_classifier_node);
814 }
815 # Otherwise just add the documents as children of this list
816 else {
817 @OIDs = $self->sort_leaf_items(\@OIDs);
818 foreach my $OID (@OIDs) {
819 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
820 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
821 }
822
823 }
824
825 # Add the sublist to the list
826 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
827 }
828 }
829}
830
831sub metadata_offset
832{
833 my $self = shift(@_);
834 my $metadata_group = shift(@_);
835 my $OID = shift(@_);
836 my $metadata_value = shift(@_);
837
838 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
839 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
840 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
841 if ($metadata_value eq $metadata_values[$i]) {
842 return $i;
843 }
844 }
845
846 return 0;
847}
848
849sub sort_leaf_items
850{
851 my $self = shift(@_);
852 my @OIDs = @{shift(@_)};
853# my $classifier_node = shift(@_);
854
855 # Sort leaf nodes and add to list
856 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
857 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
858 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
859 # Force a stable sort (Perl 5.6's sort isn't stable)
860 # !! The [0] bits aren't ideal (multiple metadata values) !!
861 @OIDs = @OIDs[ sort {
862 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
863 {
864 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
865 }
866 else
867 {
868 $a <=> $b;
869 }
870 } 0..$#OIDs ];
871 }
872 if ($self->{'reverse_sort_leaf_nodes'}) {
873 print STDERR "reversing\n";
874 return reverse @OIDs;
875 }
876 return @OIDs;
877}
878
879
880
881sub sort_metadata_values_array
882{
883 my $self = shift(@_);
884 my @metadata_values = @_;
885
886 if ($self->{'unicode_collator'}) {
887 return $self->{'unicode_collator'}->sort(@metadata_values);
888 }
889 else {
890 return sort(@metadata_values);
891 }
892}
893
894
8951;
Note: See TracBrowser for help on using the repository browser.