source: main/trunk/greenstone2/perllib/classify/List.pm@ 22667

Last change on this file since 22667 was 22667, checked in by mdewsnip, 14 years ago

Added code to prevent uninitialized value errors when trying to sort on metadata fields that don't exist in the collection.

  • Property svn:keywords set to Author Date Id Revision
File size: 35.0 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46my $partition_type_list =
47 [ { 'name' => "per_letter",
48 'desc' => "{List.level_partition.per_letter}" },
49 { 'name' => "approximate_size",
50 'desc' => "{List.level_partition.approximate_size}"},
51 { 'name' => "constant_size",
52 'desc' => "{List.level_partition.constant_size}" },
53 { 'name' => "none",
54 'desc' => "{List.level_partition.none}" } ];
55
56# following used to check types later on
57my $valid_partition_types = { 'per_letter' => 1,
58 'constant_size' => 1,
59 'per_letter_fixed_size' => 1,
60 'approximate_size' => 1,
61 'none' => 1};
62
63my $bookshelf_type_list =
64 [ { 'name' => "always",
65 'desc' => "{List.bookshelf_type.always}" },
66 { 'name' => "duplicate_only",
67 'desc' => "{List.bookshelf_type.duplicate_only}" },
68 { 'name' => "never",
69 'desc' => "{List.bookshelf_type.never}" } ];
70
71my $arguments =
72 [ { 'name' => "metadata",
73 'desc' => "{List.metadata}",
74 'type' => "metadata",
75 'reqd' => "yes" },
76
77 # The interesting options
78 { 'name' => "bookshelf_type",
79 'desc' => "{List.bookshelf_type}",
80 'type' => "enum",
81 'list' => $bookshelf_type_list,
82 'deft' => "never" },
83 { 'name' => "classify_sections",
84 'desc' => "{List.classify_sections}",
85 'type' => "flag" },
86 { 'name' => "partition_type_within_level",
87 'desc' => "{List.partition_type_within_level}",
88 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
89 'list' => $partition_type_list,
90 'deft' => "per_letter" },
91 { 'name' => "partition_size_within_level",
92 'desc' => "{List.partition_size_within_level}",
93 'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
94 { 'name' => "partition_name_length",
95 'desc' => "{List.partition_name_length}",
96 'type' => "string" },
97 { 'name' => "sort_leaf_nodes_using",
98 'desc' => "{List.sort_leaf_nodes_using}",
99 'type' => "metadata",
100 'deft' => "Title" },
101 { 'name' => "sort_using_unicode_collation",
102 'desc' => "{List.sort_using_unicode_collation}",
103 'type' => "flag" },
104 { 'name' => "use_hlist_for",
105 'desc' => "{List.use_hlist_for}",
106 'type' => "string" },
107 { 'name' => "removeprefix",
108 'desc' => "{BasClas.removeprefix}",
109 'type' => "regexp" },
110 { 'name' => "removesuffix",
111 'desc' => "{BasClas.removesuffix}",
112 'type' => "regexp" } ];
113
114my $options = { 'name' => "List",
115 'desc' => "{List.desc}",
116 'abstract' => "no",
117 'inherits' => "yes",
118 'args' => $arguments };
119
120
121sub new
122{
123 my ($class) = shift(@_);
124 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
125 push(@$classifierslist, $class);
126
127 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
128 push(@{$hashArgOptLists->{"OptList"}}, $options);
129
130 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
131
132 if ($self->{'info_only'}) {
133 # don't worry about any options etc
134 return bless $self, $class;
135 }
136
137 # The metadata elements to use (required)
138 if (!$self->{'metadata'}) {
139 die "Error: No metadata fields specified for List.\n";
140 }
141 my @metadata_groups = split(/\//, $self->{'metadata'});
142 $self->{'metadata_groups'} = \@metadata_groups;
143
144 # The classifier button name (default: the first metadata element specified)
145 if (!$self->{'buttonname'}) {
146 my $first_metadata_group = $metadata_groups[0];
147 my $first_metadata_element = (split(/\;|,/, $first_metadata_group))[0];
148 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
149 }
150
151 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
152 foreach my $metadata_group (@metadata_groups) {
153 $self->{$metadata_group . ".bookshelf_type"} = "always";
154 }
155 my $last_metadata_group = $metadata_groups[$#metadata_groups];
156 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
157 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
158
159 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
160 foreach my $metadata_group (@metadata_groups) {
161 $self->{$metadata_group . ".list_type"} = "VList";
162 }
163 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
164 $self->{$metadata_group . ".list_type"} = "HList";
165 }
166
167 # How the items are grouped into partitions (default: no partition)
168 # for each level (metadata group), separated by '/'
169 if (!$self->{"partition_type_within_level"}) {
170 foreach my $metadata_group (@metadata_groups) {
171 $self->{$metadata_group . ".partition_type_within_level"} = "none";
172 }
173 } else {
174 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
175
176 my $first = 1;
177 foreach my $metadata_group (@metadata_groups) {
178 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
179 if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
180 print STDERR "per letter fixed size, changing to approximate size\n";
181 $partition_type_within_levelelem = "approximate_size";
182 }
183 if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
184 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
185 }
186 else {
187 if ($first) {
188 $self->{$metadata_group . ".partition_type_within_level"} = "none";
189 $first = 0;
190 } else {
191 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
192 }
193 if (defined($partition_type_within_levelelem)) {
194 # ie invalid entry
195 print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
196 }
197 }
198 }
199 }
200
201 # The number of items in each partition
202 if (!$self->{'partition_size_within_level'}) {
203 # Default: 20
204 foreach my $metadata_group (@metadata_groups) {
205 $self->{$metadata_group . ".partition_size_within_level"} = 20;
206 }
207 }
208 else {
209 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
210
211 # Assign values based on the partition_size_within_level parameter
212 foreach my $metadata_group (@metadata_groups) {
213 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
214 if (defined($partition_size_within_levelelem)) {
215 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
216 }
217 else {
218 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
219 }
220 }
221 }
222
223 # The removeprefix and removesuffix expressions
224 if ($self->{'removeprefix'}) {
225 # If there are more than one expressions, use '' to quote each experession and '/' to separate
226 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
227
228 foreach my $metadata_group (@metadata_groups) {
229 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
230 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
231 # Remove the other ' at the beginning and the end if there is any
232 $removeprefix_expr_within_levelelem =~ s/^'//;
233 $removeprefix_expr_within_levelelem =~ s/'$//;
234 # Remove the extra ^ at the beginning
235 $removeprefix_expr_within_levelelem =~ s/^\^//;
236 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
237 } else {
238 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
239 }
240 }
241 }
242 if ($self->{'removesuffix'}) {
243 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
244
245 foreach my $metadata_group (@metadata_groups) {
246 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
247 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
248 $removesuffix_expr_within_levelelem =~ s/^'//;
249 $removesuffix_expr_within_levelelem =~ s/'$//;
250 # Remove the extra $ at the end
251 $removesuffix_expr_within_levelelem =~ s/\$$//;
252 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
253 } else {
254 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
255 }
256 }
257 }
258
259 # The metadata elements to use to sort the leaf nodes (default: Title)
260 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
261 if ($self->{'sort_leaf_nodes_using'}) {
262 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
263 }
264 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
265
266 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
267 if ($self->{'sort_using_unicode_collation'}) {
268 # To use this you first need to download the allkeys.txt file from
269 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
270 # Unicode/Collate directory.
271 require Unicode::Collate;
272 $self->{'unicode_collator'} = Unicode::Collate->new();
273 }
274
275 # An empty array for the document OIDs
276 $self->{'OIDs'} = [];
277
278 return bless $self, $class;
279}
280
281
282sub init
283{
284 # Nothing to do...
285}
286
287
288# Called for each document in the collection
289sub classify
290{
291 my $self = shift(@_);
292 my ($doc_obj,$edit_mode) = @_;
293
294 # If "-classify_sections" is set, classify every section of the document
295 if ($self->{'classify_sections'}) {
296 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
297 while (defined $section) {
298 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section, $edit_mode);
299 $section = $doc_obj->get_next_section($section);
300 }
301 }
302 # Otherwise just classify the top document section
303 else {
304 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section(), $edit_mode);
305 }
306}
307
308
309sub classify_section
310{
311 my $self = shift(@_);
312 my ($doc_obj,$section_OID,$section,$edit_mode) = @_;
313
314 my @metadata_groups = @{$self->{'metadata_groups'}};
315
316 # Only classify the section if it has a value for one of the metadata elements in the first group
317 my $classify_section = 0;
318 my $first_metadata_group = $metadata_groups[0];
319 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
320 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
321 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
322 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
323 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
324
325 # Remove prefix/suffix if requested
326 if (defined ($first_metadata_group_element_value)) {
327 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
328 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
329 }
330
331 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
332 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
333 }
334 }
335 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
336 # This section must be included in the classifier
337 $classify_section = 1;
338 last;
339 }
340 }
341
342 # We're not classifying this section because it doesn't have the required metadata
343 return if (!$classify_section);
344
345 if (($edit_mode eq "delete") || ($edit_mode eq "update")) {
346 $self->oid_array_delete($section_OID,'OIDs');
347 if ($edit_mode eq "delete") {
348 return;
349 }
350 }
351
352 # Otherwise, include this section in the classifier
353 push(@{$self->{'OIDs'}}, $section_OID);
354
355 # Create a hash for the metadata values of each metadata element we're interested in
356 my %metadata_groups_done = ();
357 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
358 # Take care not to do a metadata group more than once
359 unless ($metadata_groups_done{$metadata_group}) {
360 if ($edit_mode eq "update") {
361 # if we are updating, we delete all the old values before
362 # adding the new ones, otherwise, the section will end up in
363 # the classifier twice.
364 delete $self->{$metadata_group . ".list"}->{$section_OID};
365 }
366
367 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
368 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
369 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
370 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
371
372 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
373 foreach my $metadata_value (@metadata_values) {
374 # Strip leading and trailing whitespace
375 $metadata_value =~ s/^\s*//;
376 $metadata_value =~ s/\s*$//;
377
378 # Remove prefix/suffix if requested
379 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
380 $metadata_value =~ s/^$remove_prefix_expr//;
381 }
382 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
383 $metadata_value =~ s/$remove_suffix_expr$//;
384 }
385
386 # uppercase the metadata - makes the AZList nicer
387 $metadata_value = uc($metadata_value);
388 # Convert the metadata value from a UTF-8 string to a Unicode string
389 # This means that length() and substr() work properly
390 # We need to be careful to convert classifier node title values back to UTF-8, however
391 my $metadata_value_unicode_string = $self->convert_utf8_string_to_unicode_string($metadata_value);
392
393 # Add the metadata value into the list for this combination of metadata group and section
394 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
395 }
396 last if (@metadata_values > 0);
397 }
398
399 $metadata_groups_done{$metadata_group} = 1;
400 }
401 }
402}
403
404
405sub get_classify_info
406{
407 my $self = shift(@_);
408
409 # The metadata groups to classify by
410 my @metadata_groups = @{$self->{'metadata_groups'}};
411 my $first_metadata_group = $metadata_groups[0];
412
413 # The OID values of the documents to include in the classifier
414 my @OIDs = @{$self->{'OIDs'}};
415
416 # Create the root node of the classification hierarchy
417 my %classifier_node = ( 'thistype' => "Invisible",
418 'childtype' => $self->{$first_metadata_group . ".list_type"},
419 'Title' => $self->{'buttonname'},
420 'contains' => [],
421 'mdtype' => $first_metadata_group );
422
423 # Recursively create the classification hierarchy, one level for each metadata group
424 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
425 return \%classifier_node;
426}
427
428
429sub add_level
430{
431 my $self = shift(@_);
432 my @metadata_groups = @{shift(@_)};
433 my @OIDs = @{shift(@_)};
434 my $classifier_node = shift(@_);
435 # print STDERR "\nAdding AZ list for " . $classifier_node->{'Title'} . "\n";
436
437 my $metadata_group = $metadata_groups[0];
438 # print STDERR "Processing metadata group: " . $metadata_group . "\n";
439 # print STDERR "Number of OID values: " . @OIDs . "\n";
440
441 if (!defined($self->{$metadata_group . ".list"})) {
442 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
443 return;
444 }
445
446 # Create a mapping from metadata value to OID
447 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
448 my %metadata_value_to_OIDs_hash = ();
449 foreach my $OID (@OIDs)
450 {
451 if ($OID_to_metadata_values_hash_ref->{$OID})
452 {
453 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
454 foreach my $metadata_value (@metadata_values)
455 {
456 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
457 }
458 }
459 }
460 # print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
461
462 # Partition the values (if necessary)
463 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
464 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
465 if ($partition_type_within_level =~ /^per_letter$/i) {
466 # Generate one hlist for each letter
467 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
468 my %metadata_value_to_OIDs_subhash = ();
469
470 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
471 foreach my $metadata_value (@sortedmetadata_values) {
472 my $metadata_valuepartition = substr($metadata_value, 0, 1);
473
474 # Is this the start of a new partition?
475 if ($metadata_valuepartition ne $lastpartition) {
476 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
477 %metadata_value_to_OIDs_subhash = ();
478 $lastpartition = $metadata_valuepartition;
479 }
480
481 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
482 }
483
484 # Don't forget to add the last partition
485 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
486
487 # The partitions are stored in an HList
488 $classifier_node->{'childtype'} = "HList";
489 }
490 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
491 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
492 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
493 # If a letter has more items than specified, split into several hlists.
494 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
495 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
496 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
497 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
498
499 # Separate values by their first letter, each form a bucket, like the per_letter partition type
500 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
501 my %partition_buckets = ();
502 my @metadata_values_in_bucket = ();
503 my $num_items_in_bucket = 0;
504 foreach my $metadata_value (@sortedmetadata_values) {
505 my $metadata_valuepartition = substr($metadata_value, 0, 1);
506 if ($metadata_valuepartition ne $last_partition) {
507 my @temp_array = @metadata_values_in_bucket;
508 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
509 my %partition_info = ();
510 $partition_info{'metadata_values'} = \@temp_array;
511 $partition_info{'size'} = $num_items_in_bucket;
512 $partition_buckets{$last_partition} = \%partition_info;
513
514 @metadata_values_in_bucket = ($metadata_value);
515 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
516 $last_partition = $metadata_valuepartition;
517 } else {
518 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
519 push (@metadata_values_in_bucket, $metadata_value);
520 }
521 }
522 # Last one
523 my %partition_info = ();
524 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
525 $partition_info{'size'} = $num_items_in_bucket;
526 $partition_buckets{$last_partition} = \%partition_info;
527
528 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
529 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
530 my $partition = $partition_keys[$i];
531 my $items_in_partition = $partition_buckets{$partition}->{'size'};
532 # Merge small buckets together, but keep the numeric bucket apart
533 if ($items_in_partition < $partition_size_within_level) {
534 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
535 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
536 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
537 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
538 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
539 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
540 }
541 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
542 delete $partition_buckets{$partition};
543 }
544 }
545 }
546 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
547
548 # Add partitions, and divide big bucket into several
549 my $last_partition_end = "";
550 my $partition_start = "";
551 foreach my $partition (@partition_keys) {
552 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
553 my $items_in_partition = $partition_buckets{$partition}->{'size'};
554 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
555
556 if ($items_in_partition > $partition_size_within_level) {
557 my $items_done = 0;
558 my %metadata_values_to_OIDs_subhashes = ();
559 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
560 my $metadata_value = $metadata_values[$i];
561 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
562 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
563
564 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
565 my $partitionname = $partition_start;
566 if ($partitionend ne $partition_start) {
567 $partitionname = $partitionname . "-" . $partitionend;
568 }
569
570 # Start a new partition
571 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
572 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
573 $last_partition_end = $partitionend;
574 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
575 $items_done = 0;
576 %metadata_values_to_OIDs_subhashes = ();
577 }
578
579 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
580 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
581 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
582 # Get the number of partitions needed for this value
583 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
584 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
585
586 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
587 for (my $i = 0; $i < $num_splits; $i++) {
588 my %OIDs_subhashes_for_this_value = ();
589 my @OIDs_for_this_partition = ();
590 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
591 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
592 }
593
594 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
595 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
596 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
597 $items_done += scalar(@OIDs_for_this_partition);
598 next;
599 }
600
601 # Add an HList for this bucket
602 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
603 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
604 $last_partition_end = $partitionname_for_this_value;
605 }
606 next;
607 }
608
609 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
610 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
611
612 # The last partition
613 if($i == scalar(@metadata_values) - 1) {
614 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
615 }
616 }
617 }
618 else {
619 # The easier case, just add a partition
620 my %metadata_values_to_OIDs_subhashes = ();
621 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
622 my $metadata_value = $metadata_values[$i];
623 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
624 }
625 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
626 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
627 my $partitionname = $partition_start;
628 if ($partitionend ne $partition_start) {
629 $partitionname = $partitionname . "-" . $partitionend;
630 }
631 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
632 $last_partition_end = $partitionend;
633 }
634 }
635
636 # The partitions are stored in an HList
637 $classifier_node->{'childtype'} = "HList";
638
639 } # end approximate_size
640 else {
641 # Generate hlists of a certain size
642 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
643 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
644 my $itemsdone = 0;
645 my %metadata_value_to_OIDs_subhash = ();
646 my $lastpartitionend = "";
647 my $partitionstart;
648 foreach my $metadata_value (@sortedmetadata_values) {
649 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
650 $itemsdone++;
651 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
652
653 # Is this the start of a new partition?
654 if ($itemsinpartition == 1) {
655 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
656 }
657
658 # Is this the end of the partition?
659 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
660 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
661 my $partitionname = $partitionstart;
662 if ($partitionend ne $partitionstart) {
663 $partitionname = $partitionname . "-" . $partitionend;
664 }
665
666 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
667 %metadata_value_to_OIDs_subhash = ();
668 $lastpartitionend = $partitionend;
669 }
670 }
671
672 # The partitions are stored in an HList
673 $classifier_node->{'childtype'} = "HList";
674 }
675
676 # Otherwise just add all the values to a VList
677 else {
678 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
679 }
680 }
681}
682
683
684sub convert_utf8_string_to_unicode_string
685{
686 my $self = shift(@_);
687 my $utf8_string = shift(@_);
688
689 my $unicode_string = "";
690 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
691 $unicode_string .= chr($unicode_value);
692 }
693 return $unicode_string;
694}
695
696
697sub convert_unicode_string_to_utf8_string
698{
699 my $self = shift(@_);
700 my $unicode_string = shift(@_);
701
702 my @unicode_array;
703 for (my $i = 0; $i < length($unicode_string); $i++) {
704 push(@unicode_array, ord(substr($unicode_string, $i, 1)));
705 }
706 return &unicode::unicode2utf8(\@unicode_array);
707}
708
709
710sub generate_partition_start
711{
712 my $self = shift(@_);
713 my $metadata_value = shift(@_);
714 my $lastpartitionend = shift(@_);
715 my $partition_name_length = shift(@_);
716
717 if ($partition_name_length) {
718 return substr($metadata_value, 0, $partition_name_length);
719 }
720
721 my $partitionstart = substr($metadata_value, 0, 1);
722 if ($partitionstart le $lastpartitionend) {
723 $partitionstart = substr($metadata_value, 0, 2);
724 # Give up after three characters
725 if ($partitionstart le $lastpartitionend) {
726 $partitionstart = substr($metadata_value, 0, 3);
727 }
728 }
729
730 return $partitionstart;
731}
732
733
734sub generate_partition_end
735{
736 my $self = shift(@_);
737 my $metadata_value = shift(@_);
738 my $partitionstart = shift(@_);
739 my $partition_name_length = shift(@_);
740
741 if ($partition_name_length) {
742 return substr($metadata_value, 0, $partition_name_length);
743 }
744
745 my $partitionend = substr($metadata_value, 0, length($partitionstart));
746 if ($partitionend gt $partitionstart) {
747 $partitionend = substr($metadata_value, 0, 1);
748 if ($partitionend le $partitionstart) {
749 $partitionend = substr($metadata_value, 0, 2);
750 # Give up after three characters
751 if ($partitionend le $partitionstart) {
752 $partitionend = substr($metadata_value, 0, 3);
753 }
754 }
755 }
756
757 return $partitionend;
758}
759
760
761sub add_hlist_partition
762{
763 my $self = shift(@_);
764 my @metadata_groups = @{shift(@_)};
765 my $classifier_node = shift(@_);
766 my $partitionname = shift(@_);
767 my $metadata_value_to_OIDs_hash_ref = shift(@_);
768
769 # Create an hlist partition
770 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
771 'childtype' => "VList",
772 'contains' => [] );
773
774 # Add the children to the hlist partition
775 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
776 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
777}
778
779
780sub add_vlist
781{
782 my $self = shift(@_);
783 my @metadata_groups = @{shift(@_)};
784 my $classifier_node = shift(@_);
785 my $metadata_value_to_OIDs_hash_ref = shift(@_);
786
787 my $metadata_group = shift(@metadata_groups);
788 $classifier_node->{'mdtype'} = $metadata_group;
789
790 # Create an entry in the vlist for each value
791 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
792 {
793 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
794
795 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
796 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
797 my $OID = $OIDs[0];
798 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
799 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
800 }
801 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
802 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
803 @OIDs = $self->sort_leaf_items(\@OIDs);
804 foreach my $OID (@OIDs) {
805 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
806 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
807 }
808
809 }
810 # Otherwise create a sublist (bookshelf) for the metadata value
811 else {
812 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
813 'childtype' => "VList",
814 'mdtype' => $metadata_group,
815 'contains' => [] );
816
817 # If there are metadata elements remaining, recursively apply the process
818 if (@metadata_groups > 0) {
819 my $next_metadata_group = $metadata_groups[0];
820 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
821 $self->add_level(\@metadata_groups, \@OIDs, \%child_classifier_node);
822 }
823 # Otherwise just add the documents as children of this list
824 else {
825 @OIDs = $self->sort_leaf_items(\@OIDs);
826 foreach my $OID (@OIDs) {
827 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
828 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
829 }
830
831 }
832
833 # Add the sublist to the list
834 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
835 }
836 }
837}
838
839sub metadata_offset
840{
841 my $self = shift(@_);
842 my $metadata_group = shift(@_);
843 my $OID = shift(@_);
844 my $metadata_value = shift(@_);
845
846 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
847 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
848 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
849 if ($metadata_value eq $metadata_values[$i]) {
850 return $i;
851 }
852 }
853
854 return 0;
855}
856
857sub sort_leaf_items
858{
859 my $self = shift(@_);
860 my @OIDs = @{shift(@_)};
861# my $classifier_node = shift(@_);
862
863 # Sort leaf nodes and add to list
864 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
865 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
866 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
867 # Force a stable sort (Perl 5.6's sort isn't stable)
868 # !! The [0] bits aren't ideal (multiple metadata values) !!
869 @OIDs = @OIDs[ sort {
870 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
871 {
872 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
873 }
874 else
875 {
876 $a <=> $b;
877 }
878 } 0..$#OIDs ];
879 }
880 return @OIDs;
881}
882
883
884
885sub sort_metadata_values_array
886{
887 my $self = shift(@_);
888 my @metadata_values = @_;
889
890 if ($self->{'unicode_collator'}) {
891 return $self->{'unicode_collator'}->sort(@metadata_values);
892 }
893 else {
894 return sort(@metadata_values);
895 }
896}
897
898
8991;
Note: See TracBrowser for help on using the repository browser.