source: gsdl/trunk/perllib/classify/List.pm@ 19645

Last change on this file since 19645 was 19645, checked in by anna, 15 years ago

Changed default bookshelf type to 'never' so it behaves more like AZList.

  • Property svn:keywords set to Author Date Id Revision
File size: 32.5 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46my $partition_type_list =
47 [ { 'name' => "per_letter",
48 'desc' => "{List.level_partition.per_letter}" },
49 { 'name' => "constant_size",
50 'desc' => "{List.level_partition.constant_size}" },
51 { 'name' => "per_letter_fixed_size",
52 'desc' => "{List.level_partition.per_letter_fixed_size}"},
53 { 'name' => "none",
54 'desc' => "{List.level_partition.none}" } ];
55
56my $bookshelf_type_list =
57 [ { 'name' => "always",
58 'desc' => "{List.bookshelf_type.always}" },
59 { 'name' => "duplicate_only",
60 'desc' => "{List.bookshelf_type.duplicate_only}" },
61 { 'name' => "never",
62 'desc' => "{List.bookshelf_type.never}" } ];
63
64my $arguments =
65 [ { 'name' => "metadata",
66 'desc' => "{List.metadata}",
67 'type' => "metadata",
68 'reqd' => "yes" },
69
70 # The interesting options
71 { 'name' => "bookshelf_type",
72 'desc' => "{List.bookshelf_type}",
73 'type' => "enum",
74 'list' => $bookshelf_type_list,
75 'deft' => "never" },
76 { 'name' => "classify_sections",
77 'desc' => "{List.classify_sections}",
78 'type' => "flag" },
79 { 'name' => "partition_type_within_level",
80 'desc' => "{List.partition_type_within_level}",
81 'type' => "string",
82 'deft' => "none" },
83 { 'name' => "partition_size_within_level",
84 'desc' => "{List.partition_size_within_level}",
85 'type' => "int",
86 'range' => "1,",
87 'deft' => "39" },
88 { 'name' => "partition_name_length",
89 'desc' => "{List.partition_name_length}",
90 'type' => "string" },
91 { 'name' => "sort_leaf_nodes_using",
92 'desc' => "{List.sort_leaf_nodes_using}",
93 'type' => "metadata",
94 'deft' => "Title" },
95 { 'name' => "sort_using_unicode_collation",
96 'desc' => "{List.sort_using_unicode_collation}",
97 'type' => "flag" },
98 { 'name' => "use_hlist_for",
99 'desc' => "{List.use_hlist_for}",
100 'type' => "string" },
101 { 'name' => "removeprefix",
102 'desc' => "{BasClas.removeprefix}",
103 'type' => "regexp" },
104 { 'name' => "removesuffix",
105 'desc' => "{BasClas.removesuffix}",
106 'type' => "regexp" } ];
107
108my $options = { 'name' => "List",
109 'desc' => "{List.desc}",
110 'abstract' => "no",
111 'inherits' => "yes",
112 'args' => $arguments };
113
114
115sub new
116{
117 my ($class) = shift(@_);
118 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
119 push(@$classifierslist, $class);
120
121 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
122 push(@{$hashArgOptLists->{"OptList"}}, $options);
123
124 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
125
126 if ($self->{'info_only'}) {
127 # don't worry about any options etc
128 return bless $self, $class;
129 }
130
131 # The metadata elements to use (required)
132 if (!$self->{'metadata'}) {
133 die "Error: No metadata fields specified for List.\n";
134 }
135 my @metadata_groups = split(/\//, $self->{'metadata'});
136 $self->{'metadata_groups'} = \@metadata_groups;
137
138 # The classifier button name (default: the first metadata element specified)
139 if (!$self->{'buttonname'}) {
140 my $first_metadata_group = $metadata_groups[0];
141 my $first_metadata_element = (split(/\;/, $first_metadata_group))[0];
142 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
143 }
144
145 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
146 foreach my $metadata_group (@metadata_groups) {
147 $self->{$metadata_group . ".bookshelf_type"} = "always";
148 }
149 my $last_metadata_group = $metadata_groups[$#metadata_groups];
150 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
151 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
152
153 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
154 foreach my $metadata_group (@metadata_groups) {
155 $self->{$metadata_group . ".list_type"} = "VList";
156 }
157 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
158 $self->{$metadata_group . ".list_type"} = "HList";
159 }
160
161 # How the items are grouped into partitions (default: no partition)
162 # for each level (metadata group), separated by '/'
163 if (!$self->{"partition_type_within_level"}) {
164 foreach my $metadata_group (@metadata_groups) {
165 $self->{$metadata_group . ".partition_type_within_level"} = "none";
166 }
167 } else {
168 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
169 foreach my $metadata_group (@metadata_groups) {
170 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
171 if (defined($partition_type_within_levelelem)) {
172 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
173 }
174 else {
175 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
176 }
177 }
178 }
179
180 # The number of items in each partition
181 if (!$self->{'partition_size_within_level'}) {
182 # Default: 20
183 foreach my $metadata_group (@metadata_groups) {
184 $self->{$metadata_group . ".partition_size_within_level"} = 20;
185 }
186 }
187 else {
188 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
189
190 # Assign values based on the partition_size_within_level parameter
191 foreach my $metadata_group (@metadata_groups) {
192 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
193 if (defined($partition_size_within_levelelem)) {
194 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
195 }
196 else {
197 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
198 }
199 }
200 }
201
202 # The removeprefix and removesuffix expressions
203 if ($self->{'removeprefix'}) {
204 # If there are more than one expressions, use '' to quote each experession and '/' to separate
205 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
206
207 foreach my $metadata_group (@metadata_groups) {
208 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
209 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
210 # Remove the other ' at the beginning and the end if there is any
211 $removeprefix_expr_within_levelelem =~ s/^'//;
212 $removeprefix_expr_within_levelelem =~ s/'$//;
213 # Remove the extra ^ at the beginning
214 $removeprefix_expr_within_levelelem =~ s/^\^//;
215 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
216 } else {
217 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
218 }
219 }
220 }
221 if ($self->{'removesuffix'}) {
222 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
223
224 foreach my $metadata_group (@metadata_groups) {
225 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
226 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
227 $removesuffix_expr_within_levelelem =~ s/^'//;
228 $removesuffix_expr_within_levelelem =~ s/'$//;
229 # Remove the extra $ at the end
230 $removesuffix_expr_within_levelelem =~ s/\$$//;
231 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
232 } else {
233 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
234 }
235 }
236 }
237
238 # The metadata elements to use to sort the leaf nodes (default: Title)
239 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
240 if ($self->{'sort_leaf_nodes_using'}) {
241 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
242 }
243 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
244
245 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
246 if ($self->{'sort_using_unicode_collation'}) {
247 # To use this you first need to download the allkeys.txt file from
248 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
249 # Unicode/Collate directory.
250 require Unicode::Collate;
251 $self->{'unicode_collator'} = Unicode::Collate->new();
252 }
253
254 # An empty array for the document OIDs
255 $self->{'OIDs'} = [];
256
257 return bless $self, $class;
258}
259
260
261sub init
262{
263 # Nothing to do...
264}
265
266
267# Called for each document in the collection
268sub classify
269{
270 my $self = shift(@_);
271 my ($doc_obj,$edit_mode) = @_;
272
273 # If "-classify_sections" is set, classify every section of the document
274 if ($self->{'classify_sections'}) {
275 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
276 while (defined $section) {
277 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section, $edit_mode);
278 $section = $doc_obj->get_next_section($section);
279 }
280 }
281 # Otherwise just classify the top document section
282 else {
283 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section(), $edit_mode);
284 }
285}
286
287
288sub classify_section
289{
290 my $self = shift(@_);
291 my ($doc_obj,$section_OID,$section,$edit_mode) = @_;
292
293 my @metadata_groups = @{$self->{'metadata_groups'}};
294
295 # Only classify the section if it has a value for one of the metadata elements in the first group
296 my $classify_section = 0;
297 my $first_metadata_group = $metadata_groups[0];
298 foreach my $first_metadata_group_element (split(/\;/, $first_metadata_group)) {
299 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $first_metadata_group_element);
300
301 # Remove prefix/suffix if requested
302 my $remove_prefix_expr = $self->{$first_metadata_group_element . ".remove_prefix_expr"};
303 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
304 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
305 }
306 my $remove_suffix_expr = $self->{$first_metadata_group_element . ".remove_suffix_expr"};
307 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
308 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
309 }
310
311 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
312 # This section must be included in the classifier
313 $classify_section = 1;
314 last;
315 }
316 }
317
318 # We're not classifying this section because it doesn't have the required metadata
319 return if (!$classify_section);
320
321 if (($edit_mode eq "delete") || ($edit_mode eq "update")) {
322 $self->oid_array_delete($section_OID,'OIDs');
323 if ($edit_mode eq "delete") {
324 return;
325 }
326 }
327
328 # Otherwise, include this section in the classifier
329 push(@{$self->{'OIDs'}}, $section_OID);
330
331 # Create a hash for the metadata values of each metadata element we're interested in
332 my %metadata_groups_done = ();
333 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
334 # Take care not to do a metadata group more than once
335 unless ($metadata_groups_done{$metadata_group}) {
336 foreach my $metadata_element (split(/\;/, $metadata_group)) {
337 my $remove_prefix_expr = $self->{$metadata_element . ".remove_prefix_expr"};
338 my $remove_suffix_expr = $self->{$metadata_element . ".remove_suffix_expr"};
339 my @metadata_values = @{$doc_obj->get_metadata($section, $metadata_element)};
340 foreach my $metadata_value (@metadata_values) {
341 # Strip leading and trailing whitespace
342 $metadata_value =~ s/^\s*//;
343 $metadata_value =~ s/\s*$//;
344
345 # Remove prefix/suffix if requested
346 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
347 $metadata_value =~ s/^$remove_prefix_expr//;
348 }
349 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
350 $metadata_value =~ s/$remove_suffix_expr$//;
351 }
352
353 # Convert the metadata value from a UTF-8 string to a Unicode string
354 # This means that length() and substr() work properly
355 # We need to be careful to convert classifier node title values back to UTF-8, however
356 my $metadata_value_unicode_string = $self->convert_utf8_string_to_unicode_string($metadata_value);
357
358 # Add the metadata value into the list for this combination of metadata group and section
359 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
360 }
361 last if (@metadata_values > 0);
362 }
363
364 $metadata_groups_done{$metadata_group} = 1;
365 }
366 }
367}
368
369
370sub get_classify_info
371{
372 my $self = shift(@_);
373
374 # The metadata groups to classify by
375 my @metadata_groups = @{$self->{'metadata_groups'}};
376 my $first_metadata_group = $metadata_groups[0];
377
378 # The OID values of the documents to include in the classifier
379 my @OIDs = @{$self->{'OIDs'}};
380
381 # Create the root node of the classification hierarchy
382 my %classifier_node = ( 'thistype' => "Invisible",
383 'childtype' => $self->{$first_metadata_group . ".list_type"},
384 'Title' => $self->{'buttonname'},
385 'contains' => [],
386 'mdtype' => $first_metadata_group );
387
388 # Recursively create the classification hierarchy, one level for each metadata group
389 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
390 return \%classifier_node;
391}
392
393
394sub add_level
395{
396 my $self = shift(@_);
397 my @metadata_groups = @{shift(@_)};
398 my @OIDs = @{shift(@_)};
399 my $classifier_node = shift(@_);
400 # print STDERR "\nAdding AZ list for " . $classifier_node->{'Title'} . "\n";
401
402 my $metadata_group = $metadata_groups[0];
403 # print STDERR "Processing metadata group: " . $metadata_group . "\n";
404 # print STDERR "Number of OID values: " . @OIDs . "\n";
405
406 if (!defined($self->{$metadata_group . ".list"})) {
407 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
408 return;
409 }
410
411 # Create a mapping from metadata value to OID
412 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
413 my %metadata_value_to_OIDs_hash = ();
414 foreach my $OID (@OIDs)
415 {
416 if ($OID_to_metadata_values_hash_ref->{$OID})
417 {
418 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
419 foreach my $metadata_value (@metadata_values)
420 {
421 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
422 }
423 }
424 }
425 # print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
426
427 # Partition the values (if necessary)
428 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
429 if ($partition_type_within_level =~ /^per_letter$/i) {
430 # Generate one hlist for each letter
431 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
432 my %metadata_value_to_OIDs_subhash = ();
433
434 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
435 foreach my $metadata_value (@sortedmetadata_values) {
436 my $metadata_valuepartition = substr($metadata_value, 0, 1);
437
438 # Is this the start of a new partition?
439 if ($metadata_valuepartition ne $lastpartition) {
440 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
441 %metadata_value_to_OIDs_subhash = ();
442 $lastpartition = $metadata_valuepartition;
443 }
444
445 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
446 }
447
448 # Don't forget to add the last partition
449 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
450
451 # The partitions are stored in an HList
452 $classifier_node->{'childtype'} = "HList";
453 }
454 elsif ($partition_type_within_level =~ /^per_letter_fixed_size$/i) {
455 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
456 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
457 # If a letter has more items than specified, split into several hlists.
458 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
459 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
460 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
461 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
462
463 # Separate values by their first letter, each form a bucket, like the per_letter partition type
464 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
465 my %partition_buckets = ();
466 my @metadata_values_in_bucket = ();
467 my $num_items_in_bucket = 0;
468 foreach my $metadata_value (@sortedmetadata_values) {
469 my $metadata_valuepartition = substr($metadata_value, 0, 1);
470 if ($metadata_valuepartition ne $last_partition) {
471 my @temp_array = @metadata_values_in_bucket;
472 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
473 my %partition_info = ();
474 $partition_info{'metadata_values'} = \@temp_array;
475 $partition_info{'size'} = $num_items_in_bucket;
476 $partition_buckets{$last_partition} = \%partition_info;
477
478 @metadata_values_in_bucket = ($metadata_value);
479 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
480 $last_partition = $metadata_valuepartition;
481 } else {
482 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
483 push (@metadata_values_in_bucket, $metadata_value);
484 }
485 }
486 # Last one
487 my %partition_info = ();
488 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
489 $partition_info{'size'} = $num_items_in_bucket;
490 $partition_buckets{$last_partition} = \%partition_info;
491
492 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
493 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
494 my $partition = $partition_keys[$i];
495 my $items_in_partition = $partition_buckets{$partition}->{'size'};
496 # Merge small buckets together, but keep the numeric bucket apart
497 if ($items_in_partition < $partition_size_within_level) {
498 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
499 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
500 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
501 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
502 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
503 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
504 }
505 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
506 delete $partition_buckets{$partition};
507 }
508 }
509 }
510 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
511
512 # Add partitions, and divide big bucket into several
513 my $last_partition_end = "";
514 my $partition_start = "";
515 foreach my $partition (@partition_keys) {
516 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
517 my $items_in_partition = $partition_buckets{$partition}->{'size'};
518 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
519
520 if ($items_in_partition > $partition_size_within_level) {
521 my $items_done = 0;
522 my %metadata_values_to_OIDs_subhashes = ();
523 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
524 my $metadata_value = $metadata_values[$i];
525 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
526 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
527
528 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
529 my $partitionname = $partition_start;
530 if ($partitionend ne $partition_start) {
531 $partitionname = $partitionname . "-" . $partitionend;
532 }
533
534 # Start a new partition
535 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
536 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
537 $last_partition_end = $partitionend;
538 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
539 $items_done = 0;
540 %metadata_values_to_OIDs_subhashes = ();
541 }
542
543 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
544 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
545 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
546 # Get the number of partitions needed for this value
547 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
548 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
549
550 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
551 for (my $i = 0; $i < $num_splits; $i++) {
552 my %OIDs_subhashes_for_this_value = ();
553 my @OIDs_for_this_partition = ();
554 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
555 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
556 }
557
558 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
559 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
560 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
561 $items_done += scalar(@OIDs_for_this_partition);
562 next;
563 }
564
565 # Add an HList for this bucket
566 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
567 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
568 $last_partition_end = $partitionname_for_this_value;
569 }
570 next;
571 }
572
573 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
574 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
575
576 # The last partition
577 if($i == scalar(@metadata_values) - 1) {
578 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
579 }
580 }
581 }
582 else {
583 # The easier case, just add a partition
584 my %metadata_values_to_OIDs_subhashes = ();
585 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
586 my $metadata_value = $metadata_values[$i];
587 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
588 }
589 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
590 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
591 my $partitionname = $partition_start;
592 if ($partitionend ne $partition_start) {
593 $partitionname = $partitionname . "-" . $partitionend;
594 }
595 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
596 $last_partition_end = $partitionend;
597 }
598 }
599 }
600 else {
601 # Generate hlists of a certain size
602 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
603 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
604 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
605 my $itemsdone = 0;
606 my %metadata_value_to_OIDs_subhash = ();
607 my $lastpartitionend = "";
608 my $partitionstart;
609 foreach my $metadata_value (@sortedmetadata_values) {
610 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
611 $itemsdone++;
612 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
613
614 # Is this the start of a new partition?
615 if ($itemsinpartition == 1) {
616 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
617 }
618
619 # Is this the end of the partition?
620 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
621 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
622 my $partitionname = $partitionstart;
623 if ($partitionend ne $partitionstart) {
624 $partitionname = $partitionname . "-" . $partitionend;
625 }
626
627 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
628 %metadata_value_to_OIDs_subhash = ();
629 $lastpartitionend = $partitionend;
630 }
631 }
632
633 # The partitions are stored in an HList
634 $classifier_node->{'childtype'} = "HList";
635 }
636
637 # Otherwise just add all the values to a VList
638 else {
639 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
640 }
641 }
642}
643
644
645sub convert_utf8_string_to_unicode_string
646{
647 my $self = shift(@_);
648 my $utf8_string = shift(@_);
649
650 my $unicode_string = "";
651 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
652 $unicode_string .= chr($unicode_value);
653 }
654 return $unicode_string;
655}
656
657
658sub convert_unicode_string_to_utf8_string
659{
660 my $self = shift(@_);
661 my $unicode_string = shift(@_);
662
663 my @unicode_array;
664 for (my $i = 0; $i < length($unicode_string); $i++) {
665 push(@unicode_array, ord(substr($unicode_string, $i, 1)));
666 }
667 return &unicode::unicode2utf8(\@unicode_array);
668}
669
670
671sub generate_partition_start
672{
673 my $self = shift(@_);
674 my $metadata_value = shift(@_);
675 my $lastpartitionend = shift(@_);
676 my $partition_name_length = shift(@_);
677
678 if ($partition_name_length) {
679 return substr($metadata_value, 0, $partition_name_length);
680 }
681
682 my $partitionstart = substr($metadata_value, 0, 1);
683 if ($partitionstart le $lastpartitionend) {
684 $partitionstart = substr($metadata_value, 0, 2);
685 # Give up after three characters
686 if ($partitionstart le $lastpartitionend) {
687 $partitionstart = substr($metadata_value, 0, 3);
688 }
689 }
690
691 return $partitionstart;
692}
693
694
695sub generate_partition_end
696{
697 my $self = shift(@_);
698 my $metadata_value = shift(@_);
699 my $partitionstart = shift(@_);
700 my $partition_name_length = shift(@_);
701
702 if ($partition_name_length) {
703 return substr($metadata_value, 0, $partition_name_length);
704 }
705
706 my $partitionend = substr($metadata_value, 0, length($partitionstart));
707 if ($partitionend gt $partitionstart) {
708 $partitionend = substr($metadata_value, 0, 1);
709 if ($partitionend le $partitionstart) {
710 $partitionend = substr($metadata_value, 0, 2);
711 # Give up after three characters
712 if ($partitionend le $partitionstart) {
713 $partitionend = substr($metadata_value, 0, 3);
714 }
715 }
716 }
717
718 return $partitionend;
719}
720
721
722sub add_hlist_partition
723{
724 my $self = shift(@_);
725 my @metadata_groups = @{shift(@_)};
726 my $classifier_node = shift(@_);
727 my $partitionname = shift(@_);
728 my $metadata_value_to_OIDs_hash_ref = shift(@_);
729
730 # Create an hlist partition
731 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
732 'childtype' => "VList",
733 'contains' => [] );
734
735 # Add the children to the hlist partition
736 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
737 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
738}
739
740
741sub add_vlist
742{
743 my $self = shift(@_);
744 my @metadata_groups = @{shift(@_)};
745 my $classifier_node = shift(@_);
746 my $metadata_value_to_OIDs_hash_ref = shift(@_);
747
748 my $metadata_group = shift(@metadata_groups);
749 $classifier_node->{'mdtype'} = $metadata_group;
750
751 # Create an entry in the vlist for each value
752 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
753 {
754 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
755
756 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
757 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
758 my $OID = $OIDs[0];
759
760 # Find the offset of this metadata value
761 my $offset = 0;
762 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
763 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
764 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
765 if ($metadata_value eq $metadata_values[$i]) {
766 $offset = $i;
767 last;
768 }
769 }
770 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
771 }
772 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
773 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
774 $self->add_sorted_leaf_items(\@OIDs, $classifier_node);
775 }
776 # Otherwise create a sublist (bookshelf) for the metadata value
777 else {
778 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
779 'childtype' => "VList",
780 'contains' => [] );
781
782 # If there are metadata elements remaining, recursively apply the process
783 if (@metadata_groups > 0) {
784 my $next_metadata_group = $metadata_groups[0];
785 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
786 $self->add_level(\@metadata_groups, \@OIDs, \%child_classifier_node);
787 }
788 # Otherwise just add the documents as children of this list
789 else {
790 $self->add_sorted_leaf_items(\@OIDs, \%child_classifier_node);
791 }
792
793 # Add the sublist to the list
794 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
795 }
796 }
797}
798
799sub add_sorted_leaf_items
800{
801 my $self = shift(@_);
802 my @OIDs = @{shift(@_)};
803 my $classifier_node = shift(@_);
804
805 # Sort leaf nodes and add to list
806 my @sort_leaf_nodes_usingmetadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
807 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_usingmetadata_groups) {
808 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
809 # Force a stable sort (Perl 5.6's sort isn't stable)
810 # !! The [0] bits aren't ideal (multiple metadata values) !!
811 @OIDs = @OIDs[ sort { $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ];
812 }
813
814 foreach my $OID (@OIDs) {
815 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID });
816 }
817}
818
819
820sub sort_metadata_values_array
821{
822 my $self = shift(@_);
823 my @metadata_values = @_;
824
825 if ($self->{'unicode_collator'}) {
826 return $self->{'unicode_collator'}->sort(@metadata_values);
827 }
828 else {
829 return sort(@metadata_values);
830 }
831}
832
833
8341;
Note: See TracBrowser for help on using the repository browser.