source: gsdl/trunk/perllib/classify/List.pm@ 20008

Last change on this file since 20008 was 20008, checked in by ak19, 15 years ago

Adjusted the code to deal with 1. semicolons and commas separating metadata fieldnames, 2. Greenstone extracted metadata fieldnames being referred to with the ex. prefix, 3. List's partition_type_within_level option is to be an enumerated type (dropdown list).

  • Property svn:keywords set to Author Date Id Revision
File size: 32.6 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46my $partition_type_list =
47 [ { 'name' => "per_letter",
48 'desc' => "{List.level_partition.per_letter}" },
49 { 'name' => "constant_size",
50 'desc' => "{List.level_partition.constant_size}" },
51 { 'name' => "per_letter_fixed_size",
52 'desc' => "{List.level_partition.per_letter_fixed_size}"},
53 { 'name' => "none",
54 'desc' => "{List.level_partition.none}" } ];
55
56my $bookshelf_type_list =
57 [ { 'name' => "always",
58 'desc' => "{List.bookshelf_type.always}" },
59 { 'name' => "duplicate_only",
60 'desc' => "{List.bookshelf_type.duplicate_only}" },
61 { 'name' => "never",
62 'desc' => "{List.bookshelf_type.never}" } ];
63
64my $arguments =
65 [ { 'name' => "metadata",
66 'desc' => "{List.metadata}",
67 'type' => "metadata",
68 'reqd' => "yes" },
69
70 # The interesting options
71 { 'name' => "bookshelf_type",
72 'desc' => "{List.bookshelf_type}",
73 'type' => "enum",
74 'list' => $bookshelf_type_list,
75 'deft' => "never" },
76 { 'name' => "classify_sections",
77 'desc' => "{List.classify_sections}",
78 'type' => "flag" },
79 { 'name' => "partition_type_within_level",
80 'desc' => "{List.partition_type_within_level}",
81 'type' => "enum",
82 'list' => $partition_type_list,
83 'deft' => "per_letter" },
84 { 'name' => "partition_size_within_level",
85 'desc' => "{List.partition_size_within_level}",
86 'type' => "int",
87 'range' => "1,",
88 'deft' => "39" },
89 { 'name' => "partition_name_length",
90 'desc' => "{List.partition_name_length}",
91 'type' => "string" },
92 { 'name' => "sort_leaf_nodes_using",
93 'desc' => "{List.sort_leaf_nodes_using}",
94 'type' => "metadata",
95 'deft' => "Title" },
96 { 'name' => "sort_using_unicode_collation",
97 'desc' => "{List.sort_using_unicode_collation}",
98 'type' => "flag" },
99 { 'name' => "use_hlist_for",
100 'desc' => "{List.use_hlist_for}",
101 'type' => "string" },
102 { 'name' => "removeprefix",
103 'desc' => "{BasClas.removeprefix}",
104 'type' => "regexp" },
105 { 'name' => "removesuffix",
106 'desc' => "{BasClas.removesuffix}",
107 'type' => "regexp" } ];
108
109my $options = { 'name' => "List",
110 'desc' => "{List.desc}",
111 'abstract' => "no",
112 'inherits' => "yes",
113 'args' => $arguments };
114
115
116sub new
117{
118 my ($class) = shift(@_);
119 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
120 push(@$classifierslist, $class);
121
122 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
123 push(@{$hashArgOptLists->{"OptList"}}, $options);
124
125 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
126
127 if ($self->{'info_only'}) {
128 # don't worry about any options etc
129 return bless $self, $class;
130 }
131
132 # The metadata elements to use (required)
133 if (!$self->{'metadata'}) {
134 die "Error: No metadata fields specified for List.\n";
135 }
136 my @metadata_groups = split(/\//, $self->{'metadata'});
137 $self->{'metadata_groups'} = \@metadata_groups;
138
139 # The classifier button name (default: the first metadata element specified)
140 if (!$self->{'buttonname'}) {
141 my $first_metadata_group = $metadata_groups[0];
142 my $first_metadata_element = (split(/\;|,/, $first_metadata_group))[0];
143 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
144 }
145
146 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
147 foreach my $metadata_group (@metadata_groups) {
148 $self->{$metadata_group . ".bookshelf_type"} = "always";
149 }
150 my $last_metadata_group = $metadata_groups[$#metadata_groups];
151 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
152 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
153
154 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
155 foreach my $metadata_group (@metadata_groups) {
156 $self->{$metadata_group . ".list_type"} = "VList";
157 }
158 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
159 $self->{$metadata_group . ".list_type"} = "HList";
160 }
161
162 # How the items are grouped into partitions (default: no partition)
163 # for each level (metadata group), separated by '/'
164 if (!$self->{"partition_type_within_level"}) {
165 foreach my $metadata_group (@metadata_groups) {
166 $self->{$metadata_group . ".partition_type_within_level"} = "none";
167 }
168 } else {
169 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
170 foreach my $metadata_group (@metadata_groups) {
171 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
172 if (defined($partition_type_within_levelelem)) {
173 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
174 }
175 else {
176 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
177 }
178 }
179 }
180
181 # The number of items in each partition
182 if (!$self->{'partition_size_within_level'}) {
183 # Default: 20
184 foreach my $metadata_group (@metadata_groups) {
185 $self->{$metadata_group . ".partition_size_within_level"} = 20;
186 }
187 }
188 else {
189 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
190
191 # Assign values based on the partition_size_within_level parameter
192 foreach my $metadata_group (@metadata_groups) {
193 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
194 if (defined($partition_size_within_levelelem)) {
195 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
196 }
197 else {
198 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
199 }
200 }
201 }
202
203 # The removeprefix and removesuffix expressions
204 if ($self->{'removeprefix'}) {
205 # If there are more than one expressions, use '' to quote each experession and '/' to separate
206 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
207
208 foreach my $metadata_group (@metadata_groups) {
209 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
210 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
211 # Remove the other ' at the beginning and the end if there is any
212 $removeprefix_expr_within_levelelem =~ s/^'//;
213 $removeprefix_expr_within_levelelem =~ s/'$//;
214 # Remove the extra ^ at the beginning
215 $removeprefix_expr_within_levelelem =~ s/^\^//;
216 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
217 } else {
218 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
219 }
220 }
221 }
222 if ($self->{'removesuffix'}) {
223 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
224
225 foreach my $metadata_group (@metadata_groups) {
226 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
227 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
228 $removesuffix_expr_within_levelelem =~ s/^'//;
229 $removesuffix_expr_within_levelelem =~ s/'$//;
230 # Remove the extra $ at the end
231 $removesuffix_expr_within_levelelem =~ s/\$$//;
232 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
233 } else {
234 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
235 }
236 }
237 }
238
239 # The metadata elements to use to sort the leaf nodes (default: Title)
240 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
241 if ($self->{'sort_leaf_nodes_using'}) {
242 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
243 }
244 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
245
246 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
247 if ($self->{'sort_using_unicode_collation'}) {
248 # To use this you first need to download the allkeys.txt file from
249 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
250 # Unicode/Collate directory.
251 require Unicode::Collate;
252 $self->{'unicode_collator'} = Unicode::Collate->new();
253 }
254
255 # An empty array for the document OIDs
256 $self->{'OIDs'} = [];
257
258 return bless $self, $class;
259}
260
261
262sub init
263{
264 # Nothing to do...
265}
266
267
268# Called for each document in the collection
269sub classify
270{
271 my $self = shift(@_);
272 my ($doc_obj,$edit_mode) = @_;
273
274 # If "-classify_sections" is set, classify every section of the document
275 if ($self->{'classify_sections'}) {
276 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
277 while (defined $section) {
278 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section, $edit_mode);
279 $section = $doc_obj->get_next_section($section);
280 }
281 }
282 # Otherwise just classify the top document section
283 else {
284 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section(), $edit_mode);
285 }
286}
287
288
289sub classify_section
290{
291 my $self = shift(@_);
292 my ($doc_obj,$section_OID,$section,$edit_mode) = @_;
293
294 my @metadata_groups = @{$self->{'metadata_groups'}};
295
296 # Only classify the section if it has a value for one of the metadata elements in the first group
297 my $classify_section = 0;
298 my $first_metadata_group = $metadata_groups[0];
299 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
300 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $first_metadata_group_element);
301
302 # Remove prefix/suffix if requested
303 my $remove_prefix_expr = $self->{$first_metadata_group_element . ".remove_prefix_expr"};
304 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
305 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
306 }
307 my $remove_suffix_expr = $self->{$first_metadata_group_element . ".remove_suffix_expr"};
308 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
309 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
310 }
311
312 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
313 # This section must be included in the classifier
314 $classify_section = 1;
315 last;
316 }
317 }
318
319 # We're not classifying this section because it doesn't have the required metadata
320 return if (!$classify_section);
321
322 if (($edit_mode eq "delete") || ($edit_mode eq "update")) {
323 $self->oid_array_delete($section_OID,'OIDs');
324 if ($edit_mode eq "delete") {
325 return;
326 }
327 }
328
329 # Otherwise, include this section in the classifier
330 push(@{$self->{'OIDs'}}, $section_OID);
331
332 # Create a hash for the metadata values of each metadata element we're interested in
333 my %metadata_groups_done = ();
334 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
335 # Take care not to do a metadata group more than once
336 unless ($metadata_groups_done{$metadata_group}) {
337 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
338 my $remove_prefix_expr = $self->{$metadata_element . ".remove_prefix_expr"};
339 my $remove_suffix_expr = $self->{$metadata_element . ".remove_suffix_expr"};
340 my @metadata_values = @{$doc_obj->get_metadata($section, $metadata_element)};
341 foreach my $metadata_value (@metadata_values) {
342 # Strip leading and trailing whitespace
343 $metadata_value =~ s/^\s*//;
344 $metadata_value =~ s/\s*$//;
345
346 # Remove prefix/suffix if requested
347 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
348 $metadata_value =~ s/^$remove_prefix_expr//;
349 }
350 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
351 $metadata_value =~ s/$remove_suffix_expr$//;
352 }
353
354 # Convert the metadata value from a UTF-8 string to a Unicode string
355 # This means that length() and substr() work properly
356 # We need to be careful to convert classifier node title values back to UTF-8, however
357 my $metadata_value_unicode_string = $self->convert_utf8_string_to_unicode_string($metadata_value);
358
359 # Add the metadata value into the list for this combination of metadata group and section
360 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
361 }
362 last if (@metadata_values > 0);
363 }
364
365 $metadata_groups_done{$metadata_group} = 1;
366 }
367 }
368}
369
370
371sub get_classify_info
372{
373 my $self = shift(@_);
374
375 # The metadata groups to classify by
376 my @metadata_groups = @{$self->{'metadata_groups'}};
377 my $first_metadata_group = $metadata_groups[0];
378
379 # The OID values of the documents to include in the classifier
380 my @OIDs = @{$self->{'OIDs'}};
381
382 # Create the root node of the classification hierarchy
383 my %classifier_node = ( 'thistype' => "Invisible",
384 'childtype' => $self->{$first_metadata_group . ".list_type"},
385 'Title' => $self->{'buttonname'},
386 'contains' => [],
387 'mdtype' => $first_metadata_group );
388
389 # Recursively create the classification hierarchy, one level for each metadata group
390 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
391 return \%classifier_node;
392}
393
394
395sub add_level
396{
397 my $self = shift(@_);
398 my @metadata_groups = @{shift(@_)};
399 my @OIDs = @{shift(@_)};
400 my $classifier_node = shift(@_);
401 # print STDERR "\nAdding AZ list for " . $classifier_node->{'Title'} . "\n";
402
403 my $metadata_group = $metadata_groups[0];
404 # print STDERR "Processing metadata group: " . $metadata_group . "\n";
405 # print STDERR "Number of OID values: " . @OIDs . "\n";
406
407 if (!defined($self->{$metadata_group . ".list"})) {
408 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
409 return;
410 }
411
412 # Create a mapping from metadata value to OID
413 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
414 my %metadata_value_to_OIDs_hash = ();
415 foreach my $OID (@OIDs)
416 {
417 if ($OID_to_metadata_values_hash_ref->{$OID})
418 {
419 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
420 foreach my $metadata_value (@metadata_values)
421 {
422 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
423 }
424 }
425 }
426 # print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
427
428 # Partition the values (if necessary)
429 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
430 if ($partition_type_within_level =~ /^per_letter$/i) {
431 # Generate one hlist for each letter
432 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
433 my %metadata_value_to_OIDs_subhash = ();
434
435 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
436 foreach my $metadata_value (@sortedmetadata_values) {
437 my $metadata_valuepartition = substr($metadata_value, 0, 1);
438
439 # Is this the start of a new partition?
440 if ($metadata_valuepartition ne $lastpartition) {
441 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
442 %metadata_value_to_OIDs_subhash = ();
443 $lastpartition = $metadata_valuepartition;
444 }
445
446 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
447 }
448
449 # Don't forget to add the last partition
450 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
451
452 # The partitions are stored in an HList
453 $classifier_node->{'childtype'} = "HList";
454 }
455 elsif ($partition_type_within_level =~ /^per_letter_fixed_size$/i) {
456 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
457 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
458 # If a letter has more items than specified, split into several hlists.
459 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
460 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
461 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
462 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
463
464 # Separate values by their first letter, each form a bucket, like the per_letter partition type
465 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
466 my %partition_buckets = ();
467 my @metadata_values_in_bucket = ();
468 my $num_items_in_bucket = 0;
469 foreach my $metadata_value (@sortedmetadata_values) {
470 my $metadata_valuepartition = substr($metadata_value, 0, 1);
471 if ($metadata_valuepartition ne $last_partition) {
472 my @temp_array = @metadata_values_in_bucket;
473 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
474 my %partition_info = ();
475 $partition_info{'metadata_values'} = \@temp_array;
476 $partition_info{'size'} = $num_items_in_bucket;
477 $partition_buckets{$last_partition} = \%partition_info;
478
479 @metadata_values_in_bucket = ($metadata_value);
480 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
481 $last_partition = $metadata_valuepartition;
482 } else {
483 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
484 push (@metadata_values_in_bucket, $metadata_value);
485 }
486 }
487 # Last one
488 my %partition_info = ();
489 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
490 $partition_info{'size'} = $num_items_in_bucket;
491 $partition_buckets{$last_partition} = \%partition_info;
492
493 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
494 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
495 my $partition = $partition_keys[$i];
496 my $items_in_partition = $partition_buckets{$partition}->{'size'};
497 # Merge small buckets together, but keep the numeric bucket apart
498 if ($items_in_partition < $partition_size_within_level) {
499 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
500 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
501 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
502 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
503 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
504 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
505 }
506 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
507 delete $partition_buckets{$partition};
508 }
509 }
510 }
511 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
512
513 # Add partitions, and divide big bucket into several
514 my $last_partition_end = "";
515 my $partition_start = "";
516 foreach my $partition (@partition_keys) {
517 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
518 my $items_in_partition = $partition_buckets{$partition}->{'size'};
519 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
520
521 if ($items_in_partition > $partition_size_within_level) {
522 my $items_done = 0;
523 my %metadata_values_to_OIDs_subhashes = ();
524 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
525 my $metadata_value = $metadata_values[$i];
526 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
527 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
528
529 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
530 my $partitionname = $partition_start;
531 if ($partitionend ne $partition_start) {
532 $partitionname = $partitionname . "-" . $partitionend;
533 }
534
535 # Start a new partition
536 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
537 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
538 $last_partition_end = $partitionend;
539 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
540 $items_done = 0;
541 %metadata_values_to_OIDs_subhashes = ();
542 }
543
544 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
545 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
546 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
547 # Get the number of partitions needed for this value
548 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
549 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
550
551 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
552 for (my $i = 0; $i < $num_splits; $i++) {
553 my %OIDs_subhashes_for_this_value = ();
554 my @OIDs_for_this_partition = ();
555 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
556 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
557 }
558
559 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
560 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
561 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
562 $items_done += scalar(@OIDs_for_this_partition);
563 next;
564 }
565
566 # Add an HList for this bucket
567 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
568 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
569 $last_partition_end = $partitionname_for_this_value;
570 }
571 next;
572 }
573
574 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
575 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
576
577 # The last partition
578 if($i == scalar(@metadata_values) - 1) {
579 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
580 }
581 }
582 }
583 else {
584 # The easier case, just add a partition
585 my %metadata_values_to_OIDs_subhashes = ();
586 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
587 my $metadata_value = $metadata_values[$i];
588 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
589 }
590 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
591 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
592 my $partitionname = $partition_start;
593 if ($partitionend ne $partition_start) {
594 $partitionname = $partitionname . "-" . $partitionend;
595 }
596 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
597 $last_partition_end = $partitionend;
598 }
599 }
600 }
601 else {
602 # Generate hlists of a certain size
603 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
604 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
605 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
606 my $itemsdone = 0;
607 my %metadata_value_to_OIDs_subhash = ();
608 my $lastpartitionend = "";
609 my $partitionstart;
610 foreach my $metadata_value (@sortedmetadata_values) {
611 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
612 $itemsdone++;
613 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
614
615 # Is this the start of a new partition?
616 if ($itemsinpartition == 1) {
617 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
618 }
619
620 # Is this the end of the partition?
621 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
622 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
623 my $partitionname = $partitionstart;
624 if ($partitionend ne $partitionstart) {
625 $partitionname = $partitionname . "-" . $partitionend;
626 }
627
628 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
629 %metadata_value_to_OIDs_subhash = ();
630 $lastpartitionend = $partitionend;
631 }
632 }
633
634 # The partitions are stored in an HList
635 $classifier_node->{'childtype'} = "HList";
636 }
637
638 # Otherwise just add all the values to a VList
639 else {
640 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
641 }
642 }
643}
644
645
646sub convert_utf8_string_to_unicode_string
647{
648 my $self = shift(@_);
649 my $utf8_string = shift(@_);
650
651 my $unicode_string = "";
652 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
653 $unicode_string .= chr($unicode_value);
654 }
655 return $unicode_string;
656}
657
658
659sub convert_unicode_string_to_utf8_string
660{
661 my $self = shift(@_);
662 my $unicode_string = shift(@_);
663
664 my @unicode_array;
665 for (my $i = 0; $i < length($unicode_string); $i++) {
666 push(@unicode_array, ord(substr($unicode_string, $i, 1)));
667 }
668 return &unicode::unicode2utf8(\@unicode_array);
669}
670
671
672sub generate_partition_start
673{
674 my $self = shift(@_);
675 my $metadata_value = shift(@_);
676 my $lastpartitionend = shift(@_);
677 my $partition_name_length = shift(@_);
678
679 if ($partition_name_length) {
680 return substr($metadata_value, 0, $partition_name_length);
681 }
682
683 my $partitionstart = substr($metadata_value, 0, 1);
684 if ($partitionstart le $lastpartitionend) {
685 $partitionstart = substr($metadata_value, 0, 2);
686 # Give up after three characters
687 if ($partitionstart le $lastpartitionend) {
688 $partitionstart = substr($metadata_value, 0, 3);
689 }
690 }
691
692 return $partitionstart;
693}
694
695
696sub generate_partition_end
697{
698 my $self = shift(@_);
699 my $metadata_value = shift(@_);
700 my $partitionstart = shift(@_);
701 my $partition_name_length = shift(@_);
702
703 if ($partition_name_length) {
704 return substr($metadata_value, 0, $partition_name_length);
705 }
706
707 my $partitionend = substr($metadata_value, 0, length($partitionstart));
708 if ($partitionend gt $partitionstart) {
709 $partitionend = substr($metadata_value, 0, 1);
710 if ($partitionend le $partitionstart) {
711 $partitionend = substr($metadata_value, 0, 2);
712 # Give up after three characters
713 if ($partitionend le $partitionstart) {
714 $partitionend = substr($metadata_value, 0, 3);
715 }
716 }
717 }
718
719 return $partitionend;
720}
721
722
723sub add_hlist_partition
724{
725 my $self = shift(@_);
726 my @metadata_groups = @{shift(@_)};
727 my $classifier_node = shift(@_);
728 my $partitionname = shift(@_);
729 my $metadata_value_to_OIDs_hash_ref = shift(@_);
730
731 # Create an hlist partition
732 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
733 'childtype' => "VList",
734 'contains' => [] );
735
736 # Add the children to the hlist partition
737 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
738 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
739}
740
741
742sub add_vlist
743{
744 my $self = shift(@_);
745 my @metadata_groups = @{shift(@_)};
746 my $classifier_node = shift(@_);
747 my $metadata_value_to_OIDs_hash_ref = shift(@_);
748
749 my $metadata_group = shift(@metadata_groups);
750 $classifier_node->{'mdtype'} = $metadata_group;
751
752 # Create an entry in the vlist for each value
753 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
754 {
755 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
756
757 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
758 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
759 my $OID = $OIDs[0];
760
761 # Find the offset of this metadata value
762 my $offset = 0;
763 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
764 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
765 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
766 if ($metadata_value eq $metadata_values[$i]) {
767 $offset = $i;
768 last;
769 }
770 }
771 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
772 }
773 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
774 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
775 $self->add_sorted_leaf_items(\@OIDs, $classifier_node);
776 }
777 # Otherwise create a sublist (bookshelf) for the metadata value
778 else {
779 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
780 'childtype' => "VList",
781 'contains' => [] );
782
783 # If there are metadata elements remaining, recursively apply the process
784 if (@metadata_groups > 0) {
785 my $next_metadata_group = $metadata_groups[0];
786 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
787 $self->add_level(\@metadata_groups, \@OIDs, \%child_classifier_node);
788 }
789 # Otherwise just add the documents as children of this list
790 else {
791 $self->add_sorted_leaf_items(\@OIDs, \%child_classifier_node);
792 }
793
794 # Add the sublist to the list
795 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
796 }
797 }
798}
799
800sub add_sorted_leaf_items
801{
802 my $self = shift(@_);
803 my @OIDs = @{shift(@_)};
804 my $classifier_node = shift(@_);
805
806 # Sort leaf nodes and add to list
807 my @sort_leaf_nodes_usingmetadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
808 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_usingmetadata_groups) {
809 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
810 # Force a stable sort (Perl 5.6's sort isn't stable)
811 # !! The [0] bits aren't ideal (multiple metadata values) !!
812 @OIDs = @OIDs[ sort { $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ];
813 }
814
815 foreach my $OID (@OIDs) {
816 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID });
817 }
818}
819
820
821sub sort_metadata_values_array
822{
823 my $self = shift(@_);
824 my @metadata_values = @_;
825
826 if ($self->{'unicode_collator'}) {
827 return $self->{'unicode_collator'}->sort(@metadata_values);
828 }
829 else {
830 return sort(@metadata_values);
831 }
832}
833
834
8351;
Note: See TracBrowser for help on using the repository browser.