source: gsdl/trunk/perllib/classify/List.pm@ 20904

Last change on this file since 20904 was 20904, checked in by kjdon, 14 years ago

approximate size partition, don't do it if too few items

  • Property svn:keywords set to Author Date Id Revision
File size: 34.3 KB
RevLine 
[10398]1###########################################################################
2#
[18568]3# List.pm -- A general and flexible list classifier with most of
[10398]4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
[13741]29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
[10398]31#
32###########################################################################
33
[18568]34package List;
[10398]35
36
[17209]37use BaseClassifier;
[10398]38
39use strict;
40
41
42sub BEGIN {
[18568]43 @List::ISA = ('BaseClassifier');
[10398]44}
45
[18572]46my $partition_type_list =
47 [ { 'name' => "per_letter",
48 'desc' => "{List.level_partition.per_letter}" },
[20865]49 { 'name' => "approximate_size",
50 'desc' => "{List.level_partition.approximate_size}"},
[18572]51 { 'name' => "constant_size",
[18619]52 'desc' => "{List.level_partition.constant_size}" },
[18572]53 { 'name' => "none",
54 'desc' => "{List.level_partition.none}" } ];
[10398]55
[20825]56# following used to check types later on
57my $valid_partition_types = { 'per_letter' => 1,
58 'constant_size' => 1,
59 'per_letter_fixed_size' => 1,
[20865]60 'approximate_size' => 1,
[20825]61 'none' => 1};
62
[18619]63my $bookshelf_type_list =
64 [ { 'name' => "always",
65 'desc' => "{List.bookshelf_type.always}" },
66 { 'name' => "duplicate_only",
67 'desc' => "{List.bookshelf_type.duplicate_only}" },
68 { 'name' => "never",
[20008]69 'desc' => "{List.bookshelf_type.never}" } ];
[18619]70
[10398]71my $arguments =
72 [ { 'name' => "metadata",
[19234]73 'desc' => "{List.metadata}",
[10398]74 'type' => "metadata",
75 'reqd' => "yes" },
[12889]76
[10398]77 # The interesting options
[18619]78 { 'name' => "bookshelf_type",
[19234]79 'desc' => "{List.bookshelf_type}",
[18619]80 'type' => "enum",
81 'list' => $bookshelf_type_list,
[19645]82 'deft' => "never" },
[10498]83 { 'name' => "classify_sections",
[19234]84 'desc' => "{List.classify_sections}",
[10498]85 'type' => "flag" },
86 { 'name' => "partition_type_within_level",
[19234]87 'desc' => "{List.partition_type_within_level}",
[20679]88 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
[20008]89 'list' => $partition_type_list,
90 'deft' => "per_letter" },
[10498]91 { 'name' => "partition_size_within_level",
[19234]92 'desc' => "{List.partition_size_within_level}",
[20679]93 'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
[14084]94 { 'name' => "partition_name_length",
[19234]95 'desc' => "{List.partition_name_length}",
[14084]96 'type' => "string" },
[10498]97 { 'name' => "sort_leaf_nodes_using",
[19234]98 'desc' => "{List.sort_leaf_nodes_using}",
[10398]99 'type' => "metadata",
[10499]100 'deft' => "Title" },
[13551]101 { 'name' => "sort_using_unicode_collation",
[19234]102 'desc' => "{List.sort_using_unicode_collation}",
[13551]103 'type' => "flag" },
[10499]104 { 'name' => "use_hlist_for",
[19234]105 'desc' => "{List.use_hlist_for}",
[18619]106 'type' => "string" },
107 { 'name' => "removeprefix",
108 'desc' => "{BasClas.removeprefix}",
109 'type' => "regexp" },
110 { 'name' => "removesuffix",
111 'desc' => "{BasClas.removesuffix}",
112 'type' => "regexp" } ];
[10398]113
[18568]114my $options = { 'name' => "List",
[19234]115 'desc' => "{List.desc}",
[10502]116 'abstract' => "no",
[18572]117 'inherits' => "yes",
[10398]118 'args' => $arguments };
119
120
121sub new
122{
123 my ($class) = shift(@_);
124 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
125 push(@$classifierslist, $class);
126
[17209]127 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
128 push(@{$hashArgOptLists->{"OptList"}}, $options);
[10398]129
[17209]130 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
[10398]131
132 if ($self->{'info_only'}) {
133 # don't worry about any options etc
134 return bless $self, $class;
135 }
136
137 # The metadata elements to use (required)
[12894]138 if (!$self->{'metadata'}) {
[18568]139 die "Error: No metadata fields specified for List.\n";
[10398]140 }
[12894]141 my @metadata_groups = split(/\//, $self->{'metadata'});
[12889]142 $self->{'metadata_groups'} = \@metadata_groups;
[10398]143
[12894]144 # The classifier button name (default: the first metadata element specified)
145 if (!$self->{'buttonname'}) {
146 my $first_metadata_group = $metadata_groups[0];
[20008]147 my $first_metadata_element = (split(/\;|,/, $first_metadata_group))[0];
[12894]148 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
[10398]149 }
150
[18619]151 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
[12889]152 foreach my $metadata_group (@metadata_groups) {
[18619]153 $self->{$metadata_group . ".bookshelf_type"} = "always";
154 }
155 my $last_metadata_group = $metadata_groups[$#metadata_groups];
156 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
157 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
158
[12892]159 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
[12889]160 foreach my $metadata_group (@metadata_groups) {
[12892]161 $self->{$metadata_group . ".list_type"} = "VList";
[10499]162 }
[12892]163 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
164 $self->{$metadata_group . ".list_type"} = "HList";
[10499]165 }
166
[18619]167 # How the items are grouped into partitions (default: no partition)
168 # for each level (metadata group), separated by '/'
[12894]169 if (!$self->{"partition_type_within_level"}) {
[18619]170 foreach my $metadata_group (@metadata_groups) {
171 $self->{$metadata_group . ".partition_type_within_level"} = "none";
172 }
173 } else {
174 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
[20825]175
176 my $first = 1;
[18619]177 foreach my $metadata_group (@metadata_groups) {
178 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
[20865]179 if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
180 print STDERR "per letter fixed size, changing to approximate size\n";
181 $partition_type_within_levelelem = "approximate_size";
182 }
[20825]183 if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
[18619]184 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
185 }
186 else {
[20825]187 if ($first) {
188 $self->{$metadata_group . ".partition_type_within_level"} = "none";
189 $first = 0;
190 } else {
191 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
192 }
193 if (defined($partition_type_within_levelelem)) {
194 # ie invalid entry
195 print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
196 }
[18619]197 }
198 }
[10398]199 }
[20825]200
[10499]201 # The number of items in each partition
[12894]202 if (!$self->{'partition_size_within_level'}) {
[10398]203 # Default: 20
[12889]204 foreach my $metadata_group (@metadata_groups) {
205 $self->{$metadata_group . ".partition_size_within_level"} = 20;
[10398]206 }
207 }
208 else {
[12894]209 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
[10398]210
[10498]211 # Assign values based on the partition_size_within_level parameter
[12889]212 foreach my $metadata_group (@metadata_groups) {
[10498]213 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
214 if (defined($partition_size_within_levelelem)) {
[12889]215 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
[10398]216 }
217 else {
[12889]218 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
[10398]219 }
220 }
221 }
222
[18619]223 # The removeprefix and removesuffix expressions
224 if ($self->{'removeprefix'}) {
225 # If there are more than one expressions, use '' to quote each experession and '/' to separate
226 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
227
228 foreach my $metadata_group (@metadata_groups) {
229 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
230 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
231 # Remove the other ' at the beginning and the end if there is any
232 $removeprefix_expr_within_levelelem =~ s/^'//;
233 $removeprefix_expr_within_levelelem =~ s/'$//;
234 # Remove the extra ^ at the beginning
235 $removeprefix_expr_within_levelelem =~ s/^\^//;
236 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
237 } else {
238 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
239 }
240 }
241 }
242 if ($self->{'removesuffix'}) {
243 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
244
245 foreach my $metadata_group (@metadata_groups) {
246 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
247 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
248 $removesuffix_expr_within_levelelem =~ s/^'//;
249 $removesuffix_expr_within_levelelem =~ s/'$//;
250 # Remove the extra $ at the end
251 $removesuffix_expr_within_levelelem =~ s/\$$//;
252 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
253 } else {
254 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
255 }
256 }
257 }
258
[12894]259 # The metadata elements to use to sort the leaf nodes (default: Title)
260 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
261 if ($self->{'sort_leaf_nodes_using'}) {
262 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
[10398]263 }
[12894]264 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
[10398]265
[13551]266 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
267 if ($self->{'sort_using_unicode_collation'}) {
[13791]268 # To use this you first need to download the allkeys.txt file from
269 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
270 # Unicode/Collate directory.
[13551]271 require Unicode::Collate;
272 $self->{'unicode_collator'} = Unicode::Collate->new();
273 }
274
[12894]275 # An empty array for the document OIDs
276 $self->{'OIDs'} = [];
277
[10398]278 return bless $self, $class;
279}
280
281
282sub init
283{
284 # Nothing to do...
285}
286
287
[12896]288# Called for each document in the collection
[10398]289sub classify
290{
291 my $self = shift(@_);
[18455]292 my ($doc_obj,$edit_mode) = @_;
[10398]293
[12896]294 # If "-classify_sections" is set, classify every section of the document
[10398]295 if ($self->{'classify_sections'}) {
296 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
297 while (defined $section) {
[18455]298 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section, $edit_mode);
[10398]299 $section = $doc_obj->get_next_section($section);
300 }
301 }
[12896]302 # Otherwise just classify the top document section
[10398]303 else {
[18455]304 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section(), $edit_mode);
[10398]305 }
306}
307
308
309sub classify_section
310{
311 my $self = shift(@_);
[18455]312 my ($doc_obj,$section_OID,$section,$edit_mode) = @_;
[10398]313
[12889]314 my @metadata_groups = @{$self->{'metadata_groups'}};
[10398]315
[12896]316 # Only classify the section if it has a value for one of the metadata elements in the first group
317 my $classify_section = 0;
318 my $first_metadata_group = $metadata_groups[0];
[20008]319 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
[20424]320 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
[20421]321 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
[18619]322
323 # Remove prefix/suffix if requested
324 my $remove_prefix_expr = $self->{$first_metadata_group_element . ".remove_prefix_expr"};
325 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
326 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
327 }
328 my $remove_suffix_expr = $self->{$first_metadata_group_element . ".remove_suffix_expr"};
329 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
330 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
331 }
332
[12896]333 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
334 # This section must be included in the classifier
335 $classify_section = 1;
336 last;
[18619]337 }
[12896]338 }
[10398]339
[12896]340 # We're not classifying this section because it doesn't have the required metadata
341 return if (!$classify_section);
[10398]342
[18508]343 if (($edit_mode eq "delete") || ($edit_mode eq "update")) {
[18455]344 $self->oid_array_delete($section_OID,'OIDs');
[18508]345 if ($edit_mode eq "delete") {
346 return;
347 }
[18455]348 }
349
[12896]350 # Otherwise, include this section in the classifier
351 push(@{$self->{'OIDs'}}, $section_OID);
352
353 # Create a hash for the metadata values of each metadata element we're interested in
354 my %metadata_groups_done = ();
355 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
356 # Take care not to do a metadata group more than once
357 unless ($metadata_groups_done{$metadata_group}) {
[20653]358 if ($edit_mode eq "update") {
359 # if we are updating, we delete all the old values before
360 # adding the new ones, otherwise, the section will end up in
361 # the classifier twice.
362 delete $self->{$metadata_group . ".list"}->{$section_OID};
363 }
[20008]364 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
[20424]365 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
366
[18619]367 my $remove_prefix_expr = $self->{$metadata_element . ".remove_prefix_expr"};
368 my $remove_suffix_expr = $self->{$metadata_element . ".remove_suffix_expr"};
[20421]369 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
[12896]370 foreach my $metadata_value (@metadata_values) {
371 # Strip leading and trailing whitespace
372 $metadata_value =~ s/^\s*//;
373 $metadata_value =~ s/\s*$//;
[13550]374
[18619]375 # Remove prefix/suffix if requested
376 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
377 $metadata_value =~ s/^$remove_prefix_expr//;
378 }
379 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
380 $metadata_value =~ s/$remove_suffix_expr$//;
381 }
382
[20865]383 # uppercase the metadata - makes the AZList nicer
384 $metadata_value = uc($metadata_value);
[13550]385 # Convert the metadata value from a UTF-8 string to a Unicode string
386 # This means that length() and substr() work properly
387 # We need to be careful to convert classifier node title values back to UTF-8, however
[14173]388 my $metadata_value_unicode_string = $self->convert_utf8_string_to_unicode_string($metadata_value);
[13550]389
390 # Add the metadata value into the list for this combination of metadata group and section
391 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
[10398]392 }
[12896]393 last if (@metadata_values > 0);
[10398]394 }
395
[12896]396 $metadata_groups_done{$metadata_group} = 1;
[10398]397 }
398 }
399}
400
401
402sub get_classify_info
403{
404 my $self = shift(@_);
405
[12896]406 # The metadata groups to classify by
[12889]407 my @metadata_groups = @{$self->{'metadata_groups'}};
408 my $first_metadata_group = $metadata_groups[0];
[10398]409
[12896]410 # The OID values of the documents to include in the classifier
[12889]411 my @OIDs = @{$self->{'OIDs'}};
[10398]412
[12896]413 # Create the root node of the classification hierarchy
[12893]414 my %classifier_node = ( 'thistype' => "Invisible",
[12895]415 'childtype' => $self->{$first_metadata_group . ".list_type"},
[12894]416 'Title' => $self->{'buttonname'},
[13271]417 'contains' => [],
418 'mdtype' => $first_metadata_group );
[10398]419
[12895]420 # Recursively create the classification hierarchy, one level for each metadata group
[14173]421 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
[12893]422 return \%classifier_node;
[10398]423}
424
425
[12895]426sub add_level
[10398]427{
428 my $self = shift(@_);
[12889]429 my @metadata_groups = @{shift(@_)};
430 my @OIDs = @{shift(@_)};
[12893]431 my $classifier_node = shift(@_);
432 # print STDERR "\nAdding AZ list for " . $classifier_node->{'Title'} . "\n";
[10398]433
[12889]434 my $metadata_group = $metadata_groups[0];
435 # print STDERR "Processing metadata group: " . $metadata_group . "\n";
436 # print STDERR "Number of OID values: " . @OIDs . "\n";
[10398]437
[13340]438 if (!defined($self->{$metadata_group . ".list"})) {
439 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
440 return;
441 }
[10398]442
443 # Create a mapping from metadata value to OID
[14845]444 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
[18619]445 my %metadata_value_to_OIDs_hash = ();
[14845]446 foreach my $OID (@OIDs)
447 {
448 if ($OID_to_metadata_values_hash_ref->{$OID})
449 {
450 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
451 foreach my $metadata_value (@metadata_values)
452 {
453 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
[10398]454 }
455 }
456 }
[14845]457 # print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
[10398]458
459 # Partition the values (if necessary)
[18619]460 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
[20904]461 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
[10498]462 if ($partition_type_within_level =~ /^per_letter$/i) {
[10398]463 # Generate one hlist for each letter
[14845]464 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
465 my %metadata_value_to_OIDs_subhash = ();
[10398]466
[14845]467 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
468 foreach my $metadata_value (@sortedmetadata_values) {
469 my $metadata_valuepartition = substr($metadata_value, 0, 1);
[10398]470
471 # Is this the start of a new partition?
[14845]472 if ($metadata_valuepartition ne $lastpartition) {
473 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
474 %metadata_value_to_OIDs_subhash = ();
475 $lastpartition = $metadata_valuepartition;
[10398]476 }
477
[14845]478 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
[10398]479 }
480
481 # Don't forget to add the last partition
[14845]482 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
[10398]483
484 # The partitions are stored in an HList
[12893]485 $classifier_node->{'childtype'} = "HList";
[10398]486 }
[20904]487 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
[18619]488 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
489 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
490 # If a letter has more items than specified, split into several hlists.
491 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
492 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
493 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
494 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
495
496 # Separate values by their first letter, each form a bucket, like the per_letter partition type
497 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
498 my %partition_buckets = ();
499 my @metadata_values_in_bucket = ();
500 my $num_items_in_bucket = 0;
501 foreach my $metadata_value (@sortedmetadata_values) {
502 my $metadata_valuepartition = substr($metadata_value, 0, 1);
503 if ($metadata_valuepartition ne $last_partition) {
504 my @temp_array = @metadata_values_in_bucket;
505 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
506 my %partition_info = ();
507 $partition_info{'metadata_values'} = \@temp_array;
508 $partition_info{'size'} = $num_items_in_bucket;
509 $partition_buckets{$last_partition} = \%partition_info;
510
511 @metadata_values_in_bucket = ($metadata_value);
512 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
513 $last_partition = $metadata_valuepartition;
514 } else {
515 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
516 push (@metadata_values_in_bucket, $metadata_value);
517 }
518 }
519 # Last one
520 my %partition_info = ();
521 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
522 $partition_info{'size'} = $num_items_in_bucket;
523 $partition_buckets{$last_partition} = \%partition_info;
524
525 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
526 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
527 my $partition = $partition_keys[$i];
528 my $items_in_partition = $partition_buckets{$partition}->{'size'};
529 # Merge small buckets together, but keep the numeric bucket apart
530 if ($items_in_partition < $partition_size_within_level) {
531 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
532 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
533 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
534 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
535 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
536 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
537 }
538 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
539 delete $partition_buckets{$partition};
540 }
541 }
542 }
543 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
544
545 # Add partitions, and divide big bucket into several
546 my $last_partition_end = "";
547 my $partition_start = "";
548 foreach my $partition (@partition_keys) {
549 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
550 my $items_in_partition = $partition_buckets{$partition}->{'size'};
551 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
552
553 if ($items_in_partition > $partition_size_within_level) {
554 my $items_done = 0;
555 my %metadata_values_to_OIDs_subhashes = ();
556 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
557 my $metadata_value = $metadata_values[$i];
558 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
559 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
[10398]560
[18619]561 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
562 my $partitionname = $partition_start;
563 if ($partitionend ne $partition_start) {
564 $partitionname = $partitionname . "-" . $partitionend;
565 }
566
567 # Start a new partition
568 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
569 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
570 $last_partition_end = $partitionend;
571 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
572 $items_done = 0;
573 %metadata_values_to_OIDs_subhashes = ();
574 }
575
576 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
577 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
578 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
579 # Get the number of partitions needed for this value
580 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
581 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
582
583 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
584 for (my $i = 0; $i < $num_splits; $i++) {
585 my %OIDs_subhashes_for_this_value = ();
586 my @OIDs_for_this_partition = ();
587 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
588 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
589 }
590
591 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
592 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
593 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
594 $items_done += scalar(@OIDs_for_this_partition);
595 next;
596 }
597
598 # Add an HList for this bucket
599 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
600 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
601 $last_partition_end = $partitionname_for_this_value;
602 }
603 next;
604 }
605
606 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
607 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
608
609 # The last partition
610 if($i == scalar(@metadata_values) - 1) {
611 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
612 }
613 }
614 }
615 else {
616 # The easier case, just add a partition
617 my %metadata_values_to_OIDs_subhashes = ();
618 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
619 my $metadata_value = $metadata_values[$i];
620 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
621 }
622 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
623 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
624 my $partitionname = $partition_start;
625 if ($partitionend ne $partition_start) {
626 $partitionname = $partitionname . "-" . $partitionend;
627 }
628 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
629 $last_partition_end = $partitionend;
630 }
631 }
[20865]632
633 # The partitions are stored in an HList
634 $classifier_node->{'childtype'} = "HList";
635
636 } # end approximate_size
[10398]637 else {
638 # Generate hlists of a certain size
[14845]639 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
640 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
[10398]641 my $itemsdone = 0;
[14845]642 my %metadata_value_to_OIDs_subhash = ();
[10398]643 my $lastpartitionend = "";
644 my $partitionstart;
[14845]645 foreach my $metadata_value (@sortedmetadata_values) {
646 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
[10398]647 $itemsdone++;
[14845]648 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
[10398]649
650 # Is this the start of a new partition?
651 if ($itemsinpartition == 1) {
[14845]652 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
[10398]653 }
654
655 # Is this the end of the partition?
[14845]656 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
657 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
[10398]658 my $partitionname = $partitionstart;
659 if ($partitionend ne $partitionstart) {
660 $partitionname = $partitionname . "-" . $partitionend;
661 }
662
[14845]663 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
664 %metadata_value_to_OIDs_subhash = ();
[10398]665 $lastpartitionend = $partitionend;
666 }
667 }
668
669 # The partitions are stored in an HList
[12893]670 $classifier_node->{'childtype'} = "HList";
[10398]671 }
672
673 # Otherwise just add all the values to a VList
674 else {
[14845]675 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
[10398]676 }
677 }
678}
679
680
[13550]681sub convert_utf8_string_to_unicode_string
[10398]682{
[14173]683 my $self = shift(@_);
[13550]684 my $utf8_string = shift(@_);
[10398]685
[13550]686 my $unicode_string = "";
687 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
688 $unicode_string .= chr($unicode_value);
689 }
690 return $unicode_string;
[10398]691}
692
693
[13550]694sub convert_unicode_string_to_utf8_string
695{
[14173]696 my $self = shift(@_);
[13550]697 my $unicode_string = shift(@_);
698
699 my @unicode_array;
700 for (my $i = 0; $i < length($unicode_string); $i++) {
701 push(@unicode_array, ord(substr($unicode_string, $i, 1)));
702 }
703 return &unicode::unicode2utf8(\@unicode_array);
704}
705
706
[10398]707sub generate_partition_start
708{
[14173]709 my $self = shift(@_);
[14845]710 my $metadata_value = shift(@_);
[10398]711 my $lastpartitionend = shift(@_);
[14084]712 my $partition_name_length = shift(@_);
[10398]713
[14084]714 if ($partition_name_length) {
[14845]715 return substr($metadata_value, 0, $partition_name_length);
[14084]716 }
717
[14845]718 my $partitionstart = substr($metadata_value, 0, 1);
[10398]719 if ($partitionstart le $lastpartitionend) {
[14845]720 $partitionstart = substr($metadata_value, 0, 2);
[10398]721 # Give up after three characters
722 if ($partitionstart le $lastpartitionend) {
[14845]723 $partitionstart = substr($metadata_value, 0, 3);
[10398]724 }
725 }
726
727 return $partitionstart;
728}
729
730
731sub generate_partition_end
732{
[14173]733 my $self = shift(@_);
[14845]734 my $metadata_value = shift(@_);
[10398]735 my $partitionstart = shift(@_);
[14084]736 my $partition_name_length = shift(@_);
[10398]737
[14084]738 if ($partition_name_length) {
[14845]739 return substr($metadata_value, 0, $partition_name_length);
[14084]740 }
741
[14845]742 my $partitionend = substr($metadata_value, 0, length($partitionstart));
[10398]743 if ($partitionend gt $partitionstart) {
[14845]744 $partitionend = substr($metadata_value, 0, 1);
[10398]745 if ($partitionend le $partitionstart) {
[14845]746 $partitionend = substr($metadata_value, 0, 2);
[10398]747 # Give up after three characters
748 if ($partitionend le $partitionstart) {
[14845]749 $partitionend = substr($metadata_value, 0, 3);
[10398]750 }
751 }
752 }
753
754 return $partitionend;
755}
756
757
758sub add_hlist_partition
759{
760 my $self = shift(@_);
[12889]761 my @metadata_groups = @{shift(@_)};
[12893]762 my $classifier_node = shift(@_);
[10398]763 my $partitionname = shift(@_);
[14845]764 my $metadata_value_to_OIDs_hash_ref = shift(@_);
[10398]765
766 # Create an hlist partition
[14173]767 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
[12893]768 'childtype' => "VList",
769 'contains' => [] );
[10398]770
771 # Add the children to the hlist partition
[14845]772 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
[12893]773 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
[10398]774}
775
776
777sub add_vlist
778{
779 my $self = shift(@_);
[12889]780 my @metadata_groups = @{shift(@_)};
[12893]781 my $classifier_node = shift(@_);
[14845]782 my $metadata_value_to_OIDs_hash_ref = shift(@_);
[10398]783
[12889]784 my $metadata_group = shift(@metadata_groups);
[13287]785 $classifier_node->{'mdtype'} = $metadata_group;
[10398]786
787 # Create an entry in the vlist for each value
[14845]788 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
789 {
790 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
[10398]791
[18619]792 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
793 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
[13271]794 my $OID = $OIDs[0];
795
796 # Find the offset of this metadata value
797 my $offset = 0;
[14845]798 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
799 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
800 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
801 if ($metadata_value eq $metadata_values[$i]) {
[13271]802 $offset = $i;
803 last;
804 }
805 }
806 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
[18619]807 }
808 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
809 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
810 $self->add_sorted_leaf_items(\@OIDs, $classifier_node);
[10398]811 }
812 # Otherwise create a sublist (bookshelf) for the metadata value
[18619]813 else {
[14845]814 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
[12893]815 'childtype' => "VList",
816 'contains' => [] );
[10398]817
818 # If there are metadata elements remaining, recursively apply the process
[12889]819 if (@metadata_groups > 0) {
820 my $next_metadata_group = $metadata_groups[0];
[12895]821 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
[14173]822 $self->add_level(\@metadata_groups, \@OIDs, \%child_classifier_node);
[10398]823 }
824 # Otherwise just add the documents as children of this list
825 else {
[18619]826 $self->add_sorted_leaf_items(\@OIDs, \%child_classifier_node);
[10398]827 }
828
829 # Add the sublist to the list
[12893]830 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
[10398]831 }
832 }
833}
834
[18619]835sub add_sorted_leaf_items
836{
837 my $self = shift(@_);
838 my @OIDs = @{shift(@_)};
839 my $classifier_node = shift(@_);
840
841 # Sort leaf nodes and add to list
[20825]842 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
843 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
[18619]844 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
845 # Force a stable sort (Perl 5.6's sort isn't stable)
846 # !! The [0] bits aren't ideal (multiple metadata values) !!
847 @OIDs = @OIDs[ sort { $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ];
848 }
[10398]849
[18619]850 foreach my $OID (@OIDs) {
851 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID });
852 }
853}
854
855
[13551]856sub sort_metadata_values_array
857{
858 my $self = shift(@_);
859 my @metadata_values = @_;
860
861 if ($self->{'unicode_collator'}) {
862 return $self->{'unicode_collator'}->sort(@metadata_values);
863 }
864 else {
865 return sort(@metadata_values);
866 }
867}
868
869
[10398]8701;
Note: See TracBrowser for help on using the repository browser.