source: main/trunk/greenstone2/perllib/classify/List.pm@ 24016

Last change on this file since 24016 was 24016, checked in by ak19, 13 years ago

Fixed the unicode-issue that Diego discovered. When using the List classifier, the bookshelf titles were displayed without any special/accented characters, whereas, when the same titles appeared in documents under such bookshelves, the document titles were correctly displayed. Turns out the problem was from utf8-to-unicode and unicode-to-utf8 calls that were not removed after the changes to GS perlcode for making string unicode aware in Perl. So such conversions are already handled elsewhere in the code and no longer belong in List, which fixed the problem.

  • Property svn:keywords set to Author Date Id Revision
File size: 36.3 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46my $partition_type_list =
47 [ { 'name' => "per_letter",
48 'desc' => "{List.level_partition.per_letter}" },
49 { 'name' => "approximate_size",
50 'desc' => "{List.level_partition.approximate_size}"},
51 { 'name' => "constant_size",
52 'desc' => "{List.level_partition.constant_size}" },
53 { 'name' => "none",
54 'desc' => "{List.level_partition.none}" } ];
55
56# following used to check types later on
57my $valid_partition_types = { 'per_letter' => 1,
58 'constant_size' => 1,
59 'per_letter_fixed_size' => 1,
60 'approximate_size' => 1,
61 'none' => 1};
62
63my $bookshelf_type_list =
64 [ { 'name' => "always",
65 'desc' => "{List.bookshelf_type.always}" },
66 { 'name' => "duplicate_only",
67 'desc' => "{List.bookshelf_type.duplicate_only}" },
68 { 'name' => "never",
69 'desc' => "{List.bookshelf_type.never}" } ];
70
71my $arguments =
72 [ { 'name' => "metadata",
73 'desc' => "{List.metadata}",
74 'type' => "metadata",
75 'reqd' => "yes" },
76
77 # The interesting options
78 { 'name' => "bookshelf_type",
79 'desc' => "{List.bookshelf_type}",
80 'type' => "enum",
81 'list' => $bookshelf_type_list,
82 'deft' => "never" },
83 { 'name' => "classify_sections",
84 'desc' => "{List.classify_sections}",
85 'type' => "flag" },
86 { 'name' => "partition_type_within_level",
87 'desc' => "{List.partition_type_within_level}",
88 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
89 'list' => $partition_type_list,
90 'deft' => "per_letter" },
91 { 'name' => "partition_size_within_level",
92 'desc' => "{List.partition_size_within_level}",
93 'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
94 { 'name' => "partition_name_length",
95 'desc' => "{List.partition_name_length}",
96 'type' => "string" },
97 { 'name' => "sort_leaf_nodes_using",
98 'desc' => "{List.sort_leaf_nodes_using}",
99 'type' => "metadata",
100 'deft' => "Title" },
101 { 'name' => "reverse_sort_leaf_nodes",
102 'desc' => "{List.reverse_sort_leaf_nodes}",
103 'type' => "flag"},
104 { 'name' => "sort_using_unicode_collation",
105 'desc' => "{List.sort_using_unicode_collation}",
106 'type' => "flag" },
107 { 'name' => "use_hlist_for",
108 'desc' => "{List.use_hlist_for}",
109 'type' => "string" },
110 { 'name' => "removeprefix",
111 'desc' => "{BasClas.removeprefix}",
112 'type' => "regexp" },
113 { 'name' => "removesuffix",
114 'desc' => "{BasClas.removesuffix}",
115 'type' => "regexp" } ];
116
117my $options = { 'name' => "List",
118 'desc' => "{List.desc}",
119 'abstract' => "no",
120 'inherits' => "yes",
121 'args' => $arguments };
122
123
124sub new
125{
126 my ($class) = shift(@_);
127 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
128 push(@$classifierslist, $class);
129
130 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
131 push(@{$hashArgOptLists->{"OptList"}}, $options);
132
133 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
134
135 if ($self->{'info_only'}) {
136 # don't worry about any options etc
137 return bless $self, $class;
138 }
139
140 # The metadata elements to use (required)
141 if (!$self->{'metadata'}) {
142 die "Error: No metadata fields specified for List.\n";
143 }
144
145 my @metadata_groups = split(/[\/]/, $self->{'metadata'});
146 $self->{'metadata_groups'} = \@metadata_groups;
147
148 # The classifier button name (default: the first metadata element specified)
149 if (!$self->{'buttonname'}) {
150 my $first_metadata_group = $metadata_groups[0];
151 my $first_metadata_element = (split(/[\;|,\/]/, $first_metadata_group))[0];
152 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
153 }
154
155 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
156 foreach my $metadata_group (@metadata_groups) {
157 $self->{$metadata_group . ".bookshelf_type"} = "always";
158 }
159 my $last_metadata_group = $metadata_groups[$#metadata_groups];
160 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
161 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
162
163 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
164 foreach my $metadata_group (@metadata_groups) {
165 $self->{$metadata_group . ".list_type"} = "VList";
166 }
167 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
168 $self->{$metadata_group . ".list_type"} = "HList";
169 }
170
171 # How the items are grouped into partitions (default: no partition)
172 # for each level (metadata group), separated by '/'
173 if (!$self->{"partition_type_within_level"}) {
174 foreach my $metadata_group (@metadata_groups) {
175 $self->{$metadata_group . ".partition_type_within_level"} = "none";
176 }
177 } else {
178 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
179
180 my $first = 1;
181 foreach my $metadata_group (@metadata_groups) {
182 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
183 if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
184 print STDERR "per letter fixed size, changing to approximate size\n";
185 $partition_type_within_levelelem = "approximate_size";
186 }
187 if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
188 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
189 }
190 else {
191 if ($first) {
192 $self->{$metadata_group . ".partition_type_within_level"} = "none";
193 $first = 0;
194 } else {
195 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
196 }
197 if (defined($partition_type_within_levelelem)) {
198 # ie invalid entry
199 print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
200 }
201 }
202 }
203 }
204
205 # The number of items in each partition
206 if (!$self->{'partition_size_within_level'}) {
207 # Default: 20
208 foreach my $metadata_group (@metadata_groups) {
209 $self->{$metadata_group . ".partition_size_within_level"} = 20;
210 }
211 }
212 else {
213 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
214
215 # Assign values based on the partition_size_within_level parameter
216 foreach my $metadata_group (@metadata_groups) {
217 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
218 if (defined($partition_size_within_levelelem)) {
219 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
220 }
221 else {
222 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
223 }
224 }
225 }
226
227 # The removeprefix and removesuffix expressions
228 if ($self->{'removeprefix'}) {
229 # If there are more than one expressions, use '' to quote each experession and '/' to separate
230 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
231
232 foreach my $metadata_group (@metadata_groups) {
233 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
234 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
235 # Remove the other ' at the beginning and the end if there is any
236 $removeprefix_expr_within_levelelem =~ s/^'//;
237 $removeprefix_expr_within_levelelem =~ s/'$//;
238 # Remove the extra ^ at the beginning
239 $removeprefix_expr_within_levelelem =~ s/^\^//;
240 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
241 } else {
242 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
243 }
244 }
245 }
246 if ($self->{'removesuffix'}) {
247 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
248
249 foreach my $metadata_group (@metadata_groups) {
250 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
251 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
252 $removesuffix_expr_within_levelelem =~ s/^'//;
253 $removesuffix_expr_within_levelelem =~ s/'$//;
254 # Remove the extra $ at the end
255 $removesuffix_expr_within_levelelem =~ s/\$$//;
256 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
257 } else {
258 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
259 }
260 }
261 }
262
263 # The metadata elements to use to sort the leaf nodes (default: Title)
264 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
265 if ($self->{'sort_leaf_nodes_using'}) {
266 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
267 }
268 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
269
270 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
271 if ($self->{'sort_using_unicode_collation'}) {
272 # To use this you first need to download the allkeys.txt file from
273 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
274 # Unicode/Collate directory.
275 require Unicode::Collate;
276 $self->{'unicode_collator'} = Unicode::Collate->new();
277 }
278
279 # An empty array for the document/section OIDs that we are classifying
280 $self->{'OIDs'} = [];
281 # A hash for all the doc ids that we have seen, so we don't classify something twice
282 $self->{'all_doc_OIDs'} = {};
283 return bless $self, $class;
284}
285
286
287sub init
288{
289 # Nothing to do...
290}
291
292
293# Called for each document in the collection
294sub classify
295{
296 my $self = shift(@_);
297 my ($doc_obj) = @_;
298
299 if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
300 print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
301 return;
302 }
303 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
304 # If "-classify_sections" is set, classify every section of the document
305 if ($self->{'classify_sections'}) {
306 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
307 while (defined $section) {
308 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
309 $section = $doc_obj->get_next_section($section);
310 }
311 }
312 # Otherwise just classify the top document section
313 else {
314 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
315 }
316
317}
318
319sub classify_section
320{
321 my $self = shift(@_);
322 my ($doc_obj,$section_OID,$section) = @_;
323
324 my @metadata_groups = @{$self->{'metadata_groups'}};
325
326 # Only classify the section if it has a value for one of the metadata elements in the first group
327 my $classify_section = 0;
328 my $first_metadata_group = $metadata_groups[0];
329 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
330 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
331 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
332 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
333 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
334
335 # Remove prefix/suffix if requested
336 if (defined ($first_metadata_group_element_value)) {
337 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
338 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
339 }
340
341 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
342 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
343 }
344 }
345 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
346 # This section must be included in the classifier
347 $classify_section = 1;
348 last;
349 }
350 }
351
352 # We're not classifying this section because it doesn't have the required metadata
353 return if (!$classify_section);
354
355 # Otherwise, include this section in the classifier
356
357 push(@{$self->{'OIDs'}}, $section_OID);
358
359 # Create a hash for the metadata values of each metadata element we're interested in
360 my %metadata_groups_done = ();
361 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
362 # Take care not to do a metadata group more than once
363 unless ($metadata_groups_done{$metadata_group}) {
364 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
365 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
366 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
367 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
368
369 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
370 foreach my $metadata_value (@metadata_values) {
371 # Strip leading and trailing whitespace
372 $metadata_value =~ s/^\s*//;
373 $metadata_value =~ s/\s*$//;
374
375 # Remove prefix/suffix if requested
376 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
377 $metadata_value =~ s/^$remove_prefix_expr//;
378 }
379 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
380 $metadata_value =~ s/$remove_suffix_expr$//;
381 }
382
383 # uppercase the metadata - makes the AZList nicer
384 $metadata_value = uc($metadata_value);
385
386 # We are already working with unicode aware strings at this
387 # stage, so we no longer need to convert from utf8 to unicode
388 my $metadata_value_unicode_string = $metadata_value; # $self->convert_utf8_string_to_unicode_string($metadata_value);
389
390 # Add the metadata value into the list for this combination of metadata group and section
391 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
392 }
393 last if (@metadata_values > 0);
394 }
395
396 $metadata_groups_done{$metadata_group} = 1;
397 }
398 }
399}
400
401
402sub get_classify_info
403{
404 my $self = shift(@_);
405
406 # The metadata groups to classify by
407 my @metadata_groups = @{$self->{'metadata_groups'}};
408 my $first_metadata_group = $metadata_groups[0];
409
410 # The OID values of the documents to include in the classifier
411 my @OIDs = @{$self->{'OIDs'}};
412
413 # Create the root node of the classification hierarchy
414 my %classifier_node = ( 'thistype' => "Invisible",
415 'childtype' => $self->{$first_metadata_group . ".list_type"},
416 'Title' => $self->{'buttonname'},
417 'contains' => [],
418 'mdtype' => $first_metadata_group );
419
420 # Recursively create the classification hierarchy, one level for each metadata group
421 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
422 return \%classifier_node;
423}
424
425
426sub add_level
427{
428 my $self = shift(@_);
429 my @metadata_groups = @{shift(@_)};
430 my @OIDs = @{shift(@_)};
431 my $classifier_node = shift(@_);
432
433 my $metadata_group = $metadata_groups[0];
434
435 if (!defined($self->{$metadata_group . ".list"})) {
436 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
437 return;
438 }
439
440 # Create a mapping from metadata value to OID
441 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
442 my %metadata_value_to_OIDs_hash = ();
443 foreach my $OID (@OIDs)
444 {
445 if ($OID_to_metadata_values_hash_ref->{$OID})
446 {
447 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
448 foreach my $metadata_value (@metadata_values)
449 {
450 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
451 }
452 }
453 }
454 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
455
456 # Partition the values (if necessary)
457 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
458 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
459 if ($partition_type_within_level =~ /^per_letter$/i) {
460 # Generate one hlist for each letter
461 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
462 my %metadata_value_to_OIDs_subhash = ();
463
464 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
465 foreach my $metadata_value (@sortedmetadata_values) {
466 my $metadata_valuepartition = substr($metadata_value, 0, 1);
467
468 # Is this the start of a new partition?
469 if ($metadata_valuepartition ne $lastpartition) {
470 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
471 %metadata_value_to_OIDs_subhash = ();
472 $lastpartition = $metadata_valuepartition;
473 }
474
475 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
476 }
477
478 # Don't forget to add the last partition
479 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
480
481 # The partitions are stored in an HList
482 $classifier_node->{'childtype'} = "HList";
483 }
484 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
485 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
486 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
487 # If a letter has more items than specified, split into several hlists.
488 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
489 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
490 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
491 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
492
493 # Separate values by their first letter, each form a bucket, like the per_letter partition type
494 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
495 my %partition_buckets = ();
496 my @metadata_values_in_bucket = ();
497 my $num_items_in_bucket = 0;
498 foreach my $metadata_value (@sortedmetadata_values) {
499 my $metadata_valuepartition = substr($metadata_value, 0, 1);
500 if ($metadata_valuepartition ne $last_partition) {
501 my @temp_array = @metadata_values_in_bucket;
502 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
503 my %partition_info = ();
504 $partition_info{'metadata_values'} = \@temp_array;
505 $partition_info{'size'} = $num_items_in_bucket;
506 $partition_buckets{$last_partition} = \%partition_info;
507
508 @metadata_values_in_bucket = ($metadata_value);
509 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
510 $last_partition = $metadata_valuepartition;
511 } else {
512 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
513 push (@metadata_values_in_bucket, $metadata_value);
514 }
515 }
516 # Last one
517 my %partition_info = ();
518 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
519 $partition_info{'size'} = $num_items_in_bucket;
520 $partition_buckets{$last_partition} = \%partition_info;
521
522 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
523 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
524 my $partition = $partition_keys[$i];
525 my $items_in_partition = $partition_buckets{$partition}->{'size'};
526 # Merge small buckets together, but keep the numeric bucket apart
527 if ($items_in_partition < $partition_size_within_level) {
528 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
529 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
530 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
531 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
532 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
533 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
534 }
535 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
536 delete $partition_buckets{$partition};
537 }
538 }
539 }
540 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
541
542 # Add partitions, and divide big bucket into several
543 my $last_partition_end = "";
544 my $partition_start = "";
545 foreach my $partition (@partition_keys) {
546 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
547 my $items_in_partition = $partition_buckets{$partition}->{'size'};
548 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
549
550 if ($items_in_partition > $partition_size_within_level) {
551 my $items_done = 0;
552 my %metadata_values_to_OIDs_subhashes = ();
553 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
554 my $metadata_value = $metadata_values[$i];
555 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
556 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
557
558 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
559 my $partitionname = $partition_start;
560 if ($partitionend ne $partition_start) {
561 $partitionname = $partitionname . "-" . $partitionend;
562 }
563
564 # Start a new partition
565 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
566 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
567 $last_partition_end = $partitionend;
568 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
569 $items_done = 0;
570 %metadata_values_to_OIDs_subhashes = ();
571 }
572
573 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
574 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
575 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
576 # Get the number of partitions needed for this value
577 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
578 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
579
580 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
581 for (my $i = 0; $i < $num_splits; $i++) {
582 my %OIDs_subhashes_for_this_value = ();
583 my @OIDs_for_this_partition = ();
584 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
585 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
586 }
587
588 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
589 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
590 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
591 $items_done += scalar(@OIDs_for_this_partition);
592 next;
593 }
594
595 # Add an HList for this bucket
596 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
597 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
598 $last_partition_end = $partitionname_for_this_value;
599 }
600 next;
601 }
602
603 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
604 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
605
606 # The last partition
607 if($i == scalar(@metadata_values) - 1) {
608 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
609 }
610 }
611 }
612 else {
613 # The easier case, just add a partition
614 my %metadata_values_to_OIDs_subhashes = ();
615 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
616 my $metadata_value = $metadata_values[$i];
617 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
618 }
619 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
620 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
621 my $partitionname = $partition_start;
622 if ($partitionend ne $partition_start) {
623 $partitionname = $partitionname . "-" . $partitionend;
624 }
625 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
626 $last_partition_end = $partitionend;
627 }
628 }
629
630 # The partitions are stored in an HList
631 $classifier_node->{'childtype'} = "HList";
632
633 } # end approximate_size
634 else {
635 # Generate hlists of a certain size
636 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
637 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
638 my $itemsdone = 0;
639 my %metadata_value_to_OIDs_subhash = ();
640 my $lastpartitionend = "";
641 my $partitionstart;
642 foreach my $metadata_value (@sortedmetadata_values) {
643 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
644 $itemsdone++;
645 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
646
647 # Is this the start of a new partition?
648 if ($itemsinpartition == 1) {
649 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
650 }
651
652 # Is this the end of the partition?
653 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
654 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
655 my $partitionname = $partitionstart;
656 if ($partitionend ne $partitionstart) {
657 $partitionname = $partitionname . "-" . $partitionend;
658 }
659
660 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
661 %metadata_value_to_OIDs_subhash = ();
662 $lastpartitionend = $partitionend;
663 }
664 }
665
666 # The partitions are stored in an HList
667 $classifier_node->{'childtype'} = "HList";
668 }
669
670 # Otherwise just add all the values to a VList
671 else {
672 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
673 }
674 }
675}
676
677
678sub convert_utf8_string_to_unicode_string
679{
680 my $self = shift(@_);
681 my $utf8_string = shift(@_);
682
683 my $unicode_string = "";
684 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
685 $unicode_string .= chr($unicode_value);
686 }
687 return $unicode_string;
688}
689
690
691sub convert_unicode_string_to_utf8_string
692{
693 my $self = shift(@_);
694 my $unicode_string = shift(@_);
695
696 my @unicode_array;
697 for (my $i = 0; $i < length($unicode_string); $i++) {
698 push(@unicode_array, ord(substr($unicode_string, $i, 1)));
699 }
700 return &unicode::unicode2utf8(\@unicode_array);
701}
702
703
704sub generate_partition_start
705{
706 my $self = shift(@_);
707 my $metadata_value = shift(@_);
708 my $lastpartitionend = shift(@_);
709 my $partition_name_length = shift(@_);
710
711 if ($partition_name_length) {
712 return substr($metadata_value, 0, $partition_name_length);
713 }
714
715 my $partitionstart = substr($metadata_value, 0, 1);
716 if ($partitionstart le $lastpartitionend) {
717 $partitionstart = substr($metadata_value, 0, 2);
718 # Give up after three characters
719 if ($partitionstart le $lastpartitionend) {
720 $partitionstart = substr($metadata_value, 0, 3);
721 }
722 }
723
724 return $partitionstart;
725}
726
727
728sub generate_partition_end
729{
730 my $self = shift(@_);
731 my $metadata_value = shift(@_);
732 my $partitionstart = shift(@_);
733 my $partition_name_length = shift(@_);
734
735 if ($partition_name_length) {
736 return substr($metadata_value, 0, $partition_name_length);
737 }
738
739 my $partitionend = substr($metadata_value, 0, length($partitionstart));
740 if ($partitionend gt $partitionstart) {
741 $partitionend = substr($metadata_value, 0, 1);
742 if ($partitionend le $partitionstart) {
743 $partitionend = substr($metadata_value, 0, 2);
744 # Give up after three characters
745 if ($partitionend le $partitionstart) {
746 $partitionend = substr($metadata_value, 0, 3);
747 }
748 }
749 }
750
751 return $partitionend;
752}
753
754
755sub add_hlist_partition
756{
757 my $self = shift(@_);
758 my @metadata_groups = @{shift(@_)};
759 my $classifier_node = shift(@_);
760 my $partitionname = shift(@_);
761 my $metadata_value_to_OIDs_hash_ref = shift(@_);
762
763 # Create an hlist partition
764 # Note that we don't need to convert from unicode-aware strings
765 # to utf8 here, as that is handled elsewhere in the code
766 my %child_classifier_node = ( 'Title' => $partitionname, #'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
767 'childtype' => "VList",
768 'contains' => [] );
769
770 # Add the children to the hlist partition
771 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
772 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
773}
774
775
776sub add_vlist
777{
778 my $self = shift(@_);
779 my @metadata_groups = @{shift(@_)};
780 my $classifier_node = shift(@_);
781 my $metadata_value_to_OIDs_hash_ref = shift(@_);
782
783 my $metadata_group = shift(@metadata_groups);
784 $classifier_node->{'mdtype'} = $metadata_group;
785
786 # Create an entry in the vlist for each value
787 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
788 {
789 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
790 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
791 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
792 my $OID = $OIDs[0];
793 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
794 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
795 }
796 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
797 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
798 @OIDs = $self->sort_leaf_items(\@OIDs);
799 foreach my $OID (@OIDs) {
800 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
801 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
802 }
803
804 }
805 # Otherwise create a sublist (bookshelf) for the metadata value
806 else {
807 # Note that we don't need to convert from unicode-aware strings
808 # to utf8 here, as that is handled elsewhere in the code
809 my %child_classifier_node = ( 'Title' => $metadata_value, # 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
810 'childtype' => "VList",
811 'mdtype' => $metadata_group,
812 'contains' => [] );
813
814 #@OIDs = $self->sort_leaf_items(\@OIDs);
815 # If there are metadata elements remaining, recursively apply the process
816 if (@metadata_groups > 0) {
817 my $next_metadata_group = $metadata_groups[0];
818 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
819
820 # separate metadata into those that below in the next/sub-metadata_group
821 # and those that below at the current level's metadata_group
822
823 my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
824 my @current_level_OIDs = ();
825 my @next_level_OIDs = ();
826 foreach my $OID (@OIDs)
827 {
828 if ($OID_to_metadata_values_hash_ref->{$OID}) {
829 push(@next_level_OIDs, $OID);
830 } else {
831 push(@current_level_OIDs, $OID);
832 }
833 }
834 # recursively process those docs belonging to the sub-metadata_group
835 $self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
836
837 # For those docs that don't belong in the sub/next_metadata_group, but which belong
838 # at this level, just add the documents as children of this list at the current level
839 @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
840 foreach my $current_level_OID (@current_level_OIDs) {
841 my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
842 push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
843 }
844 }
845 # Otherwise just add the documents as children of this list
846 else {
847 @OIDs = $self->sort_leaf_items(\@OIDs);
848 foreach my $OID (@OIDs) {
849 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
850 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
851 }
852
853 }
854
855 # Add the sublist to the list
856 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
857 }
858 }
859}
860
861sub metadata_offset
862{
863 my $self = shift(@_);
864 my $metadata_group = shift(@_);
865 my $OID = shift(@_);
866 my $metadata_value = shift(@_);
867
868 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
869 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
870 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
871 if ($metadata_value eq $metadata_values[$i]) {
872 return $i;
873 }
874 }
875
876 return 0;
877}
878
879sub sort_leaf_items
880{
881 my $self = shift(@_);
882 my @OIDs = @{shift(@_)};
883# my $classifier_node = shift(@_);
884
885 # Sort leaf nodes and add to list
886 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
887 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
888 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
889 # Force a stable sort (Perl 5.6's sort isn't stable)
890 # !! The [0] bits aren't ideal (multiple metadata values) !!
891 @OIDs = @OIDs[ sort {
892 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
893 {
894 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
895 }
896 else
897 {
898 $a <=> $b;
899 }
900 } 0..$#OIDs ];
901 }
902 if ($self->{'reverse_sort_leaf_nodes'}) {
903 print STDERR "reversing\n";
904 return reverse @OIDs;
905 }
906 return @OIDs;
907}
908
909
910
911sub sort_metadata_values_array
912{
913 my $self = shift(@_);
914 my @metadata_values = @_;
915
916 if ($self->{'unicode_collator'}) {
917 return $self->{'unicode_collator'}->sort(@metadata_values);
918 }
919 else {
920 return sort(@metadata_values);
921 }
922}
923
924
9251;
Note: See TracBrowser for help on using the repository browser.