source: main/trunk/greenstone2/perllib/classify/List.pm@ 24012

Last change on this file since 24012 was 24012, checked in by ak19, 13 years ago

Bugfix to the way List deals with subclassification levels specified by the forward slash token. The bug was that when sub classifications were not specified for a document, it would never appear (even in a classification level higher up, which was specified for it). Now such docs appears under the lowest classification level specified for them. E.g. dc.Creator/dc.Date. If Date is not specified for doc x even if Creator is, then x used to not appear under dc.Creator either. Now it appears under dc.Creator.

  • Property svn:keywords set to Author Date Id Revision
File size: 36.1 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46my $partition_type_list =
47 [ { 'name' => "per_letter",
48 'desc' => "{List.level_partition.per_letter}" },
49 { 'name' => "approximate_size",
50 'desc' => "{List.level_partition.approximate_size}"},
51 { 'name' => "constant_size",
52 'desc' => "{List.level_partition.constant_size}" },
53 { 'name' => "none",
54 'desc' => "{List.level_partition.none}" } ];
55
56# following used to check types later on
57my $valid_partition_types = { 'per_letter' => 1,
58 'constant_size' => 1,
59 'per_letter_fixed_size' => 1,
60 'approximate_size' => 1,
61 'none' => 1};
62
63my $bookshelf_type_list =
64 [ { 'name' => "always",
65 'desc' => "{List.bookshelf_type.always}" },
66 { 'name' => "duplicate_only",
67 'desc' => "{List.bookshelf_type.duplicate_only}" },
68 { 'name' => "never",
69 'desc' => "{List.bookshelf_type.never}" } ];
70
71my $arguments =
72 [ { 'name' => "metadata",
73 'desc' => "{List.metadata}",
74 'type' => "metadata",
75 'reqd' => "yes" },
76
77 # The interesting options
78 { 'name' => "bookshelf_type",
79 'desc' => "{List.bookshelf_type}",
80 'type' => "enum",
81 'list' => $bookshelf_type_list,
82 'deft' => "never" },
83 { 'name' => "classify_sections",
84 'desc' => "{List.classify_sections}",
85 'type' => "flag" },
86 { 'name' => "partition_type_within_level",
87 'desc' => "{List.partition_type_within_level}",
88 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
89 'list' => $partition_type_list,
90 'deft' => "per_letter" },
91 { 'name' => "partition_size_within_level",
92 'desc' => "{List.partition_size_within_level}",
93 'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
94 { 'name' => "partition_name_length",
95 'desc' => "{List.partition_name_length}",
96 'type' => "string" },
97 { 'name' => "sort_leaf_nodes_using",
98 'desc' => "{List.sort_leaf_nodes_using}",
99 'type' => "metadata",
100 'deft' => "Title" },
101 { 'name' => "reverse_sort_leaf_nodes",
102 'desc' => "{List.reverse_sort_leaf_nodes}",
103 'type' => "flag"},
104 { 'name' => "sort_using_unicode_collation",
105 'desc' => "{List.sort_using_unicode_collation}",
106 'type' => "flag" },
107 { 'name' => "use_hlist_for",
108 'desc' => "{List.use_hlist_for}",
109 'type' => "string" },
110 { 'name' => "removeprefix",
111 'desc' => "{BasClas.removeprefix}",
112 'type' => "regexp" },
113 { 'name' => "removesuffix",
114 'desc' => "{BasClas.removesuffix}",
115 'type' => "regexp" } ];
116
117my $options = { 'name' => "List",
118 'desc' => "{List.desc}",
119 'abstract' => "no",
120 'inherits' => "yes",
121 'args' => $arguments };
122
123
124sub new
125{
126 my ($class) = shift(@_);
127 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
128 push(@$classifierslist, $class);
129
130 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
131 push(@{$hashArgOptLists->{"OptList"}}, $options);
132
133 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
134
135 if ($self->{'info_only'}) {
136 # don't worry about any options etc
137 return bless $self, $class;
138 }
139
140 # The metadata elements to use (required)
141 if (!$self->{'metadata'}) {
142 die "Error: No metadata fields specified for List.\n";
143 }
144
145 my @metadata_groups = split(/[\/]/, $self->{'metadata'});
146 $self->{'metadata_groups'} = \@metadata_groups;
147
148 # The classifier button name (default: the first metadata element specified)
149 if (!$self->{'buttonname'}) {
150 my $first_metadata_group = $metadata_groups[0];
151 my $first_metadata_element = (split(/[\;|,\/]/, $first_metadata_group))[0];
152 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
153 }
154
155 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
156 foreach my $metadata_group (@metadata_groups) {
157 $self->{$metadata_group . ".bookshelf_type"} = "always";
158 }
159 my $last_metadata_group = $metadata_groups[$#metadata_groups];
160 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
161 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
162
163 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
164 foreach my $metadata_group (@metadata_groups) {
165 $self->{$metadata_group . ".list_type"} = "VList";
166 }
167 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
168 $self->{$metadata_group . ".list_type"} = "HList";
169 }
170
171 # How the items are grouped into partitions (default: no partition)
172 # for each level (metadata group), separated by '/'
173 if (!$self->{"partition_type_within_level"}) {
174 foreach my $metadata_group (@metadata_groups) {
175 $self->{$metadata_group . ".partition_type_within_level"} = "none";
176 }
177 } else {
178 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
179
180 my $first = 1;
181 foreach my $metadata_group (@metadata_groups) {
182 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
183 if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
184 print STDERR "per letter fixed size, changing to approximate size\n";
185 $partition_type_within_levelelem = "approximate_size";
186 }
187 if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
188 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
189 }
190 else {
191 if ($first) {
192 $self->{$metadata_group . ".partition_type_within_level"} = "none";
193 $first = 0;
194 } else {
195 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
196 }
197 if (defined($partition_type_within_levelelem)) {
198 # ie invalid entry
199 print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
200 }
201 }
202 }
203 }
204
205 # The number of items in each partition
206 if (!$self->{'partition_size_within_level'}) {
207 # Default: 20
208 foreach my $metadata_group (@metadata_groups) {
209 $self->{$metadata_group . ".partition_size_within_level"} = 20;
210 }
211 }
212 else {
213 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
214
215 # Assign values based on the partition_size_within_level parameter
216 foreach my $metadata_group (@metadata_groups) {
217 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
218 if (defined($partition_size_within_levelelem)) {
219 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
220 }
221 else {
222 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
223 }
224 }
225 }
226
227 # The removeprefix and removesuffix expressions
228 if ($self->{'removeprefix'}) {
229 # If there are more than one expressions, use '' to quote each experession and '/' to separate
230 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
231
232 foreach my $metadata_group (@metadata_groups) {
233 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
234 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
235 # Remove the other ' at the beginning and the end if there is any
236 $removeprefix_expr_within_levelelem =~ s/^'//;
237 $removeprefix_expr_within_levelelem =~ s/'$//;
238 # Remove the extra ^ at the beginning
239 $removeprefix_expr_within_levelelem =~ s/^\^//;
240 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
241 } else {
242 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
243 }
244 }
245 }
246 if ($self->{'removesuffix'}) {
247 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
248
249 foreach my $metadata_group (@metadata_groups) {
250 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
251 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
252 $removesuffix_expr_within_levelelem =~ s/^'//;
253 $removesuffix_expr_within_levelelem =~ s/'$//;
254 # Remove the extra $ at the end
255 $removesuffix_expr_within_levelelem =~ s/\$$//;
256 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
257 } else {
258 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
259 }
260 }
261 }
262
263 # The metadata elements to use to sort the leaf nodes (default: Title)
264 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
265 if ($self->{'sort_leaf_nodes_using'}) {
266 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
267 }
268 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
269
270 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
271 if ($self->{'sort_using_unicode_collation'}) {
272 # To use this you first need to download the allkeys.txt file from
273 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
274 # Unicode/Collate directory.
275 require Unicode::Collate;
276 $self->{'unicode_collator'} = Unicode::Collate->new();
277 }
278
279 # An empty array for the document/section OIDs that we are classifying
280 $self->{'OIDs'} = [];
281 # A hash for all the doc ids that we have seen, so we don't classify something twice
282 $self->{'all_doc_OIDs'} = {};
283 return bless $self, $class;
284}
285
286
287sub init
288{
289 # Nothing to do...
290}
291
292
293# Called for each document in the collection
294sub classify
295{
296 my $self = shift(@_);
297 my ($doc_obj) = @_;
298
299 if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
300 print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
301 return;
302 }
303 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
304 # If "-classify_sections" is set, classify every section of the document
305 if ($self->{'classify_sections'}) {
306 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
307 while (defined $section) {
308 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
309 $section = $doc_obj->get_next_section($section);
310 }
311 }
312 # Otherwise just classify the top document section
313 else {
314 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
315 }
316
317}
318
319sub classify_section
320{
321 my $self = shift(@_);
322 my ($doc_obj,$section_OID,$section) = @_;
323
324 my @metadata_groups = @{$self->{'metadata_groups'}};
325
326 # Only classify the section if it has a value for one of the metadata elements in the first group
327 my $classify_section = 0;
328 my $first_metadata_group = $metadata_groups[0];
329 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
330 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
331 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
332 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
333 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
334
335 # Remove prefix/suffix if requested
336 if (defined ($first_metadata_group_element_value)) {
337 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
338 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
339 }
340
341 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
342 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
343 }
344 }
345 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
346 # This section must be included in the classifier
347 $classify_section = 1;
348 last;
349 }
350 }
351
352 # We're not classifying this section because it doesn't have the required metadata
353 return if (!$classify_section);
354
355 # Otherwise, include this section in the classifier
356
357 push(@{$self->{'OIDs'}}, $section_OID);
358
359 # Create a hash for the metadata values of each metadata element we're interested in
360 my %metadata_groups_done = ();
361 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
362 # Take care not to do a metadata group more than once
363 unless ($metadata_groups_done{$metadata_group}) {
364 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
365 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
366 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
367 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
368
369 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
370 foreach my $metadata_value (@metadata_values) {
371 # Strip leading and trailing whitespace
372 $metadata_value =~ s/^\s*//;
373 $metadata_value =~ s/\s*$//;
374
375 # Remove prefix/suffix if requested
376 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
377 $metadata_value =~ s/^$remove_prefix_expr//;
378 }
379 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
380 $metadata_value =~ s/$remove_suffix_expr$//;
381 }
382
383 # uppercase the metadata - makes the AZList nicer
384 $metadata_value = uc($metadata_value);
385 # Convert the metadata value from a UTF-8 string to a Unicode string
386 # This means that length() and substr() work properly
387 # We need to be careful to convert classifier node title values back to UTF-8, however
388 my $metadata_value_unicode_string = $self->convert_utf8_string_to_unicode_string($metadata_value);
389
390 # Add the metadata value into the list for this combination of metadata group and section
391 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
392 }
393 last if (@metadata_values > 0);
394 }
395
396 $metadata_groups_done{$metadata_group} = 1;
397 }
398 }
399}
400
401
402sub get_classify_info
403{
404 my $self = shift(@_);
405
406 # The metadata groups to classify by
407 my @metadata_groups = @{$self->{'metadata_groups'}};
408 my $first_metadata_group = $metadata_groups[0];
409
410 # The OID values of the documents to include in the classifier
411 my @OIDs = @{$self->{'OIDs'}};
412
413 # Create the root node of the classification hierarchy
414 my %classifier_node = ( 'thistype' => "Invisible",
415 'childtype' => $self->{$first_metadata_group . ".list_type"},
416 'Title' => $self->{'buttonname'},
417 'contains' => [],
418 'mdtype' => $first_metadata_group );
419
420 # Recursively create the classification hierarchy, one level for each metadata group
421 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
422 return \%classifier_node;
423}
424
425
426sub add_level
427{
428 my $self = shift(@_);
429 my @metadata_groups = @{shift(@_)};
430 my @OIDs = @{shift(@_)};
431 my $classifier_node = shift(@_);
432
433 my $metadata_group = $metadata_groups[0];
434
435 if (!defined($self->{$metadata_group . ".list"})) {
436 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
437 return;
438 }
439
440 # Create a mapping from metadata value to OID
441 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
442 my %metadata_value_to_OIDs_hash = ();
443 foreach my $OID (@OIDs)
444 {
445 if ($OID_to_metadata_values_hash_ref->{$OID})
446 {
447 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
448 foreach my $metadata_value (@metadata_values)
449 {
450 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
451 }
452 }
453 }
454 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
455
456 # Partition the values (if necessary)
457 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
458 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
459 if ($partition_type_within_level =~ /^per_letter$/i) {
460 # Generate one hlist for each letter
461 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
462 my %metadata_value_to_OIDs_subhash = ();
463
464 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
465 foreach my $metadata_value (@sortedmetadata_values) {
466 my $metadata_valuepartition = substr($metadata_value, 0, 1);
467
468 # Is this the start of a new partition?
469 if ($metadata_valuepartition ne $lastpartition) {
470 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
471 %metadata_value_to_OIDs_subhash = ();
472 $lastpartition = $metadata_valuepartition;
473 }
474
475 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
476 }
477
478 # Don't forget to add the last partition
479 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
480
481 # The partitions are stored in an HList
482 $classifier_node->{'childtype'} = "HList";
483 }
484 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
485 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
486 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
487 # If a letter has more items than specified, split into several hlists.
488 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
489 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
490 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
491 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
492
493 # Separate values by their first letter, each form a bucket, like the per_letter partition type
494 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
495 my %partition_buckets = ();
496 my @metadata_values_in_bucket = ();
497 my $num_items_in_bucket = 0;
498 foreach my $metadata_value (@sortedmetadata_values) {
499 my $metadata_valuepartition = substr($metadata_value, 0, 1);
500 if ($metadata_valuepartition ne $last_partition) {
501 my @temp_array = @metadata_values_in_bucket;
502 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
503 my %partition_info = ();
504 $partition_info{'metadata_values'} = \@temp_array;
505 $partition_info{'size'} = $num_items_in_bucket;
506 $partition_buckets{$last_partition} = \%partition_info;
507
508 @metadata_values_in_bucket = ($metadata_value);
509 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
510 $last_partition = $metadata_valuepartition;
511 } else {
512 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
513 push (@metadata_values_in_bucket, $metadata_value);
514 }
515 }
516 # Last one
517 my %partition_info = ();
518 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
519 $partition_info{'size'} = $num_items_in_bucket;
520 $partition_buckets{$last_partition} = \%partition_info;
521
522 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
523 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
524 my $partition = $partition_keys[$i];
525 my $items_in_partition = $partition_buckets{$partition}->{'size'};
526 # Merge small buckets together, but keep the numeric bucket apart
527 if ($items_in_partition < $partition_size_within_level) {
528 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
529 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
530 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
531 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
532 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
533 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
534 }
535 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
536 delete $partition_buckets{$partition};
537 }
538 }
539 }
540 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
541
542 # Add partitions, and divide big bucket into several
543 my $last_partition_end = "";
544 my $partition_start = "";
545 foreach my $partition (@partition_keys) {
546 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
547 my $items_in_partition = $partition_buckets{$partition}->{'size'};
548 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
549
550 if ($items_in_partition > $partition_size_within_level) {
551 my $items_done = 0;
552 my %metadata_values_to_OIDs_subhashes = ();
553 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
554 my $metadata_value = $metadata_values[$i];
555 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
556 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
557
558 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
559 my $partitionname = $partition_start;
560 if ($partitionend ne $partition_start) {
561 $partitionname = $partitionname . "-" . $partitionend;
562 }
563
564 # Start a new partition
565 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
566 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
567 $last_partition_end = $partitionend;
568 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
569 $items_done = 0;
570 %metadata_values_to_OIDs_subhashes = ();
571 }
572
573 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
574 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
575 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
576 # Get the number of partitions needed for this value
577 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
578 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
579
580 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
581 for (my $i = 0; $i < $num_splits; $i++) {
582 my %OIDs_subhashes_for_this_value = ();
583 my @OIDs_for_this_partition = ();
584 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
585 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
586 }
587
588 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
589 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
590 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
591 $items_done += scalar(@OIDs_for_this_partition);
592 next;
593 }
594
595 # Add an HList for this bucket
596 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
597 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
598 $last_partition_end = $partitionname_for_this_value;
599 }
600 next;
601 }
602
603 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
604 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
605
606 # The last partition
607 if($i == scalar(@metadata_values) - 1) {
608 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
609 }
610 }
611 }
612 else {
613 # The easier case, just add a partition
614 my %metadata_values_to_OIDs_subhashes = ();
615 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
616 my $metadata_value = $metadata_values[$i];
617 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
618 }
619 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
620 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
621 my $partitionname = $partition_start;
622 if ($partitionend ne $partition_start) {
623 $partitionname = $partitionname . "-" . $partitionend;
624 }
625 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
626 $last_partition_end = $partitionend;
627 }
628 }
629
630 # The partitions are stored in an HList
631 $classifier_node->{'childtype'} = "HList";
632
633 } # end approximate_size
634 else {
635 # Generate hlists of a certain size
636 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
637 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
638 my $itemsdone = 0;
639 my %metadata_value_to_OIDs_subhash = ();
640 my $lastpartitionend = "";
641 my $partitionstart;
642 foreach my $metadata_value (@sortedmetadata_values) {
643 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
644 $itemsdone++;
645 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
646
647 # Is this the start of a new partition?
648 if ($itemsinpartition == 1) {
649 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
650 }
651
652 # Is this the end of the partition?
653 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
654 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
655 my $partitionname = $partitionstart;
656 if ($partitionend ne $partitionstart) {
657 $partitionname = $partitionname . "-" . $partitionend;
658 }
659
660 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
661 %metadata_value_to_OIDs_subhash = ();
662 $lastpartitionend = $partitionend;
663 }
664 }
665
666 # The partitions are stored in an HList
667 $classifier_node->{'childtype'} = "HList";
668 }
669
670 # Otherwise just add all the values to a VList
671 else {
672 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
673 }
674 }
675}
676
677
678sub convert_utf8_string_to_unicode_string
679{
680 my $self = shift(@_);
681 my $utf8_string = shift(@_);
682
683 my $unicode_string = "";
684 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
685 $unicode_string .= chr($unicode_value);
686 }
687 return $unicode_string;
688}
689
690
691sub convert_unicode_string_to_utf8_string
692{
693 my $self = shift(@_);
694 my $unicode_string = shift(@_);
695
696 my @unicode_array;
697 for (my $i = 0; $i < length($unicode_string); $i++) {
698 push(@unicode_array, ord(substr($unicode_string, $i, 1)));
699 }
700 return &unicode::unicode2utf8(\@unicode_array);
701}
702
703
704sub generate_partition_start
705{
706 my $self = shift(@_);
707 my $metadata_value = shift(@_);
708 my $lastpartitionend = shift(@_);
709 my $partition_name_length = shift(@_);
710
711 if ($partition_name_length) {
712 return substr($metadata_value, 0, $partition_name_length);
713 }
714
715 my $partitionstart = substr($metadata_value, 0, 1);
716 if ($partitionstart le $lastpartitionend) {
717 $partitionstart = substr($metadata_value, 0, 2);
718 # Give up after three characters
719 if ($partitionstart le $lastpartitionend) {
720 $partitionstart = substr($metadata_value, 0, 3);
721 }
722 }
723
724 return $partitionstart;
725}
726
727
728sub generate_partition_end
729{
730 my $self = shift(@_);
731 my $metadata_value = shift(@_);
732 my $partitionstart = shift(@_);
733 my $partition_name_length = shift(@_);
734
735 if ($partition_name_length) {
736 return substr($metadata_value, 0, $partition_name_length);
737 }
738
739 my $partitionend = substr($metadata_value, 0, length($partitionstart));
740 if ($partitionend gt $partitionstart) {
741 $partitionend = substr($metadata_value, 0, 1);
742 if ($partitionend le $partitionstart) {
743 $partitionend = substr($metadata_value, 0, 2);
744 # Give up after three characters
745 if ($partitionend le $partitionstart) {
746 $partitionend = substr($metadata_value, 0, 3);
747 }
748 }
749 }
750
751 return $partitionend;
752}
753
754
755sub add_hlist_partition
756{
757 my $self = shift(@_);
758 my @metadata_groups = @{shift(@_)};
759 my $classifier_node = shift(@_);
760 my $partitionname = shift(@_);
761 my $metadata_value_to_OIDs_hash_ref = shift(@_);
762
763 # Create an hlist partition
764 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
765 'childtype' => "VList",
766 'contains' => [] );
767
768 # Add the children to the hlist partition
769 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
770 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
771}
772
773
774sub add_vlist
775{
776 my $self = shift(@_);
777 my @metadata_groups = @{shift(@_)};
778 my $classifier_node = shift(@_);
779 my $metadata_value_to_OIDs_hash_ref = shift(@_);
780
781 my $metadata_group = shift(@metadata_groups);
782 $classifier_node->{'mdtype'} = $metadata_group;
783
784 # Create an entry in the vlist for each value
785 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
786 {
787 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
788 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
789 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
790 my $OID = $OIDs[0];
791 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
792 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
793 }
794 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
795 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
796 @OIDs = $self->sort_leaf_items(\@OIDs);
797 foreach my $OID (@OIDs) {
798 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
799 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
800 }
801
802 }
803 # Otherwise create a sublist (bookshelf) for the metadata value
804 else {
805 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
806 'childtype' => "VList",
807 'mdtype' => $metadata_group,
808 'contains' => [] );
809
810 #@OIDs = $self->sort_leaf_items(\@OIDs);
811 # If there are metadata elements remaining, recursively apply the process
812 if (@metadata_groups > 0) {
813 my $next_metadata_group = $metadata_groups[0];
814 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
815
816 # separate metadata into those that below in the next/sub-metadata_group
817 # and those that below at the current level's metadata_group
818
819 my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
820 my @current_level_OIDs = ();
821 my @next_level_OIDs = ();
822 foreach my $OID (@OIDs)
823 {
824 if ($OID_to_metadata_values_hash_ref->{$OID}) {
825 push(@next_level_OIDs, $OID);
826 } else {
827 push(@current_level_OIDs, $OID);
828 }
829 }
830 # recursively process those docs belonging to the sub-metadata_group
831 $self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
832
833 # For those docs that don't belong in the sub/next_metadata_group, but which belong
834 # at this level, just add the documents as children of this list at the current level
835 @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
836 foreach my $current_level_OID (@current_level_OIDs) {
837 my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
838 push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
839 }
840 }
841 # Otherwise just add the documents as children of this list
842 else {
843 @OIDs = $self->sort_leaf_items(\@OIDs);
844 foreach my $OID (@OIDs) {
845 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
846 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
847 }
848
849 }
850
851 # Add the sublist to the list
852 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
853 }
854 }
855}
856
857sub metadata_offset
858{
859 my $self = shift(@_);
860 my $metadata_group = shift(@_);
861 my $OID = shift(@_);
862 my $metadata_value = shift(@_);
863
864 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
865 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
866 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
867 if ($metadata_value eq $metadata_values[$i]) {
868 return $i;
869 }
870 }
871
872 return 0;
873}
874
875sub sort_leaf_items
876{
877 my $self = shift(@_);
878 my @OIDs = @{shift(@_)};
879# my $classifier_node = shift(@_);
880
881 # Sort leaf nodes and add to list
882 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
883 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
884 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
885 # Force a stable sort (Perl 5.6's sort isn't stable)
886 # !! The [0] bits aren't ideal (multiple metadata values) !!
887 @OIDs = @OIDs[ sort {
888 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
889 {
890 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
891 }
892 else
893 {
894 $a <=> $b;
895 }
896 } 0..$#OIDs ];
897 }
898 if ($self->{'reverse_sort_leaf_nodes'}) {
899 print STDERR "reversing\n";
900 return reverse @OIDs;
901 }
902 return @OIDs;
903}
904
905
906
907sub sort_metadata_values_array
908{
909 my $self = shift(@_);
910 my @metadata_values = @_;
911
912 if ($self->{'unicode_collator'}) {
913 return $self->{'unicode_collator'}->sort(@metadata_values);
914 }
915 else {
916 return sort(@metadata_values);
917 }
918}
919
920
9211;
Note: See TracBrowser for help on using the repository browser.