source: main/trunk/greenstone2/perllib/classify/List.pm@ 23154

Last change on this file since 23154 was 23154, checked in by kjdon, 14 years ago

store a hash of all doc oids, then check against this hash when asked to classify something, so that we don't classify anything twice, as this leads to the document being in the list 4 times

  • Property svn:keywords set to Author Date Id Revision
File size: 34.8 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46my $partition_type_list =
47 [ { 'name' => "per_letter",
48 'desc' => "{List.level_partition.per_letter}" },
49 { 'name' => "approximate_size",
50 'desc' => "{List.level_partition.approximate_size}"},
51 { 'name' => "constant_size",
52 'desc' => "{List.level_partition.constant_size}" },
53 { 'name' => "none",
54 'desc' => "{List.level_partition.none}" } ];
55
56# following used to check types later on
57my $valid_partition_types = { 'per_letter' => 1,
58 'constant_size' => 1,
59 'per_letter_fixed_size' => 1,
60 'approximate_size' => 1,
61 'none' => 1};
62
63my $bookshelf_type_list =
64 [ { 'name' => "always",
65 'desc' => "{List.bookshelf_type.always}" },
66 { 'name' => "duplicate_only",
67 'desc' => "{List.bookshelf_type.duplicate_only}" },
68 { 'name' => "never",
69 'desc' => "{List.bookshelf_type.never}" } ];
70
71my $arguments =
72 [ { 'name' => "metadata",
73 'desc' => "{List.metadata}",
74 'type' => "metadata",
75 'reqd' => "yes" },
76
77 # The interesting options
78 { 'name' => "bookshelf_type",
79 'desc' => "{List.bookshelf_type}",
80 'type' => "enum",
81 'list' => $bookshelf_type_list,
82 'deft' => "never" },
83 { 'name' => "classify_sections",
84 'desc' => "{List.classify_sections}",
85 'type' => "flag" },
86 { 'name' => "partition_type_within_level",
87 'desc' => "{List.partition_type_within_level}",
88 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
89 'list' => $partition_type_list,
90 'deft' => "per_letter" },
91 { 'name' => "partition_size_within_level",
92 'desc' => "{List.partition_size_within_level}",
93 'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
94 { 'name' => "partition_name_length",
95 'desc' => "{List.partition_name_length}",
96 'type' => "string" },
97 { 'name' => "sort_leaf_nodes_using",
98 'desc' => "{List.sort_leaf_nodes_using}",
99 'type' => "metadata",
100 'deft' => "Title" },
101 { 'name' => "sort_using_unicode_collation",
102 'desc' => "{List.sort_using_unicode_collation}",
103 'type' => "flag" },
104 { 'name' => "use_hlist_for",
105 'desc' => "{List.use_hlist_for}",
106 'type' => "string" },
107 { 'name' => "removeprefix",
108 'desc' => "{BasClas.removeprefix}",
109 'type' => "regexp" },
110 { 'name' => "removesuffix",
111 'desc' => "{BasClas.removesuffix}",
112 'type' => "regexp" } ];
113
114my $options = { 'name' => "List",
115 'desc' => "{List.desc}",
116 'abstract' => "no",
117 'inherits' => "yes",
118 'args' => $arguments };
119
120
121sub new
122{
123 my ($class) = shift(@_);
124 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
125 push(@$classifierslist, $class);
126
127 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
128 push(@{$hashArgOptLists->{"OptList"}}, $options);
129
130 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
131
132 if ($self->{'info_only'}) {
133 # don't worry about any options etc
134 return bless $self, $class;
135 }
136
137 # The metadata elements to use (required)
138 if (!$self->{'metadata'}) {
139 die "Error: No metadata fields specified for List.\n";
140 }
141 my @metadata_groups = split(/\//, $self->{'metadata'});
142 $self->{'metadata_groups'} = \@metadata_groups;
143
144 # The classifier button name (default: the first metadata element specified)
145 if (!$self->{'buttonname'}) {
146 my $first_metadata_group = $metadata_groups[0];
147 my $first_metadata_element = (split(/\;|,/, $first_metadata_group))[0];
148 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
149 }
150
151 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
152 foreach my $metadata_group (@metadata_groups) {
153 $self->{$metadata_group . ".bookshelf_type"} = "always";
154 }
155 my $last_metadata_group = $metadata_groups[$#metadata_groups];
156 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
157 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
158
159 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
160 foreach my $metadata_group (@metadata_groups) {
161 $self->{$metadata_group . ".list_type"} = "VList";
162 }
163 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
164 $self->{$metadata_group . ".list_type"} = "HList";
165 }
166
167 # How the items are grouped into partitions (default: no partition)
168 # for each level (metadata group), separated by '/'
169 if (!$self->{"partition_type_within_level"}) {
170 foreach my $metadata_group (@metadata_groups) {
171 $self->{$metadata_group . ".partition_type_within_level"} = "none";
172 }
173 } else {
174 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
175
176 my $first = 1;
177 foreach my $metadata_group (@metadata_groups) {
178 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
179 if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
180 print STDERR "per letter fixed size, changing to approximate size\n";
181 $partition_type_within_levelelem = "approximate_size";
182 }
183 if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
184 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
185 }
186 else {
187 if ($first) {
188 $self->{$metadata_group . ".partition_type_within_level"} = "none";
189 $first = 0;
190 } else {
191 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
192 }
193 if (defined($partition_type_within_levelelem)) {
194 # ie invalid entry
195 print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
196 }
197 }
198 }
199 }
200
201 # The number of items in each partition
202 if (!$self->{'partition_size_within_level'}) {
203 # Default: 20
204 foreach my $metadata_group (@metadata_groups) {
205 $self->{$metadata_group . ".partition_size_within_level"} = 20;
206 }
207 }
208 else {
209 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
210
211 # Assign values based on the partition_size_within_level parameter
212 foreach my $metadata_group (@metadata_groups) {
213 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
214 if (defined($partition_size_within_levelelem)) {
215 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
216 }
217 else {
218 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
219 }
220 }
221 }
222
223 # The removeprefix and removesuffix expressions
224 if ($self->{'removeprefix'}) {
225 # If there are more than one expressions, use '' to quote each experession and '/' to separate
226 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
227
228 foreach my $metadata_group (@metadata_groups) {
229 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
230 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
231 # Remove the other ' at the beginning and the end if there is any
232 $removeprefix_expr_within_levelelem =~ s/^'//;
233 $removeprefix_expr_within_levelelem =~ s/'$//;
234 # Remove the extra ^ at the beginning
235 $removeprefix_expr_within_levelelem =~ s/^\^//;
236 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
237 } else {
238 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
239 }
240 }
241 }
242 if ($self->{'removesuffix'}) {
243 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
244
245 foreach my $metadata_group (@metadata_groups) {
246 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
247 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
248 $removesuffix_expr_within_levelelem =~ s/^'//;
249 $removesuffix_expr_within_levelelem =~ s/'$//;
250 # Remove the extra $ at the end
251 $removesuffix_expr_within_levelelem =~ s/\$$//;
252 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
253 } else {
254 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
255 }
256 }
257 }
258
259 # The metadata elements to use to sort the leaf nodes (default: Title)
260 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
261 if ($self->{'sort_leaf_nodes_using'}) {
262 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
263 }
264 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
265
266 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
267 if ($self->{'sort_using_unicode_collation'}) {
268 # To use this you first need to download the allkeys.txt file from
269 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
270 # Unicode/Collate directory.
271 require Unicode::Collate;
272 $self->{'unicode_collator'} = Unicode::Collate->new();
273 }
274
275 # An empty array for the document/section OIDs that we are classifying
276 $self->{'OIDs'} = [];
277 # A hash for all the doc ids that we have seen, so we don't classify something twice
278 $self->{'all_doc_OIDs'} = {};
279 return bless $self, $class;
280}
281
282
283sub init
284{
285 # Nothing to do...
286}
287
288
289# Called for each document in the collection
290sub classify
291{
292 my $self = shift(@_);
293 my ($doc_obj) = @_;
294
295 if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
296 print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
297 return;
298 }
299 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
300 # If "-classify_sections" is set, classify every section of the document
301 if ($self->{'classify_sections'}) {
302 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
303 while (defined $section) {
304 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
305 $section = $doc_obj->get_next_section($section);
306 }
307 }
308 # Otherwise just classify the top document section
309 else {
310 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
311 }
312
313}
314
315sub classify_section
316{
317 my $self = shift(@_);
318 my ($doc_obj,$section_OID,$section) = @_;
319
320 my @metadata_groups = @{$self->{'metadata_groups'}};
321
322 # Only classify the section if it has a value for one of the metadata elements in the first group
323 my $classify_section = 0;
324 my $first_metadata_group = $metadata_groups[0];
325 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
326 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
327 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
328 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
329 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
330
331 # Remove prefix/suffix if requested
332 if (defined ($first_metadata_group_element_value)) {
333 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
334 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
335 }
336
337 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
338 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
339 }
340 }
341 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
342 # This section must be included in the classifier
343 $classify_section = 1;
344 last;
345 }
346 }
347
348 # We're not classifying this section because it doesn't have the required metadata
349 return if (!$classify_section);
350
351 # Otherwise, include this section in the classifier
352
353 push(@{$self->{'OIDs'}}, $section_OID);
354
355 # Create a hash for the metadata values of each metadata element we're interested in
356 my %metadata_groups_done = ();
357 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
358 # Take care not to do a metadata group more than once
359 unless ($metadata_groups_done{$metadata_group}) {
360 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
361 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
362 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
363 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
364
365 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
366 foreach my $metadata_value (@metadata_values) {
367 # Strip leading and trailing whitespace
368 $metadata_value =~ s/^\s*//;
369 $metadata_value =~ s/\s*$//;
370
371 # Remove prefix/suffix if requested
372 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
373 $metadata_value =~ s/^$remove_prefix_expr//;
374 }
375 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
376 $metadata_value =~ s/$remove_suffix_expr$//;
377 }
378
379 # uppercase the metadata - makes the AZList nicer
380 $metadata_value = uc($metadata_value);
381 # Convert the metadata value from a UTF-8 string to a Unicode string
382 # This means that length() and substr() work properly
383 # We need to be careful to convert classifier node title values back to UTF-8, however
384 my $metadata_value_unicode_string = $self->convert_utf8_string_to_unicode_string($metadata_value);
385
386 # Add the metadata value into the list for this combination of metadata group and section
387 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
388 }
389 last if (@metadata_values > 0);
390 }
391
392 $metadata_groups_done{$metadata_group} = 1;
393 }
394 }
395}
396
397
398sub get_classify_info
399{
400 my $self = shift(@_);
401
402 # The metadata groups to classify by
403 my @metadata_groups = @{$self->{'metadata_groups'}};
404 my $first_metadata_group = $metadata_groups[0];
405
406 # The OID values of the documents to include in the classifier
407 my @OIDs = @{$self->{'OIDs'}};
408
409 # Create the root node of the classification hierarchy
410 my %classifier_node = ( 'thistype' => "Invisible",
411 'childtype' => $self->{$first_metadata_group . ".list_type"},
412 'Title' => $self->{'buttonname'},
413 'contains' => [],
414 'mdtype' => $first_metadata_group );
415
416 # Recursively create the classification hierarchy, one level for each metadata group
417 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
418 return \%classifier_node;
419}
420
421
422sub add_level
423{
424 my $self = shift(@_);
425 my @metadata_groups = @{shift(@_)};
426 my @OIDs = @{shift(@_)};
427 my $classifier_node = shift(@_);
428
429 my $metadata_group = $metadata_groups[0];
430
431 if (!defined($self->{$metadata_group . ".list"})) {
432 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
433 return;
434 }
435
436 # Create a mapping from metadata value to OID
437 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
438 my %metadata_value_to_OIDs_hash = ();
439 foreach my $OID (@OIDs)
440 {
441 if ($OID_to_metadata_values_hash_ref->{$OID})
442 {
443 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
444 foreach my $metadata_value (@metadata_values)
445 {
446 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
447 }
448 }
449 }
450 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
451
452 # Partition the values (if necessary)
453 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
454 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
455 if ($partition_type_within_level =~ /^per_letter$/i) {
456 # Generate one hlist for each letter
457 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
458 my %metadata_value_to_OIDs_subhash = ();
459
460 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
461 foreach my $metadata_value (@sortedmetadata_values) {
462 my $metadata_valuepartition = substr($metadata_value, 0, 1);
463
464 # Is this the start of a new partition?
465 if ($metadata_valuepartition ne $lastpartition) {
466 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
467 %metadata_value_to_OIDs_subhash = ();
468 $lastpartition = $metadata_valuepartition;
469 }
470
471 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
472 }
473
474 # Don't forget to add the last partition
475 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
476
477 # The partitions are stored in an HList
478 $classifier_node->{'childtype'} = "HList";
479 }
480 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
481 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
482 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
483 # If a letter has more items than specified, split into several hlists.
484 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
485 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
486 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
487 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
488
489 # Separate values by their first letter, each form a bucket, like the per_letter partition type
490 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
491 my %partition_buckets = ();
492 my @metadata_values_in_bucket = ();
493 my $num_items_in_bucket = 0;
494 foreach my $metadata_value (@sortedmetadata_values) {
495 my $metadata_valuepartition = substr($metadata_value, 0, 1);
496 if ($metadata_valuepartition ne $last_partition) {
497 my @temp_array = @metadata_values_in_bucket;
498 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
499 my %partition_info = ();
500 $partition_info{'metadata_values'} = \@temp_array;
501 $partition_info{'size'} = $num_items_in_bucket;
502 $partition_buckets{$last_partition} = \%partition_info;
503
504 @metadata_values_in_bucket = ($metadata_value);
505 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
506 $last_partition = $metadata_valuepartition;
507 } else {
508 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
509 push (@metadata_values_in_bucket, $metadata_value);
510 }
511 }
512 # Last one
513 my %partition_info = ();
514 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
515 $partition_info{'size'} = $num_items_in_bucket;
516 $partition_buckets{$last_partition} = \%partition_info;
517
518 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
519 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
520 my $partition = $partition_keys[$i];
521 my $items_in_partition = $partition_buckets{$partition}->{'size'};
522 # Merge small buckets together, but keep the numeric bucket apart
523 if ($items_in_partition < $partition_size_within_level) {
524 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
525 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
526 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
527 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
528 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
529 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
530 }
531 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
532 delete $partition_buckets{$partition};
533 }
534 }
535 }
536 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
537
538 # Add partitions, and divide big bucket into several
539 my $last_partition_end = "";
540 my $partition_start = "";
541 foreach my $partition (@partition_keys) {
542 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
543 my $items_in_partition = $partition_buckets{$partition}->{'size'};
544 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
545
546 if ($items_in_partition > $partition_size_within_level) {
547 my $items_done = 0;
548 my %metadata_values_to_OIDs_subhashes = ();
549 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
550 my $metadata_value = $metadata_values[$i];
551 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
552 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
553
554 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
555 my $partitionname = $partition_start;
556 if ($partitionend ne $partition_start) {
557 $partitionname = $partitionname . "-" . $partitionend;
558 }
559
560 # Start a new partition
561 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
562 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
563 $last_partition_end = $partitionend;
564 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
565 $items_done = 0;
566 %metadata_values_to_OIDs_subhashes = ();
567 }
568
569 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
570 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
571 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
572 # Get the number of partitions needed for this value
573 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
574 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
575
576 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
577 for (my $i = 0; $i < $num_splits; $i++) {
578 my %OIDs_subhashes_for_this_value = ();
579 my @OIDs_for_this_partition = ();
580 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
581 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
582 }
583
584 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
585 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
586 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
587 $items_done += scalar(@OIDs_for_this_partition);
588 next;
589 }
590
591 # Add an HList for this bucket
592 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
593 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
594 $last_partition_end = $partitionname_for_this_value;
595 }
596 next;
597 }
598
599 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
600 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
601
602 # The last partition
603 if($i == scalar(@metadata_values) - 1) {
604 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
605 }
606 }
607 }
608 else {
609 # The easier case, just add a partition
610 my %metadata_values_to_OIDs_subhashes = ();
611 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
612 my $metadata_value = $metadata_values[$i];
613 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
614 }
615 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
616 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
617 my $partitionname = $partition_start;
618 if ($partitionend ne $partition_start) {
619 $partitionname = $partitionname . "-" . $partitionend;
620 }
621 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
622 $last_partition_end = $partitionend;
623 }
624 }
625
626 # The partitions are stored in an HList
627 $classifier_node->{'childtype'} = "HList";
628
629 } # end approximate_size
630 else {
631 # Generate hlists of a certain size
632 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
633 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
634 my $itemsdone = 0;
635 my %metadata_value_to_OIDs_subhash = ();
636 my $lastpartitionend = "";
637 my $partitionstart;
638 foreach my $metadata_value (@sortedmetadata_values) {
639 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
640 $itemsdone++;
641 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
642
643 # Is this the start of a new partition?
644 if ($itemsinpartition == 1) {
645 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
646 }
647
648 # Is this the end of the partition?
649 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
650 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
651 my $partitionname = $partitionstart;
652 if ($partitionend ne $partitionstart) {
653 $partitionname = $partitionname . "-" . $partitionend;
654 }
655
656 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
657 %metadata_value_to_OIDs_subhash = ();
658 $lastpartitionend = $partitionend;
659 }
660 }
661
662 # The partitions are stored in an HList
663 $classifier_node->{'childtype'} = "HList";
664 }
665
666 # Otherwise just add all the values to a VList
667 else {
668 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
669 }
670 }
671}
672
673
674sub convert_utf8_string_to_unicode_string
675{
676 my $self = shift(@_);
677 my $utf8_string = shift(@_);
678
679 my $unicode_string = "";
680 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
681 $unicode_string .= chr($unicode_value);
682 }
683 return $unicode_string;
684}
685
686
687sub convert_unicode_string_to_utf8_string
688{
689 my $self = shift(@_);
690 my $unicode_string = shift(@_);
691
692 my @unicode_array;
693 for (my $i = 0; $i < length($unicode_string); $i++) {
694 push(@unicode_array, ord(substr($unicode_string, $i, 1)));
695 }
696 return &unicode::unicode2utf8(\@unicode_array);
697}
698
699
700sub generate_partition_start
701{
702 my $self = shift(@_);
703 my $metadata_value = shift(@_);
704 my $lastpartitionend = shift(@_);
705 my $partition_name_length = shift(@_);
706
707 if ($partition_name_length) {
708 return substr($metadata_value, 0, $partition_name_length);
709 }
710
711 my $partitionstart = substr($metadata_value, 0, 1);
712 if ($partitionstart le $lastpartitionend) {
713 $partitionstart = substr($metadata_value, 0, 2);
714 # Give up after three characters
715 if ($partitionstart le $lastpartitionend) {
716 $partitionstart = substr($metadata_value, 0, 3);
717 }
718 }
719
720 return $partitionstart;
721}
722
723
724sub generate_partition_end
725{
726 my $self = shift(@_);
727 my $metadata_value = shift(@_);
728 my $partitionstart = shift(@_);
729 my $partition_name_length = shift(@_);
730
731 if ($partition_name_length) {
732 return substr($metadata_value, 0, $partition_name_length);
733 }
734
735 my $partitionend = substr($metadata_value, 0, length($partitionstart));
736 if ($partitionend gt $partitionstart) {
737 $partitionend = substr($metadata_value, 0, 1);
738 if ($partitionend le $partitionstart) {
739 $partitionend = substr($metadata_value, 0, 2);
740 # Give up after three characters
741 if ($partitionend le $partitionstart) {
742 $partitionend = substr($metadata_value, 0, 3);
743 }
744 }
745 }
746
747 return $partitionend;
748}
749
750
751sub add_hlist_partition
752{
753 my $self = shift(@_);
754 my @metadata_groups = @{shift(@_)};
755 my $classifier_node = shift(@_);
756 my $partitionname = shift(@_);
757 my $metadata_value_to_OIDs_hash_ref = shift(@_);
758
759 # Create an hlist partition
760 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
761 'childtype' => "VList",
762 'contains' => [] );
763
764 # Add the children to the hlist partition
765 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
766 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
767}
768
769
770sub add_vlist
771{
772 my $self = shift(@_);
773 my @metadata_groups = @{shift(@_)};
774 my $classifier_node = shift(@_);
775 my $metadata_value_to_OIDs_hash_ref = shift(@_);
776
777 my $metadata_group = shift(@metadata_groups);
778 $classifier_node->{'mdtype'} = $metadata_group;
779
780 # Create an entry in the vlist for each value
781 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
782 {
783 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
784 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
785 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
786 my $OID = $OIDs[0];
787 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
788 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
789 }
790 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
791 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
792 @OIDs = $self->sort_leaf_items(\@OIDs);
793 foreach my $OID (@OIDs) {
794 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
795 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
796 }
797
798 }
799 # Otherwise create a sublist (bookshelf) for the metadata value
800 else {
801 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
802 'childtype' => "VList",
803 'mdtype' => $metadata_group,
804 'contains' => [] );
805
806 # If there are metadata elements remaining, recursively apply the process
807 if (@metadata_groups > 0) {
808 my $next_metadata_group = $metadata_groups[0];
809 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
810 $self->add_level(\@metadata_groups, \@OIDs, \%child_classifier_node);
811 }
812 # Otherwise just add the documents as children of this list
813 else {
814 @OIDs = $self->sort_leaf_items(\@OIDs);
815 foreach my $OID (@OIDs) {
816 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
817 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
818 }
819
820 }
821
822 # Add the sublist to the list
823 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
824 }
825 }
826}
827
828sub metadata_offset
829{
830 my $self = shift(@_);
831 my $metadata_group = shift(@_);
832 my $OID = shift(@_);
833 my $metadata_value = shift(@_);
834
835 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
836 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
837 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
838 if ($metadata_value eq $metadata_values[$i]) {
839 return $i;
840 }
841 }
842
843 return 0;
844}
845
846sub sort_leaf_items
847{
848 my $self = shift(@_);
849 my @OIDs = @{shift(@_)};
850# my $classifier_node = shift(@_);
851
852 # Sort leaf nodes and add to list
853 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
854 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
855 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
856 # Force a stable sort (Perl 5.6's sort isn't stable)
857 # !! The [0] bits aren't ideal (multiple metadata values) !!
858 @OIDs = @OIDs[ sort {
859 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
860 {
861 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
862 }
863 else
864 {
865 $a <=> $b;
866 }
867 } 0..$#OIDs ];
868 }
869 return @OIDs;
870}
871
872
873
874sub sort_metadata_values_array
875{
876 my $self = shift(@_);
877 my @metadata_values = @_;
878
879 if ($self->{'unicode_collator'}) {
880 return $self->{'unicode_collator'}->sort(@metadata_values);
881 }
882 else {
883 return sort(@metadata_values);
884 }
885}
886
887
8881;
Note: See TracBrowser for help on using the repository browser.