source: gsdl/trunk/perllib/classify/List.pm@ 20426

Last change on this file since 20426 was 20424, checked in by kjdon, 15 years ago

made a subroutine for removing ex. Call this instead of doing it explicitly

  • Property svn:keywords set to Author Date Id Revision
File size: 32.8 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46my $partition_type_list =
47 [ { 'name' => "per_letter",
48 'desc' => "{List.level_partition.per_letter}" },
49 { 'name' => "constant_size",
50 'desc' => "{List.level_partition.constant_size}" },
51 { 'name' => "per_letter_fixed_size",
52 'desc' => "{List.level_partition.per_letter_fixed_size}"},
53 { 'name' => "none",
54 'desc' => "{List.level_partition.none}" } ];
55
56my $bookshelf_type_list =
57 [ { 'name' => "always",
58 'desc' => "{List.bookshelf_type.always}" },
59 { 'name' => "duplicate_only",
60 'desc' => "{List.bookshelf_type.duplicate_only}" },
61 { 'name' => "never",
62 'desc' => "{List.bookshelf_type.never}" } ];
63
64my $arguments =
65 [ { 'name' => "metadata",
66 'desc' => "{List.metadata}",
67 'type' => "metadata",
68 'reqd' => "yes" },
69
70 # The interesting options
71 { 'name' => "bookshelf_type",
72 'desc' => "{List.bookshelf_type}",
73 'type' => "enum",
74 'list' => $bookshelf_type_list,
75 'deft' => "never" },
76 { 'name' => "classify_sections",
77 'desc' => "{List.classify_sections}",
78 'type' => "flag" },
79 { 'name' => "partition_type_within_level",
80 'desc' => "{List.partition_type_within_level}",
81 'type' => "enum",
82 'list' => $partition_type_list,
83 'deft' => "per_letter" },
84 { 'name' => "partition_size_within_level",
85 'desc' => "{List.partition_size_within_level}",
86 'type' => "int",
87 'range' => "1,",
88 'deft' => "39" },
89 { 'name' => "partition_name_length",
90 'desc' => "{List.partition_name_length}",
91 'type' => "string" },
92 { 'name' => "sort_leaf_nodes_using",
93 'desc' => "{List.sort_leaf_nodes_using}",
94 'type' => "metadata",
95 'deft' => "Title" },
96 { 'name' => "sort_using_unicode_collation",
97 'desc' => "{List.sort_using_unicode_collation}",
98 'type' => "flag" },
99 { 'name' => "use_hlist_for",
100 'desc' => "{List.use_hlist_for}",
101 'type' => "string" },
102 { 'name' => "removeprefix",
103 'desc' => "{BasClas.removeprefix}",
104 'type' => "regexp" },
105 { 'name' => "removesuffix",
106 'desc' => "{BasClas.removesuffix}",
107 'type' => "regexp" } ];
108
109my $options = { 'name' => "List",
110 'desc' => "{List.desc}",
111 'abstract' => "no",
112 'inherits' => "yes",
113 'args' => $arguments };
114
115
116sub new
117{
118 my ($class) = shift(@_);
119 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
120 push(@$classifierslist, $class);
121
122 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
123 push(@{$hashArgOptLists->{"OptList"}}, $options);
124
125 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
126
127 if ($self->{'info_only'}) {
128 # don't worry about any options etc
129 return bless $self, $class;
130 }
131
132 # The metadata elements to use (required)
133 if (!$self->{'metadata'}) {
134 die "Error: No metadata fields specified for List.\n";
135 }
136 my @metadata_groups = split(/\//, $self->{'metadata'});
137 $self->{'metadata_groups'} = \@metadata_groups;
138
139 # The classifier button name (default: the first metadata element specified)
140 if (!$self->{'buttonname'}) {
141 my $first_metadata_group = $metadata_groups[0];
142 my $first_metadata_element = (split(/\;|,/, $first_metadata_group))[0];
143 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
144 }
145
146 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
147 foreach my $metadata_group (@metadata_groups) {
148 $self->{$metadata_group . ".bookshelf_type"} = "always";
149 }
150 my $last_metadata_group = $metadata_groups[$#metadata_groups];
151 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
152 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
153
154 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
155 foreach my $metadata_group (@metadata_groups) {
156 $self->{$metadata_group . ".list_type"} = "VList";
157 }
158 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
159 $self->{$metadata_group . ".list_type"} = "HList";
160 }
161
162 # How the items are grouped into partitions (default: no partition)
163 # for each level (metadata group), separated by '/'
164 if (!$self->{"partition_type_within_level"}) {
165 foreach my $metadata_group (@metadata_groups) {
166 $self->{$metadata_group . ".partition_type_within_level"} = "none";
167 }
168 } else {
169 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
170 foreach my $metadata_group (@metadata_groups) {
171 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
172 if (defined($partition_type_within_levelelem)) {
173 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
174 }
175 else {
176 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
177 }
178 }
179 }
180
181 # The number of items in each partition
182 if (!$self->{'partition_size_within_level'}) {
183 # Default: 20
184 foreach my $metadata_group (@metadata_groups) {
185 $self->{$metadata_group . ".partition_size_within_level"} = 20;
186 }
187 }
188 else {
189 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
190
191 # Assign values based on the partition_size_within_level parameter
192 foreach my $metadata_group (@metadata_groups) {
193 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
194 if (defined($partition_size_within_levelelem)) {
195 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
196 }
197 else {
198 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
199 }
200 }
201 }
202
203 # The removeprefix and removesuffix expressions
204 if ($self->{'removeprefix'}) {
205 # If there are more than one expressions, use '' to quote each experession and '/' to separate
206 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
207
208 foreach my $metadata_group (@metadata_groups) {
209 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
210 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
211 # Remove the other ' at the beginning and the end if there is any
212 $removeprefix_expr_within_levelelem =~ s/^'//;
213 $removeprefix_expr_within_levelelem =~ s/'$//;
214 # Remove the extra ^ at the beginning
215 $removeprefix_expr_within_levelelem =~ s/^\^//;
216 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
217 } else {
218 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
219 }
220 }
221 }
222 if ($self->{'removesuffix'}) {
223 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
224
225 foreach my $metadata_group (@metadata_groups) {
226 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
227 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
228 $removesuffix_expr_within_levelelem =~ s/^'//;
229 $removesuffix_expr_within_levelelem =~ s/'$//;
230 # Remove the extra $ at the end
231 $removesuffix_expr_within_levelelem =~ s/\$$//;
232 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
233 } else {
234 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
235 }
236 }
237 }
238
239 # The metadata elements to use to sort the leaf nodes (default: Title)
240 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
241 if ($self->{'sort_leaf_nodes_using'}) {
242 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
243 }
244 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
245
246 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
247 if ($self->{'sort_using_unicode_collation'}) {
248 # To use this you first need to download the allkeys.txt file from
249 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
250 # Unicode/Collate directory.
251 require Unicode::Collate;
252 $self->{'unicode_collator'} = Unicode::Collate->new();
253 }
254
255 # An empty array for the document OIDs
256 $self->{'OIDs'} = [];
257
258 return bless $self, $class;
259}
260
261
262sub init
263{
264 # Nothing to do...
265}
266
267
268# Called for each document in the collection
269sub classify
270{
271 my $self = shift(@_);
272 my ($doc_obj,$edit_mode) = @_;
273
274 # If "-classify_sections" is set, classify every section of the document
275 if ($self->{'classify_sections'}) {
276 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
277 while (defined $section) {
278 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section, $edit_mode);
279 $section = $doc_obj->get_next_section($section);
280 }
281 }
282 # Otherwise just classify the top document section
283 else {
284 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section(), $edit_mode);
285 }
286}
287
288
289sub classify_section
290{
291 my $self = shift(@_);
292 my ($doc_obj,$section_OID,$section,$edit_mode) = @_;
293
294 my @metadata_groups = @{$self->{'metadata_groups'}};
295
296 # Only classify the section if it has a value for one of the metadata elements in the first group
297 my $classify_section = 0;
298 my $first_metadata_group = $metadata_groups[0];
299 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
300 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
301 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
302
303 # Remove prefix/suffix if requested
304 my $remove_prefix_expr = $self->{$first_metadata_group_element . ".remove_prefix_expr"};
305 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
306 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
307 }
308 my $remove_suffix_expr = $self->{$first_metadata_group_element . ".remove_suffix_expr"};
309 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
310 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
311 }
312
313 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
314 # This section must be included in the classifier
315 $classify_section = 1;
316 last;
317 }
318 }
319
320 # We're not classifying this section because it doesn't have the required metadata
321 return if (!$classify_section);
322
323 if (($edit_mode eq "delete") || ($edit_mode eq "update")) {
324 $self->oid_array_delete($section_OID,'OIDs');
325 if ($edit_mode eq "delete") {
326 return;
327 }
328 }
329
330 # Otherwise, include this section in the classifier
331 push(@{$self->{'OIDs'}}, $section_OID);
332
333 # Create a hash for the metadata values of each metadata element we're interested in
334 my %metadata_groups_done = ();
335 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
336 # Take care not to do a metadata group more than once
337 unless ($metadata_groups_done{$metadata_group}) {
338 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
339 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
340
341 my $remove_prefix_expr = $self->{$metadata_element . ".remove_prefix_expr"};
342 my $remove_suffix_expr = $self->{$metadata_element . ".remove_suffix_expr"};
343 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
344 foreach my $metadata_value (@metadata_values) {
345 # Strip leading and trailing whitespace
346 $metadata_value =~ s/^\s*//;
347 $metadata_value =~ s/\s*$//;
348
349 # Remove prefix/suffix if requested
350 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
351 $metadata_value =~ s/^$remove_prefix_expr//;
352 }
353 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
354 $metadata_value =~ s/$remove_suffix_expr$//;
355 }
356
357 # Convert the metadata value from a UTF-8 string to a Unicode string
358 # This means that length() and substr() work properly
359 # We need to be careful to convert classifier node title values back to UTF-8, however
360 my $metadata_value_unicode_string = $self->convert_utf8_string_to_unicode_string($metadata_value);
361
362 # Add the metadata value into the list for this combination of metadata group and section
363 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
364 }
365 last if (@metadata_values > 0);
366 }
367
368 $metadata_groups_done{$metadata_group} = 1;
369 }
370 }
371}
372
373
374sub get_classify_info
375{
376 my $self = shift(@_);
377
378 # The metadata groups to classify by
379 my @metadata_groups = @{$self->{'metadata_groups'}};
380 my $first_metadata_group = $metadata_groups[0];
381
382 # The OID values of the documents to include in the classifier
383 my @OIDs = @{$self->{'OIDs'}};
384
385 # Create the root node of the classification hierarchy
386 my %classifier_node = ( 'thistype' => "Invisible",
387 'childtype' => $self->{$first_metadata_group . ".list_type"},
388 'Title' => $self->{'buttonname'},
389 'contains' => [],
390 'mdtype' => $first_metadata_group );
391
392 # Recursively create the classification hierarchy, one level for each metadata group
393 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
394 return \%classifier_node;
395}
396
397
398sub add_level
399{
400 my $self = shift(@_);
401 my @metadata_groups = @{shift(@_)};
402 my @OIDs = @{shift(@_)};
403 my $classifier_node = shift(@_);
404 # print STDERR "\nAdding AZ list for " . $classifier_node->{'Title'} . "\n";
405
406 my $metadata_group = $metadata_groups[0];
407 # print STDERR "Processing metadata group: " . $metadata_group . "\n";
408 # print STDERR "Number of OID values: " . @OIDs . "\n";
409
410 if (!defined($self->{$metadata_group . ".list"})) {
411 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
412 return;
413 }
414
415 # Create a mapping from metadata value to OID
416 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
417 my %metadata_value_to_OIDs_hash = ();
418 foreach my $OID (@OIDs)
419 {
420 if ($OID_to_metadata_values_hash_ref->{$OID})
421 {
422 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
423 foreach my $metadata_value (@metadata_values)
424 {
425 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
426 }
427 }
428 }
429 # print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
430
431 # Partition the values (if necessary)
432 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
433 if ($partition_type_within_level =~ /^per_letter$/i) {
434 # Generate one hlist for each letter
435 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
436 my %metadata_value_to_OIDs_subhash = ();
437
438 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
439 foreach my $metadata_value (@sortedmetadata_values) {
440 my $metadata_valuepartition = substr($metadata_value, 0, 1);
441
442 # Is this the start of a new partition?
443 if ($metadata_valuepartition ne $lastpartition) {
444 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
445 %metadata_value_to_OIDs_subhash = ();
446 $lastpartition = $metadata_valuepartition;
447 }
448
449 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
450 }
451
452 # Don't forget to add the last partition
453 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
454
455 # The partitions are stored in an HList
456 $classifier_node->{'childtype'} = "HList";
457 }
458 elsif ($partition_type_within_level =~ /^per_letter_fixed_size$/i) {
459 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
460 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
461 # If a letter has more items than specified, split into several hlists.
462 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
463 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
464 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
465 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
466
467 # Separate values by their first letter, each form a bucket, like the per_letter partition type
468 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
469 my %partition_buckets = ();
470 my @metadata_values_in_bucket = ();
471 my $num_items_in_bucket = 0;
472 foreach my $metadata_value (@sortedmetadata_values) {
473 my $metadata_valuepartition = substr($metadata_value, 0, 1);
474 if ($metadata_valuepartition ne $last_partition) {
475 my @temp_array = @metadata_values_in_bucket;
476 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
477 my %partition_info = ();
478 $partition_info{'metadata_values'} = \@temp_array;
479 $partition_info{'size'} = $num_items_in_bucket;
480 $partition_buckets{$last_partition} = \%partition_info;
481
482 @metadata_values_in_bucket = ($metadata_value);
483 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
484 $last_partition = $metadata_valuepartition;
485 } else {
486 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
487 push (@metadata_values_in_bucket, $metadata_value);
488 }
489 }
490 # Last one
491 my %partition_info = ();
492 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
493 $partition_info{'size'} = $num_items_in_bucket;
494 $partition_buckets{$last_partition} = \%partition_info;
495
496 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
497 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
498 my $partition = $partition_keys[$i];
499 my $items_in_partition = $partition_buckets{$partition}->{'size'};
500 # Merge small buckets together, but keep the numeric bucket apart
501 if ($items_in_partition < $partition_size_within_level) {
502 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
503 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
504 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
505 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
506 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
507 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
508 }
509 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
510 delete $partition_buckets{$partition};
511 }
512 }
513 }
514 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
515
516 # Add partitions, and divide big bucket into several
517 my $last_partition_end = "";
518 my $partition_start = "";
519 foreach my $partition (@partition_keys) {
520 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
521 my $items_in_partition = $partition_buckets{$partition}->{'size'};
522 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
523
524 if ($items_in_partition > $partition_size_within_level) {
525 my $items_done = 0;
526 my %metadata_values_to_OIDs_subhashes = ();
527 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
528 my $metadata_value = $metadata_values[$i];
529 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
530 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
531
532 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
533 my $partitionname = $partition_start;
534 if ($partitionend ne $partition_start) {
535 $partitionname = $partitionname . "-" . $partitionend;
536 }
537
538 # Start a new partition
539 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
540 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
541 $last_partition_end = $partitionend;
542 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
543 $items_done = 0;
544 %metadata_values_to_OIDs_subhashes = ();
545 }
546
547 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
548 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
549 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
550 # Get the number of partitions needed for this value
551 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
552 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
553
554 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
555 for (my $i = 0; $i < $num_splits; $i++) {
556 my %OIDs_subhashes_for_this_value = ();
557 my @OIDs_for_this_partition = ();
558 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
559 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
560 }
561
562 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
563 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
564 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
565 $items_done += scalar(@OIDs_for_this_partition);
566 next;
567 }
568
569 # Add an HList for this bucket
570 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
571 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
572 $last_partition_end = $partitionname_for_this_value;
573 }
574 next;
575 }
576
577 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
578 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
579
580 # The last partition
581 if($i == scalar(@metadata_values) - 1) {
582 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
583 }
584 }
585 }
586 else {
587 # The easier case, just add a partition
588 my %metadata_values_to_OIDs_subhashes = ();
589 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
590 my $metadata_value = $metadata_values[$i];
591 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
592 }
593 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
594 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
595 my $partitionname = $partition_start;
596 if ($partitionend ne $partition_start) {
597 $partitionname = $partitionname . "-" . $partitionend;
598 }
599 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
600 $last_partition_end = $partitionend;
601 }
602 }
603 }
604 else {
605 # Generate hlists of a certain size
606 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
607 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
608 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
609 my $itemsdone = 0;
610 my %metadata_value_to_OIDs_subhash = ();
611 my $lastpartitionend = "";
612 my $partitionstart;
613 foreach my $metadata_value (@sortedmetadata_values) {
614 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
615 $itemsdone++;
616 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
617
618 # Is this the start of a new partition?
619 if ($itemsinpartition == 1) {
620 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
621 }
622
623 # Is this the end of the partition?
624 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
625 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
626 my $partitionname = $partitionstart;
627 if ($partitionend ne $partitionstart) {
628 $partitionname = $partitionname . "-" . $partitionend;
629 }
630
631 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
632 %metadata_value_to_OIDs_subhash = ();
633 $lastpartitionend = $partitionend;
634 }
635 }
636
637 # The partitions are stored in an HList
638 $classifier_node->{'childtype'} = "HList";
639 }
640
641 # Otherwise just add all the values to a VList
642 else {
643 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
644 }
645 }
646}
647
648
649sub convert_utf8_string_to_unicode_string
650{
651 my $self = shift(@_);
652 my $utf8_string = shift(@_);
653
654 my $unicode_string = "";
655 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
656 $unicode_string .= chr($unicode_value);
657 }
658 return $unicode_string;
659}
660
661
662sub convert_unicode_string_to_utf8_string
663{
664 my $self = shift(@_);
665 my $unicode_string = shift(@_);
666
667 my @unicode_array;
668 for (my $i = 0; $i < length($unicode_string); $i++) {
669 push(@unicode_array, ord(substr($unicode_string, $i, 1)));
670 }
671 return &unicode::unicode2utf8(\@unicode_array);
672}
673
674
675sub generate_partition_start
676{
677 my $self = shift(@_);
678 my $metadata_value = shift(@_);
679 my $lastpartitionend = shift(@_);
680 my $partition_name_length = shift(@_);
681
682 if ($partition_name_length) {
683 return substr($metadata_value, 0, $partition_name_length);
684 }
685
686 my $partitionstart = substr($metadata_value, 0, 1);
687 if ($partitionstart le $lastpartitionend) {
688 $partitionstart = substr($metadata_value, 0, 2);
689 # Give up after three characters
690 if ($partitionstart le $lastpartitionend) {
691 $partitionstart = substr($metadata_value, 0, 3);
692 }
693 }
694
695 return $partitionstart;
696}
697
698
699sub generate_partition_end
700{
701 my $self = shift(@_);
702 my $metadata_value = shift(@_);
703 my $partitionstart = shift(@_);
704 my $partition_name_length = shift(@_);
705
706 if ($partition_name_length) {
707 return substr($metadata_value, 0, $partition_name_length);
708 }
709
710 my $partitionend = substr($metadata_value, 0, length($partitionstart));
711 if ($partitionend gt $partitionstart) {
712 $partitionend = substr($metadata_value, 0, 1);
713 if ($partitionend le $partitionstart) {
714 $partitionend = substr($metadata_value, 0, 2);
715 # Give up after three characters
716 if ($partitionend le $partitionstart) {
717 $partitionend = substr($metadata_value, 0, 3);
718 }
719 }
720 }
721
722 return $partitionend;
723}
724
725
726sub add_hlist_partition
727{
728 my $self = shift(@_);
729 my @metadata_groups = @{shift(@_)};
730 my $classifier_node = shift(@_);
731 my $partitionname = shift(@_);
732 my $metadata_value_to_OIDs_hash_ref = shift(@_);
733
734 # Create an hlist partition
735 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
736 'childtype' => "VList",
737 'contains' => [] );
738
739 # Add the children to the hlist partition
740 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
741 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
742}
743
744
745sub add_vlist
746{
747 my $self = shift(@_);
748 my @metadata_groups = @{shift(@_)};
749 my $classifier_node = shift(@_);
750 my $metadata_value_to_OIDs_hash_ref = shift(@_);
751
752 my $metadata_group = shift(@metadata_groups);
753 $classifier_node->{'mdtype'} = $metadata_group;
754
755 # Create an entry in the vlist for each value
756 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
757 {
758 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
759
760 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
761 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
762 my $OID = $OIDs[0];
763
764 # Find the offset of this metadata value
765 my $offset = 0;
766 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
767 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
768 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
769 if ($metadata_value eq $metadata_values[$i]) {
770 $offset = $i;
771 last;
772 }
773 }
774 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
775 }
776 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
777 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
778 $self->add_sorted_leaf_items(\@OIDs, $classifier_node);
779 }
780 # Otherwise create a sublist (bookshelf) for the metadata value
781 else {
782 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
783 'childtype' => "VList",
784 'contains' => [] );
785
786 # If there are metadata elements remaining, recursively apply the process
787 if (@metadata_groups > 0) {
788 my $next_metadata_group = $metadata_groups[0];
789 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
790 $self->add_level(\@metadata_groups, \@OIDs, \%child_classifier_node);
791 }
792 # Otherwise just add the documents as children of this list
793 else {
794 $self->add_sorted_leaf_items(\@OIDs, \%child_classifier_node);
795 }
796
797 # Add the sublist to the list
798 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
799 }
800 }
801}
802
803sub add_sorted_leaf_items
804{
805 my $self = shift(@_);
806 my @OIDs = @{shift(@_)};
807 my $classifier_node = shift(@_);
808
809 # Sort leaf nodes and add to list
810 my @sort_leaf_nodes_usingmetadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
811 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_usingmetadata_groups) {
812 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
813 # Force a stable sort (Perl 5.6's sort isn't stable)
814 # !! The [0] bits aren't ideal (multiple metadata values) !!
815 @OIDs = @OIDs[ sort { $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ];
816 }
817
818 foreach my $OID (@OIDs) {
819 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID });
820 }
821}
822
823
824sub sort_metadata_values_array
825{
826 my $self = shift(@_);
827 my @metadata_values = @_;
828
829 if ($self->{'unicode_collator'}) {
830 return $self->{'unicode_collator'}->sort(@metadata_values);
831 }
832 else {
833 return sort(@metadata_values);
834 }
835}
836
837
8381;
Note: See TracBrowser for help on using the repository browser.