source: main/trunk/greenstone2/perllib/classify/List.pm@ 23116

Last change on this file since 23116 was 23116, checked in by kjdon, 14 years ago

for incremental build, classifiers are not really done incrementally. Previously, we reconstructed all the docs from the database, and classified them, then processed any new/edited/deleted docs, updating the classifier as necessary. Now, we process all new/updated docs, then reconstruct the docs from the database, but only classify those not changed/deleted. This means that we are only ever adding docs to a classifier, never updating or deleting. I have removed edit_mode and all code handling deleting stuff from the classifier.

  • Property svn:keywords set to Author Date Id Revision
File size: 34.6 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46my $partition_type_list =
47 [ { 'name' => "per_letter",
48 'desc' => "{List.level_partition.per_letter}" },
49 { 'name' => "approximate_size",
50 'desc' => "{List.level_partition.approximate_size}"},
51 { 'name' => "constant_size",
52 'desc' => "{List.level_partition.constant_size}" },
53 { 'name' => "none",
54 'desc' => "{List.level_partition.none}" } ];
55
56# following used to check types later on
57my $valid_partition_types = { 'per_letter' => 1,
58 'constant_size' => 1,
59 'per_letter_fixed_size' => 1,
60 'approximate_size' => 1,
61 'none' => 1};
62
63my $bookshelf_type_list =
64 [ { 'name' => "always",
65 'desc' => "{List.bookshelf_type.always}" },
66 { 'name' => "duplicate_only",
67 'desc' => "{List.bookshelf_type.duplicate_only}" },
68 { 'name' => "never",
69 'desc' => "{List.bookshelf_type.never}" } ];
70
71my $arguments =
72 [ { 'name' => "metadata",
73 'desc' => "{List.metadata}",
74 'type' => "metadata",
75 'reqd' => "yes" },
76
77 # The interesting options
78 { 'name' => "bookshelf_type",
79 'desc' => "{List.bookshelf_type}",
80 'type' => "enum",
81 'list' => $bookshelf_type_list,
82 'deft' => "never" },
83 { 'name' => "classify_sections",
84 'desc' => "{List.classify_sections}",
85 'type' => "flag" },
86 { 'name' => "partition_type_within_level",
87 'desc' => "{List.partition_type_within_level}",
88 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
89 'list' => $partition_type_list,
90 'deft' => "per_letter" },
91 { 'name' => "partition_size_within_level",
92 'desc' => "{List.partition_size_within_level}",
93 'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
94 { 'name' => "partition_name_length",
95 'desc' => "{List.partition_name_length}",
96 'type' => "string" },
97 { 'name' => "sort_leaf_nodes_using",
98 'desc' => "{List.sort_leaf_nodes_using}",
99 'type' => "metadata",
100 'deft' => "Title" },
101 { 'name' => "sort_using_unicode_collation",
102 'desc' => "{List.sort_using_unicode_collation}",
103 'type' => "flag" },
104 { 'name' => "use_hlist_for",
105 'desc' => "{List.use_hlist_for}",
106 'type' => "string" },
107 { 'name' => "removeprefix",
108 'desc' => "{BasClas.removeprefix}",
109 'type' => "regexp" },
110 { 'name' => "removesuffix",
111 'desc' => "{BasClas.removesuffix}",
112 'type' => "regexp" } ];
113
114my $options = { 'name' => "List",
115 'desc' => "{List.desc}",
116 'abstract' => "no",
117 'inherits' => "yes",
118 'args' => $arguments };
119
120
121sub new
122{
123 my ($class) = shift(@_);
124 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
125 push(@$classifierslist, $class);
126
127 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
128 push(@{$hashArgOptLists->{"OptList"}}, $options);
129
130 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
131
132 if ($self->{'info_only'}) {
133 # don't worry about any options etc
134 return bless $self, $class;
135 }
136
137 # The metadata elements to use (required)
138 if (!$self->{'metadata'}) {
139 die "Error: No metadata fields specified for List.\n";
140 }
141 my @metadata_groups = split(/\//, $self->{'metadata'});
142 $self->{'metadata_groups'} = \@metadata_groups;
143
144 # The classifier button name (default: the first metadata element specified)
145 if (!$self->{'buttonname'}) {
146 my $first_metadata_group = $metadata_groups[0];
147 my $first_metadata_element = (split(/\;|,/, $first_metadata_group))[0];
148 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
149 }
150
151 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
152 foreach my $metadata_group (@metadata_groups) {
153 $self->{$metadata_group . ".bookshelf_type"} = "always";
154 }
155 my $last_metadata_group = $metadata_groups[$#metadata_groups];
156 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
157 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
158
159 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
160 foreach my $metadata_group (@metadata_groups) {
161 $self->{$metadata_group . ".list_type"} = "VList";
162 }
163 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
164 $self->{$metadata_group . ".list_type"} = "HList";
165 }
166
167 # How the items are grouped into partitions (default: no partition)
168 # for each level (metadata group), separated by '/'
169 if (!$self->{"partition_type_within_level"}) {
170 foreach my $metadata_group (@metadata_groups) {
171 $self->{$metadata_group . ".partition_type_within_level"} = "none";
172 }
173 } else {
174 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
175
176 my $first = 1;
177 foreach my $metadata_group (@metadata_groups) {
178 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
179 if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
180 print STDERR "per letter fixed size, changing to approximate size\n";
181 $partition_type_within_levelelem = "approximate_size";
182 }
183 if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
184 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
185 }
186 else {
187 if ($first) {
188 $self->{$metadata_group . ".partition_type_within_level"} = "none";
189 $first = 0;
190 } else {
191 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
192 }
193 if (defined($partition_type_within_levelelem)) {
194 # ie invalid entry
195 print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
196 }
197 }
198 }
199 }
200
201 # The number of items in each partition
202 if (!$self->{'partition_size_within_level'}) {
203 # Default: 20
204 foreach my $metadata_group (@metadata_groups) {
205 $self->{$metadata_group . ".partition_size_within_level"} = 20;
206 }
207 }
208 else {
209 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
210
211 # Assign values based on the partition_size_within_level parameter
212 foreach my $metadata_group (@metadata_groups) {
213 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
214 if (defined($partition_size_within_levelelem)) {
215 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
216 }
217 else {
218 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
219 }
220 }
221 }
222
223 # The removeprefix and removesuffix expressions
224 if ($self->{'removeprefix'}) {
225 # If there are more than one expressions, use '' to quote each experession and '/' to separate
226 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
227
228 foreach my $metadata_group (@metadata_groups) {
229 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
230 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
231 # Remove the other ' at the beginning and the end if there is any
232 $removeprefix_expr_within_levelelem =~ s/^'//;
233 $removeprefix_expr_within_levelelem =~ s/'$//;
234 # Remove the extra ^ at the beginning
235 $removeprefix_expr_within_levelelem =~ s/^\^//;
236 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
237 } else {
238 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
239 }
240 }
241 }
242 if ($self->{'removesuffix'}) {
243 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
244
245 foreach my $metadata_group (@metadata_groups) {
246 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
247 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
248 $removesuffix_expr_within_levelelem =~ s/^'//;
249 $removesuffix_expr_within_levelelem =~ s/'$//;
250 # Remove the extra $ at the end
251 $removesuffix_expr_within_levelelem =~ s/\$$//;
252 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
253 } else {
254 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
255 }
256 }
257 }
258
259 # The metadata elements to use to sort the leaf nodes (default: Title)
260 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
261 if ($self->{'sort_leaf_nodes_using'}) {
262 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
263 }
264 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
265
266 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
267 if ($self->{'sort_using_unicode_collation'}) {
268 # To use this you first need to download the allkeys.txt file from
269 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
270 # Unicode/Collate directory.
271 require Unicode::Collate;
272 $self->{'unicode_collator'} = Unicode::Collate->new();
273 }
274
275 # An empty array for the document OIDs
276 $self->{'OIDs'} = [];
277
278 return bless $self, $class;
279}
280
281
282sub init
283{
284 # Nothing to do...
285}
286
287
288# Called for each document in the collection
289sub classify
290{
291 my $self = shift(@_);
292 my ($doc_obj) = @_;
293
294 # If "-classify_sections" is set, classify every section of the document
295 if ($self->{'classify_sections'}) {
296 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
297 while (defined $section) {
298 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
299 $section = $doc_obj->get_next_section($section);
300 }
301 }
302 # Otherwise just classify the top document section
303 else {
304 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
305 }
306}
307
308
309sub classify_section
310{
311 my $self = shift(@_);
312 my ($doc_obj,$section_OID,$section) = @_;
313
314 my @metadata_groups = @{$self->{'metadata_groups'}};
315
316 # Only classify the section if it has a value for one of the metadata elements in the first group
317 my $classify_section = 0;
318 my $first_metadata_group = $metadata_groups[0];
319 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
320 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
321 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
322 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
323 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
324
325 # Remove prefix/suffix if requested
326 if (defined ($first_metadata_group_element_value)) {
327 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
328 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
329 }
330
331 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
332 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
333 }
334 }
335 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
336 # This section must be included in the classifier
337 $classify_section = 1;
338 last;
339 }
340 }
341
342 # We're not classifying this section because it doesn't have the required metadata
343 return if (!$classify_section);
344
345 # Otherwise, include this section in the classifier
346 push(@{$self->{'OIDs'}}, $section_OID);
347
348 # Create a hash for the metadata values of each metadata element we're interested in
349 my %metadata_groups_done = ();
350 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
351 # Take care not to do a metadata group more than once
352 unless ($metadata_groups_done{$metadata_group}) {
353 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
354 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
355 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
356 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
357
358 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
359 foreach my $metadata_value (@metadata_values) {
360 # Strip leading and trailing whitespace
361 $metadata_value =~ s/^\s*//;
362 $metadata_value =~ s/\s*$//;
363
364 # Remove prefix/suffix if requested
365 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
366 $metadata_value =~ s/^$remove_prefix_expr//;
367 }
368 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
369 $metadata_value =~ s/$remove_suffix_expr$//;
370 }
371
372 # uppercase the metadata - makes the AZList nicer
373 $metadata_value = uc($metadata_value);
374 # Convert the metadata value from a UTF-8 string to a Unicode string
375 # This means that length() and substr() work properly
376 # We need to be careful to convert classifier node title values back to UTF-8, however
377 my $metadata_value_unicode_string = $self->convert_utf8_string_to_unicode_string($metadata_value);
378
379 # Add the metadata value into the list for this combination of metadata group and section
380 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
381 }
382 last if (@metadata_values > 0);
383 }
384
385 $metadata_groups_done{$metadata_group} = 1;
386 }
387 }
388}
389
390
391sub get_classify_info
392{
393 my $self = shift(@_);
394
395 # The metadata groups to classify by
396 my @metadata_groups = @{$self->{'metadata_groups'}};
397 my $first_metadata_group = $metadata_groups[0];
398
399 # The OID values of the documents to include in the classifier
400 my @OIDs = @{$self->{'OIDs'}};
401
402 # Create the root node of the classification hierarchy
403 my %classifier_node = ( 'thistype' => "Invisible",
404 'childtype' => $self->{$first_metadata_group . ".list_type"},
405 'Title' => $self->{'buttonname'},
406 'contains' => [],
407 'mdtype' => $first_metadata_group );
408
409 # Recursively create the classification hierarchy, one level for each metadata group
410 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
411 return \%classifier_node;
412}
413
414
415sub add_level
416{
417 my $self = shift(@_);
418 my @metadata_groups = @{shift(@_)};
419 my @OIDs = @{shift(@_)};
420 my $classifier_node = shift(@_);
421 # print STDERR "\nAdding AZ list for " . $classifier_node->{'Title'} . "\n";
422
423 my $metadata_group = $metadata_groups[0];
424 # print STDERR "Processing metadata group: " . $metadata_group . "\n";
425 # print STDERR "Number of OID values: " . @OIDs . "\n";
426
427 if (!defined($self->{$metadata_group . ".list"})) {
428 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
429 return;
430 }
431
432 # Create a mapping from metadata value to OID
433 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
434 my %metadata_value_to_OIDs_hash = ();
435 foreach my $OID (@OIDs)
436 {
437 if ($OID_to_metadata_values_hash_ref->{$OID})
438 {
439 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
440 foreach my $metadata_value (@metadata_values)
441 {
442 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
443 }
444 }
445 }
446 # print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
447
448 # Partition the values (if necessary)
449 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
450 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
451 if ($partition_type_within_level =~ /^per_letter$/i) {
452 # Generate one hlist for each letter
453 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
454 my %metadata_value_to_OIDs_subhash = ();
455
456 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
457 foreach my $metadata_value (@sortedmetadata_values) {
458 my $metadata_valuepartition = substr($metadata_value, 0, 1);
459
460 # Is this the start of a new partition?
461 if ($metadata_valuepartition ne $lastpartition) {
462 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
463 %metadata_value_to_OIDs_subhash = ();
464 $lastpartition = $metadata_valuepartition;
465 }
466
467 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
468 }
469
470 # Don't forget to add the last partition
471 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
472
473 # The partitions are stored in an HList
474 $classifier_node->{'childtype'} = "HList";
475 }
476 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
477 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
478 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
479 # If a letter has more items than specified, split into several hlists.
480 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
481 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
482 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
483 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
484
485 # Separate values by their first letter, each form a bucket, like the per_letter partition type
486 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
487 my %partition_buckets = ();
488 my @metadata_values_in_bucket = ();
489 my $num_items_in_bucket = 0;
490 foreach my $metadata_value (@sortedmetadata_values) {
491 my $metadata_valuepartition = substr($metadata_value, 0, 1);
492 if ($metadata_valuepartition ne $last_partition) {
493 my @temp_array = @metadata_values_in_bucket;
494 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
495 my %partition_info = ();
496 $partition_info{'metadata_values'} = \@temp_array;
497 $partition_info{'size'} = $num_items_in_bucket;
498 $partition_buckets{$last_partition} = \%partition_info;
499
500 @metadata_values_in_bucket = ($metadata_value);
501 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
502 $last_partition = $metadata_valuepartition;
503 } else {
504 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
505 push (@metadata_values_in_bucket, $metadata_value);
506 }
507 }
508 # Last one
509 my %partition_info = ();
510 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
511 $partition_info{'size'} = $num_items_in_bucket;
512 $partition_buckets{$last_partition} = \%partition_info;
513
514 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
515 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
516 my $partition = $partition_keys[$i];
517 my $items_in_partition = $partition_buckets{$partition}->{'size'};
518 # Merge small buckets together, but keep the numeric bucket apart
519 if ($items_in_partition < $partition_size_within_level) {
520 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
521 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
522 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
523 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
524 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
525 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
526 }
527 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
528 delete $partition_buckets{$partition};
529 }
530 }
531 }
532 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
533
534 # Add partitions, and divide big bucket into several
535 my $last_partition_end = "";
536 my $partition_start = "";
537 foreach my $partition (@partition_keys) {
538 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
539 my $items_in_partition = $partition_buckets{$partition}->{'size'};
540 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
541
542 if ($items_in_partition > $partition_size_within_level) {
543 my $items_done = 0;
544 my %metadata_values_to_OIDs_subhashes = ();
545 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
546 my $metadata_value = $metadata_values[$i];
547 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
548 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
549
550 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
551 my $partitionname = $partition_start;
552 if ($partitionend ne $partition_start) {
553 $partitionname = $partitionname . "-" . $partitionend;
554 }
555
556 # Start a new partition
557 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
558 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
559 $last_partition_end = $partitionend;
560 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
561 $items_done = 0;
562 %metadata_values_to_OIDs_subhashes = ();
563 }
564
565 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
566 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
567 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
568 # Get the number of partitions needed for this value
569 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
570 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
571
572 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
573 for (my $i = 0; $i < $num_splits; $i++) {
574 my %OIDs_subhashes_for_this_value = ();
575 my @OIDs_for_this_partition = ();
576 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
577 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
578 }
579
580 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
581 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
582 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
583 $items_done += scalar(@OIDs_for_this_partition);
584 next;
585 }
586
587 # Add an HList for this bucket
588 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
589 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
590 $last_partition_end = $partitionname_for_this_value;
591 }
592 next;
593 }
594
595 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
596 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
597
598 # The last partition
599 if($i == scalar(@metadata_values) - 1) {
600 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
601 }
602 }
603 }
604 else {
605 # The easier case, just add a partition
606 my %metadata_values_to_OIDs_subhashes = ();
607 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
608 my $metadata_value = $metadata_values[$i];
609 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
610 }
611 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
612 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
613 my $partitionname = $partition_start;
614 if ($partitionend ne $partition_start) {
615 $partitionname = $partitionname . "-" . $partitionend;
616 }
617 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
618 $last_partition_end = $partitionend;
619 }
620 }
621
622 # The partitions are stored in an HList
623 $classifier_node->{'childtype'} = "HList";
624
625 } # end approximate_size
626 else {
627 # Generate hlists of a certain size
628 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
629 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
630 my $itemsdone = 0;
631 my %metadata_value_to_OIDs_subhash = ();
632 my $lastpartitionend = "";
633 my $partitionstart;
634 foreach my $metadata_value (@sortedmetadata_values) {
635 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
636 $itemsdone++;
637 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
638
639 # Is this the start of a new partition?
640 if ($itemsinpartition == 1) {
641 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
642 }
643
644 # Is this the end of the partition?
645 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
646 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
647 my $partitionname = $partitionstart;
648 if ($partitionend ne $partitionstart) {
649 $partitionname = $partitionname . "-" . $partitionend;
650 }
651
652 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
653 %metadata_value_to_OIDs_subhash = ();
654 $lastpartitionend = $partitionend;
655 }
656 }
657
658 # The partitions are stored in an HList
659 $classifier_node->{'childtype'} = "HList";
660 }
661
662 # Otherwise just add all the values to a VList
663 else {
664 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
665 }
666 }
667}
668
669
670sub convert_utf8_string_to_unicode_string
671{
672 my $self = shift(@_);
673 my $utf8_string = shift(@_);
674
675 my $unicode_string = "";
676 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
677 $unicode_string .= chr($unicode_value);
678 }
679 return $unicode_string;
680}
681
682
683sub convert_unicode_string_to_utf8_string
684{
685 my $self = shift(@_);
686 my $unicode_string = shift(@_);
687
688 my @unicode_array;
689 for (my $i = 0; $i < length($unicode_string); $i++) {
690 push(@unicode_array, ord(substr($unicode_string, $i, 1)));
691 }
692 return &unicode::unicode2utf8(\@unicode_array);
693}
694
695
696sub generate_partition_start
697{
698 my $self = shift(@_);
699 my $metadata_value = shift(@_);
700 my $lastpartitionend = shift(@_);
701 my $partition_name_length = shift(@_);
702
703 if ($partition_name_length) {
704 return substr($metadata_value, 0, $partition_name_length);
705 }
706
707 my $partitionstart = substr($metadata_value, 0, 1);
708 if ($partitionstart le $lastpartitionend) {
709 $partitionstart = substr($metadata_value, 0, 2);
710 # Give up after three characters
711 if ($partitionstart le $lastpartitionend) {
712 $partitionstart = substr($metadata_value, 0, 3);
713 }
714 }
715
716 return $partitionstart;
717}
718
719
720sub generate_partition_end
721{
722 my $self = shift(@_);
723 my $metadata_value = shift(@_);
724 my $partitionstart = shift(@_);
725 my $partition_name_length = shift(@_);
726
727 if ($partition_name_length) {
728 return substr($metadata_value, 0, $partition_name_length);
729 }
730
731 my $partitionend = substr($metadata_value, 0, length($partitionstart));
732 if ($partitionend gt $partitionstart) {
733 $partitionend = substr($metadata_value, 0, 1);
734 if ($partitionend le $partitionstart) {
735 $partitionend = substr($metadata_value, 0, 2);
736 # Give up after three characters
737 if ($partitionend le $partitionstart) {
738 $partitionend = substr($metadata_value, 0, 3);
739 }
740 }
741 }
742
743 return $partitionend;
744}
745
746
747sub add_hlist_partition
748{
749 my $self = shift(@_);
750 my @metadata_groups = @{shift(@_)};
751 my $classifier_node = shift(@_);
752 my $partitionname = shift(@_);
753 my $metadata_value_to_OIDs_hash_ref = shift(@_);
754
755 # Create an hlist partition
756 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
757 'childtype' => "VList",
758 'contains' => [] );
759
760 # Add the children to the hlist partition
761 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
762 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
763}
764
765
766sub add_vlist
767{
768 my $self = shift(@_);
769 my @metadata_groups = @{shift(@_)};
770 my $classifier_node = shift(@_);
771 my $metadata_value_to_OIDs_hash_ref = shift(@_);
772
773 my $metadata_group = shift(@metadata_groups);
774 $classifier_node->{'mdtype'} = $metadata_group;
775
776 # Create an entry in the vlist for each value
777 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
778 {
779 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
780
781 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
782 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
783 my $OID = $OIDs[0];
784 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
785 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
786 }
787 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
788 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
789 @OIDs = $self->sort_leaf_items(\@OIDs);
790 foreach my $OID (@OIDs) {
791 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
792 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
793 }
794
795 }
796 # Otherwise create a sublist (bookshelf) for the metadata value
797 else {
798 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
799 'childtype' => "VList",
800 'mdtype' => $metadata_group,
801 'contains' => [] );
802
803 # If there are metadata elements remaining, recursively apply the process
804 if (@metadata_groups > 0) {
805 my $next_metadata_group = $metadata_groups[0];
806 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
807 $self->add_level(\@metadata_groups, \@OIDs, \%child_classifier_node);
808 }
809 # Otherwise just add the documents as children of this list
810 else {
811 @OIDs = $self->sort_leaf_items(\@OIDs);
812 foreach my $OID (@OIDs) {
813 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
814 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
815 }
816
817 }
818
819 # Add the sublist to the list
820 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
821 }
822 }
823}
824
825sub metadata_offset
826{
827 my $self = shift(@_);
828 my $metadata_group = shift(@_);
829 my $OID = shift(@_);
830 my $metadata_value = shift(@_);
831
832 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
833 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
834 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
835 if ($metadata_value eq $metadata_values[$i]) {
836 return $i;
837 }
838 }
839
840 return 0;
841}
842
843sub sort_leaf_items
844{
845 my $self = shift(@_);
846 my @OIDs = @{shift(@_)};
847# my $classifier_node = shift(@_);
848
849 # Sort leaf nodes and add to list
850 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
851 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
852 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
853 # Force a stable sort (Perl 5.6's sort isn't stable)
854 # !! The [0] bits aren't ideal (multiple metadata values) !!
855 @OIDs = @OIDs[ sort {
856 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
857 {
858 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
859 }
860 else
861 {
862 $a <=> $b;
863 }
864 } 0..$#OIDs ];
865 }
866 return @OIDs;
867}
868
869
870
871sub sort_metadata_values_array
872{
873 my $self = shift(@_);
874 my @metadata_values = @_;
875
876 if ($self->{'unicode_collator'}) {
877 return $self->{'unicode_collator'}->sort(@metadata_values);
878 }
879 else {
880 return sort(@metadata_values);
881 }
882}
883
884
8851;
Note: See TracBrowser for help on using the repository browser.