source: main/trunk/greenstone2/perllib/classify/List.pm@ 26267

Last change on this file since 26267 was 26267, checked in by kjdon, 12 years ago

added filter_metadata and filer_regex to filter documents going into the classifier. Can use these to restrict which documents get added to the classifier. eg If have several different DocType metadatas, can build a classifier on just one/some of the doctypes

  • Property svn:keywords set to Author Date Id Revision
File size: 36.7 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46my $partition_type_list =
47 [ { 'name' => "per_letter",
48 'desc' => "{List.level_partition.per_letter}" },
49 { 'name' => "approximate_size",
50 'desc' => "{List.level_partition.approximate_size}"},
51 { 'name' => "constant_size",
52 'desc' => "{List.level_partition.constant_size}" },
53 { 'name' => "none",
54 'desc' => "{List.level_partition.none}" } ];
55
56# following used to check types later on
57my $valid_partition_types = { 'per_letter' => 1,
58 'constant_size' => 1,
59 'per_letter_fixed_size' => 1,
60 'approximate_size' => 1,
61 'none' => 1};
62
63my $bookshelf_type_list =
64 [ { 'name' => "always",
65 'desc' => "{List.bookshelf_type.always}" },
66 { 'name' => "duplicate_only",
67 'desc' => "{List.bookshelf_type.duplicate_only}" },
68 { 'name' => "never",
69 'desc' => "{List.bookshelf_type.never}" } ];
70
71my $arguments =
72 [ { 'name' => "metadata",
73 'desc' => "{List.metadata}",
74 'type' => "metadata",
75 'reqd' => "yes" },
76
77 # The interesting options
78 { 'name' => "bookshelf_type",
79 'desc' => "{List.bookshelf_type}",
80 'type' => "enum",
81 'list' => $bookshelf_type_list,
82 'deft' => "never" },
83 { 'name' => "classify_sections",
84 'desc' => "{List.classify_sections}",
85 'type' => "flag" },
86 { 'name' => "partition_type_within_level",
87 'desc' => "{List.partition_type_within_level}",
88 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
89 'list' => $partition_type_list,
90 'deft' => "per_letter" },
91 { 'name' => "partition_size_within_level",
92 'desc' => "{List.partition_size_within_level}",
93 'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
94 { 'name' => "partition_name_length",
95 'desc' => "{List.partition_name_length}",
96 'type' => "string" },
97 { 'name' => "sort_leaf_nodes_using",
98 'desc' => "{List.sort_leaf_nodes_using}",
99 'type' => "metadata",
100 'deft' => "Title" },
101 { 'name' => "reverse_sort_leaf_nodes",
102 'desc' => "{List.reverse_sort_leaf_nodes}",
103 'type' => "flag"},
104 { 'name' => "sort_using_unicode_collation",
105 'desc' => "{List.sort_using_unicode_collation}",
106 'type' => "flag" },
107 { 'name' => "use_hlist_for",
108 'desc' => "{List.use_hlist_for}",
109 'type' => "string" },
110 {'name' => "filter_metadata",
111 'desc' => "{List.filter_metadata}",
112 'type' => "metadata"},
113 {'name' => "filter_regex",
114 'desc' => "{List.filter_regex}",
115 'type' => "regexp"},
116
117 { 'name' => "removeprefix",
118 'desc' => "{BasClas.removeprefix}",
119 'type' => "regexp" },
120 { 'name' => "removesuffix",
121 'desc' => "{BasClas.removesuffix}",
122 'type' => "regexp" } ];
123
124my $options = { 'name' => "List",
125 'desc' => "{List.desc}",
126 'abstract' => "no",
127 'inherits' => "yes",
128 'args' => $arguments };
129
130
131sub new
132{
133 my ($class) = shift(@_);
134 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
135 push(@$classifierslist, $class);
136
137 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
138 push(@{$hashArgOptLists->{"OptList"}}, $options);
139
140 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
141
142 if ($self->{'info_only'}) {
143 # don't worry about any options etc
144 return bless $self, $class;
145 }
146
147 # The metadata elements to use (required)
148 if (!$self->{'metadata'}) {
149 die "Error: No metadata fields specified for List.\n";
150 }
151
152 my @metadata_groups = split(/[\/]/, $self->{'metadata'});
153 $self->{'metadata_groups'} = \@metadata_groups;
154
155 # The classifier button name (default: the first metadata element specified)
156 if (!$self->{'buttonname'}) {
157 my $first_metadata_group = $metadata_groups[0];
158 my $first_metadata_element = (split(/[\;|,\/]/, $first_metadata_group))[0];
159 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
160 }
161
162 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
163 foreach my $metadata_group (@metadata_groups) {
164 $self->{$metadata_group . ".bookshelf_type"} = "always";
165 }
166 my $last_metadata_group = $metadata_groups[$#metadata_groups];
167 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
168 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
169
170 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
171 foreach my $metadata_group (@metadata_groups) {
172 $self->{$metadata_group . ".list_type"} = "VList";
173 }
174 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
175 $self->{$metadata_group . ".list_type"} = "HList";
176 }
177
178 # How the items are grouped into partitions (default: no partition)
179 # for each level (metadata group), separated by '/'
180 if (!$self->{"partition_type_within_level"}) {
181 foreach my $metadata_group (@metadata_groups) {
182 $self->{$metadata_group . ".partition_type_within_level"} = "none";
183 }
184 } else {
185 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
186
187 my $first = 1;
188 foreach my $metadata_group (@metadata_groups) {
189 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
190 if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
191 print STDERR "per letter fixed size, changing to approximate size\n";
192 $partition_type_within_levelelem = "approximate_size";
193 }
194 if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
195 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
196 }
197 else {
198 if ($first) {
199 $self->{$metadata_group . ".partition_type_within_level"} = "none";
200 $first = 0;
201 } else {
202 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
203 }
204 if (defined($partition_type_within_levelelem)) {
205 # ie invalid entry
206 print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
207 }
208 }
209 }
210 }
211
212 # The number of items in each partition
213 if (!$self->{'partition_size_within_level'}) {
214 # Default: 20
215 foreach my $metadata_group (@metadata_groups) {
216 $self->{$metadata_group . ".partition_size_within_level"} = 20;
217 }
218 }
219 else {
220 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
221
222 # Assign values based on the partition_size_within_level parameter
223 foreach my $metadata_group (@metadata_groups) {
224 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
225 if (defined($partition_size_within_levelelem)) {
226 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
227 }
228 else {
229 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
230 }
231 }
232 }
233
234 # The removeprefix and removesuffix expressions
235 if ($self->{'removeprefix'}) {
236 # If there are more than one expressions, use '' to quote each experession and '/' to separate
237 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
238
239 foreach my $metadata_group (@metadata_groups) {
240 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
241 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
242 # Remove the other ' at the beginning and the end if there is any
243 $removeprefix_expr_within_levelelem =~ s/^'//;
244 $removeprefix_expr_within_levelelem =~ s/'$//;
245 # Remove the extra ^ at the beginning
246 $removeprefix_expr_within_levelelem =~ s/^\^//;
247 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
248 } else {
249 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
250 }
251 }
252 }
253 if ($self->{'removesuffix'}) {
254 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
255
256 foreach my $metadata_group (@metadata_groups) {
257 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
258 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
259 $removesuffix_expr_within_levelelem =~ s/^'//;
260 $removesuffix_expr_within_levelelem =~ s/'$//;
261 # Remove the extra $ at the end
262 $removesuffix_expr_within_levelelem =~ s/\$$//;
263 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
264 } else {
265 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
266 }
267 }
268 }
269
270 # The metadata elements to use to sort the leaf nodes (default: Title)
271 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
272 if ($self->{'sort_leaf_nodes_using'}) {
273 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
274 }
275 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
276
277 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
278 if ($self->{'sort_using_unicode_collation'}) {
279 # To use this you first need to download the allkeys.txt file from
280 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
281 # Unicode/Collate directory.
282 require Unicode::Collate;
283 $self->{'unicode_collator'} = Unicode::Collate->new();
284 }
285
286 # An empty array for the document/section OIDs that we are classifying
287 $self->{'OIDs'} = [];
288 # A hash for all the doc ids that we have seen, so we don't classify something twice
289 $self->{'all_doc_OIDs'} = {};
290 return bless $self, $class;
291}
292
293
294sub init
295{
296 # Nothing to do...
297}
298
299
300# Called for each document in the collection
301sub classify
302{
303 my $self = shift(@_);
304 my ($doc_obj) = @_;
305
306 if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
307 print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
308 return;
309 }
310 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
311 # check against filter here
312 if ($self->{'filter_metadata'}) {
313 #print STDERR "filtering documents on $self->{'filter_metadata'}\n";
314 my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'});
315 return unless defined $meta;
316 if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) {
317 #print STDERR "doesn't match regex\n";
318 return;
319
320 }
321 }
322 # if we get here, we have passed the test for filtering
323 # If "-classify_sections" is set, classify every section of the document
324 if ($self->{'classify_sections'}) {
325 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
326 while (defined $section) {
327 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
328 $section = $doc_obj->get_next_section($section);
329 }
330 }
331 # Otherwise just classify the top document section
332 else {
333 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
334 }
335
336}
337
338sub classify_section
339{
340 my $self = shift(@_);
341 my ($doc_obj,$section_OID,$section) = @_;
342
343 my @metadata_groups = @{$self->{'metadata_groups'}};
344
345
346 # Only classify the section if it has a value for one of the metadata elements in the first group
347 my $classify_section = 0;
348 my $first_metadata_group = $metadata_groups[0];
349 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
350 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
351 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
352 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
353 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
354
355 # Remove prefix/suffix if requested
356 if (defined ($first_metadata_group_element_value)) {
357 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
358 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
359 }
360
361 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
362 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
363 }
364 }
365 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
366 # This section must be included in the classifier
367 $classify_section = 1;
368 last;
369 }
370 }
371
372 # We're not classifying this section because it doesn't have the required metadata
373 return if (!$classify_section);
374
375 # Otherwise, include this section in the classifier
376
377 push(@{$self->{'OIDs'}}, $section_OID);
378
379 # Create a hash for the metadata values of each metadata element we're interested in
380 my %metadata_groups_done = ();
381 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
382 # Take care not to do a metadata group more than once
383 unless ($metadata_groups_done{$metadata_group}) {
384 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
385 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
386 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
387 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
388
389 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
390 foreach my $metadata_value (@metadata_values) {
391 # Strip leading and trailing whitespace
392 $metadata_value =~ s/^\s*//;
393 $metadata_value =~ s/\s*$//;
394
395 # Remove prefix/suffix if requested
396 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
397 $metadata_value =~ s/^$remove_prefix_expr//;
398 }
399 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
400 $metadata_value =~ s/$remove_suffix_expr$//;
401 }
402
403 # Don't uppercase metadata here, user can change the display with CSS or javascript
404 # further on. Moreover, CSS is unable to turn letters that are capitals by default
405 # into lowercase, whereas letters that are lowercase by default can be titlecased or
406 # capitalised later with CSS if necessary.
407 # uppercase the metadata - makes the AZList nicer
408 #$metadata_value = uc($metadata_value);
409
410 # We are already working with unicode aware strings at this
411 # stage, so we no longer need to convert from utf8 to unicode
412 my $metadata_value_unicode_string = $metadata_value; # $self->convert_utf8_string_to_unicode_string($metadata_value);
413
414 # Add the metadata value into the list for this combination of metadata group and section
415 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
416 }
417 last if (@metadata_values > 0);
418 }
419
420 $metadata_groups_done{$metadata_group} = 1;
421 }
422 }
423}
424
425
426sub get_classify_info
427{
428 my $self = shift(@_);
429
430 # The metadata groups to classify by
431 my @metadata_groups = @{$self->{'metadata_groups'}};
432 my $first_metadata_group = $metadata_groups[0];
433
434 # The OID values of the documents to include in the classifier
435 my @OIDs = @{$self->{'OIDs'}};
436
437 # Create the root node of the classification hierarchy
438 my %classifier_node = ( 'thistype' => "Invisible",
439 'childtype' => $self->{$first_metadata_group . ".list_type"},
440 'Title' => $self->{'buttonname'},
441 'contains' => [],
442 'mdtype' => $first_metadata_group );
443
444 # Recursively create the classification hierarchy, one level for each metadata group
445 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
446 return \%classifier_node;
447}
448
449
450sub add_level
451{
452 my $self = shift(@_);
453 my @metadata_groups = @{shift(@_)};
454 my @OIDs = @{shift(@_)};
455 my $classifier_node = shift(@_);
456
457 my $metadata_group = $metadata_groups[0];
458
459 if (!defined($self->{$metadata_group . ".list"})) {
460 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
461 return;
462 }
463
464 # Create a mapping from metadata value to OID
465 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
466 my %metadata_value_to_OIDs_hash = ();
467 foreach my $OID (@OIDs)
468 {
469 if ($OID_to_metadata_values_hash_ref->{$OID})
470 {
471 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
472 foreach my $metadata_value (@metadata_values)
473 {
474 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
475 }
476 }
477 }
478 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
479
480 # Partition the values (if necessary)
481 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
482 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
483 if ($partition_type_within_level =~ /^per_letter$/i) {
484 # Generate one hlist for each letter
485 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
486 my %metadata_value_to_OIDs_subhash = ();
487
488 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
489 foreach my $metadata_value (@sortedmetadata_values) {
490 my $metadata_valuepartition = substr($metadata_value, 0, 1);
491
492 # Is this the start of a new partition?
493 if ($metadata_valuepartition ne $lastpartition) {
494 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
495 %metadata_value_to_OIDs_subhash = ();
496 $lastpartition = $metadata_valuepartition;
497 }
498
499 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
500 }
501
502 # Don't forget to add the last partition
503 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
504
505 # The partitions are stored in an HList
506 $classifier_node->{'childtype'} = "HList";
507 }
508 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
509 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
510 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
511 # If a letter has more items than specified, split into several hlists.
512 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
513 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
514 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
515 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
516
517 # Separate values by their first letter, each form a bucket, like the per_letter partition type
518 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
519 my %partition_buckets = ();
520 my @metadata_values_in_bucket = ();
521 my $num_items_in_bucket = 0;
522 foreach my $metadata_value (@sortedmetadata_values) {
523 my $metadata_valuepartition = substr($metadata_value, 0, 1);
524 if ($metadata_valuepartition ne $last_partition) {
525 my @temp_array = @metadata_values_in_bucket;
526 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
527 my %partition_info = ();
528 $partition_info{'metadata_values'} = \@temp_array;
529 $partition_info{'size'} = $num_items_in_bucket;
530 $partition_buckets{$last_partition} = \%partition_info;
531
532 @metadata_values_in_bucket = ($metadata_value);
533 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
534 $last_partition = $metadata_valuepartition;
535 } else {
536 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
537 push (@metadata_values_in_bucket, $metadata_value);
538 }
539 }
540 # Last one
541 my %partition_info = ();
542 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
543 $partition_info{'size'} = $num_items_in_bucket;
544 $partition_buckets{$last_partition} = \%partition_info;
545
546 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
547 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
548 my $partition = $partition_keys[$i];
549 my $items_in_partition = $partition_buckets{$partition}->{'size'};
550 # Merge small buckets together, but keep the numeric bucket apart
551 if ($items_in_partition < $partition_size_within_level) {
552 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
553 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
554 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
555 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
556 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
557 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
558 }
559 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
560 delete $partition_buckets{$partition};
561 }
562 }
563 }
564 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
565
566 # Add partitions, and divide big bucket into several
567 my $last_partition_end = "";
568 my $partition_start = "";
569 foreach my $partition (@partition_keys) {
570 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
571 my $items_in_partition = $partition_buckets{$partition}->{'size'};
572 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
573
574 if ($items_in_partition > $partition_size_within_level) {
575 my $items_done = 0;
576 my %metadata_values_to_OIDs_subhashes = ();
577 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
578 my $metadata_value = $metadata_values[$i];
579 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
580 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
581
582 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
583 my $partitionname = $partition_start;
584 if ($partitionend ne $partition_start) {
585 $partitionname = $partitionname . "-" . $partitionend;
586 }
587
588 # Start a new partition
589 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
590 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
591 $last_partition_end = $partitionend;
592 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
593 $items_done = 0;
594 %metadata_values_to_OIDs_subhashes = ();
595 }
596
597 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
598 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
599 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
600 # Get the number of partitions needed for this value
601 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
602 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
603
604 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
605 for (my $i = 0; $i < $num_splits; $i++) {
606 my %OIDs_subhashes_for_this_value = ();
607 my @OIDs_for_this_partition = ();
608 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
609 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
610 }
611
612 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
613 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
614 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
615 $items_done += scalar(@OIDs_for_this_partition);
616 next;
617 }
618
619 # Add an HList for this bucket
620 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
621 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
622 $last_partition_end = $partitionname_for_this_value;
623 }
624 next;
625 }
626
627 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
628 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
629
630 # The last partition
631 if($i == scalar(@metadata_values) - 1) {
632 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
633 }
634 }
635 }
636 else {
637 # The easier case, just add a partition
638 my %metadata_values_to_OIDs_subhashes = ();
639 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
640 my $metadata_value = $metadata_values[$i];
641 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
642 }
643 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
644 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
645 my $partitionname = $partition_start;
646 if ($partitionend ne $partition_start) {
647 $partitionname = $partitionname . "-" . $partitionend;
648 }
649 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
650 $last_partition_end = $partitionend;
651 }
652 }
653
654 # The partitions are stored in an HList
655 $classifier_node->{'childtype'} = "HList";
656
657 } # end approximate_size
658 else {
659 # Generate hlists of a certain size
660 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
661 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
662 my $itemsdone = 0;
663 my %metadata_value_to_OIDs_subhash = ();
664 my $lastpartitionend = "";
665 my $partitionstart;
666 foreach my $metadata_value (@sortedmetadata_values) {
667 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
668 $itemsdone++;
669 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
670
671 # Is this the start of a new partition?
672 if ($itemsinpartition == 1) {
673 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
674 }
675
676 # Is this the end of the partition?
677 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
678 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
679 my $partitionname = $partitionstart;
680 if ($partitionend ne $partitionstart) {
681 $partitionname = $partitionname . "-" . $partitionend;
682 }
683
684 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
685 %metadata_value_to_OIDs_subhash = ();
686 $lastpartitionend = $partitionend;
687 }
688 }
689
690 # The partitions are stored in an HList
691 $classifier_node->{'childtype'} = "HList";
692 }
693
694 # Otherwise just add all the values to a VList
695 else {
696 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
697 }
698 }
699}
700
701
702sub generate_partition_start
703{
704 my $self = shift(@_);
705 my $metadata_value = shift(@_);
706 my $lastpartitionend = shift(@_);
707 my $partition_name_length = shift(@_);
708
709 if ($partition_name_length) {
710 return substr($metadata_value, 0, $partition_name_length);
711 }
712
713 my $partitionstart = substr($metadata_value, 0, 1);
714 if ($partitionstart le $lastpartitionend) {
715 $partitionstart = substr($metadata_value, 0, 2);
716 # Give up after three characters
717 if ($partitionstart le $lastpartitionend) {
718 $partitionstart = substr($metadata_value, 0, 3);
719 }
720 }
721
722 return $partitionstart;
723}
724
725
726sub generate_partition_end
727{
728 my $self = shift(@_);
729 my $metadata_value = shift(@_);
730 my $partitionstart = shift(@_);
731 my $partition_name_length = shift(@_);
732
733 if ($partition_name_length) {
734 return substr($metadata_value, 0, $partition_name_length);
735 }
736
737 my $partitionend = substr($metadata_value, 0, length($partitionstart));
738 if ($partitionend gt $partitionstart) {
739 $partitionend = substr($metadata_value, 0, 1);
740 if ($partitionend le $partitionstart) {
741 $partitionend = substr($metadata_value, 0, 2);
742 # Give up after three characters
743 if ($partitionend le $partitionstart) {
744 $partitionend = substr($metadata_value, 0, 3);
745 }
746 }
747 }
748
749 return $partitionend;
750}
751
752
753sub add_hlist_partition
754{
755 my $self = shift(@_);
756 my @metadata_groups = @{shift(@_)};
757 my $classifier_node = shift(@_);
758 my $partitionname = shift(@_);
759 my $metadata_value_to_OIDs_hash_ref = shift(@_);
760
761 # Create an hlist partition
762 # Note that we don't need to convert from unicode-aware strings
763 # to utf8 here, as that is handled elsewhere in the code
764 my %child_classifier_node = ( 'Title' => $partitionname, #'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
765 'childtype' => "VList",
766 'contains' => [] );
767
768 # Add the children to the hlist partition
769 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
770 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
771}
772
773
774sub add_vlist
775{
776 my $self = shift(@_);
777 my @metadata_groups = @{shift(@_)};
778 my $classifier_node = shift(@_);
779 my $metadata_value_to_OIDs_hash_ref = shift(@_);
780
781 my $metadata_group = shift(@metadata_groups);
782 $classifier_node->{'mdtype'} = $metadata_group;
783
784 # Create an entry in the vlist for each value
785 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
786 {
787 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
788 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
789 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
790 my $OID = $OIDs[0];
791 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
792 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
793 }
794 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
795 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
796 @OIDs = $self->sort_leaf_items(\@OIDs);
797 foreach my $OID (@OIDs) {
798 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
799 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
800 }
801
802 }
803 # Otherwise create a sublist (bookshelf) for the metadata value
804 else {
805 # Note that we don't need to convert from unicode-aware strings
806 # to utf8 here, as that is handled elsewhere in the code
807 my %child_classifier_node = ( 'Title' => $metadata_value, # 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
808 'childtype' => "VList",
809 'mdtype' => $metadata_group,
810 'contains' => [] );
811
812 #@OIDs = $self->sort_leaf_items(\@OIDs);
813 # If there are metadata elements remaining, recursively apply the process
814 if (@metadata_groups > 0) {
815 my $next_metadata_group = $metadata_groups[0];
816 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
817
818 # separate metadata into those that below in the next/sub-metadata_group
819 # and those that below at the current level's metadata_group
820
821 my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
822 my @current_level_OIDs = ();
823 my @next_level_OIDs = ();
824 foreach my $OID (@OIDs)
825 {
826 if ($OID_to_metadata_values_hash_ref->{$OID}) {
827 push(@next_level_OIDs, $OID);
828 } else {
829 push(@current_level_OIDs, $OID);
830 }
831 }
832 # recursively process those docs belonging to the sub-metadata_group
833 $self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
834
835 # For those docs that don't belong in the sub/next_metadata_group, but which belong
836 # at this level, just add the documents as children of this list at the current level
837 @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
838 foreach my $current_level_OID (@current_level_OIDs) {
839 my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
840 push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
841 }
842 }
843 # Otherwise just add the documents as children of this list
844 else {
845 @OIDs = $self->sort_leaf_items(\@OIDs);
846 foreach my $OID (@OIDs) {
847 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
848 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
849 }
850
851 }
852
853 # Add the sublist to the list
854 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
855 }
856 }
857}
858
859sub metadata_offset
860{
861 my $self = shift(@_);
862 my $metadata_group = shift(@_);
863 my $OID = shift(@_);
864 my $metadata_value = shift(@_);
865
866 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
867 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
868 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
869 if ($metadata_value eq $metadata_values[$i]) {
870 return $i;
871 }
872 }
873
874 return 0;
875}
876
877sub sort_leaf_items
878{
879 my $self = shift(@_);
880 my @OIDs = @{shift(@_)};
881# my $classifier_node = shift(@_);
882
883 # Sort leaf nodes and add to list
884 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
885 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
886 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
887 # Force a stable sort (Perl 5.6's sort isn't stable)
888 # !! The [0] bits aren't ideal (multiple metadata values) !!
889 @OIDs = @OIDs[ sort {
890 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
891 {
892 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
893 }
894 else
895 {
896 $a <=> $b;
897 }
898 } 0..$#OIDs ];
899 }
900 if ($self->{'reverse_sort_leaf_nodes'}) {
901 #print STDERR "reversing\n";
902 return reverse @OIDs;
903 }
904 return @OIDs;
905}
906
907
908
909sub sort_metadata_values_array
910{
911 my $self = shift(@_);
912 my @metadata_values = @_;
913
914 if ($self->{'unicode_collator'}) {
915 return $self->{'unicode_collator'}->sort(@metadata_values);
916 }
917 else {
918 return sort(@metadata_values);
919 }
920}
921
922
9231;
Note: See TracBrowser for help on using the repository browser.