source: main/trunk/greenstone2/perllib/classify/List.pm@ 24147

Last change on this file since 24147 was 24147, checked in by ak19, 13 years ago

On Professor Witten's request List.pm no longer forces capitalisation of bookshelf names. The perlcode was doing this for display purposes, so it won't affect string comparisons and grouping of all case-variants into a single bucket. (Earlier, Diego also noted problems with the forced capitalisation, since CSS is unable to change the case -- lowercase or titlecase -- words that were capitalised by nature, as happens when List.pm outputs them that way.

  • Property svn:keywords set to Author Date Id Revision
File size: 36.6 KB
RevLine 
[10398]1###########################################################################
2#
[18568]3# List.pm -- A general and flexible list classifier with most of
[10398]4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
[13741]29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
[10398]31#
32###########################################################################
33
[18568]34package List;
[10398]35
36
[17209]37use BaseClassifier;
[10398]38
39use strict;
40
41
42sub BEGIN {
[18568]43 @List::ISA = ('BaseClassifier');
[10398]44}
45
[18572]46my $partition_type_list =
47 [ { 'name' => "per_letter",
48 'desc' => "{List.level_partition.per_letter}" },
[20865]49 { 'name' => "approximate_size",
50 'desc' => "{List.level_partition.approximate_size}"},
[18572]51 { 'name' => "constant_size",
[18619]52 'desc' => "{List.level_partition.constant_size}" },
[18572]53 { 'name' => "none",
54 'desc' => "{List.level_partition.none}" } ];
[10398]55
[20825]56# following used to check types later on
57my $valid_partition_types = { 'per_letter' => 1,
58 'constant_size' => 1,
59 'per_letter_fixed_size' => 1,
[20865]60 'approximate_size' => 1,
[20825]61 'none' => 1};
62
[18619]63my $bookshelf_type_list =
64 [ { 'name' => "always",
65 'desc' => "{List.bookshelf_type.always}" },
66 { 'name' => "duplicate_only",
67 'desc' => "{List.bookshelf_type.duplicate_only}" },
68 { 'name' => "never",
[20008]69 'desc' => "{List.bookshelf_type.never}" } ];
[18619]70
[10398]71my $arguments =
72 [ { 'name' => "metadata",
[19234]73 'desc' => "{List.metadata}",
[10398]74 'type' => "metadata",
75 'reqd' => "yes" },
[12889]76
[10398]77 # The interesting options
[18619]78 { 'name' => "bookshelf_type",
[19234]79 'desc' => "{List.bookshelf_type}",
[18619]80 'type' => "enum",
81 'list' => $bookshelf_type_list,
[19645]82 'deft' => "never" },
[10498]83 { 'name' => "classify_sections",
[19234]84 'desc' => "{List.classify_sections}",
[10498]85 'type' => "flag" },
86 { 'name' => "partition_type_within_level",
[19234]87 'desc' => "{List.partition_type_within_level}",
[20679]88 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
[20008]89 'list' => $partition_type_list,
90 'deft' => "per_letter" },
[10498]91 { 'name' => "partition_size_within_level",
[19234]92 'desc' => "{List.partition_size_within_level}",
[20679]93 'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
[14084]94 { 'name' => "partition_name_length",
[19234]95 'desc' => "{List.partition_name_length}",
[14084]96 'type' => "string" },
[10498]97 { 'name' => "sort_leaf_nodes_using",
[19234]98 'desc' => "{List.sort_leaf_nodes_using}",
[10398]99 'type' => "metadata",
[10499]100 'deft' => "Title" },
[23302]101 { 'name' => "reverse_sort_leaf_nodes",
102 'desc' => "{List.reverse_sort_leaf_nodes}",
103 'type' => "flag"},
[13551]104 { 'name' => "sort_using_unicode_collation",
[19234]105 'desc' => "{List.sort_using_unicode_collation}",
[13551]106 'type' => "flag" },
[10499]107 { 'name' => "use_hlist_for",
[19234]108 'desc' => "{List.use_hlist_for}",
[18619]109 'type' => "string" },
110 { 'name' => "removeprefix",
111 'desc' => "{BasClas.removeprefix}",
112 'type' => "regexp" },
113 { 'name' => "removesuffix",
114 'desc' => "{BasClas.removesuffix}",
115 'type' => "regexp" } ];
[10398]116
[18568]117my $options = { 'name' => "List",
[19234]118 'desc' => "{List.desc}",
[10502]119 'abstract' => "no",
[18572]120 'inherits' => "yes",
[10398]121 'args' => $arguments };
122
123
124sub new
125{
126 my ($class) = shift(@_);
127 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
128 push(@$classifierslist, $class);
129
[17209]130 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
131 push(@{$hashArgOptLists->{"OptList"}}, $options);
[10398]132
[17209]133 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
[10398]134
135 if ($self->{'info_only'}) {
136 # don't worry about any options etc
137 return bless $self, $class;
138 }
139
140 # The metadata elements to use (required)
[12894]141 if (!$self->{'metadata'}) {
[18568]142 die "Error: No metadata fields specified for List.\n";
[10398]143 }
[24012]144
145 my @metadata_groups = split(/[\/]/, $self->{'metadata'});
[12889]146 $self->{'metadata_groups'} = \@metadata_groups;
[10398]147
[12894]148 # The classifier button name (default: the first metadata element specified)
149 if (!$self->{'buttonname'}) {
150 my $first_metadata_group = $metadata_groups[0];
[24012]151 my $first_metadata_element = (split(/[\;|,\/]/, $first_metadata_group))[0];
[12894]152 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
[10398]153 }
154
[18619]155 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
[12889]156 foreach my $metadata_group (@metadata_groups) {
[18619]157 $self->{$metadata_group . ".bookshelf_type"} = "always";
158 }
159 my $last_metadata_group = $metadata_groups[$#metadata_groups];
160 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
161 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
162
[12892]163 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
[12889]164 foreach my $metadata_group (@metadata_groups) {
[12892]165 $self->{$metadata_group . ".list_type"} = "VList";
[10499]166 }
[12892]167 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
168 $self->{$metadata_group . ".list_type"} = "HList";
[10499]169 }
170
[18619]171 # How the items are grouped into partitions (default: no partition)
172 # for each level (metadata group), separated by '/'
[12894]173 if (!$self->{"partition_type_within_level"}) {
[18619]174 foreach my $metadata_group (@metadata_groups) {
175 $self->{$metadata_group . ".partition_type_within_level"} = "none";
176 }
177 } else {
178 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
[20825]179
180 my $first = 1;
[18619]181 foreach my $metadata_group (@metadata_groups) {
182 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
[20865]183 if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
184 print STDERR "per letter fixed size, changing to approximate size\n";
185 $partition_type_within_levelelem = "approximate_size";
186 }
[20825]187 if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
[18619]188 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
189 }
190 else {
[20825]191 if ($first) {
192 $self->{$metadata_group . ".partition_type_within_level"} = "none";
193 $first = 0;
194 } else {
195 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
196 }
197 if (defined($partition_type_within_levelelem)) {
198 # ie invalid entry
199 print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
200 }
[18619]201 }
202 }
[10398]203 }
[20825]204
[10499]205 # The number of items in each partition
[12894]206 if (!$self->{'partition_size_within_level'}) {
[10398]207 # Default: 20
[12889]208 foreach my $metadata_group (@metadata_groups) {
209 $self->{$metadata_group . ".partition_size_within_level"} = 20;
[10398]210 }
211 }
212 else {
[12894]213 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
[10398]214
[10498]215 # Assign values based on the partition_size_within_level parameter
[12889]216 foreach my $metadata_group (@metadata_groups) {
[10498]217 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
218 if (defined($partition_size_within_levelelem)) {
[12889]219 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
[10398]220 }
221 else {
[12889]222 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
[10398]223 }
224 }
225 }
226
[18619]227 # The removeprefix and removesuffix expressions
228 if ($self->{'removeprefix'}) {
229 # If there are more than one expressions, use '' to quote each experession and '/' to separate
230 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
231
232 foreach my $metadata_group (@metadata_groups) {
233 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
234 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
235 # Remove the other ' at the beginning and the end if there is any
236 $removeprefix_expr_within_levelelem =~ s/^'//;
237 $removeprefix_expr_within_levelelem =~ s/'$//;
238 # Remove the extra ^ at the beginning
239 $removeprefix_expr_within_levelelem =~ s/^\^//;
240 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
241 } else {
242 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
243 }
244 }
245 }
246 if ($self->{'removesuffix'}) {
247 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
248
249 foreach my $metadata_group (@metadata_groups) {
250 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
251 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
252 $removesuffix_expr_within_levelelem =~ s/^'//;
253 $removesuffix_expr_within_levelelem =~ s/'$//;
254 # Remove the extra $ at the end
255 $removesuffix_expr_within_levelelem =~ s/\$$//;
256 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
257 } else {
258 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
259 }
260 }
261 }
262
[12894]263 # The metadata elements to use to sort the leaf nodes (default: Title)
264 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
265 if ($self->{'sort_leaf_nodes_using'}) {
266 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
[10398]267 }
[12894]268 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
[10398]269
[13551]270 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
271 if ($self->{'sort_using_unicode_collation'}) {
[13791]272 # To use this you first need to download the allkeys.txt file from
273 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
274 # Unicode/Collate directory.
[13551]275 require Unicode::Collate;
276 $self->{'unicode_collator'} = Unicode::Collate->new();
277 }
278
[23154]279 # An empty array for the document/section OIDs that we are classifying
[12894]280 $self->{'OIDs'} = [];
[23154]281 # A hash for all the doc ids that we have seen, so we don't classify something twice
282 $self->{'all_doc_OIDs'} = {};
[10398]283 return bless $self, $class;
284}
285
286
287sub init
288{
289 # Nothing to do...
290}
291
292
[12896]293# Called for each document in the collection
[10398]294sub classify
295{
296 my $self = shift(@_);
[23116]297 my ($doc_obj) = @_;
[10398]298
[23154]299 if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
300 print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
301 return;
302 }
303 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
[12896]304 # If "-classify_sections" is set, classify every section of the document
[10398]305 if ($self->{'classify_sections'}) {
306 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
307 while (defined $section) {
[23116]308 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
[10398]309 $section = $doc_obj->get_next_section($section);
310 }
311 }
[12896]312 # Otherwise just classify the top document section
[10398]313 else {
[23116]314 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
[10398]315 }
[23154]316
[10398]317}
318
319sub classify_section
320{
321 my $self = shift(@_);
[23116]322 my ($doc_obj,$section_OID,$section) = @_;
[10398]323
[12889]324 my @metadata_groups = @{$self->{'metadata_groups'}};
[10398]325
[12896]326 # Only classify the section if it has a value for one of the metadata elements in the first group
327 my $classify_section = 0;
328 my $first_metadata_group = $metadata_groups[0];
[22175]329 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
330 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
[20008]331 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
[20424]332 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
[20421]333 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
[18619]334
335 # Remove prefix/suffix if requested
[22175]336 if (defined ($first_metadata_group_element_value)) {
337 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
338 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
339 }
340
341 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
342 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
343 }
[18619]344 }
[12896]345 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
346 # This section must be included in the classifier
347 $classify_section = 1;
348 last;
[18619]349 }
[12896]350 }
[10398]351
[12896]352 # We're not classifying this section because it doesn't have the required metadata
353 return if (!$classify_section);
[18455]354
[12896]355 # Otherwise, include this section in the classifier
[23154]356
[12896]357 push(@{$self->{'OIDs'}}, $section_OID);
358
359 # Create a hash for the metadata values of each metadata element we're interested in
360 my %metadata_groups_done = ();
361 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
362 # Take care not to do a metadata group more than once
363 unless ($metadata_groups_done{$metadata_group}) {
[22175]364 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
365 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
[20008]366 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
[20424]367 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
368
[20421]369 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
[12896]370 foreach my $metadata_value (@metadata_values) {
371 # Strip leading and trailing whitespace
372 $metadata_value =~ s/^\s*//;
373 $metadata_value =~ s/\s*$//;
[13550]374
[18619]375 # Remove prefix/suffix if requested
376 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
377 $metadata_value =~ s/^$remove_prefix_expr//;
378 }
379 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
380 $metadata_value =~ s/$remove_suffix_expr$//;
381 }
382
[24147]383 # Don't uppercase metadata here, user can change the display with CSS or javascript
384 # further on. Moreover, CSS is unable to turn letters that are capitals by default
385 # into lowercase, whereas letters that are lowercase by default can be titlecased or
386 # capitalised later with CSS if necessary.
[20865]387 # uppercase the metadata - makes the AZList nicer
[24147]388 #$metadata_value = uc($metadata_value);
[24016]389
390 # We are already working with unicode aware strings at this
391 # stage, so we no longer need to convert from utf8 to unicode
392 my $metadata_value_unicode_string = $metadata_value; # $self->convert_utf8_string_to_unicode_string($metadata_value);
[13550]393
394 # Add the metadata value into the list for this combination of metadata group and section
395 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
[10398]396 }
[12896]397 last if (@metadata_values > 0);
[10398]398 }
399
[12896]400 $metadata_groups_done{$metadata_group} = 1;
[10398]401 }
402 }
403}
404
405
406sub get_classify_info
407{
408 my $self = shift(@_);
409
[12896]410 # The metadata groups to classify by
[12889]411 my @metadata_groups = @{$self->{'metadata_groups'}};
412 my $first_metadata_group = $metadata_groups[0];
[10398]413
[12896]414 # The OID values of the documents to include in the classifier
[12889]415 my @OIDs = @{$self->{'OIDs'}};
[10398]416
[12896]417 # Create the root node of the classification hierarchy
[12893]418 my %classifier_node = ( 'thistype' => "Invisible",
[12895]419 'childtype' => $self->{$first_metadata_group . ".list_type"},
[12894]420 'Title' => $self->{'buttonname'},
[13271]421 'contains' => [],
422 'mdtype' => $first_metadata_group );
[10398]423
[12895]424 # Recursively create the classification hierarchy, one level for each metadata group
[14173]425 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
[12893]426 return \%classifier_node;
[10398]427}
428
429
[12895]430sub add_level
[10398]431{
432 my $self = shift(@_);
[12889]433 my @metadata_groups = @{shift(@_)};
434 my @OIDs = @{shift(@_)};
[12893]435 my $classifier_node = shift(@_);
[23154]436
[12889]437 my $metadata_group = $metadata_groups[0];
[23154]438
[13340]439 if (!defined($self->{$metadata_group . ".list"})) {
440 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
441 return;
442 }
[10398]443
444 # Create a mapping from metadata value to OID
[14845]445 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
[18619]446 my %metadata_value_to_OIDs_hash = ();
[14845]447 foreach my $OID (@OIDs)
448 {
449 if ($OID_to_metadata_values_hash_ref->{$OID})
450 {
451 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
452 foreach my $metadata_value (@metadata_values)
453 {
454 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
[10398]455 }
456 }
457 }
[24012]458 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
[10398]459
460 # Partition the values (if necessary)
[18619]461 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
[20904]462 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
[10498]463 if ($partition_type_within_level =~ /^per_letter$/i) {
[10398]464 # Generate one hlist for each letter
[14845]465 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
466 my %metadata_value_to_OIDs_subhash = ();
[10398]467
[14845]468 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
469 foreach my $metadata_value (@sortedmetadata_values) {
470 my $metadata_valuepartition = substr($metadata_value, 0, 1);
[10398]471
472 # Is this the start of a new partition?
[14845]473 if ($metadata_valuepartition ne $lastpartition) {
474 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
475 %metadata_value_to_OIDs_subhash = ();
476 $lastpartition = $metadata_valuepartition;
[10398]477 }
478
[14845]479 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
[10398]480 }
481
482 # Don't forget to add the last partition
[14845]483 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
[10398]484
485 # The partitions are stored in an HList
[12893]486 $classifier_node->{'childtype'} = "HList";
[10398]487 }
[20904]488 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
[18619]489 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
490 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
491 # If a letter has more items than specified, split into several hlists.
492 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
493 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
494 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
495 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
496
497 # Separate values by their first letter, each form a bucket, like the per_letter partition type
498 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
499 my %partition_buckets = ();
500 my @metadata_values_in_bucket = ();
501 my $num_items_in_bucket = 0;
[24012]502 foreach my $metadata_value (@sortedmetadata_values) {
[18619]503 my $metadata_valuepartition = substr($metadata_value, 0, 1);
504 if ($metadata_valuepartition ne $last_partition) {
505 my @temp_array = @metadata_values_in_bucket;
506 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
507 my %partition_info = ();
508 $partition_info{'metadata_values'} = \@temp_array;
509 $partition_info{'size'} = $num_items_in_bucket;
510 $partition_buckets{$last_partition} = \%partition_info;
511
512 @metadata_values_in_bucket = ($metadata_value);
513 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
514 $last_partition = $metadata_valuepartition;
515 } else {
516 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
517 push (@metadata_values_in_bucket, $metadata_value);
518 }
519 }
520 # Last one
521 my %partition_info = ();
522 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
523 $partition_info{'size'} = $num_items_in_bucket;
524 $partition_buckets{$last_partition} = \%partition_info;
525
526 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
527 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
528 my $partition = $partition_keys[$i];
529 my $items_in_partition = $partition_buckets{$partition}->{'size'};
530 # Merge small buckets together, but keep the numeric bucket apart
531 if ($items_in_partition < $partition_size_within_level) {
532 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
533 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
534 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
535 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
536 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
537 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
538 }
539 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
540 delete $partition_buckets{$partition};
541 }
542 }
543 }
544 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
545
546 # Add partitions, and divide big bucket into several
547 my $last_partition_end = "";
548 my $partition_start = "";
549 foreach my $partition (@partition_keys) {
550 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
551 my $items_in_partition = $partition_buckets{$partition}->{'size'};
552 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
553
554 if ($items_in_partition > $partition_size_within_level) {
555 my $items_done = 0;
556 my %metadata_values_to_OIDs_subhashes = ();
557 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
558 my $metadata_value = $metadata_values[$i];
559 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
560 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
[10398]561
[18619]562 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
563 my $partitionname = $partition_start;
564 if ($partitionend ne $partition_start) {
565 $partitionname = $partitionname . "-" . $partitionend;
566 }
567
568 # Start a new partition
569 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
570 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
571 $last_partition_end = $partitionend;
572 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
573 $items_done = 0;
574 %metadata_values_to_OIDs_subhashes = ();
575 }
576
577 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
578 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
579 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
580 # Get the number of partitions needed for this value
581 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
582 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
583
584 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
585 for (my $i = 0; $i < $num_splits; $i++) {
586 my %OIDs_subhashes_for_this_value = ();
587 my @OIDs_for_this_partition = ();
588 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
589 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
590 }
591
592 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
593 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
594 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
595 $items_done += scalar(@OIDs_for_this_partition);
596 next;
597 }
598
599 # Add an HList for this bucket
600 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
601 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
602 $last_partition_end = $partitionname_for_this_value;
603 }
604 next;
605 }
606
607 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
608 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
609
610 # The last partition
611 if($i == scalar(@metadata_values) - 1) {
612 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
613 }
614 }
615 }
616 else {
617 # The easier case, just add a partition
618 my %metadata_values_to_OIDs_subhashes = ();
619 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
620 my $metadata_value = $metadata_values[$i];
621 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
622 }
623 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
624 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
625 my $partitionname = $partition_start;
626 if ($partitionend ne $partition_start) {
627 $partitionname = $partitionname . "-" . $partitionend;
628 }
629 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
630 $last_partition_end = $partitionend;
631 }
632 }
[20865]633
634 # The partitions are stored in an HList
635 $classifier_node->{'childtype'} = "HList";
636
637 } # end approximate_size
[10398]638 else {
639 # Generate hlists of a certain size
[14845]640 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
641 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
[10398]642 my $itemsdone = 0;
[14845]643 my %metadata_value_to_OIDs_subhash = ();
[10398]644 my $lastpartitionend = "";
645 my $partitionstart;
[14845]646 foreach my $metadata_value (@sortedmetadata_values) {
647 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
[10398]648 $itemsdone++;
[14845]649 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
[10398]650
651 # Is this the start of a new partition?
652 if ($itemsinpartition == 1) {
[14845]653 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
[10398]654 }
655
656 # Is this the end of the partition?
[14845]657 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
658 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
[10398]659 my $partitionname = $partitionstart;
660 if ($partitionend ne $partitionstart) {
661 $partitionname = $partitionname . "-" . $partitionend;
662 }
663
[14845]664 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
665 %metadata_value_to_OIDs_subhash = ();
[10398]666 $lastpartitionend = $partitionend;
667 }
668 }
669
670 # The partitions are stored in an HList
[12893]671 $classifier_node->{'childtype'} = "HList";
[10398]672 }
673
674 # Otherwise just add all the values to a VList
675 else {
[14845]676 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
[10398]677 }
678 }
679}
680
681
[13550]682sub convert_utf8_string_to_unicode_string
[10398]683{
[14173]684 my $self = shift(@_);
[13550]685 my $utf8_string = shift(@_);
[10398]686
[13550]687 my $unicode_string = "";
688 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
689 $unicode_string .= chr($unicode_value);
690 }
691 return $unicode_string;
[10398]692}
693
694
[13550]695sub convert_unicode_string_to_utf8_string
696{
[14173]697 my $self = shift(@_);
[13550]698 my $unicode_string = shift(@_);
699
700 my @unicode_array;
701 for (my $i = 0; $i < length($unicode_string); $i++) {
702 push(@unicode_array, ord(substr($unicode_string, $i, 1)));
703 }
704 return &unicode::unicode2utf8(\@unicode_array);
705}
706
707
[10398]708sub generate_partition_start
709{
[14173]710 my $self = shift(@_);
[14845]711 my $metadata_value = shift(@_);
[10398]712 my $lastpartitionend = shift(@_);
[14084]713 my $partition_name_length = shift(@_);
[10398]714
[14084]715 if ($partition_name_length) {
[14845]716 return substr($metadata_value, 0, $partition_name_length);
[14084]717 }
718
[14845]719 my $partitionstart = substr($metadata_value, 0, 1);
[10398]720 if ($partitionstart le $lastpartitionend) {
[14845]721 $partitionstart = substr($metadata_value, 0, 2);
[10398]722 # Give up after three characters
723 if ($partitionstart le $lastpartitionend) {
[14845]724 $partitionstart = substr($metadata_value, 0, 3);
[10398]725 }
726 }
727
728 return $partitionstart;
729}
730
731
732sub generate_partition_end
733{
[14173]734 my $self = shift(@_);
[14845]735 my $metadata_value = shift(@_);
[10398]736 my $partitionstart = shift(@_);
[14084]737 my $partition_name_length = shift(@_);
[10398]738
[14084]739 if ($partition_name_length) {
[14845]740 return substr($metadata_value, 0, $partition_name_length);
[14084]741 }
742
[14845]743 my $partitionend = substr($metadata_value, 0, length($partitionstart));
[10398]744 if ($partitionend gt $partitionstart) {
[14845]745 $partitionend = substr($metadata_value, 0, 1);
[10398]746 if ($partitionend le $partitionstart) {
[14845]747 $partitionend = substr($metadata_value, 0, 2);
[10398]748 # Give up after three characters
749 if ($partitionend le $partitionstart) {
[14845]750 $partitionend = substr($metadata_value, 0, 3);
[10398]751 }
752 }
753 }
754
755 return $partitionend;
756}
757
758
759sub add_hlist_partition
760{
761 my $self = shift(@_);
[12889]762 my @metadata_groups = @{shift(@_)};
[12893]763 my $classifier_node = shift(@_);
[10398]764 my $partitionname = shift(@_);
[14845]765 my $metadata_value_to_OIDs_hash_ref = shift(@_);
[10398]766
767 # Create an hlist partition
[24016]768 # Note that we don't need to convert from unicode-aware strings
769 # to utf8 here, as that is handled elsewhere in the code
770 my %child_classifier_node = ( 'Title' => $partitionname, #'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
[12893]771 'childtype' => "VList",
772 'contains' => [] );
[10398]773
774 # Add the children to the hlist partition
[14845]775 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
[12893]776 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
[10398]777}
778
779
780sub add_vlist
781{
782 my $self = shift(@_);
[12889]783 my @metadata_groups = @{shift(@_)};
[12893]784 my $classifier_node = shift(@_);
[14845]785 my $metadata_value_to_OIDs_hash_ref = shift(@_);
[10398]786
[12889]787 my $metadata_group = shift(@metadata_groups);
[13287]788 $classifier_node->{'mdtype'} = $metadata_group;
[10398]789
790 # Create an entry in the vlist for each value
[14845]791 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
792 {
793 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
[18619]794 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
795 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
[13271]796 my $OID = $OIDs[0];
[21969]797 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
[13271]798 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
[21969]799 }
[18619]800 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
801 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
[21969]802 @OIDs = $self->sort_leaf_items(\@OIDs);
803 foreach my $OID (@OIDs) {
804 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
805 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
806 }
807
[10398]808 }
809 # Otherwise create a sublist (bookshelf) for the metadata value
[24012]810 else {
[24016]811 # Note that we don't need to convert from unicode-aware strings
812 # to utf8 here, as that is handled elsewhere in the code
813 my %child_classifier_node = ( 'Title' => $metadata_value, # 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
[12893]814 'childtype' => "VList",
[21969]815 'mdtype' => $metadata_group,
[12893]816 'contains' => [] );
[10398]817
[24012]818 #@OIDs = $self->sort_leaf_items(\@OIDs);
[10398]819 # If there are metadata elements remaining, recursively apply the process
[12889]820 if (@metadata_groups > 0) {
[24012]821 my $next_metadata_group = $metadata_groups[0];
[12895]822 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
[24012]823
824 # separate metadata into those that below in the next/sub-metadata_group
825 # and those that below at the current level's metadata_group
826
827 my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
828 my @current_level_OIDs = ();
829 my @next_level_OIDs = ();
830 foreach my $OID (@OIDs)
831 {
832 if ($OID_to_metadata_values_hash_ref->{$OID}) {
833 push(@next_level_OIDs, $OID);
834 } else {
835 push(@current_level_OIDs, $OID);
836 }
837 }
838 # recursively process those docs belonging to the sub-metadata_group
839 $self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
840
841 # For those docs that don't belong in the sub/next_metadata_group, but which belong
842 # at this level, just add the documents as children of this list at the current level
843 @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
844 foreach my $current_level_OID (@current_level_OIDs) {
845 my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
846 push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
847 }
[10398]848 }
849 # Otherwise just add the documents as children of this list
850 else {
[21969]851 @OIDs = $self->sort_leaf_items(\@OIDs);
852 foreach my $OID (@OIDs) {
853 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
854 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
855 }
856
[10398]857 }
858
859 # Add the sublist to the list
[12893]860 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
[10398]861 }
862 }
863}
864
[21969]865sub metadata_offset
[18619]866{
867 my $self = shift(@_);
[21969]868 my $metadata_group = shift(@_);
869 my $OID = shift(@_);
870 my $metadata_value = shift(@_);
871
872 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
873 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
874 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
875 if ($metadata_value eq $metadata_values[$i]) {
876 return $i;
877 }
878 }
879
880 return 0;
881}
882
883sub sort_leaf_items
884{
885 my $self = shift(@_);
[18619]886 my @OIDs = @{shift(@_)};
[21969]887# my $classifier_node = shift(@_);
[18619]888
889 # Sort leaf nodes and add to list
[20825]890 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
891 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
[18619]892 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
893 # Force a stable sort (Perl 5.6's sort isn't stable)
894 # !! The [0] bits aren't ideal (multiple metadata values) !!
[22667]895 @OIDs = @OIDs[ sort {
896 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
897 {
898 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
899 }
900 else
901 {
902 $a <=> $b;
903 }
904 } 0..$#OIDs ];
[18619]905 }
[23302]906 if ($self->{'reverse_sort_leaf_nodes'}) {
[24081]907 #print STDERR "reversing\n";
[23302]908 return reverse @OIDs;
909 }
[21969]910 return @OIDs;
[18619]911}
912
913
[21969]914
[13551]915sub sort_metadata_values_array
916{
917 my $self = shift(@_);
918 my @metadata_values = @_;
919
920 if ($self->{'unicode_collator'}) {
921 return $self->{'unicode_collator'}->sort(@metadata_values);
922 }
923 else {
924 return sort(@metadata_values);
925 }
926}
927
928
[10398]9291;
Note: See TracBrowser for help on using the repository browser.