root/gsdl/trunk/perllib/classify/List.pm @ 20825

Revision 20825, 33.8 KB (checked in by kjdon, 10 years ago)

check that the values of the partition_types_within_level option are valid

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4#                   the abilities of AZCompactList, and better Unicode,
5#                   metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30#        - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43    @List::ISA = ('BaseClassifier');
44}
45
46my $partition_type_list =
47    [ { 'name' => "per_letter",
48    'desc' => "{List.level_partition.per_letter}" },
49      { 'name' => "constant_size",
50    'desc' => "{List.level_partition.constant_size}" },     
51      { 'name' => "per_letter_fixed_size",
52    'desc' => "{List.level_partition.per_letter_fixed_size}"},
53      { 'name' => "none",
54    'desc' => "{List.level_partition.none}" } ];
55
56# following used to check types later on
57my $valid_partition_types = { 'per_letter' => 1,
58                  'constant_size' => 1,
59                  'per_letter_fixed_size' => 1,
60                  'none' => 1};
61
62my $bookshelf_type_list =
63    [ { 'name' => "always",
64    'desc' => "{List.bookshelf_type.always}" },
65      { 'name' => "duplicate_only",
66    'desc' => "{List.bookshelf_type.duplicate_only}" },
67      { 'name' => "never",
68    'desc' => "{List.bookshelf_type.never}" } ]; 
69
70my $arguments =
71    [ { 'name' => "metadata",
72    'desc' => "{List.metadata}",
73    'type' => "metadata",
74    'reqd' => "yes" },
75
76      # The interesting options
77      { 'name' => "bookshelf_type",
78    'desc' => "{List.bookshelf_type}",
79    'type' => "enum",
80    'list' => $bookshelf_type_list,
81    'deft' => "never" },
82      { 'name' => "classify_sections",
83    'desc' => "{List.classify_sections}",
84    'type' => "flag" },
85      { 'name' => "partition_type_within_level",
86    'desc' => "{List.partition_type_within_level}",
87    'type' => "enumstring",  # Must be enumstring because multiple values can be specified (separated by '/')
88    'list' => $partition_type_list,
89    'deft' => "per_letter" },
90      { 'name' => "partition_size_within_level",
91    'desc' => "{List.partition_size_within_level}",
92    'type' => "string" },  # Must be string because multiple values can be specified (separated by '/')
93      { 'name' => "partition_name_length",
94    'desc' => "{List.partition_name_length}",
95    'type' => "string" },
96      { 'name' => "sort_leaf_nodes_using",
97    'desc' => "{List.sort_leaf_nodes_using}",
98    'type' => "metadata",
99    'deft' => "Title" },
100      { 'name' => "sort_using_unicode_collation",
101    'desc' => "{List.sort_using_unicode_collation}",
102    'type' => "flag" },
103      { 'name' => "use_hlist_for",
104    'desc' => "{List.use_hlist_for}",
105    'type' => "string" },
106      { 'name' => "removeprefix",
107    'desc' => "{BasClas.removeprefix}",
108    'type' => "regexp" },
109      { 'name' => "removesuffix",
110    'desc' => "{BasClas.removesuffix}",
111    'type' => "regexp" } ];
112
113my $options = { 'name'     => "List",
114        'desc'     => "{List.desc}",
115        'abstract' => "no",
116        'inherits' => "yes",
117        'args'     => $arguments };
118
119
120sub new
121{
122    my ($class) = shift(@_);
123    my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
124    push(@$classifierslist, $class);
125
126    push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
127    push(@{$hashArgOptLists->{"OptList"}}, $options);
128
129    my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
130
131    if ($self->{'info_only'}) {
132    # don't worry about any options etc
133    return bless $self, $class;
134    }
135
136    # The metadata elements to use (required)
137    if (!$self->{'metadata'}) {
138    die "Error: No metadata fields specified for List.\n";
139    }
140    my @metadata_groups = split(/\//, $self->{'metadata'});
141    $self->{'metadata_groups'} = \@metadata_groups;
142
143    # The classifier button name (default: the first metadata element specified)
144    if (!$self->{'buttonname'}) {
145    my $first_metadata_group = $metadata_groups[0];
146    my $first_metadata_element = (split(/\;|,/, $first_metadata_group))[0];
147    $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
148    }
149
150    # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
151    foreach my $metadata_group (@metadata_groups) {
152    $self->{$metadata_group . ".bookshelf_type"} = "always";
153    }   
154    my $last_metadata_group = $metadata_groups[$#metadata_groups];
155    # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
156    $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
157       
158    # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
159    foreach my $metadata_group (@metadata_groups) {
160    $self->{$metadata_group . ".list_type"} = "VList";
161    }
162    foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
163    $self->{$metadata_group . ".list_type"} = "HList";
164    }
165
166    # How the items are grouped into partitions (default: no partition)
167    # for each level (metadata group), separated by '/'
168    if (!$self->{"partition_type_within_level"}) {
169    foreach my $metadata_group (@metadata_groups) {
170        $self->{$metadata_group . ".partition_type_within_level"} = "none";
171    }
172    } else {
173    my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'}); 
174       
175    my $first = 1;
176    foreach my $metadata_group (@metadata_groups) {
177        my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
178        if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
179        $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
180        }
181        else {
182        if ($first) {
183            $self->{$metadata_group . ".partition_type_within_level"} = "none";
184            $first = 0;
185        } else {
186            $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
187        }
188        if (defined($partition_type_within_levelelem)) {
189            # ie invalid entry
190            print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
191        }
192        }
193    }
194    }
195   
196    # The number of items in each partition
197    if (!$self->{'partition_size_within_level'}) {
198    # Default: 20
199    foreach my $metadata_group (@metadata_groups) {
200        $self->{$metadata_group . ".partition_size_within_level"} = 20;
201    }
202    }
203    else {
204    my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
205
206    # Assign values based on the partition_size_within_level parameter
207    foreach my $metadata_group (@metadata_groups) {
208        my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
209        if (defined($partition_size_within_levelelem)) {
210        $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
211        }
212        else {
213        $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
214        }
215    }
216    }
217
218    # The removeprefix and removesuffix expressions
219    if ($self->{'removeprefix'}) {
220    # If there are more than one expressions, use '' to quote each experession and '/' to separate
221    my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
222
223    foreach my $metadata_group (@metadata_groups) {
224        my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
225        if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
226        # Remove the other ' at the beginning and the end if there is any
227        $removeprefix_expr_within_levelelem =~ s/^'//;
228        $removeprefix_expr_within_levelelem =~ s/'$//;
229        # Remove the extra ^ at the beginning
230        $removeprefix_expr_within_levelelem =~ s/^\^//;
231        $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
232        } else {
233        $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
234        }
235    }
236    }
237    if ($self->{'removesuffix'}) { 
238    my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
239
240    foreach my $metadata_group (@metadata_groups) {
241        my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
242        if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
243        $removesuffix_expr_within_levelelem =~ s/^'//;
244        $removesuffix_expr_within_levelelem =~ s/'$//;
245        # Remove the extra $ at the end
246        $removesuffix_expr_within_levelelem =~ s/\$$//;
247        $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
248        } else {
249        $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
250        }
251    }
252    }
253
254    # The metadata elements to use to sort the leaf nodes (default: Title)
255    my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
256    if ($self->{'sort_leaf_nodes_using'}) {
257    @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
258    }
259    $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
260
261    # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
262    if ($self->{'sort_using_unicode_collation'}) {
263    # To use this you first need to download the allkeys.txt file from
264        # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
265        # Unicode/Collate directory.
266    require Unicode::Collate;
267    $self->{'unicode_collator'} = Unicode::Collate->new();
268    }
269
270    # An empty array for the document OIDs
271    $self->{'OIDs'} = [];
272
273    return bless $self, $class;
274}
275
276
277sub init
278{
279    # Nothing to do...
280}
281
282
283# Called for each document in the collection
284sub classify
285{
286    my $self = shift(@_);
287    my ($doc_obj,$edit_mode) = @_;
288
289    # If "-classify_sections" is set, classify every section of the document
290    if ($self->{'classify_sections'}) {
291    my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
292    while (defined $section) {
293        $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section, $edit_mode);
294        $section = $doc_obj->get_next_section($section);
295    }
296    }
297    # Otherwise just classify the top document section
298    else {
299    $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section(), $edit_mode);
300    }
301}
302
303
304sub classify_section
305{
306    my $self = shift(@_);
307    my ($doc_obj,$section_OID,$section,$edit_mode) = @_;
308
309    my @metadata_groups = @{$self->{'metadata_groups'}};
310
311    # Only classify the section if it has a value for one of the metadata elements in the first group
312    my $classify_section = 0;
313    my $first_metadata_group = $metadata_groups[0];
314    foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
315    my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
316    my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element); 
317   
318    # Remove prefix/suffix if requested
319    my $remove_prefix_expr = $self->{$first_metadata_group_element . ".remove_prefix_expr"};
320    if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {     
321        $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
322    }
323    my $remove_suffix_expr = $self->{$first_metadata_group_element . ".remove_suffix_expr"};
324    if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
325        $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
326    }
327
328    if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
329        # This section must be included in the classifier
330        $classify_section = 1;
331        last;
332    }
333    }
334
335    # We're not classifying this section because it doesn't have the required metadata
336    return if (!$classify_section);
337
338    if (($edit_mode eq "delete") || ($edit_mode eq "update")) {
339    $self->oid_array_delete($section_OID,'OIDs');
340    if ($edit_mode eq "delete") {
341        return;
342    }
343    }
344   
345    # Otherwise, include this section in the classifier
346    push(@{$self->{'OIDs'}}, $section_OID);
347
348    # Create a hash for the metadata values of each metadata element we're interested in
349    my %metadata_groups_done = ();
350    foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
351    # Take care not to do a metadata group more than once
352    unless ($metadata_groups_done{$metadata_group}) {
353        if ($edit_mode eq "update") {
354        # if we are updating, we delete all the old values before
355        # adding the new ones, otherwise, the section will end up in
356        # the classifier twice.
357        delete $self->{$metadata_group . ".list"}->{$section_OID};
358        }
359        foreach my $metadata_element (split(/\;|,/, $metadata_group)) {     
360        my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
361       
362        my $remove_prefix_expr = $self->{$metadata_element . ".remove_prefix_expr"};
363        my $remove_suffix_expr = $self->{$metadata_element . ".remove_suffix_expr"};
364        my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
365        foreach my $metadata_value (@metadata_values) {
366            # Strip leading and trailing whitespace
367            $metadata_value =~ s/^\s*//;
368            $metadata_value =~ s/\s*$//;
369
370            # Remove prefix/suffix if requested
371            if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
372            $metadata_value =~ s/^$remove_prefix_expr//;
373            }
374            if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
375            $metadata_value =~ s/$remove_suffix_expr$//;
376            }
377
378            # Convert the metadata value from a UTF-8 string to a Unicode string
379            # This means that length() and substr() work properly
380            # We need to be careful to convert classifier node title values back to UTF-8, however
381            my $metadata_value_unicode_string = $self->convert_utf8_string_to_unicode_string($metadata_value);
382
383            # Add the metadata value into the list for this combination of metadata group and section
384            push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
385        }
386        last if (@metadata_values > 0);
387        }
388
389        $metadata_groups_done{$metadata_group} = 1;
390    }
391    }
392}
393
394
395sub get_classify_info
396{
397    my $self = shift(@_);
398
399    # The metadata groups to classify by
400    my @metadata_groups = @{$self->{'metadata_groups'}};
401    my $first_metadata_group = $metadata_groups[0];
402
403    # The OID values of the documents to include in the classifier
404    my @OIDs = @{$self->{'OIDs'}};
405
406    # Create the root node of the classification hierarchy
407    my %classifier_node = ( 'thistype' => "Invisible",
408                'childtype' => $self->{$first_metadata_group . ".list_type"},
409                'Title' => $self->{'buttonname'},
410                'contains' => [],
411                'mdtype' => $first_metadata_group );
412
413    # Recursively create the classification hierarchy, one level for each metadata group
414    $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
415    return \%classifier_node;
416}
417
418
419sub add_level
420{
421    my $self = shift(@_);
422    my @metadata_groups = @{shift(@_)};
423    my @OIDs = @{shift(@_)};
424    my $classifier_node = shift(@_);
425    # print STDERR "\nAdding AZ list for " . $classifier_node->{'Title'} . "\n";
426
427    my $metadata_group = $metadata_groups[0];
428    # print STDERR "Processing metadata group: " . $metadata_group . "\n";
429    # print STDERR "Number of OID values: " . @OIDs . "\n";
430
431    if (!defined($self->{$metadata_group . ".list"})) {
432    print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
433    return;
434    }
435
436    # Create a mapping from metadata value to OID
437    my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
438    my %metadata_value_to_OIDs_hash = ();   
439    foreach my $OID (@OIDs)
440    {
441    if ($OID_to_metadata_values_hash_ref->{$OID})
442    {
443        my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
444        foreach my $metadata_value (@metadata_values)
445        {
446        push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
447        }
448    }
449    }
450    # print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
451
452    # Partition the values (if necessary)
453    my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
454    if ($partition_type_within_level =~ /^per_letter$/i) {
455    # Generate one hlist for each letter
456    my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
457    my %metadata_value_to_OIDs_subhash = ();
458
459    my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
460    foreach my $metadata_value (@sortedmetadata_values) {
461        my $metadata_valuepartition = substr($metadata_value, 0, 1);
462
463        # Is this the start of a new partition?
464        if ($metadata_valuepartition ne $lastpartition) {
465        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
466        %metadata_value_to_OIDs_subhash = ();
467        $lastpartition = $metadata_valuepartition;
468        }
469
470        $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};       
471    }
472
473    # Don't forget to add the last partition
474    $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
475
476    # The partitions are stored in an HList
477    $classifier_node->{'childtype'} = "HList";
478    }
479    elsif ($partition_type_within_level =~ /^per_letter_fixed_size$/i) {
480    # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
481    # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
482    # If a letter has more items than specified, split into several hlists.
483    # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
484    my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};       
485    my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
486    my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
487   
488    # Separate values by their first letter, each form a bucket, like the per_letter partition type
489    my $last_partition = substr($sortedmetadata_values[0], 0, 1);
490    my %partition_buckets = ();
491    my @metadata_values_in_bucket = ();
492    my $num_items_in_bucket = 0;
493    foreach my $metadata_value (@sortedmetadata_values) {       
494        my $metadata_valuepartition = substr($metadata_value, 0, 1);
495        if ($metadata_valuepartition ne $last_partition) {
496        my @temp_array = @metadata_values_in_bucket;
497        # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
498        my %partition_info = ();       
499        $partition_info{'metadata_values'} = \@temp_array;
500        $partition_info{'size'} = $num_items_in_bucket;     
501        $partition_buckets{$last_partition} = \%partition_info;     
502       
503        @metadata_values_in_bucket = ($metadata_value);
504        $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
505        $last_partition = $metadata_valuepartition;
506        } else {
507        $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
508        push (@metadata_values_in_bucket, $metadata_value);
509        }
510    }
511    # Last one
512    my %partition_info = ();
513    $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
514    $partition_info{'size'} = $num_items_in_bucket;
515    $partition_buckets{$last_partition} = \%partition_info;
516       
517    my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));   
518    for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
519        my $partition = $partition_keys[$i];
520        my $items_in_partition = $partition_buckets{$partition}->{'size'};             
521        # Merge small buckets together, but keep the numeric bucket apart
522        if ($items_in_partition < $partition_size_within_level) {
523        my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
524        if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
525            && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
526             || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
527            foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
528            push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
529            }           
530            $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
531            delete $partition_buckets{$partition};                     
532        }
533        }
534    }
535    @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
536   
537    # Add partitions, and divide big bucket into several
538    my $last_partition_end = "";
539    my $partition_start = "";
540    foreach my $partition (@partition_keys) {
541        my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
542        my $items_in_partition = $partition_buckets{$partition}->{'size'};
543        $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
544       
545        if ($items_in_partition > $partition_size_within_level) {   
546        my $items_done = 0;
547        my %metadata_values_to_OIDs_subhashes = ();
548        for (my $i = 0; $i < scalar(@metadata_values); $i++) {
549            my $metadata_value = $metadata_values[$i];
550            # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
551            my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
552
553            my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
554            my $partitionname = $partition_start;
555            if ($partitionend ne $partition_start) {
556            $partitionname = $partitionname . "-" . $partitionend;
557            }
558           
559            # Start a new partition
560            if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
561            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
562            $last_partition_end = $partitionend;           
563            $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
564            $items_done = 0;
565            %metadata_values_to_OIDs_subhashes = ();
566            }
567           
568            # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
569                    if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
570            my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
571            # Get the number of partitions needed for this value
572            my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
573            $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
574
575            my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
576            for (my $i = 0; $i < $num_splits; $i++) {
577                my %OIDs_subhashes_for_this_value = ();
578                my @OIDs_for_this_partition = ();
579                for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
580                push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
581                }
582               
583                # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
584                if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
585                $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
586                $items_done += scalar(@OIDs_for_this_partition);
587                next;
588                }
589               
590                # Add an HList for this bucket
591                $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
592                $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
593                $last_partition_end = $partitionname_for_this_value;
594            }
595            next;
596                    }
597                       
598            $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
599            $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;         
600           
601            # The last partition
602            if($i == scalar(@metadata_values) - 1) {
603            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
604            }
605        }       
606        }
607        else {
608        # The easier case, just add a partition
609        my %metadata_values_to_OIDs_subhashes = ();
610        for (my $i = 0; $i < scalar(@metadata_values); $i++) {
611            my $metadata_value = $metadata_values[$i];
612            $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};           
613        }
614        my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
615        my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
616        my $partitionname = $partition_start;
617        if ($partitionend ne $partition_start) {
618            $partitionname = $partitionname . "-" . $partitionend;
619        }
620        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);           
621        $last_partition_end = $partitionend;       
622        }
623    }   
624    }
625    else {
626    # Generate hlists of a certain size
627    my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
628    if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
629        my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
630        my $itemsdone = 0;
631        my %metadata_value_to_OIDs_subhash = ();
632        my $lastpartitionend = "";
633        my $partitionstart;
634        foreach my $metadata_value (@sortedmetadata_values) {
635        $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
636        $itemsdone++;
637        my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
638
639        # Is this the start of a new partition?
640        if ($itemsinpartition == 1) {
641            $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
642        }
643
644        # Is this the end of the partition?
645        if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
646            my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
647            my $partitionname = $partitionstart;
648            if ($partitionend ne $partitionstart) {
649            $partitionname = $partitionname . "-" . $partitionend;
650            }
651
652            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
653            %metadata_value_to_OIDs_subhash = ();
654            $lastpartitionend = $partitionend;
655        }
656        }
657
658        # The partitions are stored in an HList
659        $classifier_node->{'childtype'} = "HList";
660    }
661
662    # Otherwise just add all the values to a VList
663    else {
664        $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
665    }
666    }
667}
668
669
670sub convert_utf8_string_to_unicode_string
671{
672    my $self = shift(@_);
673    my $utf8_string = shift(@_);
674
675    my $unicode_string = "";
676    foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
677    $unicode_string .= chr($unicode_value);
678    }
679    return $unicode_string;
680}
681
682
683sub convert_unicode_string_to_utf8_string
684{
685    my $self = shift(@_);
686    my $unicode_string = shift(@_);
687
688    my @unicode_array;
689    for (my $i = 0; $i < length($unicode_string); $i++) {
690    push(@unicode_array, ord(substr($unicode_string, $i, 1)));
691    }
692    return &unicode::unicode2utf8(\@unicode_array);
693}
694
695
696sub generate_partition_start
697{
698    my $self = shift(@_);
699    my $metadata_value = shift(@_);
700    my $lastpartitionend = shift(@_);
701    my $partition_name_length = shift(@_);
702
703    if ($partition_name_length) {
704    return substr($metadata_value, 0, $partition_name_length);
705    }
706
707    my $partitionstart = substr($metadata_value, 0, 1);
708    if ($partitionstart le $lastpartitionend) {
709    $partitionstart = substr($metadata_value, 0, 2);
710    # Give up after three characters
711    if ($partitionstart le $lastpartitionend) {
712        $partitionstart = substr($metadata_value, 0, 3);
713    }
714    }
715
716    return $partitionstart;
717}
718
719
720sub generate_partition_end
721{
722    my $self = shift(@_);
723    my $metadata_value = shift(@_);
724    my $partitionstart = shift(@_);
725    my $partition_name_length = shift(@_);
726
727    if ($partition_name_length) {
728    return substr($metadata_value, 0, $partition_name_length);
729    }
730
731    my $partitionend = substr($metadata_value, 0, length($partitionstart));
732    if ($partitionend gt $partitionstart) {
733    $partitionend = substr($metadata_value, 0, 1);
734    if ($partitionend le $partitionstart) {
735        $partitionend = substr($metadata_value, 0, 2);
736        # Give up after three characters
737        if ($partitionend le $partitionstart) {
738        $partitionend = substr($metadata_value, 0, 3);
739        }
740    }
741    }
742
743    return $partitionend;
744}
745
746
747sub add_hlist_partition
748{
749    my $self = shift(@_);
750    my @metadata_groups = @{shift(@_)};
751    my $classifier_node = shift(@_);
752    my $partitionname = shift(@_);
753    my $metadata_value_to_OIDs_hash_ref = shift(@_);
754
755    # Create an hlist partition
756    my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
757                  'childtype' => "VList",
758                  'contains' => [] );
759
760    # Add the children to the hlist partition
761    $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
762    push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
763}
764
765
766sub add_vlist
767{
768    my $self = shift(@_);
769    my @metadata_groups = @{shift(@_)};
770    my $classifier_node = shift(@_);
771    my $metadata_value_to_OIDs_hash_ref = shift(@_);
772
773    my $metadata_group = shift(@metadata_groups);
774    $classifier_node->{'mdtype'} = $metadata_group;
775
776    # Create an entry in the vlist for each value
777    foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
778    {
779    my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
780
781    # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
782    if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
783        my $OID = $OIDs[0];
784
785        # Find the offset of this metadata value
786        my $offset = 0;
787        my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
788        my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
789        for (my $i = 0; $i < scalar(@metadata_values); $i++) {
790        if ($metadata_value eq $metadata_values[$i]) {
791            $offset = $i;
792            last;
793        }
794        }
795        push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
796    }
797    # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
798    elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
799       $self->add_sorted_leaf_items(\@OIDs, $classifier_node);
800    }
801    # Otherwise create a sublist (bookshelf) for the metadata value
802    else {         
803        my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
804                      'childtype' => "VList",
805                      'contains' => [] );
806
807        # If there are metadata elements remaining, recursively apply the process
808        if (@metadata_groups > 0) {
809        my $next_metadata_group = $metadata_groups[0];
810        $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
811        $self->add_level(\@metadata_groups, \@OIDs, \%child_classifier_node);
812        }
813        # Otherwise just add the documents as children of this list
814        else {
815        $self->add_sorted_leaf_items(\@OIDs, \%child_classifier_node);
816        }
817
818        # Add the sublist to the list
819        push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
820    }
821    }
822}
823
824sub add_sorted_leaf_items
825{
826    my $self = shift(@_);
827    my @OIDs = @{shift(@_)};
828    my $classifier_node = shift(@_);   
829   
830    # Sort leaf nodes and add to list
831    my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
832    foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
833    my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
834    # Force a stable sort (Perl 5.6's sort isn't stable)
835    # !! The [0] bits aren't ideal (multiple metadata values) !!
836    @OIDs = @OIDs[ sort { $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ];
837    }
838
839    foreach my $OID (@OIDs) {
840    push(@{$classifier_node->{'contains'}}, { 'OID' => $OID });
841    }
842}
843
844
845sub sort_metadata_values_array
846{
847    my $self = shift(@_);
848    my @metadata_values = @_;
849
850    if ($self->{'unicode_collator'}) {
851    return $self->{'unicode_collator'}->sort(@metadata_values);
852    }
853    else {
854    return sort(@metadata_values);
855    }
856}
857
858
8591;
Note: See TracBrowser for help on using the browser.