root/gsdl/trunk/perllib/classify/List.pm @ 20904

Revision 20904, 34.3 KB (checked in by kjdon, 11 years ago)

approximate size partition, don't do it if too few items

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4#                   the abilities of AZCompactList, and better Unicode,
5#                   metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30#        - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43    @List::ISA = ('BaseClassifier');
44}
45
46my $partition_type_list =
47    [ { 'name' => "per_letter",
48    'desc' => "{List.level_partition.per_letter}" },
49      { 'name' => "approximate_size",
50    'desc' => "{List.level_partition.approximate_size}"},
51      { 'name' => "constant_size",
52    'desc' => "{List.level_partition.constant_size}" },     
53      { 'name' => "none",
54    'desc' => "{List.level_partition.none}" } ];
55
56# following used to check types later on
57my $valid_partition_types = { 'per_letter' => 1,
58                  'constant_size' => 1,
59                  'per_letter_fixed_size' => 1,
60                  'approximate_size' => 1,
61                  'none' => 1};
62
63my $bookshelf_type_list =
64    [ { 'name' => "always",
65    'desc' => "{List.bookshelf_type.always}" },
66      { 'name' => "duplicate_only",
67    'desc' => "{List.bookshelf_type.duplicate_only}" },
68      { 'name' => "never",
69    'desc' => "{List.bookshelf_type.never}" } ]; 
70
71my $arguments =
72    [ { 'name' => "metadata",
73    'desc' => "{List.metadata}",
74    'type' => "metadata",
75    'reqd' => "yes" },
76
77      # The interesting options
78      { 'name' => "bookshelf_type",
79    'desc' => "{List.bookshelf_type}",
80    'type' => "enum",
81    'list' => $bookshelf_type_list,
82    'deft' => "never" },
83      { 'name' => "classify_sections",
84    'desc' => "{List.classify_sections}",
85    'type' => "flag" },
86      { 'name' => "partition_type_within_level",
87    'desc' => "{List.partition_type_within_level}",
88    'type' => "enumstring",  # Must be enumstring because multiple values can be specified (separated by '/')
89    'list' => $partition_type_list,
90    'deft' => "per_letter" },
91      { 'name' => "partition_size_within_level",
92    'desc' => "{List.partition_size_within_level}",
93    'type' => "string" },  # Must be string because multiple values can be specified (separated by '/')
94      { 'name' => "partition_name_length",
95    'desc' => "{List.partition_name_length}",
96    'type' => "string" },
97      { 'name' => "sort_leaf_nodes_using",
98    'desc' => "{List.sort_leaf_nodes_using}",
99    'type' => "metadata",
100    'deft' => "Title" },
101      { 'name' => "sort_using_unicode_collation",
102    'desc' => "{List.sort_using_unicode_collation}",
103    'type' => "flag" },
104      { 'name' => "use_hlist_for",
105    'desc' => "{List.use_hlist_for}",
106    'type' => "string" },
107      { 'name' => "removeprefix",
108    'desc' => "{BasClas.removeprefix}",
109    'type' => "regexp" },
110      { 'name' => "removesuffix",
111    'desc' => "{BasClas.removesuffix}",
112    'type' => "regexp" } ];
113
114my $options = { 'name'     => "List",
115        'desc'     => "{List.desc}",
116        'abstract' => "no",
117        'inherits' => "yes",
118        'args'     => $arguments };
119
120
121sub new
122{
123    my ($class) = shift(@_);
124    my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
125    push(@$classifierslist, $class);
126
127    push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
128    push(@{$hashArgOptLists->{"OptList"}}, $options);
129
130    my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
131
132    if ($self->{'info_only'}) {
133    # don't worry about any options etc
134    return bless $self, $class;
135    }
136
137    # The metadata elements to use (required)
138    if (!$self->{'metadata'}) {
139    die "Error: No metadata fields specified for List.\n";
140    }
141    my @metadata_groups = split(/\//, $self->{'metadata'});
142    $self->{'metadata_groups'} = \@metadata_groups;
143
144    # The classifier button name (default: the first metadata element specified)
145    if (!$self->{'buttonname'}) {
146    my $first_metadata_group = $metadata_groups[0];
147    my $first_metadata_element = (split(/\;|,/, $first_metadata_group))[0];
148    $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
149    }
150
151    # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
152    foreach my $metadata_group (@metadata_groups) {
153    $self->{$metadata_group . ".bookshelf_type"} = "always";
154    }   
155    my $last_metadata_group = $metadata_groups[$#metadata_groups];
156    # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
157    $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
158       
159    # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
160    foreach my $metadata_group (@metadata_groups) {
161    $self->{$metadata_group . ".list_type"} = "VList";
162    }
163    foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
164    $self->{$metadata_group . ".list_type"} = "HList";
165    }
166
167    # How the items are grouped into partitions (default: no partition)
168    # for each level (metadata group), separated by '/'
169    if (!$self->{"partition_type_within_level"}) {
170    foreach my $metadata_group (@metadata_groups) {
171        $self->{$metadata_group . ".partition_type_within_level"} = "none";
172    }
173    } else {
174    my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'}); 
175       
176    my $first = 1;
177    foreach my $metadata_group (@metadata_groups) {
178        my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
179        if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
180        print STDERR "per letter fixed size, changing to approximate size\n";
181        $partition_type_within_levelelem = "approximate_size";
182        }
183        if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
184        $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
185        }
186        else {
187        if ($first) {
188            $self->{$metadata_group . ".partition_type_within_level"} = "none";
189            $first = 0;
190        } else {
191            $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
192        }
193        if (defined($partition_type_within_levelelem)) {
194            # ie invalid entry
195            print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
196        }
197        }
198    }
199    }
200   
201    # The number of items in each partition
202    if (!$self->{'partition_size_within_level'}) {
203    # Default: 20
204    foreach my $metadata_group (@metadata_groups) {
205        $self->{$metadata_group . ".partition_size_within_level"} = 20;
206    }
207    }
208    else {
209    my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
210
211    # Assign values based on the partition_size_within_level parameter
212    foreach my $metadata_group (@metadata_groups) {
213        my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
214        if (defined($partition_size_within_levelelem)) {
215        $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
216        }
217        else {
218        $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
219        }
220    }
221    }
222
223    # The removeprefix and removesuffix expressions
224    if ($self->{'removeprefix'}) {
225    # If there are more than one expressions, use '' to quote each experession and '/' to separate
226    my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
227
228    foreach my $metadata_group (@metadata_groups) {
229        my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
230        if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
231        # Remove the other ' at the beginning and the end if there is any
232        $removeprefix_expr_within_levelelem =~ s/^'//;
233        $removeprefix_expr_within_levelelem =~ s/'$//;
234        # Remove the extra ^ at the beginning
235        $removeprefix_expr_within_levelelem =~ s/^\^//;
236        $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
237        } else {
238        $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
239        }
240    }
241    }
242    if ($self->{'removesuffix'}) { 
243    my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
244
245    foreach my $metadata_group (@metadata_groups) {
246        my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
247        if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
248        $removesuffix_expr_within_levelelem =~ s/^'//;
249        $removesuffix_expr_within_levelelem =~ s/'$//;
250        # Remove the extra $ at the end
251        $removesuffix_expr_within_levelelem =~ s/\$$//;
252        $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
253        } else {
254        $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
255        }
256    }
257    }
258
259    # The metadata elements to use to sort the leaf nodes (default: Title)
260    my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
261    if ($self->{'sort_leaf_nodes_using'}) {
262    @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
263    }
264    $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
265
266    # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
267    if ($self->{'sort_using_unicode_collation'}) {
268    # To use this you first need to download the allkeys.txt file from
269        # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
270        # Unicode/Collate directory.
271    require Unicode::Collate;
272    $self->{'unicode_collator'} = Unicode::Collate->new();
273    }
274
275    # An empty array for the document OIDs
276    $self->{'OIDs'} = [];
277
278    return bless $self, $class;
279}
280
281
282sub init
283{
284    # Nothing to do...
285}
286
287
288# Called for each document in the collection
289sub classify
290{
291    my $self = shift(@_);
292    my ($doc_obj,$edit_mode) = @_;
293
294    # If "-classify_sections" is set, classify every section of the document
295    if ($self->{'classify_sections'}) {
296    my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
297    while (defined $section) {
298        $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section, $edit_mode);
299        $section = $doc_obj->get_next_section($section);
300    }
301    }
302    # Otherwise just classify the top document section
303    else {
304    $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section(), $edit_mode);
305    }
306}
307
308
309sub classify_section
310{
311    my $self = shift(@_);
312    my ($doc_obj,$section_OID,$section,$edit_mode) = @_;
313
314    my @metadata_groups = @{$self->{'metadata_groups'}};
315
316    # Only classify the section if it has a value for one of the metadata elements in the first group
317    my $classify_section = 0;
318    my $first_metadata_group = $metadata_groups[0];
319    foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
320    my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
321    my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element); 
322   
323    # Remove prefix/suffix if requested
324    my $remove_prefix_expr = $self->{$first_metadata_group_element . ".remove_prefix_expr"};
325    if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {     
326        $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
327    }
328    my $remove_suffix_expr = $self->{$first_metadata_group_element . ".remove_suffix_expr"};
329    if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
330        $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
331    }
332
333    if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
334        # This section must be included in the classifier
335        $classify_section = 1;
336        last;
337    }
338    }
339
340    # We're not classifying this section because it doesn't have the required metadata
341    return if (!$classify_section);
342
343    if (($edit_mode eq "delete") || ($edit_mode eq "update")) {
344    $self->oid_array_delete($section_OID,'OIDs');
345    if ($edit_mode eq "delete") {
346        return;
347    }
348    }
349   
350    # Otherwise, include this section in the classifier
351    push(@{$self->{'OIDs'}}, $section_OID);
352
353    # Create a hash for the metadata values of each metadata element we're interested in
354    my %metadata_groups_done = ();
355    foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
356    # Take care not to do a metadata group more than once
357    unless ($metadata_groups_done{$metadata_group}) {
358        if ($edit_mode eq "update") {
359        # if we are updating, we delete all the old values before
360        # adding the new ones, otherwise, the section will end up in
361        # the classifier twice.
362        delete $self->{$metadata_group . ".list"}->{$section_OID};
363        }
364        foreach my $metadata_element (split(/\;|,/, $metadata_group)) {     
365        my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
366       
367        my $remove_prefix_expr = $self->{$metadata_element . ".remove_prefix_expr"};
368        my $remove_suffix_expr = $self->{$metadata_element . ".remove_suffix_expr"};
369        my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
370        foreach my $metadata_value (@metadata_values) {
371            # Strip leading and trailing whitespace
372            $metadata_value =~ s/^\s*//;
373            $metadata_value =~ s/\s*$//;
374
375            # Remove prefix/suffix if requested
376            if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
377            $metadata_value =~ s/^$remove_prefix_expr//;
378            }
379            if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
380            $metadata_value =~ s/$remove_suffix_expr$//;
381            }
382
383            # uppercase the metadata - makes the AZList nicer
384            $metadata_value = uc($metadata_value);
385            # Convert the metadata value from a UTF-8 string to a Unicode string
386            # This means that length() and substr() work properly
387            # We need to be careful to convert classifier node title values back to UTF-8, however
388            my $metadata_value_unicode_string = $self->convert_utf8_string_to_unicode_string($metadata_value);
389
390            # Add the metadata value into the list for this combination of metadata group and section
391            push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
392        }
393        last if (@metadata_values > 0);
394        }
395
396        $metadata_groups_done{$metadata_group} = 1;
397    }
398    }
399}
400
401
402sub get_classify_info
403{
404    my $self = shift(@_);
405
406    # The metadata groups to classify by
407    my @metadata_groups = @{$self->{'metadata_groups'}};
408    my $first_metadata_group = $metadata_groups[0];
409
410    # The OID values of the documents to include in the classifier
411    my @OIDs = @{$self->{'OIDs'}};
412
413    # Create the root node of the classification hierarchy
414    my %classifier_node = ( 'thistype' => "Invisible",
415                'childtype' => $self->{$first_metadata_group . ".list_type"},
416                'Title' => $self->{'buttonname'},
417                'contains' => [],
418                'mdtype' => $first_metadata_group );
419
420    # Recursively create the classification hierarchy, one level for each metadata group
421    $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
422    return \%classifier_node;
423}
424
425
426sub add_level
427{
428    my $self = shift(@_);
429    my @metadata_groups = @{shift(@_)};
430    my @OIDs = @{shift(@_)};
431    my $classifier_node = shift(@_);
432    # print STDERR "\nAdding AZ list for " . $classifier_node->{'Title'} . "\n";
433
434    my $metadata_group = $metadata_groups[0];
435    # print STDERR "Processing metadata group: " . $metadata_group . "\n";
436    # print STDERR "Number of OID values: " . @OIDs . "\n";
437
438    if (!defined($self->{$metadata_group . ".list"})) {
439    print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
440    return;
441    }
442
443    # Create a mapping from metadata value to OID
444    my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
445    my %metadata_value_to_OIDs_hash = ();   
446    foreach my $OID (@OIDs)
447    {
448    if ($OID_to_metadata_values_hash_ref->{$OID})
449    {
450        my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
451        foreach my $metadata_value (@metadata_values)
452        {
453        push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
454        }
455    }
456    }
457    # print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
458
459    # Partition the values (if necessary)
460    my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
461    my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
462    if ($partition_type_within_level =~ /^per_letter$/i) {
463    # Generate one hlist for each letter
464    my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
465    my %metadata_value_to_OIDs_subhash = ();
466
467    my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
468    foreach my $metadata_value (@sortedmetadata_values) {
469        my $metadata_valuepartition = substr($metadata_value, 0, 1);
470
471        # Is this the start of a new partition?
472        if ($metadata_valuepartition ne $lastpartition) {
473        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
474        %metadata_value_to_OIDs_subhash = ();
475        $lastpartition = $metadata_valuepartition;
476        }
477
478        $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};       
479    }
480
481    # Don't forget to add the last partition
482    $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
483
484    # The partitions are stored in an HList
485    $classifier_node->{'childtype'} = "HList";
486    }
487    elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
488    # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
489    # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
490    # If a letter has more items than specified, split into several hlists.
491    # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
492    my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};       
493    my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
494    my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
495   
496    # Separate values by their first letter, each form a bucket, like the per_letter partition type
497    my $last_partition = substr($sortedmetadata_values[0], 0, 1);
498    my %partition_buckets = ();
499    my @metadata_values_in_bucket = ();
500    my $num_items_in_bucket = 0;
501    foreach my $metadata_value (@sortedmetadata_values) {       
502        my $metadata_valuepartition = substr($metadata_value, 0, 1);
503        if ($metadata_valuepartition ne $last_partition) {
504        my @temp_array = @metadata_values_in_bucket;
505        # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
506        my %partition_info = ();       
507        $partition_info{'metadata_values'} = \@temp_array;
508        $partition_info{'size'} = $num_items_in_bucket;     
509        $partition_buckets{$last_partition} = \%partition_info;     
510       
511        @metadata_values_in_bucket = ($metadata_value);
512        $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
513        $last_partition = $metadata_valuepartition;
514        } else {
515        $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
516        push (@metadata_values_in_bucket, $metadata_value);
517        }
518    }
519    # Last one
520    my %partition_info = ();
521    $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
522    $partition_info{'size'} = $num_items_in_bucket;
523    $partition_buckets{$last_partition} = \%partition_info;
524       
525    my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));   
526    for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
527        my $partition = $partition_keys[$i];
528        my $items_in_partition = $partition_buckets{$partition}->{'size'};             
529        # Merge small buckets together, but keep the numeric bucket apart
530        if ($items_in_partition < $partition_size_within_level) {
531        my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
532        if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
533            && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
534             || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
535            foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
536            push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
537            }           
538            $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
539            delete $partition_buckets{$partition};                     
540        }
541        }
542    }
543    @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
544   
545    # Add partitions, and divide big bucket into several
546    my $last_partition_end = "";
547    my $partition_start = "";
548    foreach my $partition (@partition_keys) {
549        my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
550        my $items_in_partition = $partition_buckets{$partition}->{'size'};
551        $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
552       
553        if ($items_in_partition > $partition_size_within_level) {   
554        my $items_done = 0;
555        my %metadata_values_to_OIDs_subhashes = ();
556        for (my $i = 0; $i < scalar(@metadata_values); $i++) {
557            my $metadata_value = $metadata_values[$i];
558            # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
559            my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
560
561            my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
562            my $partitionname = $partition_start;
563            if ($partitionend ne $partition_start) {
564            $partitionname = $partitionname . "-" . $partitionend;
565            }
566           
567            # Start a new partition
568            if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
569            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
570            $last_partition_end = $partitionend;           
571            $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
572            $items_done = 0;
573            %metadata_values_to_OIDs_subhashes = ();
574            }
575           
576            # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
577                    if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
578            my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
579            # Get the number of partitions needed for this value
580            my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
581            $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
582
583            my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
584            for (my $i = 0; $i < $num_splits; $i++) {
585                my %OIDs_subhashes_for_this_value = ();
586                my @OIDs_for_this_partition = ();
587                for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
588                push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
589                }
590               
591                # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
592                if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
593                $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
594                $items_done += scalar(@OIDs_for_this_partition);
595                next;
596                }
597               
598                # Add an HList for this bucket
599                $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
600                $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
601                $last_partition_end = $partitionname_for_this_value;
602            }
603            next;
604                    }
605                       
606            $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
607            $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;         
608           
609            # The last partition
610            if($i == scalar(@metadata_values) - 1) {
611            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
612            }
613        }       
614        }
615        else {
616        # The easier case, just add a partition
617        my %metadata_values_to_OIDs_subhashes = ();
618        for (my $i = 0; $i < scalar(@metadata_values); $i++) {
619            my $metadata_value = $metadata_values[$i];
620            $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};           
621        }
622        my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
623        my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
624        my $partitionname = $partition_start;
625        if ($partitionend ne $partition_start) {
626            $partitionname = $partitionname . "-" . $partitionend;
627        }
628        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);           
629        $last_partition_end = $partitionend;       
630        }
631    }   
632   
633    # The partitions are stored in an HList
634    $classifier_node->{'childtype'} = "HList";
635
636    } # end approximate_size
637    else {
638    # Generate hlists of a certain size
639    if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
640        my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
641        my $itemsdone = 0;
642        my %metadata_value_to_OIDs_subhash = ();
643        my $lastpartitionend = "";
644        my $partitionstart;
645        foreach my $metadata_value (@sortedmetadata_values) {
646        $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
647        $itemsdone++;
648        my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
649
650        # Is this the start of a new partition?
651        if ($itemsinpartition == 1) {
652            $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
653        }
654
655        # Is this the end of the partition?
656        if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
657            my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
658            my $partitionname = $partitionstart;
659            if ($partitionend ne $partitionstart) {
660            $partitionname = $partitionname . "-" . $partitionend;
661            }
662
663            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
664            %metadata_value_to_OIDs_subhash = ();
665            $lastpartitionend = $partitionend;
666        }
667        }
668
669        # The partitions are stored in an HList
670        $classifier_node->{'childtype'} = "HList";
671    }
672
673    # Otherwise just add all the values to a VList
674    else {
675        $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
676    }
677    }
678}
679
680
681sub convert_utf8_string_to_unicode_string
682{
683    my $self = shift(@_);
684    my $utf8_string = shift(@_);
685
686    my $unicode_string = "";
687    foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
688    $unicode_string .= chr($unicode_value);
689    }
690    return $unicode_string;
691}
692
693
694sub convert_unicode_string_to_utf8_string
695{
696    my $self = shift(@_);
697    my $unicode_string = shift(@_);
698
699    my @unicode_array;
700    for (my $i = 0; $i < length($unicode_string); $i++) {
701    push(@unicode_array, ord(substr($unicode_string, $i, 1)));
702    }
703    return &unicode::unicode2utf8(\@unicode_array);
704}
705
706
707sub generate_partition_start
708{
709    my $self = shift(@_);
710    my $metadata_value = shift(@_);
711    my $lastpartitionend = shift(@_);
712    my $partition_name_length = shift(@_);
713
714    if ($partition_name_length) {
715    return substr($metadata_value, 0, $partition_name_length);
716    }
717
718    my $partitionstart = substr($metadata_value, 0, 1);
719    if ($partitionstart le $lastpartitionend) {
720    $partitionstart = substr($metadata_value, 0, 2);
721    # Give up after three characters
722    if ($partitionstart le $lastpartitionend) {
723        $partitionstart = substr($metadata_value, 0, 3);
724    }
725    }
726
727    return $partitionstart;
728}
729
730
731sub generate_partition_end
732{
733    my $self = shift(@_);
734    my $metadata_value = shift(@_);
735    my $partitionstart = shift(@_);
736    my $partition_name_length = shift(@_);
737
738    if ($partition_name_length) {
739    return substr($metadata_value, 0, $partition_name_length);
740    }
741
742    my $partitionend = substr($metadata_value, 0, length($partitionstart));
743    if ($partitionend gt $partitionstart) {
744    $partitionend = substr($metadata_value, 0, 1);
745    if ($partitionend le $partitionstart) {
746        $partitionend = substr($metadata_value, 0, 2);
747        # Give up after three characters
748        if ($partitionend le $partitionstart) {
749        $partitionend = substr($metadata_value, 0, 3);
750        }
751    }
752    }
753
754    return $partitionend;
755}
756
757
758sub add_hlist_partition
759{
760    my $self = shift(@_);
761    my @metadata_groups = @{shift(@_)};
762    my $classifier_node = shift(@_);
763    my $partitionname = shift(@_);
764    my $metadata_value_to_OIDs_hash_ref = shift(@_);
765
766    # Create an hlist partition
767    my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
768                  'childtype' => "VList",
769                  'contains' => [] );
770
771    # Add the children to the hlist partition
772    $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
773    push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
774}
775
776
777sub add_vlist
778{
779    my $self = shift(@_);
780    my @metadata_groups = @{shift(@_)};
781    my $classifier_node = shift(@_);
782    my $metadata_value_to_OIDs_hash_ref = shift(@_);
783
784    my $metadata_group = shift(@metadata_groups);
785    $classifier_node->{'mdtype'} = $metadata_group;
786
787    # Create an entry in the vlist for each value
788    foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
789    {
790    my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
791
792    # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
793    if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
794        my $OID = $OIDs[0];
795
796        # Find the offset of this metadata value
797        my $offset = 0;
798        my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
799        my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
800        for (my $i = 0; $i < scalar(@metadata_values); $i++) {
801        if ($metadata_value eq $metadata_values[$i]) {
802            $offset = $i;
803            last;
804        }
805        }
806        push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
807    }
808    # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
809    elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
810       $self->add_sorted_leaf_items(\@OIDs, $classifier_node);
811    }
812    # Otherwise create a sublist (bookshelf) for the metadata value
813    else {         
814        my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
815                      'childtype' => "VList",
816                      'contains' => [] );
817
818        # If there are metadata elements remaining, recursively apply the process
819        if (@metadata_groups > 0) {
820        my $next_metadata_group = $metadata_groups[0];
821        $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
822        $self->add_level(\@metadata_groups, \@OIDs, \%child_classifier_node);
823        }
824        # Otherwise just add the documents as children of this list
825        else {
826        $self->add_sorted_leaf_items(\@OIDs, \%child_classifier_node);
827        }
828
829        # Add the sublist to the list
830        push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
831    }
832    }
833}
834
835sub add_sorted_leaf_items
836{
837    my $self = shift(@_);
838    my @OIDs = @{shift(@_)};
839    my $classifier_node = shift(@_);   
840   
841    # Sort leaf nodes and add to list
842    my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
843    foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
844    my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
845    # Force a stable sort (Perl 5.6's sort isn't stable)
846    # !! The [0] bits aren't ideal (multiple metadata values) !!
847    @OIDs = @OIDs[ sort { $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ];
848    }
849
850    foreach my $OID (@OIDs) {
851    push(@{$classifier_node->{'contains'}}, { 'OID' => $OID });
852    }
853}
854
855
856sub sort_metadata_values_array
857{
858    my $self = shift(@_);
859    my @metadata_values = @_;
860
861    if ($self->{'unicode_collator'}) {
862    return $self->{'unicode_collator'}->sort(@metadata_values);
863    }
864    else {
865    return sort(@metadata_values);
866    }
867}
868
869
8701;
Note: See TracBrowser for help on using the browser.