root/main/trunk/greenstone2/perllib/classify/List.pm @ 31567

Revision 31567, 40.2 KB (checked in by kjdon, 3 years ago)

added in alpha_numeric_sort - copied from AZlist I think, so if metadata values are numeric, then they will be sorted numerically, ie 10 comes after 9.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4#                   the abilities of AZCompactList, and better Unicode,
5#                   metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30#        - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43    @List::ISA = ('BaseClassifier');
44}
45
46my $meta_select_type_list =
47    [
48      { 'name' => "firstvalue",
49    'desc' => "{List.metadata_selection.firstvalue}"},
50      { 'name' => "firstvalidmetadata",
51    'desc' => "{List.metadata_selection.firstvalidmetadata}"},
52      { 'name' => "allvalues",
53    'desc' => "{List.metadata_selection.allvalues}"} ];
54my $valid_meta_select_types = { 'firstvalue' => 1,
55                'firstvalidmetadata' => 1,
56                'allvalues' => 1 };
57my $partition_type_list =
58    [ { 'name' => "per_letter",
59    'desc' => "{List.level_partition.per_letter}" },
60      { 'name' => "approximate_size",
61    'desc' => "{List.level_partition.approximate_size}"},
62      { 'name' => "constant_size",
63    'desc' => "{List.level_partition.constant_size}" },     
64      { 'name' => "none",
65    'desc' => "{List.level_partition.none}" } ];
66
67# following used to check types later on
68my $valid_partition_types = { 'per_letter' => 1,
69                  'constant_size' => 1,
70                  'per_letter_fixed_size' => 1,
71                  'approximate_size' => 1,
72                  'none' => 1};
73
74my $bookshelf_type_list =
75    [ { 'name' => "always",
76    'desc' => "{List.bookshelf_type.always}" },
77      { 'name' => "duplicate_only",
78    'desc' => "{List.bookshelf_type.duplicate_only}" },
79      { 'name' => "never",
80    'desc' => "{List.bookshelf_type.never}" } ]; 
81
82my $arguments =
83    [ { 'name' => "metadata",
84    'desc' => "{List.metadata}",
85    'type' => "metadata",
86    'reqd' => "yes" },
87      { 'name' => "metadata_selection_mode",
88    'desc' => "{List.metadata_selection_mode}",
89    'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
90    'list' => $meta_select_type_list,
91    'deft' => "firstvalidmetadata" },
92      # The interesting options
93      { 'name' => "bookshelf_type",
94    'desc' => "{List.bookshelf_type}",
95    'type' => "enum",
96    'list' => $bookshelf_type_list,
97    'deft' => "never" },
98      { 'name' => "classify_sections",
99    'desc' => "{List.classify_sections}",
100    'type' => "flag" },
101      { 'name' => "partition_type_within_level",
102    'desc' => "{List.partition_type_within_level}",
103    'type' => "enumstring",  # Must be enumstring because multiple values can be specified (separated by '/')
104    'list' => $partition_type_list,
105    'deft' => "per_letter" },
106      { 'name' => "partition_size_within_level",
107    'desc' => "{List.partition_size_within_level}",
108    'type' => "string" },  # Must be string because multiple values can be specified (separated by '/')
109      { 'name' => "partition_name_length",
110    'desc' => "{List.partition_name_length}",
111    'type' => "string" },
112      { 'name' => "sort_leaf_nodes_using",
113    'desc' => "{List.sort_leaf_nodes_using}",
114    'type' => "metadata",
115    'deft' => "Title" },
116      { 'name' => "reverse_sort_leaf_nodes",
117    'desc' => "{List.reverse_sort_leaf_nodes}",
118    'type' => "flag"},
119      { 'name' => "sort_using_unicode_collation",
120    'desc' => "{List.sort_using_unicode_collation}",
121    'type' => "flag" },
122      { 'name' => "use_hlist_for",
123    'desc' => "{List.use_hlist_for}",
124    'type' => "string" },
125      {'name' => "filter_metadata",
126       'desc' => "{List.filter_metadata}",
127       'type' => "metadata"},
128      {'name' => "filter_regex",
129       'desc' => "{List.filter_regex}",
130       'type' => "regexp"},
131      { 'name' => "standardize_capitalization",
132    'desc' => "{List.standardize_capitalization}",
133    'type' => "flag"},
134      { 'name' => "removeprefix",
135    'desc' => "{BasClas.removeprefix}",
136    'type' => "regexp" },
137      { 'name' => "removesuffix",
138    'desc' => "{BasClas.removesuffix}",
139    'type' => "regexp" } ];
140
141my $options = { 'name'     => "List",
142        'desc'     => "{List.desc}",
143        'abstract' => "no",
144        'inherits' => "yes",
145        'args'     => $arguments };
146
147
148sub new
149{
150    my ($class) = shift(@_);
151    my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
152    push(@$classifierslist, $class);
153
154    push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
155    push(@{$hashArgOptLists->{"OptList"}}, $options);
156
157    my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
158
159    if ($self->{'info_only'}) {
160    # don't worry about any options etc
161    return bless $self, $class;
162    }
163
164    # The metadata elements to use (required)
165    if (!$self->{'metadata'}) {
166    die "Error: No metadata fields specified for List.\n";
167    }
168
169    my @metadata_groups = split(/[\/]/, $self->{'metadata'});
170    $self->{'metadata_groups'} = \@metadata_groups;
171
172    # The classifier button name (default: the first metadata element specified)
173    if (!$self->{'buttonname'}) {
174    my $first_metadata_group = $metadata_groups[0];
175    my $first_metadata_element = (split(/[\;|,\/]/, $first_metadata_group))[0];
176    $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
177    }
178
179    # meta selection mode for each level
180    if (!$self->{'metadata_selection_mode'}) {
181    foreach my $metadata_group (@metadata_groups) {
182        $self->{$metadata_group . ".metadata_selection_mode"} = "firstvalidmetadata";
183    }
184    } else {
185    my @metadata_selection_mode_list = split(/\//, $self->{'metadata_selection_mode'});
186    foreach my $metadata_group (@metadata_groups) {
187        my $meta_select_mode = shift(@metadata_selection_mode_list);
188        if (defined($meta_select_mode) && defined $valid_meta_select_types->{$meta_select_mode}) {
189        $self->{$metadata_group . ".metadata_selection_mode"} = $meta_select_mode;
190        } else {
191        $self->{$metadata_group . ".metadata_selection_mode"} = "firstvalidmetadata";   
192        }
193    }
194    }   
195    # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
196    foreach my $metadata_group (@metadata_groups) {
197    $self->{$metadata_group . ".bookshelf_type"} = "always";
198    }   
199    my $last_metadata_group = $metadata_groups[$#metadata_groups];
200    # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
201    $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
202       
203    # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
204    foreach my $metadata_group (@metadata_groups) {
205    $self->{$metadata_group . ".list_type"} = "VList";
206    }
207    foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
208    $self->{$metadata_group . ".list_type"} = "HList";
209    }
210
211    # How the items are grouped into partitions (default: no partition)
212    # for each level (metadata group), separated by '/'
213    if (!$self->{"partition_type_within_level"}) {
214    foreach my $metadata_group (@metadata_groups) {
215        $self->{$metadata_group . ".partition_type_within_level"} = "none";
216    }
217    } else {
218    my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'}); 
219       
220    my $first = 1;
221    foreach my $metadata_group (@metadata_groups) {
222        my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
223        if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
224        print STDERR "per letter fixed size, changing to approximate size\n";
225        $partition_type_within_levelelem = "approximate_size";
226        }
227        if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
228        $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
229        }
230        else {
231        if ($first) {
232            $self->{$metadata_group . ".partition_type_within_level"} = "none";
233            $first = 0;
234        } else {
235            $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
236        }
237        if (defined($partition_type_within_levelelem)) {
238            # ie invalid entry
239            print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
240        }
241        }
242    }
243    }
244   
245    # The number of items in each partition
246    if (!$self->{'partition_size_within_level'}) {
247    # Default: 20
248    foreach my $metadata_group (@metadata_groups) {
249        $self->{$metadata_group . ".partition_size_within_level"} = 20;
250    }
251    }
252    else {
253    my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
254
255    # Assign values based on the partition_size_within_level parameter
256    foreach my $metadata_group (@metadata_groups) {
257        my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
258        if (defined($partition_size_within_levelelem)) {
259        $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
260        }
261        else {
262        $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
263        }
264    }
265    }
266
267    # The removeprefix and removesuffix expressions
268    if ($self->{'removeprefix'}) {
269    # If there are more than one expressions, use '' to quote each experession and '/' to separate
270    my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
271
272    foreach my $metadata_group (@metadata_groups) {
273        my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
274        if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
275        # Remove the other ' at the beginning and the end if there is any
276        $removeprefix_expr_within_levelelem =~ s/^'//;
277        $removeprefix_expr_within_levelelem =~ s/'$//;
278        # Remove the extra ^ at the beginning
279        $removeprefix_expr_within_levelelem =~ s/^\^//;
280        $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
281        } else {
282        $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
283        }
284    }
285    }
286    if ($self->{'removesuffix'}) { 
287    my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
288
289    foreach my $metadata_group (@metadata_groups) {
290        my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
291        if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
292        $removesuffix_expr_within_levelelem =~ s/^'//;
293        $removesuffix_expr_within_levelelem =~ s/'$//;
294        # Remove the extra $ at the end
295        $removesuffix_expr_within_levelelem =~ s/\$$//;
296        $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
297        } else {
298        $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
299        }
300    }
301    }
302
303    # The metadata elements to use to sort the leaf nodes (default: Title)
304    my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
305    if ($self->{'sort_leaf_nodes_using'}) {
306    @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
307    }
308    $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
309    foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
310    # set metadata_select_type, if not already set - might be already set if the same group was used in -metadata
311    if (!defined $self->{$sort_group . ".metadata_selection_mode"}) {
312        $self->{$sort_group . ".metadata_selection_mode"} = "firstvalue";
313    }
314    }
315    # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
316    if ($self->{'sort_using_unicode_collation'}) {
317    # To use this you first need to download the allkeys.txt file from
318        # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
319        # Unicode/Collate directory.
320    require Unicode::Collate;
321    $self->{'unicode_collator'} = Unicode::Collate->new();
322    }
323
324    # An empty array for the document/section OIDs that we are classifying
325    $self->{'OIDs'} = [];
326    # A hash for all the doc ids that we have seen, so we don't classify something twice
327    $self->{'all_doc_OIDs'} = {};
328    return bless $self, $class;
329}
330
331
332sub init
333{
334    # Nothing to do...
335}
336
337
338# Called for each document in the collection
339sub classify
340{
341    my $self = shift(@_);
342    my ($doc_obj) = @_;
343
344    if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
345    print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
346    return;
347    }
348    $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
349    # check against filter here
350    if ($self->{'filter_metadata'}) {
351    #print STDERR "filtering documents on $self->{'filter_metadata'}\n";
352    my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'});
353    return unless defined $meta;
354    if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) {
355        #print STDERR "doesn't match regex\n";
356        return;
357
358    }
359    }
360    # if we get here, we have passed the test for filtering
361    # If "-classify_sections" is set, classify every section of the document
362    if ($self->{'classify_sections'}) {
363    my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
364    while (defined $section) {
365        $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
366        $section = $doc_obj->get_next_section($section);
367    }
368    }
369    # Otherwise just classify the top document section
370    else {
371    $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
372    }
373   
374}
375
376sub classify_section
377{
378    my $self = shift(@_);
379    my ($doc_obj,$section_OID,$section) = @_;
380
381    my @metadata_groups = @{$self->{'metadata_groups'}};
382
383   
384    # Only classify the section if it has a value for one of the metadata elements in the first group
385    my $classify_section = 0;
386    my $first_metadata_group = $metadata_groups[0];
387    my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
388    my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
389    foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
390    my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
391    my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element); 
392   
393    # Remove prefix/suffix if requested
394    if (defined ($first_metadata_group_element_value)) {
395        if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {     
396        $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
397        }
398       
399        if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
400        $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
401        }
402    }
403    if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
404        # This section must be included in the classifier
405        $classify_section = 1;
406        last;
407    }
408    }
409
410    # We're not classifying this section because it doesn't have the required metadata
411    return if (!$classify_section);
412   
413    # Otherwise, include this section in the classifier
414   
415    push(@{$self->{'OIDs'}}, $section_OID);
416
417    # Create a hash for the metadata values of each metadata element we're interested in
418    my %metadata_groups_done = ();
419    foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
420    # Take care not to do a metadata group more than once
421    unless ($metadata_groups_done{$metadata_group}) {
422        my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
423        my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
424        foreach my $metadata_element (split(/\;|,/, $metadata_group)) {     
425        my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
426       
427        my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
428        foreach my $metadata_value (@metadata_values) {
429            # Strip leading and trailing whitespace
430            $metadata_value =~ s/^\s*//;
431            $metadata_value =~ s/\s*$//;
432
433            # Remove prefix/suffix if requested
434            if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
435            $metadata_value =~ s/^$remove_prefix_expr//;
436            }
437            if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
438            $metadata_value =~ s/$remove_suffix_expr$//;
439            }
440
441            # lowercase metadata both for sorting meta (d/D under D), and to allow CSS to be able to
442            # text-transform the stored lowercase values as capitalize or uppercase (can't CSS
443            # text-transform if stored uppercase). 2 CSS text-transforms have been added to core.css
444            my $lc_metadata_value = lc($metadata_value);
445           
446            # We are already working with unicode aware strings at this
447            # stage, so we no longer need to convert from utf8 to unicode
448            #my $metadata_value_unicode_string = $metadata_value; # $self->convert_utf8_string_to_unicode_string($metadata_value);
449
450            # Add the metadata value into the list for this combination of metadata group and section
451            # text that we have some non-whitespace chars
452            if ($lc_metadata_value =~ /\S/) {
453           
454            push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $lc_metadata_value);
455           
456           
457            # add the actual value into the stored values so we can remember the case
458            if (!$self->{'standardize_capitalization'}) {
459            if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) {
460                $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++;
461            } else {
462                $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1;
463            }
464            }
465            last if ($self->{$metadata_group . ".metadata_selection_mode"} eq "firstvalue");
466            }
467        } # foreach metadatavalue
468        last if ((@metadata_values > 0) &&  $self->{$metadata_group . ".metadata_selection_mode"} =~ /^(firstvalue|firstvalidmetadata)$/ );
469        } # foreach metadata element
470
471        $metadata_groups_done{$metadata_group} = 1;
472    }
473    }
474}
475
476
477sub get_classify_info
478{
479    my $self = shift(@_);
480
481    # The metadata groups to classify by
482    my @metadata_groups = @{$self->{'metadata_groups'}};
483    my $first_metadata_group = $metadata_groups[0];
484
485    # The OID values of the documents to include in the classifier
486    my @OIDs = @{$self->{'OIDs'}};
487
488    # Create the root node of the classification hierarchy
489    my %classifier_node = ( 'thistype' => "Invisible",
490                'childtype' => $self->{$first_metadata_group . ".list_type"},
491                'Title' => $self->{'buttonname'},
492                'contains' => [],
493                'mdtype' => $first_metadata_group );
494
495    # Recursively create the classification hierarchy, one level for each metadata group
496    $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
497    return \%classifier_node;
498}
499
500
501sub add_level
502{
503    my $self = shift(@_);
504    my @metadata_groups = @{shift(@_)};
505    my @OIDs = @{shift(@_)};
506    my $classifier_node = shift(@_);
507   
508    my $metadata_group = $metadata_groups[0];
509   
510    if (!defined($self->{$metadata_group . ".list"})) {
511    print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
512    return;
513    }
514
515    # Create a mapping from metadata value to OID
516    my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
517    my %metadata_value_to_OIDs_hash = ();   
518    foreach my $OID (@OIDs)
519    {
520    if ($OID_to_metadata_values_hash_ref->{$OID})
521    {
522        my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
523        foreach my $metadata_value (@metadata_values)
524        {
525        push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
526        }
527    }
528    }
529    #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
530
531    # Partition the values (if necessary)
532    my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
533    my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
534    if ($partition_type_within_level =~ /^per_letter$/i) {
535    # Generate one hlist for each letter
536    my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
537    my %metadata_value_to_OIDs_subhash = ();
538
539    my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
540    foreach my $metadata_value (@sortedmetadata_values) {
541        my $metadata_valuepartition = substr($metadata_value, 0, 1);
542
543        # Is this the start of a new partition?
544        if ($metadata_valuepartition ne $lastpartition) {
545        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
546        %metadata_value_to_OIDs_subhash = ();
547        $lastpartition = $metadata_valuepartition;
548        }
549
550        $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};       
551    }
552
553    # Don't forget to add the last partition
554    $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
555
556    # The partitions are stored in an HList
557    $classifier_node->{'childtype'} = "HList";
558    }
559    elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
560    # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
561    # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
562    # If a letter has more items than specified, split into several hlists.
563    # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
564    my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};       
565    my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
566    my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
567   
568    # Separate values by their first letter, each form a bucket, like the per_letter partition type
569    my $last_partition = substr($sortedmetadata_values[0], 0, 1);
570    my %partition_buckets = ();
571    my @metadata_values_in_bucket = ();
572    my $num_items_in_bucket = 0;
573    foreach my $metadata_value (@sortedmetadata_values) {
574        my $metadata_valuepartition = substr($metadata_value, 0, 1);
575        if ($metadata_valuepartition ne $last_partition) {
576        my @temp_array = @metadata_values_in_bucket;
577        # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
578        my %partition_info = ();       
579        $partition_info{'metadata_values'} = \@temp_array;
580        $partition_info{'size'} = $num_items_in_bucket;     
581        $partition_buckets{$last_partition} = \%partition_info;     
582       
583        @metadata_values_in_bucket = ($metadata_value);
584        $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
585        $last_partition = $metadata_valuepartition;
586        } else {
587        $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
588        push (@metadata_values_in_bucket, $metadata_value);
589        }
590    }
591    # Last one
592    my %partition_info = ();
593    $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
594    $partition_info{'size'} = $num_items_in_bucket;
595    $partition_buckets{$last_partition} = \%partition_info;
596       
597    my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));   
598    for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
599        my $partition = $partition_keys[$i];
600        my $items_in_partition = $partition_buckets{$partition}->{'size'};             
601        # Merge small buckets together, but keep the numeric bucket apart
602        if ($items_in_partition < $partition_size_within_level) {
603        my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
604        if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
605            && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
606             || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
607            foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
608            push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
609            }           
610            $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
611            delete $partition_buckets{$partition};                     
612        }
613        }
614    }
615    @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
616   
617    # Add partitions, and divide big bucket into several
618    my $last_partition_end = "";
619    my $partition_start = "";
620    foreach my $partition (@partition_keys) {
621        my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
622        my $items_in_partition = $partition_buckets{$partition}->{'size'};
623        $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
624       
625        if ($items_in_partition > $partition_size_within_level) {   
626        my $items_done = 0;
627        my %metadata_values_to_OIDs_subhashes = ();
628        for (my $i = 0; $i < scalar(@metadata_values); $i++) {
629            my $metadata_value = $metadata_values[$i];
630            # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
631            my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
632
633            my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
634            my $partitionname = $partition_start;
635            if ($partitionend ne $partition_start) {
636            $partitionname = $partitionname . "-" . $partitionend;
637            }
638           
639            # Start a new partition
640            if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
641            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
642            $last_partition_end = $partitionend;           
643            $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
644            $items_done = 0;
645            %metadata_values_to_OIDs_subhashes = ();
646            }
647           
648            # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
649                    if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
650            my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
651            # Get the number of partitions needed for this value
652            my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
653            $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
654
655            my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
656            for (my $i = 0; $i < $num_splits; $i++) {
657                my %OIDs_subhashes_for_this_value = ();
658                my @OIDs_for_this_partition = ();
659                for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
660                push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
661                }
662               
663                # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
664                if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
665                $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
666                $items_done += scalar(@OIDs_for_this_partition);
667                next;
668                }
669               
670                # Add an HList for this bucket
671                $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
672                $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
673                $last_partition_end = $partitionname_for_this_value;
674            }
675            next;
676                    }
677                       
678            $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
679            $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;         
680           
681            # The last partition
682            if($i == scalar(@metadata_values) - 1) {
683            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
684            }
685        }       
686        }
687        else {
688        # The easier case, just add a partition
689        my %metadata_values_to_OIDs_subhashes = ();
690        for (my $i = 0; $i < scalar(@metadata_values); $i++) {
691            my $metadata_value = $metadata_values[$i];
692            $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};           
693        }
694        my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
695        my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
696        my $partitionname = $partition_start;
697        if ($partitionend ne $partition_start) {
698            $partitionname = $partitionname . "-" . $partitionend;
699        }
700        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);           
701        $last_partition_end = $partitionend;       
702        }
703    }   
704   
705    # The partitions are stored in an HList
706    $classifier_node->{'childtype'} = "HList";
707
708    } # end approximate_size
709    else {
710    # Generate hlists of a certain size
711    if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
712        my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
713        my $itemsdone = 0;
714        my %metadata_value_to_OIDs_subhash = ();
715        my $lastpartitionend = "";
716        my $partitionstart;
717        foreach my $metadata_value (@sortedmetadata_values) {
718        $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
719        $itemsdone++;
720        my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
721
722        # Is this the start of a new partition?
723        if ($itemsinpartition == 1) {
724            $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
725        }
726
727        # Is this the end of the partition?
728        if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
729            my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
730            my $partitionname = $partitionstart;
731            if ($partitionend ne $partitionstart) {
732            $partitionname = $partitionname . "-" . $partitionend;
733            }
734
735            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
736            %metadata_value_to_OIDs_subhash = ();
737            $lastpartitionend = $partitionend;
738        }
739        }
740
741        # The partitions are stored in an HList
742        $classifier_node->{'childtype'} = "HList";
743    }
744
745    # Otherwise just add all the values to a VList
746    else {
747        $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
748    }
749    }
750}
751
752
753sub generate_partition_start
754{
755    my $self = shift(@_);
756    my $metadata_value = shift(@_);
757    my $lastpartitionend = shift(@_);
758    my $partition_name_length = shift(@_);
759
760    if ($partition_name_length) {
761    return substr($metadata_value, 0, $partition_name_length);
762    }
763
764    my $partitionstart = substr($metadata_value, 0, 1);
765    if ($partitionstart le $lastpartitionend) {
766    $partitionstart = substr($metadata_value, 0, 2);
767    # Give up after three characters
768    if ($partitionstart le $lastpartitionend) {
769        $partitionstart = substr($metadata_value, 0, 3);
770    }
771    }
772
773    return $partitionstart;
774}
775
776
777sub generate_partition_end
778{
779    my $self = shift(@_);
780    my $metadata_value = shift(@_);
781    my $partitionstart = shift(@_);
782    my $partition_name_length = shift(@_);
783
784    if ($partition_name_length) {
785    return substr($metadata_value, 0, $partition_name_length);
786    }
787
788    my $partitionend = substr($metadata_value, 0, length($partitionstart));
789    if ($partitionend gt $partitionstart) {
790    $partitionend = substr($metadata_value, 0, 1);
791    if ($partitionend le $partitionstart) {
792        $partitionend = substr($metadata_value, 0, 2);
793        # Give up after three characters
794        if ($partitionend le $partitionstart) {
795        $partitionend = substr($metadata_value, 0, 3);
796        }
797    }
798    }
799
800    return $partitionend;
801}
802
803
804sub add_hlist_partition
805{
806    my $self = shift(@_);
807    my @metadata_groups = @{shift(@_)};
808    my $classifier_node = shift(@_);
809    my $partitionname = shift(@_);
810    my $metadata_value_to_OIDs_hash_ref = shift(@_);
811
812    # Create an hlist partition
813    # Note that we don't need to convert from unicode-aware strings
814    # to utf8 here, as that is handled elsewhere in the code
815    my %child_classifier_node = ( 'Title' => $partitionname, #'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
816                  'childtype' => "VList",
817                  'contains' => [] );
818
819    # Add the children to the hlist partition
820    $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
821    push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
822}
823
824
825sub add_vlist
826{
827    my $self = shift(@_);
828    my @metadata_groups = @{shift(@_)};
829    my $classifier_node = shift(@_);
830    my $metadata_value_to_OIDs_hash_ref = shift(@_);
831
832    my $metadata_group = shift(@metadata_groups);
833    $classifier_node->{'mdtype'} = $metadata_group;
834
835    # Create an entry in the vlist for each value
836    foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
837    {
838    my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
839    # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
840    if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
841        my $OID = $OIDs[0];
842        my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
843        push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
844    }
845    # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
846    elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
847        @OIDs = $self->sort_leaf_items(\@OIDs);
848        foreach my $OID (@OIDs) {
849        my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
850        push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
851        }
852   
853    }
854    # Otherwise create a sublist (bookshelf) for the metadata value
855    else {
856        my $metadata_value_display = $self->get_metadata_value_display($metadata_group, $metadata_value);
857        # Note that we don't need to convert from unicode-aware strings
858        # to utf8 here, as that is handled elsewhere in the code
859        my %child_classifier_node = ( 'Title' => $metadata_value_display, # 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
860                      'childtype' => "VList",
861                      'mdtype' => $metadata_group,
862                      'contains' => [] );
863
864        #@OIDs = $self->sort_leaf_items(\@OIDs);
865        # If there are metadata elements remaining, recursively apply the process
866        if (@metadata_groups > 0) {
867        my $next_metadata_group = $metadata_groups[0];     
868        $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
869
870        # separate metadata into those that below in the next/sub-metadata_group
871        # and those that below at the current level's metadata_group
872
873        my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
874        my @current_level_OIDs = ();
875        my @next_level_OIDs = ();
876        foreach my $OID (@OIDs)
877        {
878            if ($OID_to_metadata_values_hash_ref->{$OID}) {
879            push(@next_level_OIDs, $OID);
880            } else {
881            push(@current_level_OIDs, $OID);
882            }
883        }
884        # recursively process those docs belonging to the sub-metadata_group
885        $self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
886
887        # For those docs that don't belong in the sub/next_metadata_group, but which belong
888        # at this level, just add the documents as children of this list at the current level
889        @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
890        foreach my $current_level_OID (@current_level_OIDs) {
891            my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
892            push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
893        }       
894        }
895        # Otherwise just add the documents as children of this list
896        else {
897        @OIDs = $self->sort_leaf_items(\@OIDs);
898        foreach my $OID (@OIDs) {
899            my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
900            push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
901        }
902
903        }
904
905        # Add the sublist to the list
906        push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
907    }
908    }
909}
910
911sub metadata_offset
912{
913    my $self = shift(@_);
914    my $metadata_group = shift(@_);
915    my $OID = shift(@_);
916    my $metadata_value = shift(@_);
917   
918    my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
919    my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
920    for (my $i = 0; $i < scalar(@metadata_values); $i++) {
921    if ($metadata_value eq $metadata_values[$i]) {
922        return $i;
923    }
924    }
925   
926    return 0;
927}
928
929sub sort_leaf_items
930{
931    my $self = shift(@_);
932    my @OIDs = @{shift(@_)};
933#    my $classifier_node = shift(@_);
934   
935    # Sort leaf nodes and add to list
936    my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
937    foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
938    my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
939    # Force a stable sort (Perl 5.6's sort isn't stable)
940    # !! The [0] bits aren't ideal (multiple metadata values) !!
941    @OIDs = @OIDs[ sort {
942      if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
943      {
944        $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
945      }
946      else
947      {
948        $a <=> $b;
949      }
950    } 0..$#OIDs ];
951    }
952    if ($self->{'reverse_sort_leaf_nodes'}) {
953    #print STDERR "reversing\n";
954    return reverse @OIDs;
955    }
956    return @OIDs;
957}
958
959
960
961sub sort_metadata_values_array
962{
963    my $self = shift(@_);
964    my @metadata_values = @_;
965
966    if ($self->{'unicode_collator'}) {
967    return $self->{'unicode_collator'}->sort(@metadata_values);
968    }
969    else {
970    return sort { $self->alpha_numeric_cmp($a,$b) }(@metadata_values);
971    }
972}
973
974
975sub alpha_numeric_cmp
976{
977    my ($self,$a,$b) = @_;
978
979    if ($a =~ m/^(\d+(\.\d+)?)/)
980    {
981        my $val_a = $1;
982        if ($b =~ m/^(\d+(\.\d+)?)/)
983        {
984            my $val_b = $1;
985            if ($val_a != $val_b)
986            {
987                return ($val_a <=> $val_b);
988            }
989        }
990    }
991   
992    return ($a cmp $b);
993}
994
995
996sub get_metadata_value_display {
997    my $self = shift(@_);
998    my ($metadata_group, $metadata_value) = @_;
999    return $metadata_value if $self->{'standardize_capitalization'};
1000    my $actual_values_hash = $self->{$metadata_group . ".actualvalues"}->{$metadata_value};
1001    my $display_value ="";
1002    my $max_count=0;
1003    foreach my $v (keys %$actual_values_hash) {
1004    if ($actual_values_hash->{$v} > $max_count) {
1005        $display_value = $v;
1006        $max_count = $actual_values_hash->{$v};
1007    }
1008    }
1009    return $display_value;
1010}
10111;
Note: See TracBrowser for help on using the browser.