root/main/trunk/greenstone2/perllib/classify/List.pm @ 31577

Revision 31577, 40.3 KB (checked in by kjdon, 2 years ago)

changed my mind. now declaring aStr and bStr variables, cos what if the a and b coming in are not the same as global a and b?? maybe there is room for things to go wrong? so lets explicitly declare the variables

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4#                   the abilities of AZCompactList, and better Unicode,
5#                   metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30#        - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34
35package List;
36
37
38use BaseClassifier;
39
40use strict;
41
42
43sub BEGIN {
44    @List::ISA = ('BaseClassifier');
45}
46
47my $meta_select_type_list =
48    [
49      { 'name' => "firstvalue",
50    'desc' => "{List.metadata_selection.firstvalue}"},
51      { 'name' => "firstvalidmetadata",
52    'desc' => "{List.metadata_selection.firstvalidmetadata}"},
53      { 'name' => "allvalues",
54    'desc' => "{List.metadata_selection.allvalues}"} ];
55my $valid_meta_select_types = { 'firstvalue' => 1,
56                'firstvalidmetadata' => 1,
57                'allvalues' => 1 };
58my $partition_type_list =
59    [ { 'name' => "per_letter",
60    'desc' => "{List.level_partition.per_letter}" },
61      { 'name' => "approximate_size",
62    'desc' => "{List.level_partition.approximate_size}"},
63      { 'name' => "constant_size",
64    'desc' => "{List.level_partition.constant_size}" },     
65      { 'name' => "none",
66    'desc' => "{List.level_partition.none}" } ];
67
68# following used to check types later on
69my $valid_partition_types = { 'per_letter' => 1,
70                  'constant_size' => 1,
71                  'per_letter_fixed_size' => 1,
72                  'approximate_size' => 1,
73                  'none' => 1};
74
75my $bookshelf_type_list =
76    [ { 'name' => "always",
77    'desc' => "{List.bookshelf_type.always}" },
78      { 'name' => "duplicate_only",
79    'desc' => "{List.bookshelf_type.duplicate_only}" },
80      { 'name' => "never",
81    'desc' => "{List.bookshelf_type.never}" } ]; 
82
83my $arguments =
84    [ { 'name' => "metadata",
85    'desc' => "{List.metadata}",
86    'type' => "metadata",
87    'reqd' => "yes" },
88      { 'name' => "metadata_selection_mode",
89    'desc' => "{List.metadata_selection_mode}",
90    'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
91    'list' => $meta_select_type_list,
92    'deft' => "firstvalidmetadata" },
93      # The interesting options
94      { 'name' => "bookshelf_type",
95    'desc' => "{List.bookshelf_type}",
96    'type' => "enum",
97    'list' => $bookshelf_type_list,
98    'deft' => "never" },
99      { 'name' => "classify_sections",
100    'desc' => "{List.classify_sections}",
101    'type' => "flag" },
102      { 'name' => "partition_type_within_level",
103    'desc' => "{List.partition_type_within_level}",
104    'type' => "enumstring",  # Must be enumstring because multiple values can be specified (separated by '/')
105    'list' => $partition_type_list,
106    'deft' => "per_letter" },
107      { 'name' => "partition_size_within_level",
108    'desc' => "{List.partition_size_within_level}",
109    'type' => "string" },  # Must be string because multiple values can be specified (separated by '/')
110      { 'name' => "partition_name_length",
111    'desc' => "{List.partition_name_length}",
112    'type' => "string" },
113      { 'name' => "sort_leaf_nodes_using",
114    'desc' => "{List.sort_leaf_nodes_using}",
115    'type' => "metadata",
116    'deft' => "Title" },
117      { 'name' => "reverse_sort_leaf_nodes",
118    'desc' => "{List.reverse_sort_leaf_nodes}",
119    'type' => "flag"},
120      { 'name' => "sort_using_unicode_collation",
121    'desc' => "{List.sort_using_unicode_collation}",
122    'type' => "flag" },
123      { 'name' => "use_hlist_for",
124    'desc' => "{List.use_hlist_for}",
125    'type' => "string" },
126      {'name' => "filter_metadata",
127       'desc' => "{List.filter_metadata}",
128       'type' => "metadata"},
129      {'name' => "filter_regex",
130       'desc' => "{List.filter_regex}",
131       'type' => "regexp"},
132      { 'name' => "standardize_capitalization",
133    'desc' => "{List.standardize_capitalization}",
134    'type' => "flag"},
135      { 'name' => "removeprefix",
136    'desc' => "{BasClas.removeprefix}",
137    'type' => "regexp" },
138      { 'name' => "removesuffix",
139    'desc' => "{BasClas.removesuffix}",
140    'type' => "regexp" } ];
141
142my $options = { 'name'     => "List",
143        'desc'     => "{List.desc}",
144        'abstract' => "no",
145        'inherits' => "yes",
146        'args'     => $arguments };
147
148
149sub new
150{
151    my ($class) = shift(@_);
152    my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
153    push(@$classifierslist, $class);
154
155    push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
156    push(@{$hashArgOptLists->{"OptList"}}, $options);
157
158    my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
159
160    if ($self->{'info_only'}) {
161    # don't worry about any options etc
162    return bless $self, $class;
163    }
164
165    # The metadata elements to use (required)
166    if (!$self->{'metadata'}) {
167    die "Error: No metadata fields specified for List.\n";
168    }
169
170    my @metadata_groups = split(/[\/]/, $self->{'metadata'});
171    $self->{'metadata_groups'} = \@metadata_groups;
172
173    # The classifier button name (default: the first metadata element specified)
174    if (!$self->{'buttonname'}) {
175    my $first_metadata_group = $metadata_groups[0];
176    my $first_metadata_element = (split(/[\;|,\/]/, $first_metadata_group))[0];
177    $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
178    }
179
180    # meta selection mode for each level
181    if (!$self->{'metadata_selection_mode'}) {
182    foreach my $metadata_group (@metadata_groups) {
183        $self->{$metadata_group . ".metadata_selection_mode"} = "firstvalidmetadata";
184    }
185    } else {
186    my @metadata_selection_mode_list = split(/\//, $self->{'metadata_selection_mode'});
187    foreach my $metadata_group (@metadata_groups) {
188        my $meta_select_mode = shift(@metadata_selection_mode_list);
189        if (defined($meta_select_mode) && defined $valid_meta_select_types->{$meta_select_mode}) {
190        $self->{$metadata_group . ".metadata_selection_mode"} = $meta_select_mode;
191        } else {
192        $self->{$metadata_group . ".metadata_selection_mode"} = "firstvalidmetadata";   
193        }
194    }
195    }   
196    # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
197    foreach my $metadata_group (@metadata_groups) {
198    $self->{$metadata_group . ".bookshelf_type"} = "always";
199    }   
200    my $last_metadata_group = $metadata_groups[$#metadata_groups];
201    # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
202    $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
203       
204    # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
205    foreach my $metadata_group (@metadata_groups) {
206    $self->{$metadata_group . ".list_type"} = "VList";
207    }
208    foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
209    $self->{$metadata_group . ".list_type"} = "HList";
210    }
211
212    # How the items are grouped into partitions (default: no partition)
213    # for each level (metadata group), separated by '/'
214    if (!$self->{"partition_type_within_level"}) {
215    foreach my $metadata_group (@metadata_groups) {
216        $self->{$metadata_group . ".partition_type_within_level"} = "none";
217    }
218    } else {
219    my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'}); 
220       
221    my $first = 1;
222    foreach my $metadata_group (@metadata_groups) {
223        my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
224        if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
225        print STDERR "per letter fixed size, changing to approximate size\n";
226        $partition_type_within_levelelem = "approximate_size";
227        }
228        if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
229        $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
230        }
231        else {
232        if ($first) {
233            $self->{$metadata_group . ".partition_type_within_level"} = "none";
234            $first = 0;
235        } else {
236            $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
237        }
238        if (defined($partition_type_within_levelelem)) {
239            # ie invalid entry
240            print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
241        }
242        }
243    }
244    }
245   
246    # The number of items in each partition
247    if (!$self->{'partition_size_within_level'}) {
248    # Default: 20
249    foreach my $metadata_group (@metadata_groups) {
250        $self->{$metadata_group . ".partition_size_within_level"} = 20;
251    }
252    }
253    else {
254    my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
255
256    # Assign values based on the partition_size_within_level parameter
257    foreach my $metadata_group (@metadata_groups) {
258        my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
259        if (defined($partition_size_within_levelelem)) {
260        $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
261        }
262        else {
263        $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
264        }
265    }
266    }
267
268    # The removeprefix and removesuffix expressions
269    if ($self->{'removeprefix'}) {
270    # If there are more than one expressions, use '' to quote each experession and '/' to separate
271    my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
272
273    foreach my $metadata_group (@metadata_groups) {
274        my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
275        if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
276        # Remove the other ' at the beginning and the end if there is any
277        $removeprefix_expr_within_levelelem =~ s/^'//;
278        $removeprefix_expr_within_levelelem =~ s/'$//;
279        # Remove the extra ^ at the beginning
280        $removeprefix_expr_within_levelelem =~ s/^\^//;
281        $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
282        } else {
283        $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
284        }
285    }
286    }
287    if ($self->{'removesuffix'}) { 
288    my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
289
290    foreach my $metadata_group (@metadata_groups) {
291        my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
292        if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
293        $removesuffix_expr_within_levelelem =~ s/^'//;
294        $removesuffix_expr_within_levelelem =~ s/'$//;
295        # Remove the extra $ at the end
296        $removesuffix_expr_within_levelelem =~ s/\$$//;
297        $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
298        } else {
299        $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
300        }
301    }
302    }
303
304    # The metadata elements to use to sort the leaf nodes (default: Title)
305    my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
306    if ($self->{'sort_leaf_nodes_using'}) {
307    @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
308    }
309    $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
310    foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
311    # set metadata_select_type, if not already set - might be already set if the same group was used in -metadata
312    if (!defined $self->{$sort_group . ".metadata_selection_mode"}) {
313        $self->{$sort_group . ".metadata_selection_mode"} = "firstvalue";
314    }
315    }
316    # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
317    if ($self->{'sort_using_unicode_collation'}) {
318    # To use this you first need to download the allkeys.txt file from
319        # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
320        # Unicode/Collate directory.
321    require Unicode::Collate;
322    $self->{'unicode_collator'} = Unicode::Collate->new();
323    }
324
325    # An empty array for the document/section OIDs that we are classifying
326    $self->{'OIDs'} = [];
327    # A hash for all the doc ids that we have seen, so we don't classify something twice
328    $self->{'all_doc_OIDs'} = {};
329    return bless $self, $class;
330}
331
332
333sub init
334{
335    # Nothing to do...
336}
337
338
339# Called for each document in the collection
340sub classify
341{
342    my $self = shift(@_);
343    my ($doc_obj) = @_;
344
345    if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
346    print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
347    return;
348    }
349    $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
350    # check against filter here
351    if ($self->{'filter_metadata'}) {
352    #print STDERR "filtering documents on $self->{'filter_metadata'}\n";
353    my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'});
354    return unless defined $meta;
355    if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) {
356        #print STDERR "doesn't match regex\n";
357        return;
358
359    }
360    }
361    # if we get here, we have passed the test for filtering
362    # If "-classify_sections" is set, classify every section of the document
363    if ($self->{'classify_sections'}) {
364    my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
365    while (defined $section) {
366        $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
367        $section = $doc_obj->get_next_section($section);
368    }
369    }
370    # Otherwise just classify the top document section
371    else {
372    $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
373    }
374   
375}
376
377sub classify_section
378{
379    my $self = shift(@_);
380    my ($doc_obj,$section_OID,$section) = @_;
381
382    my @metadata_groups = @{$self->{'metadata_groups'}};
383
384   
385    # Only classify the section if it has a value for one of the metadata elements in the first group
386    my $classify_section = 0;
387    my $first_metadata_group = $metadata_groups[0];
388    my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
389    my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
390    foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
391    my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
392    my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element); 
393   
394    # Remove prefix/suffix if requested
395    if (defined ($first_metadata_group_element_value)) {
396        if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {     
397        $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
398        }
399       
400        if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
401        $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
402        }
403    }
404    if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
405        # This section must be included in the classifier
406        $classify_section = 1;
407        last;
408    }
409    }
410
411    # We're not classifying this section because it doesn't have the required metadata
412    return if (!$classify_section);
413   
414    # Otherwise, include this section in the classifier
415   
416    push(@{$self->{'OIDs'}}, $section_OID);
417
418    # Create a hash for the metadata values of each metadata element we're interested in
419    my %metadata_groups_done = ();
420    foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
421    # Take care not to do a metadata group more than once
422    unless ($metadata_groups_done{$metadata_group}) {
423        my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
424        my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
425        foreach my $metadata_element (split(/\;|,/, $metadata_group)) {     
426        my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
427       
428        my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
429        foreach my $metadata_value (@metadata_values) {
430            # Strip leading and trailing whitespace
431            $metadata_value =~ s/^\s*//;
432            $metadata_value =~ s/\s*$//;
433
434            # Remove prefix/suffix if requested
435            if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
436            $metadata_value =~ s/^$remove_prefix_expr//;
437            }
438            if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
439            $metadata_value =~ s/$remove_suffix_expr$//;
440            }
441
442            # lowercase metadata both for sorting meta (d/D under D), and to allow CSS to be able to
443            # text-transform the stored lowercase values as capitalize or uppercase (can't CSS
444            # text-transform if stored uppercase). 2 CSS text-transforms have been added to core.css
445            my $lc_metadata_value = lc($metadata_value);
446           
447            # We are already working with unicode aware strings at this
448            # stage, so we no longer need to convert from utf8 to unicode
449            #my $metadata_value_unicode_string = $metadata_value; # $self->convert_utf8_string_to_unicode_string($metadata_value);
450
451            # Add the metadata value into the list for this combination of metadata group and section
452            # text that we have some non-whitespace chars
453            if ($lc_metadata_value =~ /\S/) {
454           
455            push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $lc_metadata_value);
456           
457           
458            # add the actual value into the stored values so we can remember the case
459            if (!$self->{'standardize_capitalization'}) {
460            if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) {
461                $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++;
462            } else {
463                $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1;
464            }
465            }
466            last if ($self->{$metadata_group . ".metadata_selection_mode"} eq "firstvalue");
467            }
468        } # foreach metadatavalue
469        last if ((@metadata_values > 0) &&  $self->{$metadata_group . ".metadata_selection_mode"} =~ /^(firstvalue|firstvalidmetadata)$/ );
470        } # foreach metadata element
471
472        $metadata_groups_done{$metadata_group} = 1;
473    }
474    }
475}
476
477
478sub get_classify_info
479{
480    my $self = shift(@_);
481
482    # The metadata groups to classify by
483    my @metadata_groups = @{$self->{'metadata_groups'}};
484    my $first_metadata_group = $metadata_groups[0];
485
486    # The OID values of the documents to include in the classifier
487    my @OIDs = @{$self->{'OIDs'}};
488
489    # Create the root node of the classification hierarchy
490    my %classifier_node = ( 'thistype' => "Invisible",
491                'childtype' => $self->{$first_metadata_group . ".list_type"},
492                'Title' => $self->{'buttonname'},
493                'contains' => [],
494                'mdtype' => $first_metadata_group );
495
496    # Recursively create the classification hierarchy, one level for each metadata group
497    $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
498    return \%classifier_node;
499}
500
501
502sub add_level
503{
504    my $self = shift(@_);
505    my @metadata_groups = @{shift(@_)};
506    my @OIDs = @{shift(@_)};
507    my $classifier_node = shift(@_);
508   
509    my $metadata_group = $metadata_groups[0];
510   
511    if (!defined($self->{$metadata_group . ".list"})) {
512    print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
513    return;
514    }
515
516    # Create a mapping from metadata value to OID
517    my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
518    my %metadata_value_to_OIDs_hash = ();   
519    foreach my $OID (@OIDs)
520    {
521    if ($OID_to_metadata_values_hash_ref->{$OID})
522    {
523        my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
524        foreach my $metadata_value (@metadata_values)
525        {
526        push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
527        }
528    }
529    }
530    #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
531
532    # Partition the values (if necessary)
533    my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
534    my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
535    if ($partition_type_within_level =~ /^per_letter$/i) {
536    # Generate one hlist for each letter
537    my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
538    my %metadata_value_to_OIDs_subhash = ();
539
540    my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
541    foreach my $metadata_value (@sortedmetadata_values) {
542        my $metadata_valuepartition = substr($metadata_value, 0, 1);
543
544        # Is this the start of a new partition?
545        if ($metadata_valuepartition ne $lastpartition) {
546        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
547        %metadata_value_to_OIDs_subhash = ();
548        $lastpartition = $metadata_valuepartition;
549        }
550
551        $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};       
552    }
553
554    # Don't forget to add the last partition
555    $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
556
557    # The partitions are stored in an HList
558    $classifier_node->{'childtype'} = "HList";
559    }
560    elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
561    # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
562    # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
563    # If a letter has more items than specified, split into several hlists.
564    # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
565    my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};       
566    my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
567    my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
568   
569    # Separate values by their first letter, each form a bucket, like the per_letter partition type
570    my $last_partition = substr($sortedmetadata_values[0], 0, 1);
571    my %partition_buckets = ();
572    my @metadata_values_in_bucket = ();
573    my $num_items_in_bucket = 0;
574    foreach my $metadata_value (@sortedmetadata_values) {
575        my $metadata_valuepartition = substr($metadata_value, 0, 1);
576        if ($metadata_valuepartition ne $last_partition) {
577        my @temp_array = @metadata_values_in_bucket;
578        # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
579        my %partition_info = ();       
580        $partition_info{'metadata_values'} = \@temp_array;
581        $partition_info{'size'} = $num_items_in_bucket;     
582        $partition_buckets{$last_partition} = \%partition_info;     
583       
584        @metadata_values_in_bucket = ($metadata_value);
585        $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
586        $last_partition = $metadata_valuepartition;
587        } else {
588        $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
589        push (@metadata_values_in_bucket, $metadata_value);
590        }
591    }
592    # Last one
593    my %partition_info = ();
594    $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
595    $partition_info{'size'} = $num_items_in_bucket;
596    $partition_buckets{$last_partition} = \%partition_info;
597       
598    my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));   
599    for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
600        my $partition = $partition_keys[$i];
601        my $items_in_partition = $partition_buckets{$partition}->{'size'};             
602        # Merge small buckets together, but keep the numeric bucket apart
603        if ($items_in_partition < $partition_size_within_level) {
604        my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
605        if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
606            && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
607             || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
608            foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
609            push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
610            }           
611            $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
612            delete $partition_buckets{$partition};                     
613        }
614        }
615    }
616    @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
617   
618    # Add partitions, and divide big bucket into several
619    my $last_partition_end = "";
620    my $partition_start = "";
621    foreach my $partition (@partition_keys) {
622        my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
623        my $items_in_partition = $partition_buckets{$partition}->{'size'};
624        $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
625       
626        if ($items_in_partition > $partition_size_within_level) {   
627        my $items_done = 0;
628        my %metadata_values_to_OIDs_subhashes = ();
629        for (my $i = 0; $i < scalar(@metadata_values); $i++) {
630            my $metadata_value = $metadata_values[$i];
631            # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
632            my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
633
634            my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
635            my $partitionname = $partition_start;
636            if ($partitionend ne $partition_start) {
637            $partitionname = $partitionname . "-" . $partitionend;
638            }
639           
640            # Start a new partition
641            if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
642            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
643            $last_partition_end = $partitionend;           
644            $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
645            $items_done = 0;
646            %metadata_values_to_OIDs_subhashes = ();
647            }
648           
649            # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
650                    if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
651            my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
652            # Get the number of partitions needed for this value
653            my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
654            $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
655
656            my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
657            for (my $i = 0; $i < $num_splits; $i++) {
658                my %OIDs_subhashes_for_this_value = ();
659                my @OIDs_for_this_partition = ();
660                for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
661                push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
662                }
663               
664                # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
665                if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
666                $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
667                $items_done += scalar(@OIDs_for_this_partition);
668                next;
669                }
670               
671                # Add an HList for this bucket
672                $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
673                $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
674                $last_partition_end = $partitionname_for_this_value;
675            }
676            next;
677                    }
678                       
679            $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
680            $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;         
681           
682            # The last partition
683            if($i == scalar(@metadata_values) - 1) {
684            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
685            }
686        }       
687        }
688        else {
689        # The easier case, just add a partition
690        my %metadata_values_to_OIDs_subhashes = ();
691        for (my $i = 0; $i < scalar(@metadata_values); $i++) {
692            my $metadata_value = $metadata_values[$i];
693            $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};           
694        }
695        my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
696        my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
697        my $partitionname = $partition_start;
698        if ($partitionend ne $partition_start) {
699            $partitionname = $partitionname . "-" . $partitionend;
700        }
701        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);           
702        $last_partition_end = $partitionend;       
703        }
704    }   
705   
706    # The partitions are stored in an HList
707    $classifier_node->{'childtype'} = "HList";
708
709    } # end approximate_size
710    else {
711    # Generate hlists of a certain size
712    if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
713        my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
714        my $itemsdone = 0;
715        my %metadata_value_to_OIDs_subhash = ();
716        my $lastpartitionend = "";
717        my $partitionstart;
718        foreach my $metadata_value (@sortedmetadata_values) {
719        $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
720        $itemsdone++;
721        my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
722
723        # Is this the start of a new partition?
724        if ($itemsinpartition == 1) {
725            $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
726        }
727
728        # Is this the end of the partition?
729        if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
730            my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
731            my $partitionname = $partitionstart;
732            if ($partitionend ne $partitionstart) {
733            $partitionname = $partitionname . "-" . $partitionend;
734            }
735
736            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
737            %metadata_value_to_OIDs_subhash = ();
738            $lastpartitionend = $partitionend;
739        }
740        }
741
742        # The partitions are stored in an HList
743        $classifier_node->{'childtype'} = "HList";
744    }
745
746    # Otherwise just add all the values to a VList
747    else {
748        $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
749    }
750    }
751}
752
753
754sub generate_partition_start
755{
756    my $self = shift(@_);
757    my $metadata_value = shift(@_);
758    my $lastpartitionend = shift(@_);
759    my $partition_name_length = shift(@_);
760
761    if ($partition_name_length) {
762    return substr($metadata_value, 0, $partition_name_length);
763    }
764
765    my $partitionstart = substr($metadata_value, 0, 1);
766    if ($partitionstart le $lastpartitionend) {
767    $partitionstart = substr($metadata_value, 0, 2);
768    # Give up after three characters
769    if ($partitionstart le $lastpartitionend) {
770        $partitionstart = substr($metadata_value, 0, 3);
771    }
772    }
773
774    return $partitionstart;
775}
776
777
778sub generate_partition_end
779{
780    my $self = shift(@_);
781    my $metadata_value = shift(@_);
782    my $partitionstart = shift(@_);
783    my $partition_name_length = shift(@_);
784
785    if ($partition_name_length) {
786    return substr($metadata_value, 0, $partition_name_length);
787    }
788
789    my $partitionend = substr($metadata_value, 0, length($partitionstart));
790    if ($partitionend gt $partitionstart) {
791    $partitionend = substr($metadata_value, 0, 1);
792    if ($partitionend le $partitionstart) {
793        $partitionend = substr($metadata_value, 0, 2);
794        # Give up after three characters
795        if ($partitionend le $partitionstart) {
796        $partitionend = substr($metadata_value, 0, 3);
797        }
798    }
799    }
800
801    return $partitionend;
802}
803
804
805sub add_hlist_partition
806{
807    my $self = shift(@_);
808    my @metadata_groups = @{shift(@_)};
809    my $classifier_node = shift(@_);
810    my $partitionname = shift(@_);
811    my $metadata_value_to_OIDs_hash_ref = shift(@_);
812
813    # Create an hlist partition
814    # Note that we don't need to convert from unicode-aware strings
815    # to utf8 here, as that is handled elsewhere in the code
816    my %child_classifier_node = ( 'Title' => $partitionname, #'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
817                  'childtype' => "VList",
818                  'contains' => [] );
819
820    # Add the children to the hlist partition
821    $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
822    push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
823}
824
825
826sub add_vlist
827{
828    my $self = shift(@_);
829    my @metadata_groups = @{shift(@_)};
830    my $classifier_node = shift(@_);
831    my $metadata_value_to_OIDs_hash_ref = shift(@_);
832
833    my $metadata_group = shift(@metadata_groups);
834    $classifier_node->{'mdtype'} = $metadata_group;
835
836    # Create an entry in the vlist for each value
837    foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
838    {
839    my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
840    # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
841    if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
842        my $OID = $OIDs[0];
843        my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
844        push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
845    }
846    # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
847    elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
848        @OIDs = $self->sort_leaf_items(\@OIDs);
849        foreach my $OID (@OIDs) {
850        my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
851        push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
852        }
853   
854    }
855    # Otherwise create a sublist (bookshelf) for the metadata value
856    else {
857        my $metadata_value_display = $self->get_metadata_value_display($metadata_group, $metadata_value);
858        # Note that we don't need to convert from unicode-aware strings
859        # to utf8 here, as that is handled elsewhere in the code
860        my %child_classifier_node = ( 'Title' => $metadata_value_display, # 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
861                      'childtype' => "VList",
862                      'mdtype' => $metadata_group,
863                      'contains' => [] );
864
865        #@OIDs = $self->sort_leaf_items(\@OIDs);
866        # If there are metadata elements remaining, recursively apply the process
867        if (@metadata_groups > 0) {
868        my $next_metadata_group = $metadata_groups[0];     
869        $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
870
871        # separate metadata into those that below in the next/sub-metadata_group
872        # and those that below at the current level's metadata_group
873
874        my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
875        my @current_level_OIDs = ();
876        my @next_level_OIDs = ();
877        foreach my $OID (@OIDs)
878        {
879            if ($OID_to_metadata_values_hash_ref->{$OID}) {
880            push(@next_level_OIDs, $OID);
881            } else {
882            push(@current_level_OIDs, $OID);
883            }
884        }
885        # recursively process those docs belonging to the sub-metadata_group
886        $self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
887
888        # For those docs that don't belong in the sub/next_metadata_group, but which belong
889        # at this level, just add the documents as children of this list at the current level
890        @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
891        foreach my $current_level_OID (@current_level_OIDs) {
892            my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
893            push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
894        }       
895        }
896        # Otherwise just add the documents as children of this list
897        else {
898        @OIDs = $self->sort_leaf_items(\@OIDs);
899        foreach my $OID (@OIDs) {
900            my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
901            push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
902        }
903
904        }
905
906        # Add the sublist to the list
907        push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
908    }
909    }
910}
911
912sub metadata_offset
913{
914    my $self = shift(@_);
915    my $metadata_group = shift(@_);
916    my $OID = shift(@_);
917    my $metadata_value = shift(@_);
918   
919    my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
920    my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
921    for (my $i = 0; $i < scalar(@metadata_values); $i++) {
922    if ($metadata_value eq $metadata_values[$i]) {
923        return $i;
924    }
925    }
926   
927    return 0;
928}
929
930sub sort_leaf_items
931{
932    my $self = shift(@_);
933    my @OIDs = @{shift(@_)};
934#    my $classifier_node = shift(@_);
935   
936    # Sort leaf nodes and add to list
937    my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
938    foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
939    my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
940    # Force a stable sort (Perl 5.6's sort isn't stable)
941    # !! The [0] bits aren't ideal (multiple metadata values) !!
942    @OIDs = @OIDs[ sort {
943      if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
944      {
945        $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
946      }
947      else
948      {
949        $a <=> $b;
950      }
951    } 0..$#OIDs ];
952    }
953    if ($self->{'reverse_sort_leaf_nodes'}) {
954    #print STDERR "reversing\n";
955    return reverse @OIDs;
956    }
957    return @OIDs;
958}
959
960
961
962sub sort_metadata_values_array
963{
964    my $self = shift(@_);
965    my @metadata_values = @_;
966
967    if ($self->{'unicode_collator'}) {
968    return $self->{'unicode_collator'}->sort(@metadata_values);
969    }
970    else {
971    return sort { $self->alpha_numeric_cmp($a,$b) }(@metadata_values);
972    }
973}
974
975# $a and $b args automatically passed in and shouldn't be declared
976sub alpha_numeric_cmp
977{
978    my $self = shift (@_);
979    my ($aStr, $bStr) = @_;
980    if ($aStr =~ m/^(\d+(\.\d+)?)/)
981    {
982        my $val_a = $1;
983        if ($bStr =~ m/^(\d+(\.\d+)?)/)
984        {
985            my $val_b = $1;
986            if ($val_a != $val_b)
987            {
988                return ($val_a <=> $val_b);
989            }
990        }
991    }
992   
993    return ($aStr cmp $bStr);
994}
995
996
997sub get_metadata_value_display {
998    my $self = shift(@_);
999    my ($metadata_group, $metadata_value) = @_;
1000    return $metadata_value if $self->{'standardize_capitalization'};
1001    my $actual_values_hash = $self->{$metadata_group . ".actualvalues"}->{$metadata_value};
1002    my $display_value ="";
1003    my $max_count=0;
1004    foreach my $v (keys %$actual_values_hash) {
1005    if ($actual_values_hash->{$v} > $max_count) {
1006        $display_value = $v;
1007        $max_count = $actual_values_hash->{$v};
1008    }
1009    }
1010    return $display_value;
1011}
10121;
Note: See TracBrowser for help on using the browser.