source: main/trunk/greenstone2/perllib/classify/List.pm@ 31567

Last change on this file since 31567 was 31567, checked in by kjdon, 7 years ago

added in alpha_numeric_sort - copied from AZlist I think, so if metadata values are numeric, then they will be sorted numerically, ie 10 comes after 9.

  • Property svn:keywords set to Author Date Id Revision
File size: 40.2 KB
RevLine 
[10398]1###########################################################################
2#
[18568]3# List.pm -- A general and flexible list classifier with most of
[10398]4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
[13741]29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
[10398]31#
32###########################################################################
33
[18568]34package List;
[10398]35
36
[17209]37use BaseClassifier;
[10398]38
39use strict;
40
41
42sub BEGIN {
[18568]43 @List::ISA = ('BaseClassifier');
[10398]44}
45
[29094]46my $meta_select_type_list =
47 [
48 { 'name' => "firstvalue",
49 'desc' => "{List.metadata_selection.firstvalue}"},
50 { 'name' => "firstvalidmetadata",
51 'desc' => "{List.metadata_selection.firstvalidmetadata}"},
52 { 'name' => "allvalues",
53 'desc' => "{List.metadata_selection.allvalues}"} ];
54my $valid_meta_select_types = { 'firstvalue' => 1,
55 'firstvalidmetadata' => 1,
56 'allvalues' => 1 };
[18572]57my $partition_type_list =
58 [ { 'name' => "per_letter",
59 'desc' => "{List.level_partition.per_letter}" },
[20865]60 { 'name' => "approximate_size",
61 'desc' => "{List.level_partition.approximate_size}"},
[18572]62 { 'name' => "constant_size",
[18619]63 'desc' => "{List.level_partition.constant_size}" },
[18572]64 { 'name' => "none",
65 'desc' => "{List.level_partition.none}" } ];
[10398]66
[20825]67# following used to check types later on
68my $valid_partition_types = { 'per_letter' => 1,
69 'constant_size' => 1,
70 'per_letter_fixed_size' => 1,
[20865]71 'approximate_size' => 1,
[20825]72 'none' => 1};
73
[18619]74my $bookshelf_type_list =
75 [ { 'name' => "always",
76 'desc' => "{List.bookshelf_type.always}" },
77 { 'name' => "duplicate_only",
78 'desc' => "{List.bookshelf_type.duplicate_only}" },
79 { 'name' => "never",
[20008]80 'desc' => "{List.bookshelf_type.never}" } ];
[18619]81
[10398]82my $arguments =
83 [ { 'name' => "metadata",
[19234]84 'desc' => "{List.metadata}",
[10398]85 'type' => "metadata",
86 'reqd' => "yes" },
[29094]87 { 'name' => "metadata_selection_mode",
88 'desc' => "{List.metadata_selection_mode}",
89 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
90 'list' => $meta_select_type_list,
91 'deft' => "firstvalidmetadata" },
[10398]92 # The interesting options
[18619]93 { 'name' => "bookshelf_type",
[19234]94 'desc' => "{List.bookshelf_type}",
[18619]95 'type' => "enum",
96 'list' => $bookshelf_type_list,
[19645]97 'deft' => "never" },
[10498]98 { 'name' => "classify_sections",
[19234]99 'desc' => "{List.classify_sections}",
[10498]100 'type' => "flag" },
101 { 'name' => "partition_type_within_level",
[19234]102 'desc' => "{List.partition_type_within_level}",
[20679]103 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
[20008]104 'list' => $partition_type_list,
105 'deft' => "per_letter" },
[10498]106 { 'name' => "partition_size_within_level",
[19234]107 'desc' => "{List.partition_size_within_level}",
[20679]108 'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
[14084]109 { 'name' => "partition_name_length",
[19234]110 'desc' => "{List.partition_name_length}",
[14084]111 'type' => "string" },
[10498]112 { 'name' => "sort_leaf_nodes_using",
[19234]113 'desc' => "{List.sort_leaf_nodes_using}",
[10398]114 'type' => "metadata",
[10499]115 'deft' => "Title" },
[23302]116 { 'name' => "reverse_sort_leaf_nodes",
117 'desc' => "{List.reverse_sort_leaf_nodes}",
118 'type' => "flag"},
[13551]119 { 'name' => "sort_using_unicode_collation",
[19234]120 'desc' => "{List.sort_using_unicode_collation}",
[13551]121 'type' => "flag" },
[10499]122 { 'name' => "use_hlist_for",
[19234]123 'desc' => "{List.use_hlist_for}",
[18619]124 'type' => "string" },
[26267]125 {'name' => "filter_metadata",
126 'desc' => "{List.filter_metadata}",
127 'type' => "metadata"},
128 {'name' => "filter_regex",
129 'desc' => "{List.filter_regex}",
130 'type' => "regexp"},
[27098]131 { 'name' => "standardize_capitalization",
132 'desc' => "{List.standardize_capitalization}",
133 'type' => "flag"},
[18619]134 { 'name' => "removeprefix",
135 'desc' => "{BasClas.removeprefix}",
136 'type' => "regexp" },
137 { 'name' => "removesuffix",
138 'desc' => "{BasClas.removesuffix}",
139 'type' => "regexp" } ];
[10398]140
[18568]141my $options = { 'name' => "List",
[19234]142 'desc' => "{List.desc}",
[10502]143 'abstract' => "no",
[18572]144 'inherits' => "yes",
[10398]145 'args' => $arguments };
146
147
148sub new
149{
150 my ($class) = shift(@_);
151 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
152 push(@$classifierslist, $class);
153
[17209]154 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
155 push(@{$hashArgOptLists->{"OptList"}}, $options);
[10398]156
[17209]157 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
[10398]158
159 if ($self->{'info_only'}) {
160 # don't worry about any options etc
161 return bless $self, $class;
162 }
163
164 # The metadata elements to use (required)
[12894]165 if (!$self->{'metadata'}) {
[18568]166 die "Error: No metadata fields specified for List.\n";
[10398]167 }
[24012]168
169 my @metadata_groups = split(/[\/]/, $self->{'metadata'});
[12889]170 $self->{'metadata_groups'} = \@metadata_groups;
[10398]171
[12894]172 # The classifier button name (default: the first metadata element specified)
173 if (!$self->{'buttonname'}) {
174 my $first_metadata_group = $metadata_groups[0];
[24012]175 my $first_metadata_element = (split(/[\;|,\/]/, $first_metadata_group))[0];
[12894]176 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
[10398]177 }
178
[29094]179 # meta selection mode for each level
180 if (!$self->{'metadata_selection_mode'}) {
181 foreach my $metadata_group (@metadata_groups) {
182 $self->{$metadata_group . ".metadata_selection_mode"} = "firstvalidmetadata";
183 }
184 } else {
185 my @metadata_selection_mode_list = split(/\//, $self->{'metadata_selection_mode'});
186 foreach my $metadata_group (@metadata_groups) {
187 my $meta_select_mode = shift(@metadata_selection_mode_list);
188 if (defined($meta_select_mode) && defined $valid_meta_select_types->{$meta_select_mode}) {
189 $self->{$metadata_group . ".metadata_selection_mode"} = $meta_select_mode;
190 } else {
191 $self->{$metadata_group . ".metadata_selection_mode"} = "firstvalidmetadata";
192 }
193 }
194 }
[18619]195 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
[12889]196 foreach my $metadata_group (@metadata_groups) {
[18619]197 $self->{$metadata_group . ".bookshelf_type"} = "always";
198 }
199 my $last_metadata_group = $metadata_groups[$#metadata_groups];
200 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
201 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
202
[12892]203 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
[12889]204 foreach my $metadata_group (@metadata_groups) {
[12892]205 $self->{$metadata_group . ".list_type"} = "VList";
[10499]206 }
[12892]207 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
208 $self->{$metadata_group . ".list_type"} = "HList";
[10499]209 }
210
[18619]211 # How the items are grouped into partitions (default: no partition)
212 # for each level (metadata group), separated by '/'
[12894]213 if (!$self->{"partition_type_within_level"}) {
[18619]214 foreach my $metadata_group (@metadata_groups) {
215 $self->{$metadata_group . ".partition_type_within_level"} = "none";
216 }
217 } else {
218 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
[20825]219
220 my $first = 1;
[18619]221 foreach my $metadata_group (@metadata_groups) {
222 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
[20865]223 if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
224 print STDERR "per letter fixed size, changing to approximate size\n";
225 $partition_type_within_levelelem = "approximate_size";
226 }
[20825]227 if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
[18619]228 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
229 }
230 else {
[20825]231 if ($first) {
232 $self->{$metadata_group . ".partition_type_within_level"} = "none";
233 $first = 0;
234 } else {
235 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
236 }
237 if (defined($partition_type_within_levelelem)) {
238 # ie invalid entry
239 print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
240 }
[18619]241 }
242 }
[10398]243 }
[20825]244
[10499]245 # The number of items in each partition
[12894]246 if (!$self->{'partition_size_within_level'}) {
[10398]247 # Default: 20
[12889]248 foreach my $metadata_group (@metadata_groups) {
249 $self->{$metadata_group . ".partition_size_within_level"} = 20;
[10398]250 }
251 }
252 else {
[12894]253 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
[10398]254
[10498]255 # Assign values based on the partition_size_within_level parameter
[12889]256 foreach my $metadata_group (@metadata_groups) {
[10498]257 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
258 if (defined($partition_size_within_levelelem)) {
[12889]259 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
[10398]260 }
261 else {
[12889]262 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
[10398]263 }
264 }
265 }
266
[18619]267 # The removeprefix and removesuffix expressions
268 if ($self->{'removeprefix'}) {
269 # If there are more than one expressions, use '' to quote each experession and '/' to separate
270 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
271
272 foreach my $metadata_group (@metadata_groups) {
273 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
274 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
275 # Remove the other ' at the beginning and the end if there is any
276 $removeprefix_expr_within_levelelem =~ s/^'//;
277 $removeprefix_expr_within_levelelem =~ s/'$//;
278 # Remove the extra ^ at the beginning
279 $removeprefix_expr_within_levelelem =~ s/^\^//;
280 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
281 } else {
282 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
283 }
284 }
285 }
286 if ($self->{'removesuffix'}) {
287 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
288
289 foreach my $metadata_group (@metadata_groups) {
290 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
291 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
292 $removesuffix_expr_within_levelelem =~ s/^'//;
293 $removesuffix_expr_within_levelelem =~ s/'$//;
294 # Remove the extra $ at the end
295 $removesuffix_expr_within_levelelem =~ s/\$$//;
296 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
297 } else {
298 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
299 }
300 }
301 }
302
[12894]303 # The metadata elements to use to sort the leaf nodes (default: Title)
304 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
305 if ($self->{'sort_leaf_nodes_using'}) {
306 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
[10398]307 }
[12894]308 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
[29094]309 foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
310 # set metadata_select_type, if not already set - might be already set if the same group was used in -metadata
311 if (!defined $self->{$sort_group . ".metadata_selection_mode"}) {
312 $self->{$sort_group . ".metadata_selection_mode"} = "firstvalue";
313 }
314 }
[13551]315 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
316 if ($self->{'sort_using_unicode_collation'}) {
[13791]317 # To use this you first need to download the allkeys.txt file from
318 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
319 # Unicode/Collate directory.
[13551]320 require Unicode::Collate;
321 $self->{'unicode_collator'} = Unicode::Collate->new();
322 }
323
[23154]324 # An empty array for the document/section OIDs that we are classifying
[12894]325 $self->{'OIDs'} = [];
[23154]326 # A hash for all the doc ids that we have seen, so we don't classify something twice
327 $self->{'all_doc_OIDs'} = {};
[10398]328 return bless $self, $class;
329}
330
331
332sub init
333{
334 # Nothing to do...
335}
336
337
[12896]338# Called for each document in the collection
[10398]339sub classify
340{
341 my $self = shift(@_);
[23116]342 my ($doc_obj) = @_;
[10398]343
[23154]344 if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
345 print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
346 return;
347 }
348 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
[26267]349 # check against filter here
350 if ($self->{'filter_metadata'}) {
351 #print STDERR "filtering documents on $self->{'filter_metadata'}\n";
352 my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'});
353 return unless defined $meta;
354 if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) {
355 #print STDERR "doesn't match regex\n";
356 return;
357
358 }
359 }
360 # if we get here, we have passed the test for filtering
[12896]361 # If "-classify_sections" is set, classify every section of the document
[10398]362 if ($self->{'classify_sections'}) {
363 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
364 while (defined $section) {
[23116]365 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
[10398]366 $section = $doc_obj->get_next_section($section);
367 }
368 }
[12896]369 # Otherwise just classify the top document section
[10398]370 else {
[23116]371 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
[10398]372 }
[23154]373
[10398]374}
375
376sub classify_section
377{
378 my $self = shift(@_);
[23116]379 my ($doc_obj,$section_OID,$section) = @_;
[10398]380
[12889]381 my @metadata_groups = @{$self->{'metadata_groups'}};
[10398]382
[26267]383
[12896]384 # Only classify the section if it has a value for one of the metadata elements in the first group
385 my $classify_section = 0;
386 my $first_metadata_group = $metadata_groups[0];
[22175]387 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
388 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
[20008]389 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
[20424]390 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
[20421]391 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
[18619]392
393 # Remove prefix/suffix if requested
[22175]394 if (defined ($first_metadata_group_element_value)) {
395 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
396 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
397 }
398
399 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
400 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
401 }
[18619]402 }
[12896]403 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
404 # This section must be included in the classifier
405 $classify_section = 1;
406 last;
[18619]407 }
[12896]408 }
[10398]409
[12896]410 # We're not classifying this section because it doesn't have the required metadata
411 return if (!$classify_section);
[18455]412
[12896]413 # Otherwise, include this section in the classifier
[23154]414
[12896]415 push(@{$self->{'OIDs'}}, $section_OID);
416
417 # Create a hash for the metadata values of each metadata element we're interested in
418 my %metadata_groups_done = ();
419 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
420 # Take care not to do a metadata group more than once
421 unless ($metadata_groups_done{$metadata_group}) {
[22175]422 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
423 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
[20008]424 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
[20424]425 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
426
[20421]427 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
[12896]428 foreach my $metadata_value (@metadata_values) {
429 # Strip leading and trailing whitespace
430 $metadata_value =~ s/^\s*//;
431 $metadata_value =~ s/\s*$//;
[13550]432
[18619]433 # Remove prefix/suffix if requested
434 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
435 $metadata_value =~ s/^$remove_prefix_expr//;
436 }
437 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
438 $metadata_value =~ s/$remove_suffix_expr$//;
439 }
440
[26545]441 # lowercase metadata both for sorting meta (d/D under D), and to allow CSS to be able to
442 # text-transform the stored lowercase values as capitalize or uppercase (can't CSS
443 # text-transform if stored uppercase). 2 CSS text-transforms have been added to core.css
[27098]444 my $lc_metadata_value = lc($metadata_value);
[24016]445
446 # We are already working with unicode aware strings at this
447 # stage, so we no longer need to convert from utf8 to unicode
[27098]448 #my $metadata_value_unicode_string = $metadata_value; # $self->convert_utf8_string_to_unicode_string($metadata_value);
[13550]449
450 # Add the metadata value into the list for this combination of metadata group and section
[29094]451 # text that we have some non-whitespace chars
452 if ($lc_metadata_value =~ /\S/) {
453
454 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $lc_metadata_value);
455
456
[27098]457 # add the actual value into the stored values so we can remember the case
458 if (!$self->{'standardize_capitalization'}) {
459 if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) {
460 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++;
461 } else {
462 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1;
463 }
[29094]464 }
465 last if ($self->{$metadata_group . ".metadata_selection_mode"} eq "firstvalue");
466 }
467 } # foreach metadatavalue
468 last if ((@metadata_values > 0) && $self->{$metadata_group . ".metadata_selection_mode"} =~ /^(firstvalue|firstvalidmetadata)$/ );
469 } # foreach metadata element
[10398]470
[12896]471 $metadata_groups_done{$metadata_group} = 1;
[10398]472 }
473 }
474}
475
476
477sub get_classify_info
478{
479 my $self = shift(@_);
480
[12896]481 # The metadata groups to classify by
[12889]482 my @metadata_groups = @{$self->{'metadata_groups'}};
483 my $first_metadata_group = $metadata_groups[0];
[10398]484
[12896]485 # The OID values of the documents to include in the classifier
[12889]486 my @OIDs = @{$self->{'OIDs'}};
[10398]487
[12896]488 # Create the root node of the classification hierarchy
[12893]489 my %classifier_node = ( 'thistype' => "Invisible",
[12895]490 'childtype' => $self->{$first_metadata_group . ".list_type"},
[12894]491 'Title' => $self->{'buttonname'},
[13271]492 'contains' => [],
493 'mdtype' => $first_metadata_group );
[10398]494
[12895]495 # Recursively create the classification hierarchy, one level for each metadata group
[14173]496 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
[12893]497 return \%classifier_node;
[10398]498}
499
500
[12895]501sub add_level
[10398]502{
503 my $self = shift(@_);
[12889]504 my @metadata_groups = @{shift(@_)};
505 my @OIDs = @{shift(@_)};
[12893]506 my $classifier_node = shift(@_);
[23154]507
[12889]508 my $metadata_group = $metadata_groups[0];
[23154]509
[13340]510 if (!defined($self->{$metadata_group . ".list"})) {
511 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
512 return;
513 }
[10398]514
515 # Create a mapping from metadata value to OID
[14845]516 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
[18619]517 my %metadata_value_to_OIDs_hash = ();
[14845]518 foreach my $OID (@OIDs)
519 {
520 if ($OID_to_metadata_values_hash_ref->{$OID})
521 {
522 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
523 foreach my $metadata_value (@metadata_values)
524 {
525 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
[10398]526 }
527 }
528 }
[24012]529 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
[10398]530
531 # Partition the values (if necessary)
[18619]532 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
[20904]533 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
[10498]534 if ($partition_type_within_level =~ /^per_letter$/i) {
[10398]535 # Generate one hlist for each letter
[14845]536 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
537 my %metadata_value_to_OIDs_subhash = ();
[10398]538
[14845]539 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
540 foreach my $metadata_value (@sortedmetadata_values) {
541 my $metadata_valuepartition = substr($metadata_value, 0, 1);
[10398]542
543 # Is this the start of a new partition?
[14845]544 if ($metadata_valuepartition ne $lastpartition) {
545 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
546 %metadata_value_to_OIDs_subhash = ();
547 $lastpartition = $metadata_valuepartition;
[10398]548 }
549
[14845]550 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
[10398]551 }
552
553 # Don't forget to add the last partition
[14845]554 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
[10398]555
556 # The partitions are stored in an HList
[12893]557 $classifier_node->{'childtype'} = "HList";
[10398]558 }
[20904]559 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
[18619]560 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
561 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
562 # If a letter has more items than specified, split into several hlists.
563 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
564 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
565 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
566 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
567
568 # Separate values by their first letter, each form a bucket, like the per_letter partition type
569 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
570 my %partition_buckets = ();
571 my @metadata_values_in_bucket = ();
572 my $num_items_in_bucket = 0;
[24012]573 foreach my $metadata_value (@sortedmetadata_values) {
[18619]574 my $metadata_valuepartition = substr($metadata_value, 0, 1);
575 if ($metadata_valuepartition ne $last_partition) {
576 my @temp_array = @metadata_values_in_bucket;
577 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
578 my %partition_info = ();
579 $partition_info{'metadata_values'} = \@temp_array;
580 $partition_info{'size'} = $num_items_in_bucket;
581 $partition_buckets{$last_partition} = \%partition_info;
582
583 @metadata_values_in_bucket = ($metadata_value);
584 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
585 $last_partition = $metadata_valuepartition;
586 } else {
587 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
588 push (@metadata_values_in_bucket, $metadata_value);
589 }
590 }
591 # Last one
592 my %partition_info = ();
593 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
594 $partition_info{'size'} = $num_items_in_bucket;
595 $partition_buckets{$last_partition} = \%partition_info;
596
597 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
598 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
599 my $partition = $partition_keys[$i];
600 my $items_in_partition = $partition_buckets{$partition}->{'size'};
601 # Merge small buckets together, but keep the numeric bucket apart
602 if ($items_in_partition < $partition_size_within_level) {
603 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
604 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
605 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
606 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
607 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
608 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
609 }
610 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
611 delete $partition_buckets{$partition};
612 }
613 }
614 }
615 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
616
617 # Add partitions, and divide big bucket into several
618 my $last_partition_end = "";
619 my $partition_start = "";
620 foreach my $partition (@partition_keys) {
621 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
622 my $items_in_partition = $partition_buckets{$partition}->{'size'};
623 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
624
625 if ($items_in_partition > $partition_size_within_level) {
626 my $items_done = 0;
627 my %metadata_values_to_OIDs_subhashes = ();
628 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
629 my $metadata_value = $metadata_values[$i];
630 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
631 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
[10398]632
[18619]633 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
634 my $partitionname = $partition_start;
635 if ($partitionend ne $partition_start) {
636 $partitionname = $partitionname . "-" . $partitionend;
637 }
638
639 # Start a new partition
640 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
641 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
642 $last_partition_end = $partitionend;
643 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
644 $items_done = 0;
645 %metadata_values_to_OIDs_subhashes = ();
646 }
647
648 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
649 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
650 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
651 # Get the number of partitions needed for this value
652 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
653 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
654
655 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
656 for (my $i = 0; $i < $num_splits; $i++) {
657 my %OIDs_subhashes_for_this_value = ();
658 my @OIDs_for_this_partition = ();
659 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
660 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
661 }
662
663 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
664 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
665 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
666 $items_done += scalar(@OIDs_for_this_partition);
667 next;
668 }
669
670 # Add an HList for this bucket
671 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
672 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
673 $last_partition_end = $partitionname_for_this_value;
674 }
675 next;
676 }
677
678 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
679 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
680
681 # The last partition
682 if($i == scalar(@metadata_values) - 1) {
683 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
684 }
685 }
[31567]686 }
[18619]687 else {
688 # The easier case, just add a partition
689 my %metadata_values_to_OIDs_subhashes = ();
690 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
691 my $metadata_value = $metadata_values[$i];
692 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
693 }
694 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
695 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
696 my $partitionname = $partition_start;
697 if ($partitionend ne $partition_start) {
698 $partitionname = $partitionname . "-" . $partitionend;
699 }
700 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
701 $last_partition_end = $partitionend;
702 }
703 }
[20865]704
705 # The partitions are stored in an HList
706 $classifier_node->{'childtype'} = "HList";
707
708 } # end approximate_size
[10398]709 else {
710 # Generate hlists of a certain size
[14845]711 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
712 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
[10398]713 my $itemsdone = 0;
[14845]714 my %metadata_value_to_OIDs_subhash = ();
[10398]715 my $lastpartitionend = "";
716 my $partitionstart;
[14845]717 foreach my $metadata_value (@sortedmetadata_values) {
718 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
[10398]719 $itemsdone++;
[14845]720 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
[10398]721
722 # Is this the start of a new partition?
723 if ($itemsinpartition == 1) {
[14845]724 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
[10398]725 }
726
727 # Is this the end of the partition?
[14845]728 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
729 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
[10398]730 my $partitionname = $partitionstart;
731 if ($partitionend ne $partitionstart) {
732 $partitionname = $partitionname . "-" . $partitionend;
733 }
734
[14845]735 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
736 %metadata_value_to_OIDs_subhash = ();
[10398]737 $lastpartitionend = $partitionend;
738 }
739 }
740
741 # The partitions are stored in an HList
[12893]742 $classifier_node->{'childtype'} = "HList";
[10398]743 }
744
745 # Otherwise just add all the values to a VList
746 else {
[14845]747 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
[10398]748 }
749 }
750}
751
752
753sub generate_partition_start
754{
[14173]755 my $self = shift(@_);
[14845]756 my $metadata_value = shift(@_);
[10398]757 my $lastpartitionend = shift(@_);
[14084]758 my $partition_name_length = shift(@_);
[10398]759
[14084]760 if ($partition_name_length) {
[14845]761 return substr($metadata_value, 0, $partition_name_length);
[14084]762 }
763
[14845]764 my $partitionstart = substr($metadata_value, 0, 1);
[10398]765 if ($partitionstart le $lastpartitionend) {
[14845]766 $partitionstart = substr($metadata_value, 0, 2);
[10398]767 # Give up after three characters
768 if ($partitionstart le $lastpartitionend) {
[14845]769 $partitionstart = substr($metadata_value, 0, 3);
[10398]770 }
771 }
772
773 return $partitionstart;
774}
775
776
777sub generate_partition_end
778{
[14173]779 my $self = shift(@_);
[14845]780 my $metadata_value = shift(@_);
[10398]781 my $partitionstart = shift(@_);
[14084]782 my $partition_name_length = shift(@_);
[10398]783
[14084]784 if ($partition_name_length) {
[14845]785 return substr($metadata_value, 0, $partition_name_length);
[14084]786 }
787
[14845]788 my $partitionend = substr($metadata_value, 0, length($partitionstart));
[10398]789 if ($partitionend gt $partitionstart) {
[14845]790 $partitionend = substr($metadata_value, 0, 1);
[10398]791 if ($partitionend le $partitionstart) {
[14845]792 $partitionend = substr($metadata_value, 0, 2);
[10398]793 # Give up after three characters
794 if ($partitionend le $partitionstart) {
[14845]795 $partitionend = substr($metadata_value, 0, 3);
[10398]796 }
797 }
798 }
799
800 return $partitionend;
801}
802
803
804sub add_hlist_partition
805{
806 my $self = shift(@_);
[12889]807 my @metadata_groups = @{shift(@_)};
[12893]808 my $classifier_node = shift(@_);
[10398]809 my $partitionname = shift(@_);
[14845]810 my $metadata_value_to_OIDs_hash_ref = shift(@_);
[10398]811
812 # Create an hlist partition
[24016]813 # Note that we don't need to convert from unicode-aware strings
814 # to utf8 here, as that is handled elsewhere in the code
815 my %child_classifier_node = ( 'Title' => $partitionname, #'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
[12893]816 'childtype' => "VList",
817 'contains' => [] );
[10398]818
819 # Add the children to the hlist partition
[14845]820 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
[12893]821 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
[10398]822}
823
824
825sub add_vlist
826{
827 my $self = shift(@_);
[12889]828 my @metadata_groups = @{shift(@_)};
[12893]829 my $classifier_node = shift(@_);
[14845]830 my $metadata_value_to_OIDs_hash_ref = shift(@_);
[10398]831
[12889]832 my $metadata_group = shift(@metadata_groups);
[13287]833 $classifier_node->{'mdtype'} = $metadata_group;
[10398]834
835 # Create an entry in the vlist for each value
[14845]836 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
837 {
838 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
[18619]839 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
840 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
[13271]841 my $OID = $OIDs[0];
[21969]842 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
[13271]843 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
[21969]844 }
[18619]845 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
846 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
[21969]847 @OIDs = $self->sort_leaf_items(\@OIDs);
848 foreach my $OID (@OIDs) {
849 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
850 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
851 }
852
[10398]853 }
854 # Otherwise create a sublist (bookshelf) for the metadata value
[24012]855 else {
[27098]856 my $metadata_value_display = $self->get_metadata_value_display($metadata_group, $metadata_value);
[24016]857 # Note that we don't need to convert from unicode-aware strings
858 # to utf8 here, as that is handled elsewhere in the code
[27098]859 my %child_classifier_node = ( 'Title' => $metadata_value_display, # 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
[12893]860 'childtype' => "VList",
[21969]861 'mdtype' => $metadata_group,
[12893]862 'contains' => [] );
[10398]863
[24012]864 #@OIDs = $self->sort_leaf_items(\@OIDs);
[10398]865 # If there are metadata elements remaining, recursively apply the process
[12889]866 if (@metadata_groups > 0) {
[24012]867 my $next_metadata_group = $metadata_groups[0];
[12895]868 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
[24012]869
870 # separate metadata into those that below in the next/sub-metadata_group
871 # and those that below at the current level's metadata_group
872
873 my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
874 my @current_level_OIDs = ();
875 my @next_level_OIDs = ();
876 foreach my $OID (@OIDs)
877 {
878 if ($OID_to_metadata_values_hash_ref->{$OID}) {
879 push(@next_level_OIDs, $OID);
880 } else {
881 push(@current_level_OIDs, $OID);
882 }
883 }
884 # recursively process those docs belonging to the sub-metadata_group
885 $self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
886
887 # For those docs that don't belong in the sub/next_metadata_group, but which belong
888 # at this level, just add the documents as children of this list at the current level
889 @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
890 foreach my $current_level_OID (@current_level_OIDs) {
891 my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
892 push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
893 }
[10398]894 }
895 # Otherwise just add the documents as children of this list
896 else {
[21969]897 @OIDs = $self->sort_leaf_items(\@OIDs);
898 foreach my $OID (@OIDs) {
899 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
900 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
901 }
902
[10398]903 }
904
905 # Add the sublist to the list
[12893]906 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
[10398]907 }
908 }
909}
910
[21969]911sub metadata_offset
[18619]912{
913 my $self = shift(@_);
[21969]914 my $metadata_group = shift(@_);
915 my $OID = shift(@_);
916 my $metadata_value = shift(@_);
917
918 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
919 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
920 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
921 if ($metadata_value eq $metadata_values[$i]) {
922 return $i;
923 }
924 }
925
926 return 0;
927}
928
929sub sort_leaf_items
930{
931 my $self = shift(@_);
[18619]932 my @OIDs = @{shift(@_)};
[21969]933# my $classifier_node = shift(@_);
[18619]934
935 # Sort leaf nodes and add to list
[20825]936 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
937 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
[18619]938 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
939 # Force a stable sort (Perl 5.6's sort isn't stable)
940 # !! The [0] bits aren't ideal (multiple metadata values) !!
[22667]941 @OIDs = @OIDs[ sort {
942 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
943 {
944 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
945 }
946 else
947 {
948 $a <=> $b;
949 }
950 } 0..$#OIDs ];
[18619]951 }
[23302]952 if ($self->{'reverse_sort_leaf_nodes'}) {
[24081]953 #print STDERR "reversing\n";
[23302]954 return reverse @OIDs;
955 }
[21969]956 return @OIDs;
[18619]957}
958
959
[21969]960
[13551]961sub sort_metadata_values_array
962{
963 my $self = shift(@_);
964 my @metadata_values = @_;
965
966 if ($self->{'unicode_collator'}) {
967 return $self->{'unicode_collator'}->sort(@metadata_values);
968 }
969 else {
[31567]970 return sort { $self->alpha_numeric_cmp($a,$b) }(@metadata_values);
[13551]971 }
972}
973
[31567]974
975sub alpha_numeric_cmp
976{
977 my ($self,$a,$b) = @_;
978
979 if ($a =~ m/^(\d+(\.\d+)?)/)
980 {
981 my $val_a = $1;
982 if ($b =~ m/^(\d+(\.\d+)?)/)
983 {
984 my $val_b = $1;
985 if ($val_a != $val_b)
986 {
987 return ($val_a <=> $val_b);
988 }
989 }
990 }
991
992 return ($a cmp $b);
993}
994
995
[27098]996sub get_metadata_value_display {
997 my $self = shift(@_);
998 my ($metadata_group, $metadata_value) = @_;
999 return $metadata_value if $self->{'standardize_capitalization'};
1000 my $actual_values_hash = $self->{$metadata_group . ".actualvalues"}->{$metadata_value};
1001 my $display_value ="";
1002 my $max_count=0;
1003 foreach my $v (keys %$actual_values_hash) {
1004 if ($actual_values_hash->{$v} > $max_count) {
1005 $display_value = $v;
1006 $max_count = $actual_values_hash->{$v};
1007 }
1008 }
1009 return $display_value;
1010}
[10398]10111;
Note: See TracBrowser for help on using the repository browser.