source: main/trunk/greenstone2/perllib/classify/List.pm@ 31577

Last change on this file since 31577 was 31577, checked in by kjdon, 7 years ago

changed my mind. now declaring aStr and bStr variables, cos what if the a and b coming in are not the same as global a and b?? maybe there is room for things to go wrong? so lets explicitly declare the variables

  • Property svn:keywords set to Author Date Id Revision
File size: 40.3 KB
RevLine 
[10398]1###########################################################################
2#
[18568]3# List.pm -- A general and flexible list classifier with most of
[10398]4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
[13741]29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
[10398]31#
32###########################################################################
33
[31576]34
[18568]35package List;
[10398]36
37
[17209]38use BaseClassifier;
[10398]39
40use strict;
41
42
43sub BEGIN {
[18568]44 @List::ISA = ('BaseClassifier');
[10398]45}
46
[29094]47my $meta_select_type_list =
48 [
49 { 'name' => "firstvalue",
50 'desc' => "{List.metadata_selection.firstvalue}"},
51 { 'name' => "firstvalidmetadata",
52 'desc' => "{List.metadata_selection.firstvalidmetadata}"},
53 { 'name' => "allvalues",
54 'desc' => "{List.metadata_selection.allvalues}"} ];
55my $valid_meta_select_types = { 'firstvalue' => 1,
56 'firstvalidmetadata' => 1,
57 'allvalues' => 1 };
[18572]58my $partition_type_list =
59 [ { 'name' => "per_letter",
60 'desc' => "{List.level_partition.per_letter}" },
[20865]61 { 'name' => "approximate_size",
62 'desc' => "{List.level_partition.approximate_size}"},
[18572]63 { 'name' => "constant_size",
[18619]64 'desc' => "{List.level_partition.constant_size}" },
[18572]65 { 'name' => "none",
66 'desc' => "{List.level_partition.none}" } ];
[10398]67
[20825]68# following used to check types later on
69my $valid_partition_types = { 'per_letter' => 1,
70 'constant_size' => 1,
71 'per_letter_fixed_size' => 1,
[20865]72 'approximate_size' => 1,
[20825]73 'none' => 1};
74
[18619]75my $bookshelf_type_list =
76 [ { 'name' => "always",
77 'desc' => "{List.bookshelf_type.always}" },
78 { 'name' => "duplicate_only",
79 'desc' => "{List.bookshelf_type.duplicate_only}" },
80 { 'name' => "never",
[20008]81 'desc' => "{List.bookshelf_type.never}" } ];
[18619]82
[10398]83my $arguments =
84 [ { 'name' => "metadata",
[19234]85 'desc' => "{List.metadata}",
[10398]86 'type' => "metadata",
87 'reqd' => "yes" },
[29094]88 { 'name' => "metadata_selection_mode",
89 'desc' => "{List.metadata_selection_mode}",
90 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
91 'list' => $meta_select_type_list,
92 'deft' => "firstvalidmetadata" },
[10398]93 # The interesting options
[18619]94 { 'name' => "bookshelf_type",
[19234]95 'desc' => "{List.bookshelf_type}",
[18619]96 'type' => "enum",
97 'list' => $bookshelf_type_list,
[19645]98 'deft' => "never" },
[10498]99 { 'name' => "classify_sections",
[19234]100 'desc' => "{List.classify_sections}",
[10498]101 'type' => "flag" },
102 { 'name' => "partition_type_within_level",
[19234]103 'desc' => "{List.partition_type_within_level}",
[20679]104 'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
[20008]105 'list' => $partition_type_list,
106 'deft' => "per_letter" },
[10498]107 { 'name' => "partition_size_within_level",
[19234]108 'desc' => "{List.partition_size_within_level}",
[20679]109 'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
[14084]110 { 'name' => "partition_name_length",
[19234]111 'desc' => "{List.partition_name_length}",
[14084]112 'type' => "string" },
[10498]113 { 'name' => "sort_leaf_nodes_using",
[19234]114 'desc' => "{List.sort_leaf_nodes_using}",
[10398]115 'type' => "metadata",
[10499]116 'deft' => "Title" },
[23302]117 { 'name' => "reverse_sort_leaf_nodes",
118 'desc' => "{List.reverse_sort_leaf_nodes}",
119 'type' => "flag"},
[13551]120 { 'name' => "sort_using_unicode_collation",
[19234]121 'desc' => "{List.sort_using_unicode_collation}",
[13551]122 'type' => "flag" },
[10499]123 { 'name' => "use_hlist_for",
[19234]124 'desc' => "{List.use_hlist_for}",
[18619]125 'type' => "string" },
[26267]126 {'name' => "filter_metadata",
127 'desc' => "{List.filter_metadata}",
128 'type' => "metadata"},
129 {'name' => "filter_regex",
130 'desc' => "{List.filter_regex}",
131 'type' => "regexp"},
[27098]132 { 'name' => "standardize_capitalization",
133 'desc' => "{List.standardize_capitalization}",
134 'type' => "flag"},
[18619]135 { 'name' => "removeprefix",
136 'desc' => "{BasClas.removeprefix}",
137 'type' => "regexp" },
138 { 'name' => "removesuffix",
139 'desc' => "{BasClas.removesuffix}",
140 'type' => "regexp" } ];
[10398]141
[18568]142my $options = { 'name' => "List",
[19234]143 'desc' => "{List.desc}",
[10502]144 'abstract' => "no",
[18572]145 'inherits' => "yes",
[10398]146 'args' => $arguments };
147
148
149sub new
150{
151 my ($class) = shift(@_);
152 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
153 push(@$classifierslist, $class);
154
[17209]155 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
156 push(@{$hashArgOptLists->{"OptList"}}, $options);
[10398]157
[17209]158 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
[10398]159
160 if ($self->{'info_only'}) {
161 # don't worry about any options etc
162 return bless $self, $class;
163 }
164
165 # The metadata elements to use (required)
[12894]166 if (!$self->{'metadata'}) {
[18568]167 die "Error: No metadata fields specified for List.\n";
[10398]168 }
[24012]169
170 my @metadata_groups = split(/[\/]/, $self->{'metadata'});
[12889]171 $self->{'metadata_groups'} = \@metadata_groups;
[10398]172
[12894]173 # The classifier button name (default: the first metadata element specified)
174 if (!$self->{'buttonname'}) {
175 my $first_metadata_group = $metadata_groups[0];
[24012]176 my $first_metadata_element = (split(/[\;|,\/]/, $first_metadata_group))[0];
[12894]177 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
[10398]178 }
179
[29094]180 # meta selection mode for each level
181 if (!$self->{'metadata_selection_mode'}) {
182 foreach my $metadata_group (@metadata_groups) {
183 $self->{$metadata_group . ".metadata_selection_mode"} = "firstvalidmetadata";
184 }
185 } else {
186 my @metadata_selection_mode_list = split(/\//, $self->{'metadata_selection_mode'});
187 foreach my $metadata_group (@metadata_groups) {
188 my $meta_select_mode = shift(@metadata_selection_mode_list);
189 if (defined($meta_select_mode) && defined $valid_meta_select_types->{$meta_select_mode}) {
190 $self->{$metadata_group . ".metadata_selection_mode"} = $meta_select_mode;
191 } else {
192 $self->{$metadata_group . ".metadata_selection_mode"} = "firstvalidmetadata";
193 }
194 }
195 }
[18619]196 # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
[12889]197 foreach my $metadata_group (@metadata_groups) {
[18619]198 $self->{$metadata_group . ".bookshelf_type"} = "always";
199 }
200 my $last_metadata_group = $metadata_groups[$#metadata_groups];
201 # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
202 $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
203
[12892]204 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
[12889]205 foreach my $metadata_group (@metadata_groups) {
[12892]206 $self->{$metadata_group . ".list_type"} = "VList";
[10499]207 }
[12892]208 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
209 $self->{$metadata_group . ".list_type"} = "HList";
[10499]210 }
211
[18619]212 # How the items are grouped into partitions (default: no partition)
213 # for each level (metadata group), separated by '/'
[12894]214 if (!$self->{"partition_type_within_level"}) {
[18619]215 foreach my $metadata_group (@metadata_groups) {
216 $self->{$metadata_group . ".partition_type_within_level"} = "none";
217 }
218 } else {
219 my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
[20825]220
221 my $first = 1;
[18619]222 foreach my $metadata_group (@metadata_groups) {
223 my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
[20865]224 if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
225 print STDERR "per letter fixed size, changing to approximate size\n";
226 $partition_type_within_levelelem = "approximate_size";
227 }
[20825]228 if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
[18619]229 $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
230 }
231 else {
[20825]232 if ($first) {
233 $self->{$metadata_group . ".partition_type_within_level"} = "none";
234 $first = 0;
235 } else {
236 $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
237 }
238 if (defined($partition_type_within_levelelem)) {
239 # ie invalid entry
240 print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
241 }
[18619]242 }
243 }
[10398]244 }
[20825]245
[10499]246 # The number of items in each partition
[12894]247 if (!$self->{'partition_size_within_level'}) {
[10398]248 # Default: 20
[12889]249 foreach my $metadata_group (@metadata_groups) {
250 $self->{$metadata_group . ".partition_size_within_level"} = 20;
[10398]251 }
252 }
253 else {
[12894]254 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
[10398]255
[10498]256 # Assign values based on the partition_size_within_level parameter
[12889]257 foreach my $metadata_group (@metadata_groups) {
[10498]258 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
259 if (defined($partition_size_within_levelelem)) {
[12889]260 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
[10398]261 }
262 else {
[12889]263 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
[10398]264 }
265 }
266 }
267
[18619]268 # The removeprefix and removesuffix expressions
269 if ($self->{'removeprefix'}) {
270 # If there are more than one expressions, use '' to quote each experession and '/' to separate
271 my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
272
273 foreach my $metadata_group (@metadata_groups) {
274 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
275 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
276 # Remove the other ' at the beginning and the end if there is any
277 $removeprefix_expr_within_levelelem =~ s/^'//;
278 $removeprefix_expr_within_levelelem =~ s/'$//;
279 # Remove the extra ^ at the beginning
280 $removeprefix_expr_within_levelelem =~ s/^\^//;
281 $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
282 } else {
283 $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
284 }
285 }
286 }
287 if ($self->{'removesuffix'}) {
288 my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
289
290 foreach my $metadata_group (@metadata_groups) {
291 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
292 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
293 $removesuffix_expr_within_levelelem =~ s/^'//;
294 $removesuffix_expr_within_levelelem =~ s/'$//;
295 # Remove the extra $ at the end
296 $removesuffix_expr_within_levelelem =~ s/\$$//;
297 $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
298 } else {
299 $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
300 }
301 }
302 }
303
[12894]304 # The metadata elements to use to sort the leaf nodes (default: Title)
305 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
306 if ($self->{'sort_leaf_nodes_using'}) {
307 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
[10398]308 }
[12894]309 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
[29094]310 foreach my $sort_group (@sort_leaf_nodes_using_metadata_groups) {
311 # set metadata_select_type, if not already set - might be already set if the same group was used in -metadata
312 if (!defined $self->{$sort_group . ".metadata_selection_mode"}) {
313 $self->{$sort_group . ".metadata_selection_mode"} = "firstvalue";
314 }
315 }
[13551]316 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
317 if ($self->{'sort_using_unicode_collation'}) {
[13791]318 # To use this you first need to download the allkeys.txt file from
319 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
320 # Unicode/Collate directory.
[13551]321 require Unicode::Collate;
322 $self->{'unicode_collator'} = Unicode::Collate->new();
323 }
324
[23154]325 # An empty array for the document/section OIDs that we are classifying
[12894]326 $self->{'OIDs'} = [];
[23154]327 # A hash for all the doc ids that we have seen, so we don't classify something twice
328 $self->{'all_doc_OIDs'} = {};
[10398]329 return bless $self, $class;
330}
331
332
333sub init
334{
335 # Nothing to do...
336}
337
338
[12896]339# Called for each document in the collection
[10398]340sub classify
341{
342 my $self = shift(@_);
[23116]343 my ($doc_obj) = @_;
[10398]344
[23154]345 if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
346 print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
347 return;
348 }
349 $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
[26267]350 # check against filter here
351 if ($self->{'filter_metadata'}) {
352 #print STDERR "filtering documents on $self->{'filter_metadata'}\n";
353 my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'});
354 return unless defined $meta;
355 if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) {
356 #print STDERR "doesn't match regex\n";
357 return;
358
359 }
360 }
361 # if we get here, we have passed the test for filtering
[12896]362 # If "-classify_sections" is set, classify every section of the document
[10398]363 if ($self->{'classify_sections'}) {
364 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
365 while (defined $section) {
[23116]366 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
[10398]367 $section = $doc_obj->get_next_section($section);
368 }
369 }
[12896]370 # Otherwise just classify the top document section
[10398]371 else {
[23116]372 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
[10398]373 }
[23154]374
[10398]375}
376
377sub classify_section
378{
379 my $self = shift(@_);
[23116]380 my ($doc_obj,$section_OID,$section) = @_;
[10398]381
[12889]382 my @metadata_groups = @{$self->{'metadata_groups'}};
[10398]383
[26267]384
[12896]385 # Only classify the section if it has a value for one of the metadata elements in the first group
386 my $classify_section = 0;
387 my $first_metadata_group = $metadata_groups[0];
[22175]388 my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
389 my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
[20008]390 foreach my $first_metadata_group_element (split(/\;|,/, $first_metadata_group)) {
[20424]391 my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
[20421]392 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
[18619]393
394 # Remove prefix/suffix if requested
[22175]395 if (defined ($first_metadata_group_element_value)) {
396 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
397 $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
398 }
399
400 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
401 $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
402 }
[18619]403 }
[12896]404 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
405 # This section must be included in the classifier
406 $classify_section = 1;
407 last;
[18619]408 }
[12896]409 }
[10398]410
[12896]411 # We're not classifying this section because it doesn't have the required metadata
412 return if (!$classify_section);
[18455]413
[12896]414 # Otherwise, include this section in the classifier
[23154]415
[12896]416 push(@{$self->{'OIDs'}}, $section_OID);
417
418 # Create a hash for the metadata values of each metadata element we're interested in
419 my %metadata_groups_done = ();
420 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
421 # Take care not to do a metadata group more than once
422 unless ($metadata_groups_done{$metadata_group}) {
[22175]423 my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
424 my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
[20008]425 foreach my $metadata_element (split(/\;|,/, $metadata_group)) {
[20424]426 my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
427
[20421]428 my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
[12896]429 foreach my $metadata_value (@metadata_values) {
430 # Strip leading and trailing whitespace
431 $metadata_value =~ s/^\s*//;
432 $metadata_value =~ s/\s*$//;
[13550]433
[18619]434 # Remove prefix/suffix if requested
435 if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
436 $metadata_value =~ s/^$remove_prefix_expr//;
437 }
438 if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
439 $metadata_value =~ s/$remove_suffix_expr$//;
440 }
441
[26545]442 # lowercase metadata both for sorting meta (d/D under D), and to allow CSS to be able to
443 # text-transform the stored lowercase values as capitalize or uppercase (can't CSS
444 # text-transform if stored uppercase). 2 CSS text-transforms have been added to core.css
[27098]445 my $lc_metadata_value = lc($metadata_value);
[24016]446
447 # We are already working with unicode aware strings at this
448 # stage, so we no longer need to convert from utf8 to unicode
[27098]449 #my $metadata_value_unicode_string = $metadata_value; # $self->convert_utf8_string_to_unicode_string($metadata_value);
[13550]450
451 # Add the metadata value into the list for this combination of metadata group and section
[29094]452 # text that we have some non-whitespace chars
453 if ($lc_metadata_value =~ /\S/) {
454
455 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $lc_metadata_value);
456
457
[27098]458 # add the actual value into the stored values so we can remember the case
459 if (!$self->{'standardize_capitalization'}) {
460 if (defined $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}) {
461 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value}++;
462 } else {
463 $self->{$metadata_group . ".actualvalues"}->{$lc_metadata_value}->{$metadata_value} = 1;
464 }
[29094]465 }
466 last if ($self->{$metadata_group . ".metadata_selection_mode"} eq "firstvalue");
467 }
468 } # foreach metadatavalue
469 last if ((@metadata_values > 0) && $self->{$metadata_group . ".metadata_selection_mode"} =~ /^(firstvalue|firstvalidmetadata)$/ );
470 } # foreach metadata element
[10398]471
[12896]472 $metadata_groups_done{$metadata_group} = 1;
[10398]473 }
474 }
475}
476
477
478sub get_classify_info
479{
480 my $self = shift(@_);
481
[12896]482 # The metadata groups to classify by
[12889]483 my @metadata_groups = @{$self->{'metadata_groups'}};
484 my $first_metadata_group = $metadata_groups[0];
[10398]485
[12896]486 # The OID values of the documents to include in the classifier
[12889]487 my @OIDs = @{$self->{'OIDs'}};
[10398]488
[12896]489 # Create the root node of the classification hierarchy
[12893]490 my %classifier_node = ( 'thistype' => "Invisible",
[12895]491 'childtype' => $self->{$first_metadata_group . ".list_type"},
[12894]492 'Title' => $self->{'buttonname'},
[13271]493 'contains' => [],
494 'mdtype' => $first_metadata_group );
[10398]495
[12895]496 # Recursively create the classification hierarchy, one level for each metadata group
[14173]497 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
[12893]498 return \%classifier_node;
[10398]499}
500
501
[12895]502sub add_level
[10398]503{
504 my $self = shift(@_);
[12889]505 my @metadata_groups = @{shift(@_)};
506 my @OIDs = @{shift(@_)};
[12893]507 my $classifier_node = shift(@_);
[23154]508
[12889]509 my $metadata_group = $metadata_groups[0];
[23154]510
[13340]511 if (!defined($self->{$metadata_group . ".list"})) {
512 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
513 return;
514 }
[10398]515
516 # Create a mapping from metadata value to OID
[14845]517 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
[18619]518 my %metadata_value_to_OIDs_hash = ();
[14845]519 foreach my $OID (@OIDs)
520 {
521 if ($OID_to_metadata_values_hash_ref->{$OID})
522 {
523 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
524 foreach my $metadata_value (@metadata_values)
525 {
526 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
[10398]527 }
528 }
529 }
[24012]530 #print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
[10398]531
532 # Partition the values (if necessary)
[18619]533 my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
[20904]534 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
[10498]535 if ($partition_type_within_level =~ /^per_letter$/i) {
[10398]536 # Generate one hlist for each letter
[14845]537 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
538 my %metadata_value_to_OIDs_subhash = ();
[10398]539
[14845]540 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
541 foreach my $metadata_value (@sortedmetadata_values) {
542 my $metadata_valuepartition = substr($metadata_value, 0, 1);
[10398]543
544 # Is this the start of a new partition?
[14845]545 if ($metadata_valuepartition ne $lastpartition) {
546 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
547 %metadata_value_to_OIDs_subhash = ();
548 $lastpartition = $metadata_valuepartition;
[10398]549 }
550
[14845]551 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
[10398]552 }
553
554 # Don't forget to add the last partition
[14845]555 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
[10398]556
557 # The partitions are stored in an HList
[12893]558 $classifier_node->{'childtype'} = "HList";
[10398]559 }
[20904]560 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
[18619]561 # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
562 # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
563 # If a letter has more items than specified, split into several hlists.
564 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
565 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
566 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
567 my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
568
569 # Separate values by their first letter, each form a bucket, like the per_letter partition type
570 my $last_partition = substr($sortedmetadata_values[0], 0, 1);
571 my %partition_buckets = ();
572 my @metadata_values_in_bucket = ();
573 my $num_items_in_bucket = 0;
[24012]574 foreach my $metadata_value (@sortedmetadata_values) {
[18619]575 my $metadata_valuepartition = substr($metadata_value, 0, 1);
576 if ($metadata_valuepartition ne $last_partition) {
577 my @temp_array = @metadata_values_in_bucket;
578 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
579 my %partition_info = ();
580 $partition_info{'metadata_values'} = \@temp_array;
581 $partition_info{'size'} = $num_items_in_bucket;
582 $partition_buckets{$last_partition} = \%partition_info;
583
584 @metadata_values_in_bucket = ($metadata_value);
585 $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
586 $last_partition = $metadata_valuepartition;
587 } else {
588 $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
589 push (@metadata_values_in_bucket, $metadata_value);
590 }
591 }
592 # Last one
593 my %partition_info = ();
594 $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
595 $partition_info{'size'} = $num_items_in_bucket;
596 $partition_buckets{$last_partition} = \%partition_info;
597
598 my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
599 for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
600 my $partition = $partition_keys[$i];
601 my $items_in_partition = $partition_buckets{$partition}->{'size'};
602 # Merge small buckets together, but keep the numeric bucket apart
603 if ($items_in_partition < $partition_size_within_level) {
604 my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
605 if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
606 && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
607 || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
608 foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
609 push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
610 }
611 $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
612 delete $partition_buckets{$partition};
613 }
614 }
615 }
616 @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
617
618 # Add partitions, and divide big bucket into several
619 my $last_partition_end = "";
620 my $partition_start = "";
621 foreach my $partition (@partition_keys) {
622 my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
623 my $items_in_partition = $partition_buckets{$partition}->{'size'};
624 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
625
626 if ($items_in_partition > $partition_size_within_level) {
627 my $items_done = 0;
628 my %metadata_values_to_OIDs_subhashes = ();
629 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
630 my $metadata_value = $metadata_values[$i];
631 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
632 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
[10398]633
[18619]634 my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
635 my $partitionname = $partition_start;
636 if ($partitionend ne $partition_start) {
637 $partitionname = $partitionname . "-" . $partitionend;
638 }
639
640 # Start a new partition
641 if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
642 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
643 $last_partition_end = $partitionend;
644 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
645 $items_done = 0;
646 %metadata_values_to_OIDs_subhashes = ();
647 }
648
649 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
650 if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
651 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
652 # Get the number of partitions needed for this value
653 my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
654 $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
655
656 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
657 for (my $i = 0; $i < $num_splits; $i++) {
658 my %OIDs_subhashes_for_this_value = ();
659 my @OIDs_for_this_partition = ();
660 for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
661 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
662 }
663
664 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
665 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
666 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
667 $items_done += scalar(@OIDs_for_this_partition);
668 next;
669 }
670
671 # Add an HList for this bucket
672 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
673 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
674 $last_partition_end = $partitionname_for_this_value;
675 }
676 next;
677 }
678
679 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
680 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
681
682 # The last partition
683 if($i == scalar(@metadata_values) - 1) {
684 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
685 }
686 }
[31567]687 }
[18619]688 else {
689 # The easier case, just add a partition
690 my %metadata_values_to_OIDs_subhashes = ();
691 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
692 my $metadata_value = $metadata_values[$i];
693 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
694 }
695 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
696 my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
697 my $partitionname = $partition_start;
698 if ($partitionend ne $partition_start) {
699 $partitionname = $partitionname . "-" . $partitionend;
700 }
701 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
702 $last_partition_end = $partitionend;
703 }
704 }
[20865]705
706 # The partitions are stored in an HList
707 $classifier_node->{'childtype'} = "HList";
708
709 } # end approximate_size
[10398]710 else {
711 # Generate hlists of a certain size
[14845]712 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
713 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
[10398]714 my $itemsdone = 0;
[14845]715 my %metadata_value_to_OIDs_subhash = ();
[10398]716 my $lastpartitionend = "";
717 my $partitionstart;
[14845]718 foreach my $metadata_value (@sortedmetadata_values) {
719 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
[10398]720 $itemsdone++;
[14845]721 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
[10398]722
723 # Is this the start of a new partition?
724 if ($itemsinpartition == 1) {
[14845]725 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
[10398]726 }
727
728 # Is this the end of the partition?
[14845]729 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
730 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
[10398]731 my $partitionname = $partitionstart;
732 if ($partitionend ne $partitionstart) {
733 $partitionname = $partitionname . "-" . $partitionend;
734 }
735
[14845]736 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
737 %metadata_value_to_OIDs_subhash = ();
[10398]738 $lastpartitionend = $partitionend;
739 }
740 }
741
742 # The partitions are stored in an HList
[12893]743 $classifier_node->{'childtype'} = "HList";
[10398]744 }
745
746 # Otherwise just add all the values to a VList
747 else {
[14845]748 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
[10398]749 }
750 }
751}
752
753
754sub generate_partition_start
755{
[14173]756 my $self = shift(@_);
[14845]757 my $metadata_value = shift(@_);
[10398]758 my $lastpartitionend = shift(@_);
[14084]759 my $partition_name_length = shift(@_);
[10398]760
[14084]761 if ($partition_name_length) {
[14845]762 return substr($metadata_value, 0, $partition_name_length);
[14084]763 }
764
[14845]765 my $partitionstart = substr($metadata_value, 0, 1);
[10398]766 if ($partitionstart le $lastpartitionend) {
[14845]767 $partitionstart = substr($metadata_value, 0, 2);
[10398]768 # Give up after three characters
769 if ($partitionstart le $lastpartitionend) {
[14845]770 $partitionstart = substr($metadata_value, 0, 3);
[10398]771 }
772 }
773
774 return $partitionstart;
775}
776
777
778sub generate_partition_end
779{
[14173]780 my $self = shift(@_);
[14845]781 my $metadata_value = shift(@_);
[10398]782 my $partitionstart = shift(@_);
[14084]783 my $partition_name_length = shift(@_);
[10398]784
[14084]785 if ($partition_name_length) {
[14845]786 return substr($metadata_value, 0, $partition_name_length);
[14084]787 }
788
[14845]789 my $partitionend = substr($metadata_value, 0, length($partitionstart));
[10398]790 if ($partitionend gt $partitionstart) {
[14845]791 $partitionend = substr($metadata_value, 0, 1);
[10398]792 if ($partitionend le $partitionstart) {
[14845]793 $partitionend = substr($metadata_value, 0, 2);
[10398]794 # Give up after three characters
795 if ($partitionend le $partitionstart) {
[14845]796 $partitionend = substr($metadata_value, 0, 3);
[10398]797 }
798 }
799 }
800
801 return $partitionend;
802}
803
804
805sub add_hlist_partition
806{
807 my $self = shift(@_);
[12889]808 my @metadata_groups = @{shift(@_)};
[12893]809 my $classifier_node = shift(@_);
[10398]810 my $partitionname = shift(@_);
[14845]811 my $metadata_value_to_OIDs_hash_ref = shift(@_);
[10398]812
813 # Create an hlist partition
[24016]814 # Note that we don't need to convert from unicode-aware strings
815 # to utf8 here, as that is handled elsewhere in the code
816 my %child_classifier_node = ( 'Title' => $partitionname, #'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
[12893]817 'childtype' => "VList",
818 'contains' => [] );
[10398]819
820 # Add the children to the hlist partition
[14845]821 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
[12893]822 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
[10398]823}
824
825
826sub add_vlist
827{
828 my $self = shift(@_);
[12889]829 my @metadata_groups = @{shift(@_)};
[12893]830 my $classifier_node = shift(@_);
[14845]831 my $metadata_value_to_OIDs_hash_ref = shift(@_);
[10398]832
[12889]833 my $metadata_group = shift(@metadata_groups);
[13287]834 $classifier_node->{'mdtype'} = $metadata_group;
[10398]835
836 # Create an entry in the vlist for each value
[14845]837 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
838 {
839 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
[18619]840 # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
841 if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
[13271]842 my $OID = $OIDs[0];
[21969]843 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
[13271]844 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
[21969]845 }
[18619]846 # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
847 elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
[21969]848 @OIDs = $self->sort_leaf_items(\@OIDs);
849 foreach my $OID (@OIDs) {
850 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
851 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
852 }
853
[10398]854 }
855 # Otherwise create a sublist (bookshelf) for the metadata value
[24012]856 else {
[27098]857 my $metadata_value_display = $self->get_metadata_value_display($metadata_group, $metadata_value);
[24016]858 # Note that we don't need to convert from unicode-aware strings
859 # to utf8 here, as that is handled elsewhere in the code
[27098]860 my %child_classifier_node = ( 'Title' => $metadata_value_display, # 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
[12893]861 'childtype' => "VList",
[21969]862 'mdtype' => $metadata_group,
[12893]863 'contains' => [] );
[10398]864
[24012]865 #@OIDs = $self->sort_leaf_items(\@OIDs);
[10398]866 # If there are metadata elements remaining, recursively apply the process
[12889]867 if (@metadata_groups > 0) {
[24012]868 my $next_metadata_group = $metadata_groups[0];
[12895]869 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
[24012]870
871 # separate metadata into those that below in the next/sub-metadata_group
872 # and those that below at the current level's metadata_group
873
874 my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
875 my @current_level_OIDs = ();
876 my @next_level_OIDs = ();
877 foreach my $OID (@OIDs)
878 {
879 if ($OID_to_metadata_values_hash_ref->{$OID}) {
880 push(@next_level_OIDs, $OID);
881 } else {
882 push(@current_level_OIDs, $OID);
883 }
884 }
885 # recursively process those docs belonging to the sub-metadata_group
886 $self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
887
888 # For those docs that don't belong in the sub/next_metadata_group, but which belong
889 # at this level, just add the documents as children of this list at the current level
890 @current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
891 foreach my $current_level_OID (@current_level_OIDs) {
892 my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
893 push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
894 }
[10398]895 }
896 # Otherwise just add the documents as children of this list
897 else {
[21969]898 @OIDs = $self->sort_leaf_items(\@OIDs);
899 foreach my $OID (@OIDs) {
900 my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
901 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
902 }
903
[10398]904 }
905
906 # Add the sublist to the list
[12893]907 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
[10398]908 }
909 }
910}
911
[21969]912sub metadata_offset
[18619]913{
914 my $self = shift(@_);
[21969]915 my $metadata_group = shift(@_);
916 my $OID = shift(@_);
917 my $metadata_value = shift(@_);
918
919 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
920 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
921 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
922 if ($metadata_value eq $metadata_values[$i]) {
923 return $i;
924 }
925 }
926
927 return 0;
928}
929
930sub sort_leaf_items
931{
932 my $self = shift(@_);
[18619]933 my @OIDs = @{shift(@_)};
[21969]934# my $classifier_node = shift(@_);
[18619]935
936 # Sort leaf nodes and add to list
[20825]937 my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
938 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
[18619]939 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
940 # Force a stable sort (Perl 5.6's sort isn't stable)
941 # !! The [0] bits aren't ideal (multiple metadata values) !!
[22667]942 @OIDs = @OIDs[ sort {
943 if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
944 {
945 $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
946 }
947 else
948 {
949 $a <=> $b;
950 }
951 } 0..$#OIDs ];
[18619]952 }
[23302]953 if ($self->{'reverse_sort_leaf_nodes'}) {
[24081]954 #print STDERR "reversing\n";
[23302]955 return reverse @OIDs;
956 }
[21969]957 return @OIDs;
[18619]958}
959
960
[21969]961
[13551]962sub sort_metadata_values_array
963{
964 my $self = shift(@_);
965 my @metadata_values = @_;
966
967 if ($self->{'unicode_collator'}) {
968 return $self->{'unicode_collator'}->sort(@metadata_values);
969 }
970 else {
[31567]971 return sort { $self->alpha_numeric_cmp($a,$b) }(@metadata_values);
[13551]972 }
973}
974
[31575]975# $a and $b args automatically passed in and shouldn't be declared
[31567]976sub alpha_numeric_cmp
977{
[31577]978 my $self = shift (@_);
979 my ($aStr, $bStr) = @_;
980 if ($aStr =~ m/^(\d+(\.\d+)?)/)
[31567]981 {
982 my $val_a = $1;
[31577]983 if ($bStr =~ m/^(\d+(\.\d+)?)/)
[31567]984 {
985 my $val_b = $1;
986 if ($val_a != $val_b)
987 {
988 return ($val_a <=> $val_b);
989 }
990 }
991 }
992
[31577]993 return ($aStr cmp $bStr);
[31567]994}
995
996
[27098]997sub get_metadata_value_display {
998 my $self = shift(@_);
999 my ($metadata_group, $metadata_value) = @_;
1000 return $metadata_value if $self->{'standardize_capitalization'};
1001 my $actual_values_hash = $self->{$metadata_group . ".actualvalues"}->{$metadata_value};
1002 my $display_value ="";
1003 my $max_count=0;
1004 foreach my $v (keys %$actual_values_hash) {
1005 if ($actual_values_hash->{$v} > $max_count) {
1006 $display_value = $v;
1007 $max_count = $actual_values_hash->{$v};
1008 }
1009 }
1010 return $display_value;
1011}
[10398]10121;
Note: See TracBrowser for help on using the repository browser.