Context Navigation

source: main/trunk/greenstone2/perllib/classify/List.pm@ 24081

Last change on this file since 24081 was 24081, checked in by max, 13 years ago
Hide debug
Property svn:keywords set to `Author Date Id Revision`
File size: 36.3 KB

Line
1	###########################################################################
2	#
3	# List.pm -- A general and flexible list classifier with most of
4	# the abilities of AZCompactList, and better Unicode,
5	# metadata and sorting capabilities.
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12	#
13	# Copyright (C) 2005 New Zealand Digital Library Project
14	#
15	# This program is free software; you can redistribute it and/or modify
16	# it under the terms of the GNU General Public License as published by
17	# the Free Software Foundation; either version 2 of the License, or
18	# (at your option) any later version.
19	#
20	# This program is distributed in the hope that it will be useful,
21	# but WITHOUT ANY WARRANTY; without even the implied warranty of
22	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23	# GNU General Public License for more details.
24	#
25	# You should have received a copy of the GNU General Public License
26	# along with this program; if not, write to the Free Software
27	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28	#
29	# TO DO: - Remove punctuation from metadata values before sorting.
30	# - Add an AZCompactList-style hlist option?
31	#
32	###########################################################################
33
34	package List;
35
36
37	use BaseClassifier;
38
39	use strict;
40
41
42	sub BEGIN {
43	@List::ISA = ('BaseClassifier');
44	}
45
46	my $partition_type_list =
47	[ { 'name' => "per_letter",
48	'desc' => "{List.level_partition.per_letter}" },
49	{ 'name' => "approximate_size",
50	'desc' => "{List.level_partition.approximate_size}"},
51	{ 'name' => "constant_size",
52	'desc' => "{List.level_partition.constant_size}" },
53	{ 'name' => "none",
54	'desc' => "{List.level_partition.none}" } ];
55
56	# following used to check types later on
57	my $valid_partition_types = { 'per_letter' => 1,
58	'constant_size' => 1,
59	'per_letter_fixed_size' => 1,
60	'approximate_size' => 1,
61	'none' => 1};
62
63	my $bookshelf_type_list =
64	[ { 'name' => "always",
65	'desc' => "{List.bookshelf_type.always}" },
66	{ 'name' => "duplicate_only",
67	'desc' => "{List.bookshelf_type.duplicate_only}" },
68	{ 'name' => "never",
69	'desc' => "{List.bookshelf_type.never}" } ];
70
71	my $arguments =
72	[ { 'name' => "metadata",
73	'desc' => "{List.metadata}",
74	'type' => "metadata",
75	'reqd' => "yes" },
76
77	# The interesting options
78	{ 'name' => "bookshelf_type",
79	'desc' => "{List.bookshelf_type}",
80	'type' => "enum",
81	'list' => $bookshelf_type_list,
82	'deft' => "never" },
83	{ 'name' => "classify_sections",
84	'desc' => "{List.classify_sections}",
85	'type' => "flag" },
86	{ 'name' => "partition_type_within_level",
87	'desc' => "{List.partition_type_within_level}",
88	'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
89	'list' => $partition_type_list,
90	'deft' => "per_letter" },
91	{ 'name' => "partition_size_within_level",
92	'desc' => "{List.partition_size_within_level}",
93	'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
94	{ 'name' => "partition_name_length",
95	'desc' => "{List.partition_name_length}",
96	'type' => "string" },
97	{ 'name' => "sort_leaf_nodes_using",
98	'desc' => "{List.sort_leaf_nodes_using}",
99	'type' => "metadata",
100	'deft' => "Title" },
101	{ 'name' => "reverse_sort_leaf_nodes",
102	'desc' => "{List.reverse_sort_leaf_nodes}",
103	'type' => "flag"},
104	{ 'name' => "sort_using_unicode_collation",
105	'desc' => "{List.sort_using_unicode_collation}",
106	'type' => "flag" },
107	{ 'name' => "use_hlist_for",
108	'desc' => "{List.use_hlist_for}",
109	'type' => "string" },
110	{ 'name' => "removeprefix",
111	'desc' => "{BasClas.removeprefix}",
112	'type' => "regexp" },
113	{ 'name' => "removesuffix",
114	'desc' => "{BasClas.removesuffix}",
115	'type' => "regexp" } ];
116
117	my $options = { 'name' => "List",
118	'desc' => "{List.desc}",
119	'abstract' => "no",
120	'inherits' => "yes",
121	'args' => $arguments };
122
123
124	sub new
125	{
126	my ($class) = shift(@_);
127	my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
128	push(@$classifierslist, $class);
129
130	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
131	push(@{$hashArgOptLists->{"OptList"}}, $options);
132
133	my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
134
135	if ($self->{'info_only'}) {
136	# don't worry about any options etc
137	return bless $self, $class;
138	}
139
140	# The metadata elements to use (required)
141	if (!$self->{'metadata'}) {
142	die "Error: No metadata fields specified for List.\n";
143	}
144
145	my @metadata_groups = split(/[\/]/, $self->{'metadata'});
146	$self->{'metadata_groups'} = \@metadata_groups;
147
148	# The classifier button name (default: the first metadata element specified)
149	if (!$self->{'buttonname'}) {
150	my $first_metadata_group = $metadata_groups[0];
151	my $first_metadata_element = (split(/[\;\|,\/]/, $first_metadata_group))[0];
152	$self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
153	}
154
155	# Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
156	foreach my $metadata_group (@metadata_groups) {
157	$self->{$metadata_group . ".bookshelf_type"} = "always";
158	}
159	my $last_metadata_group = $metadata_groups[$#metadata_groups];
160	# Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
161	$self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
162
163	# Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
164	foreach my $metadata_group (@metadata_groups) {
165	$self->{$metadata_group . ".list_type"} = "VList";
166	}
167	foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
168	$self->{$metadata_group . ".list_type"} = "HList";
169	}
170
171	# How the items are grouped into partitions (default: no partition)
172	# for each level (metadata group), separated by '/'
173	if (!$self->{"partition_type_within_level"}) {
174	foreach my $metadata_group (@metadata_groups) {
175	$self->{$metadata_group . ".partition_type_within_level"} = "none";
176	}
177	} else {
178	my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
179
180	my $first = 1;
181	foreach my $metadata_group (@metadata_groups) {
182	my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
183	if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
184	print STDERR "per letter fixed size, changing to approximate size\n";
185	$partition_type_within_levelelem = "approximate_size";
186	}
187	if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
188	$self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
189	}
190	else {
191	if ($first) {
192	$self->{$metadata_group . ".partition_type_within_level"} = "none";
193	$first = 0;
194	} else {
195	$self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
196	}
197	if (defined($partition_type_within_levelelem)) {
198	# ie invalid entry
199	print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
200	}
201	}
202	}
203	}
204
205	# The number of items in each partition
206	if (!$self->{'partition_size_within_level'}) {
207	# Default: 20
208	foreach my $metadata_group (@metadata_groups) {
209	$self->{$metadata_group . ".partition_size_within_level"} = 20;
210	}
211	}
212	else {
213	my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
214
215	# Assign values based on the partition_size_within_level parameter
216	foreach my $metadata_group (@metadata_groups) {
217	my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
218	if (defined($partition_size_within_levelelem)) {
219	$self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
220	}
221	else {
222	$self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
223	}
224	}
225	}
226
227	# The removeprefix and removesuffix expressions
228	if ($self->{'removeprefix'}) {
229	# If there are more than one expressions, use '' to quote each experession and '/' to separate
230	my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
231
232	foreach my $metadata_group (@metadata_groups) {
233	my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
234	if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
235	# Remove the other ' at the beginning and the end if there is any
236	$removeprefix_expr_within_levelelem =~ s/^'//;
237	$removeprefix_expr_within_levelelem =~ s/'$//;
238	# Remove the extra ^ at the beginning
239	$removeprefix_expr_within_levelelem =~ s/^\^//;
240	$self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
241	} else {
242	$self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
243	}
244	}
245	}
246	if ($self->{'removesuffix'}) {
247	my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
248
249	foreach my $metadata_group (@metadata_groups) {
250	my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
251	if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
252	$removesuffix_expr_within_levelelem =~ s/^'//;
253	$removesuffix_expr_within_levelelem =~ s/'$//;
254	# Remove the extra $ at the end
255	$removesuffix_expr_within_levelelem =~ s/\$$//;
256	$self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
257	} else {
258	$self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
259	}
260	}
261	}
262
263	# The metadata elements to use to sort the leaf nodes (default: Title)
264	my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
265	if ($self->{'sort_leaf_nodes_using'}) {
266	@sort_leaf_nodes_using_metadata_groups = split(/\\|/, $self->{'sort_leaf_nodes_using'});
267	}
268	$self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
269
270	# Create an instance of the Unicode::Collate object if better Unicode sorting is desired
271	if ($self->{'sort_using_unicode_collation'}) {
272	# To use this you first need to download the allkeys.txt file from
273	# http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
274	# Unicode/Collate directory.
275	require Unicode::Collate;
276	$self->{'unicode_collator'} = Unicode::Collate->new();
277	}
278
279	# An empty array for the document/section OIDs that we are classifying
280	$self->{'OIDs'} = [];
281	# A hash for all the doc ids that we have seen, so we don't classify something twice
282	$self->{'all_doc_OIDs'} = {};
283	return bless $self, $class;
284	}
285
286
287	sub init
288	{
289	# Nothing to do...
290	}
291
292
293	# Called for each document in the collection
294	sub classify
295	{
296	my $self = shift(@_);
297	my ($doc_obj) = @_;
298
299	if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
300	print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
301	return;
302	}
303	$self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
304	# If "-classify_sections" is set, classify every section of the document
305	if ($self->{'classify_sections'}) {
306	my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
307	while (defined $section) {
308	$self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
309	$section = $doc_obj->get_next_section($section);
310	}
311	}
312	# Otherwise just classify the top document section
313	else {
314	$self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
315	}
316
317	}
318
319	sub classify_section
320	{
321	my $self = shift(@_);
322	my ($doc_obj,$section_OID,$section) = @_;
323
324	my @metadata_groups = @{$self->{'metadata_groups'}};
325
326	# Only classify the section if it has a value for one of the metadata elements in the first group
327	my $classify_section = 0;
328	my $first_metadata_group = $metadata_groups[0];
329	my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
330	my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
331	foreach my $first_metadata_group_element (split(/\;\|,/, $first_metadata_group)) {
332	my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
333	my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
334
335	# Remove prefix/suffix if requested
336	if (defined ($first_metadata_group_element_value)) {
337	if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
338	$first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
339	}
340
341	if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
342	$first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
343	}
344	}
345	if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
346	# This section must be included in the classifier
347	$classify_section = 1;
348	last;
349	}
350	}
351
352	# We're not classifying this section because it doesn't have the required metadata
353	return if (!$classify_section);
354
355	# Otherwise, include this section in the classifier
356
357	push(@{$self->{'OIDs'}}, $section_OID);
358
359	# Create a hash for the metadata values of each metadata element we're interested in
360	my %metadata_groups_done = ();
361	foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
362	# Take care not to do a metadata group more than once
363	unless ($metadata_groups_done{$metadata_group}) {
364	my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
365	my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
366	foreach my $metadata_element (split(/\;\|,/, $metadata_group)) {
367	my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
368
369	my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
370	foreach my $metadata_value (@metadata_values) {
371	# Strip leading and trailing whitespace
372	$metadata_value =~ s/^\s*//;
373	$metadata_value =~ s/\s*$//;
374
375	# Remove prefix/suffix if requested
376	if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
377	$metadata_value =~ s/^$remove_prefix_expr//;
378	}
379	if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
380	$metadata_value =~ s/$remove_suffix_expr$//;
381	}
382
383	# uppercase the metadata - makes the AZList nicer
384	$metadata_value = uc($metadata_value);
385
386	# We are already working with unicode aware strings at this
387	# stage, so we no longer need to convert from utf8 to unicode
388	my $metadata_value_unicode_string = $metadata_value; # $self->convert_utf8_string_to_unicode_string($metadata_value);
389
390	# Add the metadata value into the list for this combination of metadata group and section
391	push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
392	}
393	last if (@metadata_values > 0);
394	}
395
396	$metadata_groups_done{$metadata_group} = 1;
397	}
398	}
399	}
400
401
402	sub get_classify_info
403	{
404	my $self = shift(@_);
405
406	# The metadata groups to classify by
407	my @metadata_groups = @{$self->{'metadata_groups'}};
408	my $first_metadata_group = $metadata_groups[0];
409
410	# The OID values of the documents to include in the classifier
411	my @OIDs = @{$self->{'OIDs'}};
412
413	# Create the root node of the classification hierarchy
414	my %classifier_node = ( 'thistype' => "Invisible",
415	'childtype' => $self->{$first_metadata_group . ".list_type"},
416	'Title' => $self->{'buttonname'},
417	'contains' => [],
418	'mdtype' => $first_metadata_group );
419
420	# Recursively create the classification hierarchy, one level for each metadata group
421	$self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
422	return \%classifier_node;
423	}
424
425
426	sub add_level
427	{
428	my $self = shift(@_);
429	my @metadata_groups = @{shift(@_)};
430	my @OIDs = @{shift(@_)};
431	my $classifier_node = shift(@_);
432
433	my $metadata_group = $metadata_groups[0];
434
435	if (!defined($self->{$metadata_group . ".list"})) {
436	print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
437	return;
438	}
439
440	# Create a mapping from metadata value to OID
441	my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
442	my %metadata_value_to_OIDs_hash = ();
443	foreach my $OID (@OIDs)
444	{
445	if ($OID_to_metadata_values_hash_ref->{$OID})
446	{
447	my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
448	foreach my $metadata_value (@metadata_values)
449	{
450	push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
451	}
452	}
453	}
454	#print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
455
456	# Partition the values (if necessary)
457	my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
458	my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
459	if ($partition_type_within_level =~ /^per_letter$/i) {
460	# Generate one hlist for each letter
461	my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
462	my %metadata_value_to_OIDs_subhash = ();
463
464	my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
465	foreach my $metadata_value (@sortedmetadata_values) {
466	my $metadata_valuepartition = substr($metadata_value, 0, 1);
467
468	# Is this the start of a new partition?
469	if ($metadata_valuepartition ne $lastpartition) {
470	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
471	%metadata_value_to_OIDs_subhash = ();
472	$lastpartition = $metadata_valuepartition;
473	}
474
475	$metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
476	}
477
478	# Don't forget to add the last partition
479	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
480
481	# The partitions are stored in an HList
482	$classifier_node->{'childtype'} = "HList";
483	}
484	elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
485	# Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
486	# If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
487	# If a letter has more items than specified, split into several hlists.
488	# Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
489	my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
490	my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
491	my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
492
493	# Separate values by their first letter, each form a bucket, like the per_letter partition type
494	my $last_partition = substr($sortedmetadata_values[0], 0, 1);
495	my %partition_buckets = ();
496	my @metadata_values_in_bucket = ();
497	my $num_items_in_bucket = 0;
498	foreach my $metadata_value (@sortedmetadata_values) {
499	my $metadata_valuepartition = substr($metadata_value, 0, 1);
500	if ($metadata_valuepartition ne $last_partition) {
501	my @temp_array = @metadata_values_in_bucket;
502	# Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
503	my %partition_info = ();
504	$partition_info{'metadata_values'} = \@temp_array;
505	$partition_info{'size'} = $num_items_in_bucket;
506	$partition_buckets{$last_partition} = \%partition_info;
507
508	@metadata_values_in_bucket = ($metadata_value);
509	$num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
510	$last_partition = $metadata_valuepartition;
511	} else {
512	$num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
513	push (@metadata_values_in_bucket, $metadata_value);
514	}
515	}
516	# Last one
517	my %partition_info = ();
518	$partition_info{'metadata_values'} = \@metadata_values_in_bucket;
519	$partition_info{'size'} = $num_items_in_bucket;
520	$partition_buckets{$last_partition} = \%partition_info;
521
522	my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
523	for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
524	my $partition = $partition_keys[$i];
525	my $items_in_partition = $partition_buckets{$partition}->{'size'};
526	# Merge small buckets together, but keep the numeric bucket apart
527	if ($items_in_partition < $partition_size_within_level) {
528	my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
529	if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
530	&& !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
531	\|\| ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
532	foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
533	push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
534	}
535	$partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
536	delete $partition_buckets{$partition};
537	}
538	}
539	}
540	@partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
541
542	# Add partitions, and divide big bucket into several
543	my $last_partition_end = "";
544	my $partition_start = "";
545	foreach my $partition (@partition_keys) {
546	my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
547	my $items_in_partition = $partition_buckets{$partition}->{'size'};
548	$partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
549
550	if ($items_in_partition > $partition_size_within_level) {
551	my $items_done = 0;
552	my %metadata_values_to_OIDs_subhashes = ();
553	for (my $i = 0; $i < scalar(@metadata_values); $i++) {
554	my $metadata_value = $metadata_values[$i];
555	# If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
556	my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
557
558	my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
559	my $partitionname = $partition_start;
560	if ($partitionend ne $partition_start) {
561	$partitionname = $partitionname . "-" . $partitionend;
562	}
563
564	# Start a new partition
565	if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
566	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
567	$last_partition_end = $partitionend;
568	$partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
569	$items_done = 0;
570	%metadata_values_to_OIDs_subhashes = ();
571	}
572
573	# If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
574	if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
575	my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
576	# Get the number of partitions needed for this value
577	my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
578	$num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
579
580	my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
581	for (my $i = 0; $i < $num_splits; $i++) {
582	my %OIDs_subhashes_for_this_value = ();
583	my @OIDs_for_this_partition = ();
584	for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
585	push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
586	}
587
588	# The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
589	if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
590	$metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
591	$items_done += scalar(@OIDs_for_this_partition);
592	next;
593	}
594
595	# Add an HList for this bucket
596	$OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
597	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
598	$last_partition_end = $partitionname_for_this_value;
599	}
600	next;
601	}
602
603	$metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
604	$items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
605
606	# The last partition
607	if($i == scalar(@metadata_values) - 1) {
608	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
609	}
610	}
611	}
612	else {
613	# The easier case, just add a partition
614	my %metadata_values_to_OIDs_subhashes = ();
615	for (my $i = 0; $i < scalar(@metadata_values); $i++) {
616	my $metadata_value = $metadata_values[$i];
617	$metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
618	}
619	my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
620	my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
621	my $partitionname = $partition_start;
622	if ($partitionend ne $partition_start) {
623	$partitionname = $partitionname . "-" . $partitionend;
624	}
625	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
626	$last_partition_end = $partitionend;
627	}
628	}
629
630	# The partitions are stored in an HList
631	$classifier_node->{'childtype'} = "HList";
632
633	} # end approximate_size
634	else {
635	# Generate hlists of a certain size
636	if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
637	my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
638	my $itemsdone = 0;
639	my %metadata_value_to_OIDs_subhash = ();
640	my $lastpartitionend = "";
641	my $partitionstart;
642	foreach my $metadata_value (@sortedmetadata_values) {
643	$metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
644	$itemsdone++;
645	my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
646
647	# Is this the start of a new partition?
648	if ($itemsinpartition == 1) {
649	$partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
650	}
651
652	# Is this the end of the partition?
653	if ($itemsinpartition == $partition_size_within_level \|\| $itemsdone == @sortedmetadata_values) {
654	my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
655	my $partitionname = $partitionstart;
656	if ($partitionend ne $partitionstart) {
657	$partitionname = $partitionname . "-" . $partitionend;
658	}
659
660	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
661	%metadata_value_to_OIDs_subhash = ();
662	$lastpartitionend = $partitionend;
663	}
664	}
665
666	# The partitions are stored in an HList
667	$classifier_node->{'childtype'} = "HList";
668	}
669
670	# Otherwise just add all the values to a VList
671	else {
672	$self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
673	}
674	}
675	}
676
677
678	sub convert_utf8_string_to_unicode_string
679	{
680	my $self = shift(@_);
681	my $utf8_string = shift(@_);
682
683	my $unicode_string = "";
684	foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
685	$unicode_string .= chr($unicode_value);
686	}
687	return $unicode_string;
688	}
689
690
691	sub convert_unicode_string_to_utf8_string
692	{
693	my $self = shift(@_);
694	my $unicode_string = shift(@_);
695
696	my @unicode_array;
697	for (my $i = 0; $i < length($unicode_string); $i++) {
698	push(@unicode_array, ord(substr($unicode_string, $i, 1)));
699	}
700	return &unicode::unicode2utf8(\@unicode_array);
701	}
702
703
704	sub generate_partition_start
705	{
706	my $self = shift(@_);
707	my $metadata_value = shift(@_);
708	my $lastpartitionend = shift(@_);
709	my $partition_name_length = shift(@_);
710
711	if ($partition_name_length) {
712	return substr($metadata_value, 0, $partition_name_length);
713	}
714
715	my $partitionstart = substr($metadata_value, 0, 1);
716	if ($partitionstart le $lastpartitionend) {
717	$partitionstart = substr($metadata_value, 0, 2);
718	# Give up after three characters
719	if ($partitionstart le $lastpartitionend) {
720	$partitionstart = substr($metadata_value, 0, 3);
721	}
722	}
723
724	return $partitionstart;
725	}
726
727
728	sub generate_partition_end
729	{
730	my $self = shift(@_);
731	my $metadata_value = shift(@_);
732	my $partitionstart = shift(@_);
733	my $partition_name_length = shift(@_);
734
735	if ($partition_name_length) {
736	return substr($metadata_value, 0, $partition_name_length);
737	}
738
739	my $partitionend = substr($metadata_value, 0, length($partitionstart));
740	if ($partitionend gt $partitionstart) {
741	$partitionend = substr($metadata_value, 0, 1);
742	if ($partitionend le $partitionstart) {
743	$partitionend = substr($metadata_value, 0, 2);
744	# Give up after three characters
745	if ($partitionend le $partitionstart) {
746	$partitionend = substr($metadata_value, 0, 3);
747	}
748	}
749	}
750
751	return $partitionend;
752	}
753
754
755	sub add_hlist_partition
756	{
757	my $self = shift(@_);
758	my @metadata_groups = @{shift(@_)};
759	my $classifier_node = shift(@_);
760	my $partitionname = shift(@_);
761	my $metadata_value_to_OIDs_hash_ref = shift(@_);
762
763	# Create an hlist partition
764	# Note that we don't need to convert from unicode-aware strings
765	# to utf8 here, as that is handled elsewhere in the code
766	my %child_classifier_node = ( 'Title' => $partitionname, #'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
767	'childtype' => "VList",
768	'contains' => [] );
769
770	# Add the children to the hlist partition
771	$self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
772	push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
773	}
774
775
776	sub add_vlist
777	{
778	my $self = shift(@_);
779	my @metadata_groups = @{shift(@_)};
780	my $classifier_node = shift(@_);
781	my $metadata_value_to_OIDs_hash_ref = shift(@_);
782
783	my $metadata_group = shift(@metadata_groups);
784	$classifier_node->{'mdtype'} = $metadata_group;
785
786	# Create an entry in the vlist for each value
787	foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
788	{
789	my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
790	# If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
791	if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
792	my $OID = $OIDs[0];
793	my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
794	push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
795	}
796	# If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
797	elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
798	@OIDs = $self->sort_leaf_items(\@OIDs);
799	foreach my $OID (@OIDs) {
800	my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
801	push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
802	}
803
804	}
805	# Otherwise create a sublist (bookshelf) for the metadata value
806	else {
807	# Note that we don't need to convert from unicode-aware strings
808	# to utf8 here, as that is handled elsewhere in the code
809	my %child_classifier_node = ( 'Title' => $metadata_value, # 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
810	'childtype' => "VList",
811	'mdtype' => $metadata_group,
812	'contains' => [] );
813
814	#@OIDs = $self->sort_leaf_items(\@OIDs);
815	# If there are metadata elements remaining, recursively apply the process
816	if (@metadata_groups > 0) {
817	my $next_metadata_group = $metadata_groups[0];
818	$child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
819
820	# separate metadata into those that below in the next/sub-metadata_group
821	# and those that below at the current level's metadata_group
822
823	my $OID_to_metadata_values_hash_ref = $self->{$next_metadata_group . ".list"};
824	my @current_level_OIDs = ();
825	my @next_level_OIDs = ();
826	foreach my $OID (@OIDs)
827	{
828	if ($OID_to_metadata_values_hash_ref->{$OID}) {
829	push(@next_level_OIDs, $OID);
830	} else {
831	push(@current_level_OIDs, $OID);
832	}
833	}
834	# recursively process those docs belonging to the sub-metadata_group
835	$self->add_level(\@metadata_groups, \@next_level_OIDs, \%child_classifier_node);
836
837	# For those docs that don't belong in the sub/next_metadata_group, but which belong
838	# at this level, just add the documents as children of this list at the current level
839	@current_level_OIDs = $self->sort_leaf_items(\@current_level_OIDs);
840	foreach my $current_level_OID (@current_level_OIDs) {
841	my $offset = $self->metadata_offset($metadata_group, $current_level_OID, $metadata_value);
842	push(@{$child_classifier_node{'contains'}}, { 'OID' => $current_level_OID , 'offset' => $offset });
843	}
844	}
845	# Otherwise just add the documents as children of this list
846	else {
847	@OIDs = $self->sort_leaf_items(\@OIDs);
848	foreach my $OID (@OIDs) {
849	my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
850	push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
851	}
852
853	}
854
855	# Add the sublist to the list
856	push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
857	}
858	}
859	}
860
861	sub metadata_offset
862	{
863	my $self = shift(@_);
864	my $metadata_group = shift(@_);
865	my $OID = shift(@_);
866	my $metadata_value = shift(@_);
867
868	my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
869	my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
870	for (my $i = 0; $i < scalar(@metadata_values); $i++) {
871	if ($metadata_value eq $metadata_values[$i]) {
872	return $i;
873	}
874	}
875
876	return 0;
877	}
878
879	sub sort_leaf_items
880	{
881	my $self = shift(@_);
882	my @OIDs = @{shift(@_)};
883	# my $classifier_node = shift(@_);
884
885	# Sort leaf nodes and add to list
886	my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
887	foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
888	my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
889	# Force a stable sort (Perl 5.6's sort isn't stable)
890	# !! The [0] bits aren't ideal (multiple metadata values) !!
891	@OIDs = @OIDs[ sort {
892	if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
893	{
894	$OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
895	}
896	else
897	{
898	$a <=> $b;
899	}
900	} 0..$#OIDs ];
901	}
902	if ($self->{'reverse_sort_leaf_nodes'}) {
903	#print STDERR "reversing\n";
904	return reverse @OIDs;
905	}
906	return @OIDs;
907	}
908
909
910
911	sub sort_metadata_values_array
912	{
913	my $self = shift(@_);
914	my @metadata_values = @_;
915
916	if ($self->{'unicode_collator'}) {
917	return $self->{'unicode_collator'}->sort(@metadata_values);
918	}
919	else {
920	return sort(@metadata_values);
921	}
922	}
923
924
925	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: