Context Navigation

source: main/trunk/greenstone2/perllib/classify/List.pm@ 23154

Last change on this file since 23154 was 23154, checked in by kjdon, 14 years ago
store a hash of all doc oids, then check against this hash when asked to classify something, so that we don't classify anything twice, as this leads to the document being in the list 4 times
Property svn:keywords set to `Author Date Id Revision`
File size: 34.8 KB

Line
1	###########################################################################
2	#
3	# List.pm -- A general and flexible list classifier with most of
4	# the abilities of AZCompactList, and better Unicode,
5	# metadata and sorting capabilities.
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12	#
13	# Copyright (C) 2005 New Zealand Digital Library Project
14	#
15	# This program is free software; you can redistribute it and/or modify
16	# it under the terms of the GNU General Public License as published by
17	# the Free Software Foundation; either version 2 of the License, or
18	# (at your option) any later version.
19	#
20	# This program is distributed in the hope that it will be useful,
21	# but WITHOUT ANY WARRANTY; without even the implied warranty of
22	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23	# GNU General Public License for more details.
24	#
25	# You should have received a copy of the GNU General Public License
26	# along with this program; if not, write to the Free Software
27	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28	#
29	# TO DO: - Remove punctuation from metadata values before sorting.
30	# - Add an AZCompactList-style hlist option?
31	#
32	###########################################################################
33
34	package List;
35
36
37	use BaseClassifier;
38
39	use strict;
40
41
42	sub BEGIN {
43	@List::ISA = ('BaseClassifier');
44	}
45
46	my $partition_type_list =
47	[ { 'name' => "per_letter",
48	'desc' => "{List.level_partition.per_letter}" },
49	{ 'name' => "approximate_size",
50	'desc' => "{List.level_partition.approximate_size}"},
51	{ 'name' => "constant_size",
52	'desc' => "{List.level_partition.constant_size}" },
53	{ 'name' => "none",
54	'desc' => "{List.level_partition.none}" } ];
55
56	# following used to check types later on
57	my $valid_partition_types = { 'per_letter' => 1,
58	'constant_size' => 1,
59	'per_letter_fixed_size' => 1,
60	'approximate_size' => 1,
61	'none' => 1};
62
63	my $bookshelf_type_list =
64	[ { 'name' => "always",
65	'desc' => "{List.bookshelf_type.always}" },
66	{ 'name' => "duplicate_only",
67	'desc' => "{List.bookshelf_type.duplicate_only}" },
68	{ 'name' => "never",
69	'desc' => "{List.bookshelf_type.never}" } ];
70
71	my $arguments =
72	[ { 'name' => "metadata",
73	'desc' => "{List.metadata}",
74	'type' => "metadata",
75	'reqd' => "yes" },
76
77	# The interesting options
78	{ 'name' => "bookshelf_type",
79	'desc' => "{List.bookshelf_type}",
80	'type' => "enum",
81	'list' => $bookshelf_type_list,
82	'deft' => "never" },
83	{ 'name' => "classify_sections",
84	'desc' => "{List.classify_sections}",
85	'type' => "flag" },
86	{ 'name' => "partition_type_within_level",
87	'desc' => "{List.partition_type_within_level}",
88	'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
89	'list' => $partition_type_list,
90	'deft' => "per_letter" },
91	{ 'name' => "partition_size_within_level",
92	'desc' => "{List.partition_size_within_level}",
93	'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
94	{ 'name' => "partition_name_length",
95	'desc' => "{List.partition_name_length}",
96	'type' => "string" },
97	{ 'name' => "sort_leaf_nodes_using",
98	'desc' => "{List.sort_leaf_nodes_using}",
99	'type' => "metadata",
100	'deft' => "Title" },
101	{ 'name' => "sort_using_unicode_collation",
102	'desc' => "{List.sort_using_unicode_collation}",
103	'type' => "flag" },
104	{ 'name' => "use_hlist_for",
105	'desc' => "{List.use_hlist_for}",
106	'type' => "string" },
107	{ 'name' => "removeprefix",
108	'desc' => "{BasClas.removeprefix}",
109	'type' => "regexp" },
110	{ 'name' => "removesuffix",
111	'desc' => "{BasClas.removesuffix}",
112	'type' => "regexp" } ];
113
114	my $options = { 'name' => "List",
115	'desc' => "{List.desc}",
116	'abstract' => "no",
117	'inherits' => "yes",
118	'args' => $arguments };
119
120
121	sub new
122	{
123	my ($class) = shift(@_);
124	my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
125	push(@$classifierslist, $class);
126
127	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
128	push(@{$hashArgOptLists->{"OptList"}}, $options);
129
130	my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
131
132	if ($self->{'info_only'}) {
133	# don't worry about any options etc
134	return bless $self, $class;
135	}
136
137	# The metadata elements to use (required)
138	if (!$self->{'metadata'}) {
139	die "Error: No metadata fields specified for List.\n";
140	}
141	my @metadata_groups = split(/\//, $self->{'metadata'});
142	$self->{'metadata_groups'} = \@metadata_groups;
143
144	# The classifier button name (default: the first metadata element specified)
145	if (!$self->{'buttonname'}) {
146	my $first_metadata_group = $metadata_groups[0];
147	my $first_metadata_element = (split(/\;\|,/, $first_metadata_group))[0];
148	$self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
149	}
150
151	# Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
152	foreach my $metadata_group (@metadata_groups) {
153	$self->{$metadata_group . ".bookshelf_type"} = "always";
154	}
155	my $last_metadata_group = $metadata_groups[$#metadata_groups];
156	# Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
157	$self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
158
159	# Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
160	foreach my $metadata_group (@metadata_groups) {
161	$self->{$metadata_group . ".list_type"} = "VList";
162	}
163	foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
164	$self->{$metadata_group . ".list_type"} = "HList";
165	}
166
167	# How the items are grouped into partitions (default: no partition)
168	# for each level (metadata group), separated by '/'
169	if (!$self->{"partition_type_within_level"}) {
170	foreach my $metadata_group (@metadata_groups) {
171	$self->{$metadata_group . ".partition_type_within_level"} = "none";
172	}
173	} else {
174	my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
175
176	my $first = 1;
177	foreach my $metadata_group (@metadata_groups) {
178	my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
179	if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
180	print STDERR "per letter fixed size, changing to approximate size\n";
181	$partition_type_within_levelelem = "approximate_size";
182	}
183	if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
184	$self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
185	}
186	else {
187	if ($first) {
188	$self->{$metadata_group . ".partition_type_within_level"} = "none";
189	$first = 0;
190	} else {
191	$self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
192	}
193	if (defined($partition_type_within_levelelem)) {
194	# ie invalid entry
195	print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
196	}
197	}
198	}
199	}
200
201	# The number of items in each partition
202	if (!$self->{'partition_size_within_level'}) {
203	# Default: 20
204	foreach my $metadata_group (@metadata_groups) {
205	$self->{$metadata_group . ".partition_size_within_level"} = 20;
206	}
207	}
208	else {
209	my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
210
211	# Assign values based on the partition_size_within_level parameter
212	foreach my $metadata_group (@metadata_groups) {
213	my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
214	if (defined($partition_size_within_levelelem)) {
215	$self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
216	}
217	else {
218	$self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
219	}
220	}
221	}
222
223	# The removeprefix and removesuffix expressions
224	if ($self->{'removeprefix'}) {
225	# If there are more than one expressions, use '' to quote each experession and '/' to separate
226	my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
227
228	foreach my $metadata_group (@metadata_groups) {
229	my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
230	if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
231	# Remove the other ' at the beginning and the end if there is any
232	$removeprefix_expr_within_levelelem =~ s/^'//;
233	$removeprefix_expr_within_levelelem =~ s/'$//;
234	# Remove the extra ^ at the beginning
235	$removeprefix_expr_within_levelelem =~ s/^\^//;
236	$self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
237	} else {
238	$self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
239	}
240	}
241	}
242	if ($self->{'removesuffix'}) {
243	my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
244
245	foreach my $metadata_group (@metadata_groups) {
246	my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
247	if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
248	$removesuffix_expr_within_levelelem =~ s/^'//;
249	$removesuffix_expr_within_levelelem =~ s/'$//;
250	# Remove the extra $ at the end
251	$removesuffix_expr_within_levelelem =~ s/\$$//;
252	$self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
253	} else {
254	$self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
255	}
256	}
257	}
258
259	# The metadata elements to use to sort the leaf nodes (default: Title)
260	my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
261	if ($self->{'sort_leaf_nodes_using'}) {
262	@sort_leaf_nodes_using_metadata_groups = split(/\\|/, $self->{'sort_leaf_nodes_using'});
263	}
264	$self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
265
266	# Create an instance of the Unicode::Collate object if better Unicode sorting is desired
267	if ($self->{'sort_using_unicode_collation'}) {
268	# To use this you first need to download the allkeys.txt file from
269	# http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
270	# Unicode/Collate directory.
271	require Unicode::Collate;
272	$self->{'unicode_collator'} = Unicode::Collate->new();
273	}
274
275	# An empty array for the document/section OIDs that we are classifying
276	$self->{'OIDs'} = [];
277	# A hash for all the doc ids that we have seen, so we don't classify something twice
278	$self->{'all_doc_OIDs'} = {};
279	return bless $self, $class;
280	}
281
282
283	sub init
284	{
285	# Nothing to do...
286	}
287
288
289	# Called for each document in the collection
290	sub classify
291	{
292	my $self = shift(@_);
293	my ($doc_obj) = @_;
294
295	if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
296	print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
297	return;
298	}
299	$self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
300	# If "-classify_sections" is set, classify every section of the document
301	if ($self->{'classify_sections'}) {
302	my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
303	while (defined $section) {
304	$self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
305	$section = $doc_obj->get_next_section($section);
306	}
307	}
308	# Otherwise just classify the top document section
309	else {
310	$self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
311	}
312
313	}
314
315	sub classify_section
316	{
317	my $self = shift(@_);
318	my ($doc_obj,$section_OID,$section) = @_;
319
320	my @metadata_groups = @{$self->{'metadata_groups'}};
321
322	# Only classify the section if it has a value for one of the metadata elements in the first group
323	my $classify_section = 0;
324	my $first_metadata_group = $metadata_groups[0];
325	my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
326	my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
327	foreach my $first_metadata_group_element (split(/\;\|,/, $first_metadata_group)) {
328	my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
329	my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
330
331	# Remove prefix/suffix if requested
332	if (defined ($first_metadata_group_element_value)) {
333	if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
334	$first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
335	}
336
337	if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
338	$first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
339	}
340	}
341	if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
342	# This section must be included in the classifier
343	$classify_section = 1;
344	last;
345	}
346	}
347
348	# We're not classifying this section because it doesn't have the required metadata
349	return if (!$classify_section);
350
351	# Otherwise, include this section in the classifier
352
353	push(@{$self->{'OIDs'}}, $section_OID);
354
355	# Create a hash for the metadata values of each metadata element we're interested in
356	my %metadata_groups_done = ();
357	foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
358	# Take care not to do a metadata group more than once
359	unless ($metadata_groups_done{$metadata_group}) {
360	my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
361	my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
362	foreach my $metadata_element (split(/\;\|,/, $metadata_group)) {
363	my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
364
365	my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
366	foreach my $metadata_value (@metadata_values) {
367	# Strip leading and trailing whitespace
368	$metadata_value =~ s/^\s*//;
369	$metadata_value =~ s/\s*$//;
370
371	# Remove prefix/suffix if requested
372	if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
373	$metadata_value =~ s/^$remove_prefix_expr//;
374	}
375	if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
376	$metadata_value =~ s/$remove_suffix_expr$//;
377	}
378
379	# uppercase the metadata - makes the AZList nicer
380	$metadata_value = uc($metadata_value);
381	# Convert the metadata value from a UTF-8 string to a Unicode string
382	# This means that length() and substr() work properly
383	# We need to be careful to convert classifier node title values back to UTF-8, however
384	my $metadata_value_unicode_string = $self->convert_utf8_string_to_unicode_string($metadata_value);
385
386	# Add the metadata value into the list for this combination of metadata group and section
387	push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
388	}
389	last if (@metadata_values > 0);
390	}
391
392	$metadata_groups_done{$metadata_group} = 1;
393	}
394	}
395	}
396
397
398	sub get_classify_info
399	{
400	my $self = shift(@_);
401
402	# The metadata groups to classify by
403	my @metadata_groups = @{$self->{'metadata_groups'}};
404	my $first_metadata_group = $metadata_groups[0];
405
406	# The OID values of the documents to include in the classifier
407	my @OIDs = @{$self->{'OIDs'}};
408
409	# Create the root node of the classification hierarchy
410	my %classifier_node = ( 'thistype' => "Invisible",
411	'childtype' => $self->{$first_metadata_group . ".list_type"},
412	'Title' => $self->{'buttonname'},
413	'contains' => [],
414	'mdtype' => $first_metadata_group );
415
416	# Recursively create the classification hierarchy, one level for each metadata group
417	$self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
418	return \%classifier_node;
419	}
420
421
422	sub add_level
423	{
424	my $self = shift(@_);
425	my @metadata_groups = @{shift(@_)};
426	my @OIDs = @{shift(@_)};
427	my $classifier_node = shift(@_);
428
429	my $metadata_group = $metadata_groups[0];
430
431	if (!defined($self->{$metadata_group . ".list"})) {
432	print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
433	return;
434	}
435
436	# Create a mapping from metadata value to OID
437	my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
438	my %metadata_value_to_OIDs_hash = ();
439	foreach my $OID (@OIDs)
440	{
441	if ($OID_to_metadata_values_hash_ref->{$OID})
442	{
443	my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
444	foreach my $metadata_value (@metadata_values)
445	{
446	push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
447	}
448	}
449	}
450	#print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
451
452	# Partition the values (if necessary)
453	my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
454	my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
455	if ($partition_type_within_level =~ /^per_letter$/i) {
456	# Generate one hlist for each letter
457	my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
458	my %metadata_value_to_OIDs_subhash = ();
459
460	my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
461	foreach my $metadata_value (@sortedmetadata_values) {
462	my $metadata_valuepartition = substr($metadata_value, 0, 1);
463
464	# Is this the start of a new partition?
465	if ($metadata_valuepartition ne $lastpartition) {
466	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
467	%metadata_value_to_OIDs_subhash = ();
468	$lastpartition = $metadata_valuepartition;
469	}
470
471	$metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
472	}
473
474	# Don't forget to add the last partition
475	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
476
477	# The partitions are stored in an HList
478	$classifier_node->{'childtype'} = "HList";
479	}
480	elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
481	# Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
482	# If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
483	# If a letter has more items than specified, split into several hlists.
484	# Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
485	my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
486	my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
487	my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
488
489	# Separate values by their first letter, each form a bucket, like the per_letter partition type
490	my $last_partition = substr($sortedmetadata_values[0], 0, 1);
491	my %partition_buckets = ();
492	my @metadata_values_in_bucket = ();
493	my $num_items_in_bucket = 0;
494	foreach my $metadata_value (@sortedmetadata_values) {
495	my $metadata_valuepartition = substr($metadata_value, 0, 1);
496	if ($metadata_valuepartition ne $last_partition) {
497	my @temp_array = @metadata_values_in_bucket;
498	# Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
499	my %partition_info = ();
500	$partition_info{'metadata_values'} = \@temp_array;
501	$partition_info{'size'} = $num_items_in_bucket;
502	$partition_buckets{$last_partition} = \%partition_info;
503
504	@metadata_values_in_bucket = ($metadata_value);
505	$num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
506	$last_partition = $metadata_valuepartition;
507	} else {
508	$num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
509	push (@metadata_values_in_bucket, $metadata_value);
510	}
511	}
512	# Last one
513	my %partition_info = ();
514	$partition_info{'metadata_values'} = \@metadata_values_in_bucket;
515	$partition_info{'size'} = $num_items_in_bucket;
516	$partition_buckets{$last_partition} = \%partition_info;
517
518	my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
519	for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
520	my $partition = $partition_keys[$i];
521	my $items_in_partition = $partition_buckets{$partition}->{'size'};
522	# Merge small buckets together, but keep the numeric bucket apart
523	if ($items_in_partition < $partition_size_within_level) {
524	my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
525	if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
526	&& !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
527	\|\| ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
528	foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
529	push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
530	}
531	$partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
532	delete $partition_buckets{$partition};
533	}
534	}
535	}
536	@partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
537
538	# Add partitions, and divide big bucket into several
539	my $last_partition_end = "";
540	my $partition_start = "";
541	foreach my $partition (@partition_keys) {
542	my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
543	my $items_in_partition = $partition_buckets{$partition}->{'size'};
544	$partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
545
546	if ($items_in_partition > $partition_size_within_level) {
547	my $items_done = 0;
548	my %metadata_values_to_OIDs_subhashes = ();
549	for (my $i = 0; $i < scalar(@metadata_values); $i++) {
550	my $metadata_value = $metadata_values[$i];
551	# If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
552	my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
553
554	my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
555	my $partitionname = $partition_start;
556	if ($partitionend ne $partition_start) {
557	$partitionname = $partitionname . "-" . $partitionend;
558	}
559
560	# Start a new partition
561	if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
562	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
563	$last_partition_end = $partitionend;
564	$partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
565	$items_done = 0;
566	%metadata_values_to_OIDs_subhashes = ();
567	}
568
569	# If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
570	if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
571	my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
572	# Get the number of partitions needed for this value
573	my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
574	$num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
575
576	my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
577	for (my $i = 0; $i < $num_splits; $i++) {
578	my %OIDs_subhashes_for_this_value = ();
579	my @OIDs_for_this_partition = ();
580	for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
581	push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
582	}
583
584	# The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
585	if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
586	$metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
587	$items_done += scalar(@OIDs_for_this_partition);
588	next;
589	}
590
591	# Add an HList for this bucket
592	$OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
593	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
594	$last_partition_end = $partitionname_for_this_value;
595	}
596	next;
597	}
598
599	$metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
600	$items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
601
602	# The last partition
603	if($i == scalar(@metadata_values) - 1) {
604	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
605	}
606	}
607	}
608	else {
609	# The easier case, just add a partition
610	my %metadata_values_to_OIDs_subhashes = ();
611	for (my $i = 0; $i < scalar(@metadata_values); $i++) {
612	my $metadata_value = $metadata_values[$i];
613	$metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
614	}
615	my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
616	my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
617	my $partitionname = $partition_start;
618	if ($partitionend ne $partition_start) {
619	$partitionname = $partitionname . "-" . $partitionend;
620	}
621	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
622	$last_partition_end = $partitionend;
623	}
624	}
625
626	# The partitions are stored in an HList
627	$classifier_node->{'childtype'} = "HList";
628
629	} # end approximate_size
630	else {
631	# Generate hlists of a certain size
632	if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
633	my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
634	my $itemsdone = 0;
635	my %metadata_value_to_OIDs_subhash = ();
636	my $lastpartitionend = "";
637	my $partitionstart;
638	foreach my $metadata_value (@sortedmetadata_values) {
639	$metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
640	$itemsdone++;
641	my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
642
643	# Is this the start of a new partition?
644	if ($itemsinpartition == 1) {
645	$partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
646	}
647
648	# Is this the end of the partition?
649	if ($itemsinpartition == $partition_size_within_level \|\| $itemsdone == @sortedmetadata_values) {
650	my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
651	my $partitionname = $partitionstart;
652	if ($partitionend ne $partitionstart) {
653	$partitionname = $partitionname . "-" . $partitionend;
654	}
655
656	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
657	%metadata_value_to_OIDs_subhash = ();
658	$lastpartitionend = $partitionend;
659	}
660	}
661
662	# The partitions are stored in an HList
663	$classifier_node->{'childtype'} = "HList";
664	}
665
666	# Otherwise just add all the values to a VList
667	else {
668	$self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
669	}
670	}
671	}
672
673
674	sub convert_utf8_string_to_unicode_string
675	{
676	my $self = shift(@_);
677	my $utf8_string = shift(@_);
678
679	my $unicode_string = "";
680	foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
681	$unicode_string .= chr($unicode_value);
682	}
683	return $unicode_string;
684	}
685
686
687	sub convert_unicode_string_to_utf8_string
688	{
689	my $self = shift(@_);
690	my $unicode_string = shift(@_);
691
692	my @unicode_array;
693	for (my $i = 0; $i < length($unicode_string); $i++) {
694	push(@unicode_array, ord(substr($unicode_string, $i, 1)));
695	}
696	return &unicode::unicode2utf8(\@unicode_array);
697	}
698
699
700	sub generate_partition_start
701	{
702	my $self = shift(@_);
703	my $metadata_value = shift(@_);
704	my $lastpartitionend = shift(@_);
705	my $partition_name_length = shift(@_);
706
707	if ($partition_name_length) {
708	return substr($metadata_value, 0, $partition_name_length);
709	}
710
711	my $partitionstart = substr($metadata_value, 0, 1);
712	if ($partitionstart le $lastpartitionend) {
713	$partitionstart = substr($metadata_value, 0, 2);
714	# Give up after three characters
715	if ($partitionstart le $lastpartitionend) {
716	$partitionstart = substr($metadata_value, 0, 3);
717	}
718	}
719
720	return $partitionstart;
721	}
722
723
724	sub generate_partition_end
725	{
726	my $self = shift(@_);
727	my $metadata_value = shift(@_);
728	my $partitionstart = shift(@_);
729	my $partition_name_length = shift(@_);
730
731	if ($partition_name_length) {
732	return substr($metadata_value, 0, $partition_name_length);
733	}
734
735	my $partitionend = substr($metadata_value, 0, length($partitionstart));
736	if ($partitionend gt $partitionstart) {
737	$partitionend = substr($metadata_value, 0, 1);
738	if ($partitionend le $partitionstart) {
739	$partitionend = substr($metadata_value, 0, 2);
740	# Give up after three characters
741	if ($partitionend le $partitionstart) {
742	$partitionend = substr($metadata_value, 0, 3);
743	}
744	}
745	}
746
747	return $partitionend;
748	}
749
750
751	sub add_hlist_partition
752	{
753	my $self = shift(@_);
754	my @metadata_groups = @{shift(@_)};
755	my $classifier_node = shift(@_);
756	my $partitionname = shift(@_);
757	my $metadata_value_to_OIDs_hash_ref = shift(@_);
758
759	# Create an hlist partition
760	my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
761	'childtype' => "VList",
762	'contains' => [] );
763
764	# Add the children to the hlist partition
765	$self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
766	push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
767	}
768
769
770	sub add_vlist
771	{
772	my $self = shift(@_);
773	my @metadata_groups = @{shift(@_)};
774	my $classifier_node = shift(@_);
775	my $metadata_value_to_OIDs_hash_ref = shift(@_);
776
777	my $metadata_group = shift(@metadata_groups);
778	$classifier_node->{'mdtype'} = $metadata_group;
779
780	# Create an entry in the vlist for each value
781	foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
782	{
783	my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
784	# If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
785	if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
786	my $OID = $OIDs[0];
787	my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
788	push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
789	}
790	# If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
791	elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
792	@OIDs = $self->sort_leaf_items(\@OIDs);
793	foreach my $OID (@OIDs) {
794	my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
795	push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
796	}
797
798	}
799	# Otherwise create a sublist (bookshelf) for the metadata value
800	else {
801	my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
802	'childtype' => "VList",
803	'mdtype' => $metadata_group,
804	'contains' => [] );
805
806	# If there are metadata elements remaining, recursively apply the process
807	if (@metadata_groups > 0) {
808	my $next_metadata_group = $metadata_groups[0];
809	$child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
810	$self->add_level(\@metadata_groups, \@OIDs, \%child_classifier_node);
811	}
812	# Otherwise just add the documents as children of this list
813	else {
814	@OIDs = $self->sort_leaf_items(\@OIDs);
815	foreach my $OID (@OIDs) {
816	my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
817	push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
818	}
819
820	}
821
822	# Add the sublist to the list
823	push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
824	}
825	}
826	}
827
828	sub metadata_offset
829	{
830	my $self = shift(@_);
831	my $metadata_group = shift(@_);
832	my $OID = shift(@_);
833	my $metadata_value = shift(@_);
834
835	my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
836	my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
837	for (my $i = 0; $i < scalar(@metadata_values); $i++) {
838	if ($metadata_value eq $metadata_values[$i]) {
839	return $i;
840	}
841	}
842
843	return 0;
844	}
845
846	sub sort_leaf_items
847	{
848	my $self = shift(@_);
849	my @OIDs = @{shift(@_)};
850	# my $classifier_node = shift(@_);
851
852	# Sort leaf nodes and add to list
853	my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
854	foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
855	my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
856	# Force a stable sort (Perl 5.6's sort isn't stable)
857	# !! The [0] bits aren't ideal (multiple metadata values) !!
858	@OIDs = @OIDs[ sort {
859	if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
860	{
861	$OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
862	}
863	else
864	{
865	$a <=> $b;
866	}
867	} 0..$#OIDs ];
868	}
869	return @OIDs;
870	}
871
872
873
874	sub sort_metadata_values_array
875	{
876	my $self = shift(@_);
877	my @metadata_values = @_;
878
879	if ($self->{'unicode_collator'}) {
880	return $self->{'unicode_collator'}->sort(@metadata_values);
881	}
882	else {
883	return sort(@metadata_values);
884	}
885	}
886
887
888	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: