Context Navigation

source: main/trunk/greenstone2/perllib/classify/List.pm@ 23249

Last change on this file since 23249 was 23249, checked in by ak19, 13 years ago
A useful debug version of the rm method which got added in when Dr Bainbridge fixed the mimetype file deleting issue (commit 23248).
Property svn:keywords set to `Author Date Id Revision`
File size: 34.9 KB

Rev	Line
[10398]	1	###########################################################################
	2	#
[18568]	3	# List.pm -- A general and flexible list classifier with most of
[10398]	4	# the abilities of AZCompactList, and better Unicode,
	5	# metadata and sorting capabilities.
	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
	11	# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
	12	#
	13	# Copyright (C) 2005 New Zealand Digital Library Project
	14	#
	15	# This program is free software; you can redistribute it and/or modify
	16	# it under the terms of the GNU General Public License as published by
	17	# the Free Software Foundation; either version 2 of the License, or
	18	# (at your option) any later version.
	19	#
	20	# This program is distributed in the hope that it will be useful,
	21	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	22	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	23	# GNU General Public License for more details.
	24	#
	25	# You should have received a copy of the GNU General Public License
	26	# along with this program; if not, write to the Free Software
	27	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	28	#
[13741]	29	# TO DO: - Remove punctuation from metadata values before sorting.
	30	# - Add an AZCompactList-style hlist option?
[10398]	31	#
	32	###########################################################################
	33
[18568]	34	package List;
[10398]	35
	36
[17209]	37	use BaseClassifier;
[10398]	38
	39	use strict;
	40
	41
	42	sub BEGIN {
[18568]	43	@List::ISA = ('BaseClassifier');
[10398]	44	}
	45
[18572]	46	my $partition_type_list =
	47	[ { 'name' => "per_letter",
	48	'desc' => "{List.level_partition.per_letter}" },
[20865]	49	{ 'name' => "approximate_size",
	50	'desc' => "{List.level_partition.approximate_size}"},
[18572]	51	{ 'name' => "constant_size",
[18619]	52	'desc' => "{List.level_partition.constant_size}" },
[18572]	53	{ 'name' => "none",
	54	'desc' => "{List.level_partition.none}" } ];
[10398]	55
[20825]	56	# following used to check types later on
	57	my $valid_partition_types = { 'per_letter' => 1,
	58	'constant_size' => 1,
	59	'per_letter_fixed_size' => 1,
[20865]	60	'approximate_size' => 1,
[20825]	61	'none' => 1};
	62
[18619]	63	my $bookshelf_type_list =
	64	[ { 'name' => "always",
	65	'desc' => "{List.bookshelf_type.always}" },
	66	{ 'name' => "duplicate_only",
	67	'desc' => "{List.bookshelf_type.duplicate_only}" },
	68	{ 'name' => "never",
[20008]	69	'desc' => "{List.bookshelf_type.never}" } ];
[18619]	70
[10398]	71	my $arguments =
	72	[ { 'name' => "metadata",
[19234]	73	'desc' => "{List.metadata}",
[10398]	74	'type' => "metadata",
	75	'reqd' => "yes" },
[12889]	76
[10398]	77	# The interesting options
[18619]	78	{ 'name' => "bookshelf_type",
[19234]	79	'desc' => "{List.bookshelf_type}",
[18619]	80	'type' => "enum",
	81	'list' => $bookshelf_type_list,
[19645]	82	'deft' => "never" },
[10498]	83	{ 'name' => "classify_sections",
[19234]	84	'desc' => "{List.classify_sections}",
[10498]	85	'type' => "flag" },
	86	{ 'name' => "partition_type_within_level",
[19234]	87	'desc' => "{List.partition_type_within_level}",
[20679]	88	'type' => "enumstring", # Must be enumstring because multiple values can be specified (separated by '/')
[20008]	89	'list' => $partition_type_list,
	90	'deft' => "per_letter" },
[10498]	91	{ 'name' => "partition_size_within_level",
[19234]	92	'desc' => "{List.partition_size_within_level}",
[20679]	93	'type' => "string" }, # Must be string because multiple values can be specified (separated by '/')
[14084]	94	{ 'name' => "partition_name_length",
[19234]	95	'desc' => "{List.partition_name_length}",
[14084]	96	'type' => "string" },
[10498]	97	{ 'name' => "sort_leaf_nodes_using",
[19234]	98	'desc' => "{List.sort_leaf_nodes_using}",
[10398]	99	'type' => "metadata",
[10499]	100	'deft' => "Title" },
[13551]	101	{ 'name' => "sort_using_unicode_collation",
[19234]	102	'desc' => "{List.sort_using_unicode_collation}",
[13551]	103	'type' => "flag" },
[10499]	104	{ 'name' => "use_hlist_for",
[19234]	105	'desc' => "{List.use_hlist_for}",
[18619]	106	'type' => "string" },
	107	{ 'name' => "removeprefix",
	108	'desc' => "{BasClas.removeprefix}",
	109	'type' => "regexp" },
	110	{ 'name' => "removesuffix",
	111	'desc' => "{BasClas.removesuffix}",
	112	'type' => "regexp" } ];
[10398]	113
[18568]	114	my $options = { 'name' => "List",
[19234]	115	'desc' => "{List.desc}",
[10502]	116	'abstract' => "no",
[18572]	117	'inherits' => "yes",
[10398]	118	'args' => $arguments };
	119
	120
	121	sub new
	122	{
	123	my ($class) = shift(@_);
	124	my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
	125	push(@$classifierslist, $class);
	126
[17209]	127	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
	128	push(@{$hashArgOptLists->{"OptList"}}, $options);
[10398]	129
[17209]	130	my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
[10398]	131
	132	if ($self->{'info_only'}) {
	133	# don't worry about any options etc
	134	return bless $self, $class;
	135	}
	136
	137	# The metadata elements to use (required)
[12894]	138	if (!$self->{'metadata'}) {
[18568]	139	die "Error: No metadata fields specified for List.\n";
[10398]	140	}
[12894]	141	my @metadata_groups = split(/\//, $self->{'metadata'});
[12889]	142	$self->{'metadata_groups'} = \@metadata_groups;
[10398]	143
[12894]	144	# The classifier button name (default: the first metadata element specified)
	145	if (!$self->{'buttonname'}) {
	146	my $first_metadata_group = $metadata_groups[0];
[20008]	147	my $first_metadata_element = (split(/\;\|,/, $first_metadata_group))[0];
[12894]	148	$self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
[10398]	149	}
	150
[18619]	151	# Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
[12889]	152	foreach my $metadata_group (@metadata_groups) {
[18619]	153	$self->{$metadata_group . ".bookshelf_type"} = "always";
	154	}
	155	my $last_metadata_group = $metadata_groups[$#metadata_groups];
	156	# Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
	157	$self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
	158
[12892]	159	# Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
[12889]	160	foreach my $metadata_group (@metadata_groups) {
[12892]	161	$self->{$metadata_group . ".list_type"} = "VList";
[10499]	162	}
[12892]	163	foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
	164	$self->{$metadata_group . ".list_type"} = "HList";
[10499]	165	}
	166
[18619]	167	# How the items are grouped into partitions (default: no partition)
	168	# for each level (metadata group), separated by '/'
[12894]	169	if (!$self->{"partition_type_within_level"}) {
[18619]	170	foreach my $metadata_group (@metadata_groups) {
	171	$self->{$metadata_group . ".partition_type_within_level"} = "none";
	172	}
	173	} else {
	174	my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'});
[20825]	175
	176	my $first = 1;
[18619]	177	foreach my $metadata_group (@metadata_groups) {
	178	my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
[20865]	179	if (defined($partition_type_within_levelelem) && $partition_type_within_levelelem eq "per_letter_fixed_size") {
	180	print STDERR "per letter fixed size, changing to approximate size\n";
	181	$partition_type_within_levelelem = "approximate_size";
	182	}
[20825]	183	if (defined($partition_type_within_levelelem) && defined $valid_partition_types->{$partition_type_within_levelelem}) {
[18619]	184	$self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
	185	}
	186	else {
[20825]	187	if ($first) {
	188	$self->{$metadata_group . ".partition_type_within_level"} = "none";
	189	$first = 0;
	190	} else {
	191	$self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
	192	}
	193	if (defined($partition_type_within_levelelem)) {
	194	# ie invalid entry
	195	print STDERR "invalid partition type for level $metadata_group: $partition_type_within_levelelem, defaulting to ". $self->{$metadata_group . ".partition_type_within_level"} ."\n";
	196	}
[18619]	197	}
	198	}
[10398]	199	}
[20825]	200
[10499]	201	# The number of items in each partition
[12894]	202	if (!$self->{'partition_size_within_level'}) {
[10398]	203	# Default: 20
[12889]	204	foreach my $metadata_group (@metadata_groups) {
	205	$self->{$metadata_group . ".partition_size_within_level"} = 20;
[10398]	206	}
	207	}
	208	else {
[12894]	209	my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
[10398]	210
[10498]	211	# Assign values based on the partition_size_within_level parameter
[12889]	212	foreach my $metadata_group (@metadata_groups) {
[10498]	213	my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
	214	if (defined($partition_size_within_levelelem)) {
[12889]	215	$self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
[10398]	216	}
	217	else {
[12889]	218	$self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
[10398]	219	}
	220	}
	221	}
	222
[18619]	223	# The removeprefix and removesuffix expressions
	224	if ($self->{'removeprefix'}) {
	225	# If there are more than one expressions, use '' to quote each experession and '/' to separate
	226	my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
	227
	228	foreach my $metadata_group (@metadata_groups) {
	229	my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
	230	if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
	231	# Remove the other ' at the beginning and the end if there is any
	232	$removeprefix_expr_within_levelelem =~ s/^'//;
	233	$removeprefix_expr_within_levelelem =~ s/'$//;
	234	# Remove the extra ^ at the beginning
	235	$removeprefix_expr_within_levelelem =~ s/^\^//;
	236	$self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
	237	} else {
	238	$self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
	239	}
	240	}
	241	}
	242	if ($self->{'removesuffix'}) {
	243	my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
	244
	245	foreach my $metadata_group (@metadata_groups) {
	246	my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
	247	if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
	248	$removesuffix_expr_within_levelelem =~ s/^'//;
	249	$removesuffix_expr_within_levelelem =~ s/'$//;
	250	# Remove the extra $ at the end
	251	$removesuffix_expr_within_levelelem =~ s/\$$//;
	252	$self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
	253	} else {
	254	$self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
	255	}
	256	}
	257	}
	258
[12894]	259	# The metadata elements to use to sort the leaf nodes (default: Title)
	260	my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
	261	if ($self->{'sort_leaf_nodes_using'}) {
	262	@sort_leaf_nodes_using_metadata_groups = split(/\\|/, $self->{'sort_leaf_nodes_using'});
[10398]	263	}
[12894]	264	$self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
[10398]	265
[13551]	266	# Create an instance of the Unicode::Collate object if better Unicode sorting is desired
	267	if ($self->{'sort_using_unicode_collation'}) {
[13791]	268	# To use this you first need to download the allkeys.txt file from
	269	# http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
	270	# Unicode/Collate directory.
[13551]	271	require Unicode::Collate;
[23249]	272	#print STDERR "*** What's in INC: ".join(", ", @INC) ."\n";
[13551]	273	$self->{'unicode_collator'} = Unicode::Collate->new();
[23249]	274	#print STDERR "Table name: ".$self->{'unicode_collator'}->{'table'}."\n";
[13551]	275	}
	276
[23154]	277	# An empty array for the document/section OIDs that we are classifying
[12894]	278	$self->{'OIDs'} = [];
[23154]	279	# A hash for all the doc ids that we have seen, so we don't classify something twice
	280	$self->{'all_doc_OIDs'} = {};
[10398]	281	return bless $self, $class;
	282	}
	283
	284
	285	sub init
	286	{
	287	# Nothing to do...
	288	}
	289
	290
[12896]	291	# Called for each document in the collection
[10398]	292	sub classify
	293	{
	294	my $self = shift(@_);
[23116]	295	my ($doc_obj) = @_;
[10398]	296
[23154]	297	if (defined $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()}) {
	298	print STDERR "Warning, List classifier has already seen document ".$doc_obj->get_OID().", not classifying again\n";
	299	return;
	300	}
	301	$self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
[12896]	302	# If "-classify_sections" is set, classify every section of the document
[10398]	303	if ($self->{'classify_sections'}) {
	304	my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
	305	while (defined $section) {
[23116]	306	$self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
[10398]	307	$section = $doc_obj->get_next_section($section);
	308	}
	309	}
[12896]	310	# Otherwise just classify the top document section
[10398]	311	else {
[23116]	312	$self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
[10398]	313	}
[23154]	314
[10398]	315	}
	316
	317	sub classify_section
	318	{
	319	my $self = shift(@_);
[23116]	320	my ($doc_obj,$section_OID,$section) = @_;
[10398]	321
[12889]	322	my @metadata_groups = @{$self->{'metadata_groups'}};
[10398]	323
[12896]	324	# Only classify the section if it has a value for one of the metadata elements in the first group
	325	my $classify_section = 0;
	326	my $first_metadata_group = $metadata_groups[0];
[22175]	327	my $remove_prefix_expr = $self->{$first_metadata_group . ".remove_prefix_expr"};
	328	my $remove_suffix_expr = $self->{$first_metadata_group . ".remove_suffix_expr"};
[20008]	329	foreach my $first_metadata_group_element (split(/\;\|,/, $first_metadata_group)) {
[20424]	330	my $real_first_metadata_group_element = $self->strip_ex_from_metadata($first_metadata_group_element);
[20421]	331	my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $real_first_metadata_group_element);
[18619]	332
	333	# Remove prefix/suffix if requested
[22175]	334	if (defined ($first_metadata_group_element_value)) {
	335	if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
	336	$first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
	337	}
	338
	339	if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
	340	$first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
	341	}
[18619]	342	}
[12896]	343	if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
	344	# This section must be included in the classifier
	345	$classify_section = 1;
	346	last;
[18619]	347	}
[12896]	348	}
[10398]	349
[12896]	350	# We're not classifying this section because it doesn't have the required metadata
	351	return if (!$classify_section);
[18455]	352
[12896]	353	# Otherwise, include this section in the classifier
[23154]	354
[12896]	355	push(@{$self->{'OIDs'}}, $section_OID);
	356
	357	# Create a hash for the metadata values of each metadata element we're interested in
	358	my %metadata_groups_done = ();
	359	foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
	360	# Take care not to do a metadata group more than once
	361	unless ($metadata_groups_done{$metadata_group}) {
[22175]	362	my $remove_prefix_expr = $self->{$metadata_group . ".remove_prefix_expr"};
	363	my $remove_suffix_expr = $self->{$metadata_group . ".remove_suffix_expr"};
[20008]	364	foreach my $metadata_element (split(/\;\|,/, $metadata_group)) {
[20424]	365	my $real_metadata_element = $self->strip_ex_from_metadata($metadata_element);
	366
[20421]	367	my @metadata_values = @{$doc_obj->get_metadata($section, $real_metadata_element)};
[12896]	368	foreach my $metadata_value (@metadata_values) {
	369	# Strip leading and trailing whitespace
	370	$metadata_value =~ s/^\s*//;
	371	$metadata_value =~ s/\s*$//;
[13550]	372
[18619]	373	# Remove prefix/suffix if requested
	374	if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
	375	$metadata_value =~ s/^$remove_prefix_expr//;
	376	}
	377	if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
	378	$metadata_value =~ s/$remove_suffix_expr$//;
	379	}
	380
[20865]	381	# uppercase the metadata - makes the AZList nicer
	382	$metadata_value = uc($metadata_value);
[13550]	383	# Convert the metadata value from a UTF-8 string to a Unicode string
	384	# This means that length() and substr() work properly
	385	# We need to be careful to convert classifier node title values back to UTF-8, however
[14173]	386	my $metadata_value_unicode_string = $self->convert_utf8_string_to_unicode_string($metadata_value);
[13550]	387
	388	# Add the metadata value into the list for this combination of metadata group and section
	389	push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
[10398]	390	}
[12896]	391	last if (@metadata_values > 0);
[10398]	392	}
	393
[12896]	394	$metadata_groups_done{$metadata_group} = 1;
[10398]	395	}
	396	}
	397	}
	398
	399
	400	sub get_classify_info
	401	{
	402	my $self = shift(@_);
	403
[12896]	404	# The metadata groups to classify by
[12889]	405	my @metadata_groups = @{$self->{'metadata_groups'}};
	406	my $first_metadata_group = $metadata_groups[0];
[10398]	407
[12896]	408	# The OID values of the documents to include in the classifier
[12889]	409	my @OIDs = @{$self->{'OIDs'}};
[10398]	410
[12896]	411	# Create the root node of the classification hierarchy
[12893]	412	my %classifier_node = ( 'thistype' => "Invisible",
[12895]	413	'childtype' => $self->{$first_metadata_group . ".list_type"},
[12894]	414	'Title' => $self->{'buttonname'},
[13271]	415	'contains' => [],
	416	'mdtype' => $first_metadata_group );
[10398]	417
[12895]	418	# Recursively create the classification hierarchy, one level for each metadata group
[14173]	419	$self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
[12893]	420	return \%classifier_node;
[10398]	421	}
	422
	423
[12895]	424	sub add_level
[10398]	425	{
	426	my $self = shift(@_);
[12889]	427	my @metadata_groups = @{shift(@_)};
	428	my @OIDs = @{shift(@_)};
[12893]	429	my $classifier_node = shift(@_);
[23154]	430
[12889]	431	my $metadata_group = $metadata_groups[0];
[23154]	432
[13340]	433	if (!defined($self->{$metadata_group . ".list"})) {
	434	print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
	435	return;
	436	}
[10398]	437
	438	# Create a mapping from metadata value to OID
[14845]	439	my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
[18619]	440	my %metadata_value_to_OIDs_hash = ();
[14845]	441	foreach my $OID (@OIDs)
	442	{
	443	if ($OID_to_metadata_values_hash_ref->{$OID})
	444	{
	445	my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
	446	foreach my $metadata_value (@metadata_values)
	447	{
	448	push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
[10398]	449	}
	450	}
	451	}
[23154]	452	#print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
[10398]	453
	454	# Partition the values (if necessary)
[18619]	455	my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
[20904]	456	my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
[10498]	457	if ($partition_type_within_level =~ /^per_letter$/i) {
[10398]	458	# Generate one hlist for each letter
[14845]	459	my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
	460	my %metadata_value_to_OIDs_subhash = ();
[10398]	461
[14845]	462	my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
	463	foreach my $metadata_value (@sortedmetadata_values) {
	464	my $metadata_valuepartition = substr($metadata_value, 0, 1);
[10398]	465
	466	# Is this the start of a new partition?
[14845]	467	if ($metadata_valuepartition ne $lastpartition) {
	468	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
	469	%metadata_value_to_OIDs_subhash = ();
	470	$lastpartition = $metadata_valuepartition;
[10398]	471	}
	472
[14845]	473	$metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
[10398]	474	}
	475
	476	# Don't forget to add the last partition
[14845]	477	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
[10398]	478
	479	# The partitions are stored in an HList
[12893]	480	$classifier_node->{'childtype'} = "HList";
[10398]	481	}
[20904]	482	elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
[18619]	483	# Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
	484	# If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
	485	# If a letter has more items than specified, split into several hlists.
	486	# Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
	487	my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
	488	my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
	489	my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
	490
	491	# Separate values by their first letter, each form a bucket, like the per_letter partition type
	492	my $last_partition = substr($sortedmetadata_values[0], 0, 1);
	493	my %partition_buckets = ();
	494	my @metadata_values_in_bucket = ();
	495	my $num_items_in_bucket = 0;
	496	foreach my $metadata_value (@sortedmetadata_values) {
	497	my $metadata_valuepartition = substr($metadata_value, 0, 1);
	498	if ($metadata_valuepartition ne $last_partition) {
	499	my @temp_array = @metadata_values_in_bucket;
	500	# Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
	501	my %partition_info = ();
	502	$partition_info{'metadata_values'} = \@temp_array;
	503	$partition_info{'size'} = $num_items_in_bucket;
	504	$partition_buckets{$last_partition} = \%partition_info;
	505
	506	@metadata_values_in_bucket = ($metadata_value);
	507	$num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
	508	$last_partition = $metadata_valuepartition;
	509	} else {
	510	$num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
	511	push (@metadata_values_in_bucket, $metadata_value);
	512	}
	513	}
	514	# Last one
	515	my %partition_info = ();
	516	$partition_info{'metadata_values'} = \@metadata_values_in_bucket;
	517	$partition_info{'size'} = $num_items_in_bucket;
	518	$partition_buckets{$last_partition} = \%partition_info;
	519
	520	my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
	521	for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
	522	my $partition = $partition_keys[$i];
	523	my $items_in_partition = $partition_buckets{$partition}->{'size'};
	524	# Merge small buckets together, but keep the numeric bucket apart
	525	if ($items_in_partition < $partition_size_within_level) {
	526	my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
	527	if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
	528	&& !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
	529	\|\| ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
	530	foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
	531	push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
	532	}
	533	$partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
	534	delete $partition_buckets{$partition};
	535	}
	536	}
	537	}
	538	@partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
	539
	540	# Add partitions, and divide big bucket into several
	541	my $last_partition_end = "";
	542	my $partition_start = "";
	543	foreach my $partition (@partition_keys) {
	544	my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
	545	my $items_in_partition = $partition_buckets{$partition}->{'size'};
	546	$partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
	547
	548	if ($items_in_partition > $partition_size_within_level) {
	549	my $items_done = 0;
	550	my %metadata_values_to_OIDs_subhashes = ();
	551	for (my $i = 0; $i < scalar(@metadata_values); $i++) {
	552	my $metadata_value = $metadata_values[$i];
	553	# If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
	554	my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
[10398]	555
[18619]	556	my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
	557	my $partitionname = $partition_start;
	558	if ($partitionend ne $partition_start) {
	559	$partitionname = $partitionname . "-" . $partitionend;
	560	}
	561
	562	# Start a new partition
	563	if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
	564	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
	565	$last_partition_end = $partitionend;
	566	$partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
	567	$items_done = 0;
	568	%metadata_values_to_OIDs_subhashes = ();
	569	}
	570
	571	# If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
	572	if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
	573	my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
	574	# Get the number of partitions needed for this value
	575	my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
	576	$num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
	577
	578	my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
	579	for (my $i = 0; $i < $num_splits; $i++) {
	580	my %OIDs_subhashes_for_this_value = ();
	581	my @OIDs_for_this_partition = ();
	582	for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
	583	push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
	584	}
	585
	586	# The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
	587	if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
	588	$metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
	589	$items_done += scalar(@OIDs_for_this_partition);
	590	next;
	591	}
	592
	593	# Add an HList for this bucket
	594	$OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
	595	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
	596	$last_partition_end = $partitionname_for_this_value;
	597	}
	598	next;
	599	}
	600
	601	$metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
	602	$items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;
	603
	604	# The last partition
	605	if($i == scalar(@metadata_values) - 1) {
	606	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
	607	}
	608	}
	609	}
	610	else {
	611	# The easier case, just add a partition
	612	my %metadata_values_to_OIDs_subhashes = ();
	613	for (my $i = 0; $i < scalar(@metadata_values); $i++) {
	614	my $metadata_value = $metadata_values[$i];
	615	$metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
	616	}
	617	my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
	618	my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
	619	my $partitionname = $partition_start;
	620	if ($partitionend ne $partition_start) {
	621	$partitionname = $partitionname . "-" . $partitionend;
	622	}
	623	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
	624	$last_partition_end = $partitionend;
	625	}
	626	}
[20865]	627
	628	# The partitions are stored in an HList
	629	$classifier_node->{'childtype'} = "HList";
	630
	631	} # end approximate_size
[10398]	632	else {
	633	# Generate hlists of a certain size
[14845]	634	if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
	635	my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
[10398]	636	my $itemsdone = 0;
[14845]	637	my %metadata_value_to_OIDs_subhash = ();
[10398]	638	my $lastpartitionend = "";
	639	my $partitionstart;
[14845]	640	foreach my $metadata_value (@sortedmetadata_values) {
	641	$metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
[10398]	642	$itemsdone++;
[14845]	643	my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
[10398]	644
	645	# Is this the start of a new partition?
	646	if ($itemsinpartition == 1) {
[14845]	647	$partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
[10398]	648	}
	649
	650	# Is this the end of the partition?
[14845]	651	if ($itemsinpartition == $partition_size_within_level \|\| $itemsdone == @sortedmetadata_values) {
	652	my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
[10398]	653	my $partitionname = $partitionstart;
	654	if ($partitionend ne $partitionstart) {
	655	$partitionname = $partitionname . "-" . $partitionend;
	656	}
	657
[14845]	658	$self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
	659	%metadata_value_to_OIDs_subhash = ();
[10398]	660	$lastpartitionend = $partitionend;
	661	}
	662	}
	663
	664	# The partitions are stored in an HList
[12893]	665	$classifier_node->{'childtype'} = "HList";
[10398]	666	}
	667
	668	# Otherwise just add all the values to a VList
	669	else {
[14845]	670	$self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
[10398]	671	}
	672	}
	673	}
	674
	675
[13550]	676	sub convert_utf8_string_to_unicode_string
[10398]	677	{
[14173]	678	my $self = shift(@_);
[13550]	679	my $utf8_string = shift(@_);
[10398]	680
[13550]	681	my $unicode_string = "";
	682	foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
	683	$unicode_string .= chr($unicode_value);
	684	}
	685	return $unicode_string;
[10398]	686	}
	687
	688
[13550]	689	sub convert_unicode_string_to_utf8_string
	690	{
[14173]	691	my $self = shift(@_);
[13550]	692	my $unicode_string = shift(@_);
	693
	694	my @unicode_array;
	695	for (my $i = 0; $i < length($unicode_string); $i++) {
	696	push(@unicode_array, ord(substr($unicode_string, $i, 1)));
	697	}
	698	return &unicode::unicode2utf8(\@unicode_array);
	699	}
	700
	701
[10398]	702	sub generate_partition_start
	703	{
[14173]	704	my $self = shift(@_);
[14845]	705	my $metadata_value = shift(@_);
[10398]	706	my $lastpartitionend = shift(@_);
[14084]	707	my $partition_name_length = shift(@_);
[10398]	708
[14084]	709	if ($partition_name_length) {
[14845]	710	return substr($metadata_value, 0, $partition_name_length);
[14084]	711	}
	712
[14845]	713	my $partitionstart = substr($metadata_value, 0, 1);
[10398]	714	if ($partitionstart le $lastpartitionend) {
[14845]	715	$partitionstart = substr($metadata_value, 0, 2);
[10398]	716	# Give up after three characters
	717	if ($partitionstart le $lastpartitionend) {
[14845]	718	$partitionstart = substr($metadata_value, 0, 3);
[10398]	719	}
	720	}
	721
	722	return $partitionstart;
	723	}
	724
	725
	726	sub generate_partition_end
	727	{
[14173]	728	my $self = shift(@_);
[14845]	729	my $metadata_value = shift(@_);
[10398]	730	my $partitionstart = shift(@_);
[14084]	731	my $partition_name_length = shift(@_);
[10398]	732
[14084]	733	if ($partition_name_length) {
[14845]	734	return substr($metadata_value, 0, $partition_name_length);
[14084]	735	}
	736
[14845]	737	my $partitionend = substr($metadata_value, 0, length($partitionstart));
[10398]	738	if ($partitionend gt $partitionstart) {
[14845]	739	$partitionend = substr($metadata_value, 0, 1);
[10398]	740	if ($partitionend le $partitionstart) {
[14845]	741	$partitionend = substr($metadata_value, 0, 2);
[10398]	742	# Give up after three characters
	743	if ($partitionend le $partitionstart) {
[14845]	744	$partitionend = substr($metadata_value, 0, 3);
[10398]	745	}
	746	}
	747	}
	748
	749	return $partitionend;
	750	}
	751
	752
	753	sub add_hlist_partition
	754	{
	755	my $self = shift(@_);
[12889]	756	my @metadata_groups = @{shift(@_)};
[12893]	757	my $classifier_node = shift(@_);
[10398]	758	my $partitionname = shift(@_);
[14845]	759	my $metadata_value_to_OIDs_hash_ref = shift(@_);
[10398]	760
	761	# Create an hlist partition
[14173]	762	my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
[12893]	763	'childtype' => "VList",
	764	'contains' => [] );
[10398]	765
	766	# Add the children to the hlist partition
[14845]	767	$self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
[12893]	768	push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
[10398]	769	}
	770
	771
	772	sub add_vlist
	773	{
	774	my $self = shift(@_);
[12889]	775	my @metadata_groups = @{shift(@_)};
[12893]	776	my $classifier_node = shift(@_);
[14845]	777	my $metadata_value_to_OIDs_hash_ref = shift(@_);
[10398]	778
[12889]	779	my $metadata_group = shift(@metadata_groups);
[13287]	780	$classifier_node->{'mdtype'} = $metadata_group;
[10398]	781
	782	# Create an entry in the vlist for each value
[14845]	783	foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
	784	{
	785	my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
[18619]	786	# If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
	787	if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
[13271]	788	my $OID = $OIDs[0];
[21969]	789	my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
[13271]	790	push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
[21969]	791	}
[18619]	792	# If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
	793	elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
[21969]	794	@OIDs = $self->sort_leaf_items(\@OIDs);
	795	foreach my $OID (@OIDs) {
	796	my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
	797	push(@{$classifier_node->{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
	798	}
	799
[10398]	800	}
	801	# Otherwise create a sublist (bookshelf) for the metadata value
[18619]	802	else {
[14845]	803	my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
[12893]	804	'childtype' => "VList",
[21969]	805	'mdtype' => $metadata_group,
[12893]	806	'contains' => [] );
[10398]	807
	808	# If there are metadata elements remaining, recursively apply the process
[12889]	809	if (@metadata_groups > 0) {
	810	my $next_metadata_group = $metadata_groups[0];
[12895]	811	$child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
[14173]	812	$self->add_level(\@metadata_groups, \@OIDs, \%child_classifier_node);
[10398]	813	}
	814	# Otherwise just add the documents as children of this list
	815	else {
[21969]	816	@OIDs = $self->sort_leaf_items(\@OIDs);
	817	foreach my $OID (@OIDs) {
	818	my $offset = $self->metadata_offset($metadata_group, $OID, $metadata_value);
	819	push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID , 'offset' => $offset });
	820	}
	821
[10398]	822	}
	823
	824	# Add the sublist to the list
[12893]	825	push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
[10398]	826	}
	827	}
	828	}
	829
[21969]	830	sub metadata_offset
[18619]	831	{
	832	my $self = shift(@_);
[21969]	833	my $metadata_group = shift(@_);
	834	my $OID = shift(@_);
	835	my $metadata_value = shift(@_);
	836
	837	my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
	838	my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
	839	for (my $i = 0; $i < scalar(@metadata_values); $i++) {
	840	if ($metadata_value eq $metadata_values[$i]) {
	841	return $i;
	842	}
	843	}
	844
	845	return 0;
	846	}
	847
	848	sub sort_leaf_items
	849	{
	850	my $self = shift(@_);
[18619]	851	my @OIDs = @{shift(@_)};
[21969]	852	# my $classifier_node = shift(@_);
[18619]	853
	854	# Sort leaf nodes and add to list
[20825]	855	my @sort_leaf_nodes_using_metadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
	856	foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_using_metadata_groups) {
[18619]	857	my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
	858	# Force a stable sort (Perl 5.6's sort isn't stable)
	859	# !! The [0] bits aren't ideal (multiple metadata values) !!
[22667]	860	@OIDs = @OIDs[ sort {
	861	if (defined($OID_to_metadata_values_hash_ref->{$OIDs[$a]} && defined($OID_to_metadata_values_hash_ref->{$OIDs[$b]})))
	862	{
	863	$OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0];
	864	}
	865	else
	866	{
	867	$a <=> $b;
	868	}
	869	} 0..$#OIDs ];
[18619]	870	}
[21969]	871	return @OIDs;
[18619]	872	}
	873
	874
[21969]	875
[13551]	876	sub sort_metadata_values_array
	877	{
	878	my $self = shift(@_);
	879	my @metadata_values = @_;
	880
	881	if ($self->{'unicode_collator'}) {
	882	return $self->{'unicode_collator'}->sort(@metadata_values);
	883	}
	884	else {
	885	return sort(@metadata_values);
	886	}
	887	}
	888
	889
[10398]	890	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: