source: trunk/gsdl/perllib/classify/GenericList.pm@ 14084

Last change on this file since 14084 was 14084, checked in by mdewsnip, 17 years ago

Added "partition_name_length" option to GenericList, many thanks to Jens Wille.

  • Property svn:keywords set to Author Date Id Revision
File size: 19.0 KB
RevLine 
[10398]1###########################################################################
2#
3# GenericList.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
[13741]29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
[10398]31#
32###########################################################################
33
34package GenericList;
35
36
37use BasClas;
38
39use strict;
40
41
42sub BEGIN {
43 @GenericList::ISA = ('BasClas');
44}
45
46
47my $arguments =
48 [ { 'name' => "metadata",
49 'desc' => "{GenericList.metadata}",
50 'type' => "metadata",
51 'reqd' => "yes" },
[12889]52
[10398]53 # The interesting options
[10498]54 { 'name' => "always_bookshelf_last_level",
55 'desc' => "{GenericList.always_bookshelf_last_level}",
56 'type' => "flag" },
57 { 'name' => "classify_sections",
58 'desc' => "{GenericList.classify_sections}",
59 'type' => "flag" },
60 { 'name' => "partition_type_within_level",
61 'desc' => "{GenericList.partition_type_within_level}",
[10398]62 'type' => "string",
63 'deft' => "none" },
[10498]64 { 'name' => "partition_size_within_level",
65 'desc' => "{GenericList.partition_size_within_level}",
[10398]66 'type' => "string" },
[14084]67 { 'name' => "partition_name_length",
68 'desc' => "{GenericList.partition_name_length}",
69 'type' => "string" },
[10498]70 { 'name' => "sort_leaf_nodes_using",
71 'desc' => "{GenericList.sort_leaf_nodes_using}",
[10398]72 'type' => "metadata",
[10499]73 'deft' => "Title" },
[13551]74 { 'name' => "sort_using_unicode_collation",
75 'desc' => "{GenericList.sort_using_unicode_collation}",
76 'type' => "flag" },
[10499]77 { 'name' => "use_hlist_for",
78 'desc' => "{GenericList.use_hlist_for}",
79 'type' => "string" } ];
[10398]80
81my $options = { 'name' => "GenericList",
82 'desc' => "{GenericList.desc}",
[10502]83 'abstract' => "no",
[10398]84 'inherits' => "Yes",
85 'args' => $arguments };
86
87
88sub new
89{
90 my ($class) = shift(@_);
91 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
92 push(@$classifierslist, $class);
93
94 if (defined $arguments) { push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments}); }
95 if (defined $options) { push(@{$hashArgOptLists->{"OptList"}}, $options); }
96
[12891]97 my $self = new BasClas($classifierslist, $inputargs, $hashArgOptLists);
[10398]98
99 if ($self->{'info_only'}) {
100 # don't worry about any options etc
101 return bless $self, $class;
102 }
103
104 # The metadata elements to use (required)
[12894]105 if (!$self->{'metadata'}) {
[10398]106 die "Error: No metadata fields specified for GenericList.\n";
107 }
[12894]108 my @metadata_groups = split(/\//, $self->{'metadata'});
[12889]109 $self->{'metadata_groups'} = \@metadata_groups;
[10398]110
[12894]111 # The classifier button name (default: the first metadata element specified)
112 if (!$self->{'buttonname'}) {
113 my $first_metadata_group = $metadata_groups[0];
114 my $first_metadata_element = (split(/\;/, $first_metadata_group))[0];
115 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
[10398]116 }
117
118 # Whether to group single items into a bookshelf (must be true for all metadata fields except the last)
[12889]119 foreach my $metadata_group (@metadata_groups) {
120 $self->{$metadata_group . ".always_bookshelf"} = "t";
[10398]121 }
[10498]122 if (!$self->{'always_bookshelf_last_level'}) {
[12894]123 # Default: leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
[12889]124 my $last_metadata_group = $metadata_groups[$#metadata_groups];
125 $self->{$last_metadata_group . ".always_bookshelf"} = "f";
[10398]126 }
127
[12892]128 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
[12889]129 foreach my $metadata_group (@metadata_groups) {
[12892]130 $self->{$metadata_group . ".list_type"} = "VList";
[10499]131 }
[12892]132 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
133 $self->{$metadata_group . ".list_type"} = "HList";
[10499]134 }
135
[12894]136 # How the items are grouped into partitions (default: no partition)
137 if (!$self->{"partition_type_within_level"}) {
138 $self->{"partition_type_within_level"} = "none";
[10398]139 }
140
[10499]141 # The number of items in each partition
[12894]142 if (!$self->{'partition_size_within_level'}) {
[10398]143 # Default: 20
[12889]144 foreach my $metadata_group (@metadata_groups) {
145 $self->{$metadata_group . ".partition_size_within_level"} = 20;
[10398]146 }
147 }
148 else {
[12894]149 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
[10398]150
[10498]151 # Assign values based on the partition_size_within_level parameter
[12889]152 foreach my $metadata_group (@metadata_groups) {
[10498]153 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
154 if (defined($partition_size_within_levelelem)) {
[12889]155 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
[10398]156 }
157 else {
[12889]158 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
[10398]159 }
160 }
161 }
162
[12894]163 # The metadata elements to use to sort the leaf nodes (default: Title)
164 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
165 if ($self->{'sort_leaf_nodes_using'}) {
166 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
[10398]167 }
[12894]168 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
[10398]169
[13551]170 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
171 if ($self->{'sort_using_unicode_collation'}) {
[13791]172 # To use this you first need to download the allkeys.txt file from
173 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
174 # Unicode/Collate directory.
[13551]175 require Unicode::Collate;
176 $self->{'unicode_collator'} = Unicode::Collate->new();
177 }
178
[12894]179 # An empty array for the document OIDs
180 $self->{'OIDs'} = [];
181
[10398]182 return bless $self, $class;
183}
184
185
186sub init
187{
188 # Nothing to do...
189}
190
191
[12896]192# Called for each document in the collection
[10398]193sub classify
194{
195 my $self = shift(@_);
196 my $doc_obj = shift(@_);
197
[12896]198 # If "-classify_sections" is set, classify every section of the document
[10398]199 if ($self->{'classify_sections'}) {
200 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
201 while (defined $section) {
[12896]202 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
[10398]203 $section = $doc_obj->get_next_section($section);
204 }
205 }
[12896]206 # Otherwise just classify the top document section
[10398]207 else {
[12896]208 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
[10398]209 }
210}
211
212
213sub classify_section
214{
215 my $self = shift(@_);
216 my $doc_obj = shift(@_);
[12896]217 my $section_OID = shift(@_);
[10398]218 my $section = shift(@_);
219
[12889]220 my @metadata_groups = @{$self->{'metadata_groups'}};
[10398]221
[12896]222 # Only classify the section if it has a value for one of the metadata elements in the first group
223 my $classify_section = 0;
224 my $first_metadata_group = $metadata_groups[0];
225 foreach my $first_metadata_group_element (split(/\;/, $first_metadata_group)) {
226 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $first_metadata_group_element);
227 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
228 # This section must be included in the classifier
229 $classify_section = 1;
230 last;
231 }
232 }
[10398]233
[12896]234 # We're not classifying this section because it doesn't have the required metadata
235 return if (!$classify_section);
[10398]236
[12896]237 # Otherwise, include this section in the classifier
238 push(@{$self->{'OIDs'}}, $section_OID);
239
240 # Create a hash for the metadata values of each metadata element we're interested in
241 my %metadata_groups_done = ();
242 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
243 # Take care not to do a metadata group more than once
244 unless ($metadata_groups_done{$metadata_group}) {
245 foreach my $metadata_element (split(/\;/, $metadata_group)) {
246 my @metadata_values = @{$doc_obj->get_metadata($section, $metadata_element)};
247 foreach my $metadata_value (@metadata_values) {
248 # Strip leading and trailing whitespace
249 $metadata_value =~ s/^\s*//;
250 $metadata_value =~ s/\s*$//;
[13550]251
252 # Convert the metadata value from a UTF-8 string to a Unicode string
253 # This means that length() and substr() work properly
254 # We need to be careful to convert classifier node title values back to UTF-8, however
255 my $metadata_value_unicode_string = &convert_utf8_string_to_unicode_string($metadata_value);
256
257 # Add the metadata value into the list for this combination of metadata group and section
258 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
[10398]259 }
[12896]260 last if (@metadata_values > 0);
[10398]261 }
262
[12896]263 $metadata_groups_done{$metadata_group} = 1;
[10398]264 }
265 }
266}
267
268
269sub get_classify_info
270{
271 my $self = shift(@_);
272
[12896]273 # The metadata groups to classify by
[12889]274 my @metadata_groups = @{$self->{'metadata_groups'}};
275 my $first_metadata_group = $metadata_groups[0];
[10398]276
[12896]277 # The OID values of the documents to include in the classifier
[12889]278 my @OIDs = @{$self->{'OIDs'}};
[10398]279
[12896]280 # Create the root node of the classification hierarchy
[12893]281 my %classifier_node = ( 'thistype' => "Invisible",
[12895]282 'childtype' => $self->{$first_metadata_group . ".list_type"},
[12894]283 'Title' => $self->{'buttonname'},
[13271]284 'contains' => [],
285 'mdtype' => $first_metadata_group );
[10398]286
[12895]287 # Recursively create the classification hierarchy, one level for each metadata group
288 &add_level($self, \@metadata_groups, \@OIDs, \%classifier_node);
[12893]289 return \%classifier_node;
[10398]290}
291
292
[12895]293sub add_level
[10398]294{
295 my $self = shift(@_);
[12889]296 my @metadata_groups = @{shift(@_)};
297 my @OIDs = @{shift(@_)};
[12893]298 my $classifier_node = shift(@_);
299 # print STDERR "\nAdding AZ list for " . $classifier_node->{'Title'} . "\n";
[10398]300
[12889]301 my $metadata_group = $metadata_groups[0];
302 # print STDERR "Processing metadata group: " . $metadata_group . "\n";
303 # print STDERR "Number of OID values: " . @OIDs . "\n";
[10398]304
[13340]305 if (!defined($self->{$metadata_group . ".list"})) {
306 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
307 return;
308 }
[13272]309 my %OIDtometavalueshash = %{$self->{$metadata_group . ".list"}};
[10398]310
311 # Create a mapping from metadata value to OID
312 my %metavaluetoOIDhash = ();
[12889]313 foreach my $OID (@OIDs) {
[13272]314 if ($OIDtometavalueshash{$OID}) {
315 my @metavalues = @{$OIDtometavalueshash{$OID}};
[10398]316 foreach my $metavalue (@metavalues) {
317 push(@{$metavaluetoOIDhash{$metavalue}}, $OID);
318 }
319 }
320 }
321 # print STDERR "Number of distinct values: " . scalar(keys %metavaluetoOIDhash) . "\n";
322
323 # Partition the values (if necessary)
[10498]324 my $partition_type_within_level = $self->{"partition_type_within_level"};
325 if ($partition_type_within_level =~ /^per_letter$/i) {
[10398]326 # Generate one hlist for each letter
[13551]327 my @sortedmetavalues = &sort_metadata_values_array($self, keys(%metavaluetoOIDhash));
[10398]328 my %metavaluetoOIDsubhash = ();
329
[13550]330 my $lastpartition = substr($sortedmetavalues[0], 0, 1);
[10398]331 foreach my $metavalue (@sortedmetavalues) {
[13550]332 my $metavaluepartition = substr($metavalue, 0, 1);
[10398]333
334 # Is this the start of a new partition?
335 if ($metavaluepartition ne $lastpartition) {
[12893]336 &add_hlist_partition($self, \@metadata_groups, $classifier_node, $lastpartition, \%metavaluetoOIDsubhash);
[10398]337 %metavaluetoOIDsubhash = ();
338 $lastpartition = $metavaluepartition;
339 }
340
341 $metavaluetoOIDsubhash{$metavalue} = $metavaluetoOIDhash{$metavalue};
342 }
343
344 # Don't forget to add the last partition
[12893]345 &add_hlist_partition($self, \@metadata_groups, $classifier_node, $lastpartition, \%metavaluetoOIDsubhash);
[10398]346
347 # The partitions are stored in an HList
[12893]348 $classifier_node->{'childtype'} = "HList";
[10398]349 }
350
351 else {
352 # Generate hlists of a certain size
[12889]353 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
[10498]354 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metavaluetoOIDhash) > $partition_size_within_level) {
[13551]355 my @sortedmetavalues = &sort_metadata_values_array($self, keys(%metavaluetoOIDhash));
[10398]356 my $itemsdone = 0;
357 my %metavaluetoOIDsubhash = ();
358 my $lastpartitionend = "";
359 my $partitionstart;
360 foreach my $metavalue (@sortedmetavalues) {
361 $metavaluetoOIDsubhash{$metavalue} = $metavaluetoOIDhash{$metavalue};
362 $itemsdone++;
363 my $itemsinpartition = scalar(keys %metavaluetoOIDsubhash);
364
365 # Is this the start of a new partition?
366 if ($itemsinpartition == 1) {
[14084]367 $partitionstart = &generate_partition_start($metavalue, $lastpartitionend, $self->{"partition_name_length"});
[10398]368 }
369
370 # Is this the end of the partition?
[10498]371 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetavalues) {
[14084]372 my $partitionend = &generate_partition_end($metavalue, $partitionstart, $self->{"partition_name_length"});
[10398]373 my $partitionname = $partitionstart;
374 if ($partitionend ne $partitionstart) {
375 $partitionname = $partitionname . "-" . $partitionend;
376 }
377
[12893]378 &add_hlist_partition($self, \@metadata_groups, $classifier_node, $partitionname, \%metavaluetoOIDsubhash);
[10398]379 %metavaluetoOIDsubhash = ();
380 $lastpartitionend = $partitionend;
381 }
382 }
383
384 # The partitions are stored in an HList
[12893]385 $classifier_node->{'childtype'} = "HList";
[10398]386 }
387
388 # Otherwise just add all the values to a VList
389 else {
[12893]390 &add_vlist($self, \@metadata_groups, $classifier_node, \%metavaluetoOIDhash);
[10398]391 }
392 }
393}
394
395
[13550]396sub convert_utf8_string_to_unicode_string
[10398]397{
[13550]398 my $utf8_string = shift(@_);
[10398]399
[13550]400 my $unicode_string = "";
401 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
402 $unicode_string .= chr($unicode_value);
403 }
404 return $unicode_string;
[10398]405}
406
407
[13550]408sub convert_unicode_string_to_utf8_string
409{
410 my $unicode_string = shift(@_);
411
412 my @unicode_array;
413 for (my $i = 0; $i < length($unicode_string); $i++) {
414 push(@unicode_array, ord(substr($unicode_string, $i, 1)));
415 }
416 return &unicode::unicode2utf8(\@unicode_array);
417}
418
419
[10398]420sub generate_partition_start
421{
422 my $metavalue = shift(@_);
423 my $lastpartitionend = shift(@_);
[14084]424 my $partition_name_length = shift(@_);
[10398]425
[14084]426 if ($partition_name_length) {
427 return substr($metavalue, 0, $partition_name_length);
428 }
429
[13550]430 my $partitionstart = substr($metavalue, 0, 1);
[10398]431 if ($partitionstart le $lastpartitionend) {
[13550]432 $partitionstart = substr($metavalue, 0, 2);
[10398]433 # Give up after three characters
434 if ($partitionstart le $lastpartitionend) {
[13550]435 $partitionstart = substr($metavalue, 0, 3);
[10398]436 }
437 }
438
439 return $partitionstart;
440}
441
442
443sub generate_partition_end
444{
445 my $metavalue = shift(@_);
446 my $partitionstart = shift(@_);
[14084]447 my $partition_name_length = shift(@_);
[10398]448
[14084]449 if ($partition_name_length) {
450 return substr($metavalue, 0, $partition_name_length);
451 }
452
[13550]453 my $partitionend = substr($metavalue, 0, length($partitionstart));
[10398]454 if ($partitionend gt $partitionstart) {
[13550]455 $partitionend = substr($metavalue, 0, 1);
[10398]456 if ($partitionend le $partitionstart) {
[13550]457 $partitionend = substr($metavalue, 0, 2);
[10398]458 # Give up after three characters
459 if ($partitionend le $partitionstart) {
[13550]460 $partitionend = substr($metavalue, 0, 3);
[10398]461 }
462 }
463 }
464
465 return $partitionend;
466}
467
468
469sub add_hlist_partition
470{
471 my $self = shift(@_);
[12889]472 my @metadata_groups = @{shift(@_)};
[12893]473 my $classifier_node = shift(@_);
[10398]474 my $partitionname = shift(@_);
475 my %metavaluetoOIDhash = %{shift(@_)};
476
477 # Create an hlist partition
[13550]478 my %child_classifier_node = ( 'Title' => &convert_unicode_string_to_utf8_string($partitionname),
[12893]479 'childtype' => "VList",
480 'contains' => [] );
[10398]481
482 # Add the children to the hlist partition
[12893]483 &add_vlist($self, \@metadata_groups, \%child_classifier_node, \%metavaluetoOIDhash);
484 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
[10398]485}
486
487
488sub add_vlist
489{
490 my $self = shift(@_);
[12889]491 my @metadata_groups = @{shift(@_)};
[12893]492 my $classifier_node = shift(@_);
[10398]493 my %metavaluetoOIDhash = %{shift(@_)};
494
[12889]495 my $metadata_group = shift(@metadata_groups);
[13287]496 $classifier_node->{'mdtype'} = $metadata_group;
[10398]497
498 # Create an entry in the vlist for each value
[13551]499 foreach my $metavalue (&sort_metadata_values_array($self, keys(%metavaluetoOIDhash))) {
[12889]500 my @OIDs = @{$metavaluetoOIDhash{$metavalue}};
[10398]501
[10498]502 # If there is only one item and 'always_bookshelf' is false, add the item to the list
[12889]503 if (@OIDs == 1 && $self->{$metadata_group . ".always_bookshelf"} eq "f") {
[13271]504 my $OID = $OIDs[0];
505
506 # Find the offset of this metadata value
507 my $offset = 0;
[13272]508 my %OIDtometavalueshash = %{$self->{$metadata_group . ".list"}};
509 my @metavalues = @{$OIDtometavalueshash{$OID}};
[13271]510 for (my $i = 0; $i < scalar(@metavalues); $i++) {
511 if ($metavalue eq $metavalues[$i]) {
512 $offset = $i;
513 last;
514 }
515 }
516 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
[10398]517 }
518
519 # Otherwise create a sublist (bookshelf) for the metadata value
520 else {
[13550]521 my %child_classifier_node = ( 'Title' => &convert_unicode_string_to_utf8_string($metavalue),
[12893]522 'childtype' => "VList",
523 'contains' => [] );
[10398]524
525 # If there are metadata elements remaining, recursively apply the process
[12889]526 if (@metadata_groups > 0) {
527 my $next_metadata_group = $metadata_groups[0];
[12895]528 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
529 &add_level($self, \@metadata_groups, \@OIDs, \%child_classifier_node);
[10398]530 }
531 # Otherwise just add the documents as children of this list
532 else {
[10498]533 # Sort the leaf nodes by the metadata elements specified with -sort_leaf_nodes_using
[12894]534 my @sort_leaf_nodes_usingmetadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
[12889]535 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_usingmetadata_groups) {
[13272]536 my %OIDtometavalueshash = %{$self->{$sort_leaf_nodes_usingmetaelem . ".list"}};
[10398]537 # Force a stable sort (Perl 5.6's sort isn't stable)
538 # !! The [0] bits aren't ideal (multiple metadata values) !!
[13272]539 @OIDs = @OIDs[ sort { $OIDtometavalueshash{$OIDs[$a]}[0] cmp $OIDtometavalueshash{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ];
[10398]540 }
541
[12889]542 foreach my $OID (@OIDs) {
[12893]543 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID });
[10398]544 }
545 }
546
547 # Add the sublist to the list
[12893]548 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
[10398]549 }
550 }
551}
552
553
[13551]554sub sort_metadata_values_array
555{
556 my $self = shift(@_);
557 my @metadata_values = @_;
558
559 if ($self->{'unicode_collator'}) {
560 return $self->{'unicode_collator'}->sort(@metadata_values);
561 }
562 else {
563 return sort(@metadata_values);
564 }
565}
566
567
[10398]5681;
Note: See TracBrowser for help on using the repository browser.