source: gsdl/trunk/perllib/classify/List.pm@ 18568

Last change on this file since 18568 was 18568, checked in by kjdon, 15 years ago

file contents change for name change

  • Property svn:keywords set to Author Date Id Revision
File size: 19.8 KB
Line 
1###########################################################################
2#
3# List.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package List;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43 @List::ISA = ('BaseClassifier');
44}
45
46
47my $arguments =
48 [ { 'name' => "metadata",
49 'desc' => "{GenericList.metadata}",
50 'type' => "metadata",
51 'reqd' => "yes" },
52
53 # The interesting options
54 { 'name' => "always_bookshelf_last_level",
55 'desc' => "{GenericList.always_bookshelf_last_level}",
56 'type' => "flag" },
57 { 'name' => "classify_sections",
58 'desc' => "{GenericList.classify_sections}",
59 'type' => "flag" },
60 { 'name' => "partition_type_within_level",
61 'desc' => "{GenericList.partition_type_within_level}",
62 'type' => "string",
63 'deft' => "none" },
64 { 'name' => "partition_size_within_level",
65 'desc' => "{GenericList.partition_size_within_level}",
66 'type' => "string" },
67 { 'name' => "partition_name_length",
68 'desc' => "{GenericList.partition_name_length}",
69 'type' => "string" },
70 { 'name' => "sort_leaf_nodes_using",
71 'desc' => "{GenericList.sort_leaf_nodes_using}",
72 'type' => "metadata",
73 'deft' => "Title" },
74 { 'name' => "sort_using_unicode_collation",
75 'desc' => "{GenericList.sort_using_unicode_collation}",
76 'type' => "flag" },
77 { 'name' => "use_hlist_for",
78 'desc' => "{GenericList.use_hlist_for}",
79 'type' => "string" } ];
80
81my $options = { 'name' => "List",
82 'desc' => "{GenericList.desc}",
83 'abstract' => "no",
84 'inherits' => "Yes",
85 'args' => $arguments };
86
87
88sub new
89{
90 my ($class) = shift(@_);
91 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
92 push(@$classifierslist, $class);
93
94 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
95 push(@{$hashArgOptLists->{"OptList"}}, $options);
96
97 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
98
99 if ($self->{'info_only'}) {
100 # don't worry about any options etc
101 return bless $self, $class;
102 }
103
104 # The metadata elements to use (required)
105 if (!$self->{'metadata'}) {
106 die "Error: No metadata fields specified for List.\n";
107 }
108 my @metadata_groups = split(/\//, $self->{'metadata'});
109 $self->{'metadata_groups'} = \@metadata_groups;
110
111 # The classifier button name (default: the first metadata element specified)
112 if (!$self->{'buttonname'}) {
113 my $first_metadata_group = $metadata_groups[0];
114 my $first_metadata_element = (split(/\;/, $first_metadata_group))[0];
115 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
116 }
117
118 # Whether to group single items into a bookshelf (must be true for all metadata fields except the last)
119 foreach my $metadata_group (@metadata_groups) {
120 $self->{$metadata_group . ".always_bookshelf"} = "t";
121 }
122 if (!$self->{'always_bookshelf_last_level'}) {
123 # Default: leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
124 my $last_metadata_group = $metadata_groups[$#metadata_groups];
125 $self->{$last_metadata_group . ".always_bookshelf"} = "f";
126 }
127
128 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
129 foreach my $metadata_group (@metadata_groups) {
130 $self->{$metadata_group . ".list_type"} = "VList";
131 }
132 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
133 $self->{$metadata_group . ".list_type"} = "HList";
134 }
135
136 # How the items are grouped into partitions (default: no partition)
137 if (!$self->{"partition_type_within_level"}) {
138 $self->{"partition_type_within_level"} = "none";
139 }
140
141 # The number of items in each partition
142 if (!$self->{'partition_size_within_level'}) {
143 # Default: 20
144 foreach my $metadata_group (@metadata_groups) {
145 $self->{$metadata_group . ".partition_size_within_level"} = 20;
146 }
147 }
148 else {
149 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
150
151 # Assign values based on the partition_size_within_level parameter
152 foreach my $metadata_group (@metadata_groups) {
153 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
154 if (defined($partition_size_within_levelelem)) {
155 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
156 }
157 else {
158 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
159 }
160 }
161 }
162
163 # The metadata elements to use to sort the leaf nodes (default: Title)
164 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
165 if ($self->{'sort_leaf_nodes_using'}) {
166 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
167 }
168 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
169
170 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
171 if ($self->{'sort_using_unicode_collation'}) {
172 # To use this you first need to download the allkeys.txt file from
173 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
174 # Unicode/Collate directory.
175 require Unicode::Collate;
176 $self->{'unicode_collator'} = Unicode::Collate->new();
177 }
178
179 # An empty array for the document OIDs
180 $self->{'OIDs'} = [];
181
182 return bless $self, $class;
183}
184
185
186sub init
187{
188 # Nothing to do...
189}
190
191
192# Called for each document in the collection
193sub classify
194{
195 my $self = shift(@_);
196 my ($doc_obj,$edit_mode) = @_;
197
198 # If "-classify_sections" is set, classify every section of the document
199 if ($self->{'classify_sections'}) {
200 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
201 while (defined $section) {
202 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section, $edit_mode);
203 $section = $doc_obj->get_next_section($section);
204 }
205 }
206 # Otherwise just classify the top document section
207 else {
208 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section(), $edit_mode);
209 }
210}
211
212
213sub classify_section
214{
215 my $self = shift(@_);
216 my ($doc_obj,$section_OID,$section,$edit_mode) = @_;
217
218 my @metadata_groups = @{$self->{'metadata_groups'}};
219
220 # Only classify the section if it has a value for one of the metadata elements in the first group
221 my $classify_section = 0;
222 my $first_metadata_group = $metadata_groups[0];
223 foreach my $first_metadata_group_element (split(/\;/, $first_metadata_group)) {
224 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $first_metadata_group_element);
225 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
226 # This section must be included in the classifier
227 $classify_section = 1;
228 last;
229 }
230 }
231
232 # We're not classifying this section because it doesn't have the required metadata
233 return if (!$classify_section);
234
235
236 if (($edit_mode eq "delete") || ($edit_mode eq "update")) {
237 $self->oid_array_delete($section_OID,'OIDs');
238 if ($edit_mode eq "delete") {
239 return;
240 }
241 }
242
243 # Otherwise, include this section in the classifier
244 push(@{$self->{'OIDs'}}, $section_OID);
245
246 # Create a hash for the metadata values of each metadata element we're interested in
247 my %metadata_groups_done = ();
248 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
249 # Take care not to do a metadata group more than once
250 unless ($metadata_groups_done{$metadata_group}) {
251 foreach my $metadata_element (split(/\;/, $metadata_group)) {
252 my @metadata_values = @{$doc_obj->get_metadata($section, $metadata_element)};
253 foreach my $metadata_value (@metadata_values) {
254 # Strip leading and trailing whitespace
255 $metadata_value =~ s/^\s*//;
256 $metadata_value =~ s/\s*$//;
257
258 # Convert the metadata value from a UTF-8 string to a Unicode string
259 # This means that length() and substr() work properly
260 # We need to be careful to convert classifier node title values back to UTF-8, however
261 my $metadata_value_unicode_string = $self->convert_utf8_string_to_unicode_string($metadata_value);
262
263 # Add the metadata value into the list for this combination of metadata group and section
264 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
265 }
266 last if (@metadata_values > 0);
267 }
268
269 $metadata_groups_done{$metadata_group} = 1;
270 }
271 }
272}
273
274
275sub get_classify_info
276{
277 my $self = shift(@_);
278
279 # The metadata groups to classify by
280 my @metadata_groups = @{$self->{'metadata_groups'}};
281 my $first_metadata_group = $metadata_groups[0];
282
283 # The OID values of the documents to include in the classifier
284 my @OIDs = @{$self->{'OIDs'}};
285
286 # Create the root node of the classification hierarchy
287 my %classifier_node = ( 'thistype' => "Invisible",
288 'childtype' => $self->{$first_metadata_group . ".list_type"},
289 'Title' => $self->{'buttonname'},
290 'contains' => [],
291 'mdtype' => $first_metadata_group );
292
293 # Recursively create the classification hierarchy, one level for each metadata group
294 $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
295 return \%classifier_node;
296}
297
298
299sub add_level
300{
301 my $self = shift(@_);
302 my @metadata_groups = @{shift(@_)};
303 my @OIDs = @{shift(@_)};
304 my $classifier_node = shift(@_);
305 # print STDERR "\nAdding AZ list for " . $classifier_node->{'Title'} . "\n";
306
307 my $metadata_group = $metadata_groups[0];
308 # print STDERR "Processing metadata group: " . $metadata_group . "\n";
309 # print STDERR "Number of OID values: " . @OIDs . "\n";
310
311 if (!defined($self->{$metadata_group . ".list"})) {
312 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
313 return;
314 }
315
316 # Create a mapping from metadata value to OID
317 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
318 my %metadata_value_to_OIDs_hash = ();
319 foreach my $OID (@OIDs)
320 {
321 if ($OID_to_metadata_values_hash_ref->{$OID})
322 {
323 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
324 foreach my $metadata_value (@metadata_values)
325 {
326 push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
327 }
328 }
329 }
330 # print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
331
332 # Partition the values (if necessary)
333 my $partition_type_within_level = $self->{"partition_type_within_level"};
334 if ($partition_type_within_level =~ /^per_letter$/i) {
335 # Generate one hlist for each letter
336 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
337 my %metadata_value_to_OIDs_subhash = ();
338
339 my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
340 foreach my $metadata_value (@sortedmetadata_values) {
341 my $metadata_valuepartition = substr($metadata_value, 0, 1);
342
343 # Is this the start of a new partition?
344 if ($metadata_valuepartition ne $lastpartition) {
345 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
346 %metadata_value_to_OIDs_subhash = ();
347 $lastpartition = $metadata_valuepartition;
348 }
349
350 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
351 }
352
353 # Don't forget to add the last partition
354 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
355
356 # The partitions are stored in an HList
357 $classifier_node->{'childtype'} = "HList";
358 }
359
360 else {
361 # Generate hlists of a certain size
362 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
363 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
364 my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
365 my $itemsdone = 0;
366 my %metadata_value_to_OIDs_subhash = ();
367 my $lastpartitionend = "";
368 my $partitionstart;
369 foreach my $metadata_value (@sortedmetadata_values) {
370 $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
371 $itemsdone++;
372 my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
373
374 # Is this the start of a new partition?
375 if ($itemsinpartition == 1) {
376 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
377 }
378
379 # Is this the end of the partition?
380 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
381 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
382 my $partitionname = $partitionstart;
383 if ($partitionend ne $partitionstart) {
384 $partitionname = $partitionname . "-" . $partitionend;
385 }
386
387 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
388 %metadata_value_to_OIDs_subhash = ();
389 $lastpartitionend = $partitionend;
390 }
391 }
392
393 # The partitions are stored in an HList
394 $classifier_node->{'childtype'} = "HList";
395 }
396
397 # Otherwise just add all the values to a VList
398 else {
399 $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
400 }
401 }
402}
403
404
405sub convert_utf8_string_to_unicode_string
406{
407 my $self = shift(@_);
408 my $utf8_string = shift(@_);
409
410 my $unicode_string = "";
411 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
412 $unicode_string .= chr($unicode_value);
413 }
414 return $unicode_string;
415}
416
417
418sub convert_unicode_string_to_utf8_string
419{
420 my $self = shift(@_);
421 my $unicode_string = shift(@_);
422
423 my @unicode_array;
424 for (my $i = 0; $i < length($unicode_string); $i++) {
425 push(@unicode_array, ord(substr($unicode_string, $i, 1)));
426 }
427 return &unicode::unicode2utf8(\@unicode_array);
428}
429
430
431sub generate_partition_start
432{
433 my $self = shift(@_);
434 my $metadata_value = shift(@_);
435 my $lastpartitionend = shift(@_);
436 my $partition_name_length = shift(@_);
437
438 if ($partition_name_length) {
439 return substr($metadata_value, 0, $partition_name_length);
440 }
441
442 my $partitionstart = substr($metadata_value, 0, 1);
443 if ($partitionstart le $lastpartitionend) {
444 $partitionstart = substr($metadata_value, 0, 2);
445 # Give up after three characters
446 if ($partitionstart le $lastpartitionend) {
447 $partitionstart = substr($metadata_value, 0, 3);
448 }
449 }
450
451 return $partitionstart;
452}
453
454
455sub generate_partition_end
456{
457 my $self = shift(@_);
458 my $metadata_value = shift(@_);
459 my $partitionstart = shift(@_);
460 my $partition_name_length = shift(@_);
461
462 if ($partition_name_length) {
463 return substr($metadata_value, 0, $partition_name_length);
464 }
465
466 my $partitionend = substr($metadata_value, 0, length($partitionstart));
467 if ($partitionend gt $partitionstart) {
468 $partitionend = substr($metadata_value, 0, 1);
469 if ($partitionend le $partitionstart) {
470 $partitionend = substr($metadata_value, 0, 2);
471 # Give up after three characters
472 if ($partitionend le $partitionstart) {
473 $partitionend = substr($metadata_value, 0, 3);
474 }
475 }
476 }
477
478 return $partitionend;
479}
480
481
482sub add_hlist_partition
483{
484 my $self = shift(@_);
485 my @metadata_groups = @{shift(@_)};
486 my $classifier_node = shift(@_);
487 my $partitionname = shift(@_);
488 my $metadata_value_to_OIDs_hash_ref = shift(@_);
489
490 # Create an hlist partition
491 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
492 'childtype' => "VList",
493 'contains' => [] );
494
495 # Add the children to the hlist partition
496 $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
497 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
498}
499
500
501sub add_vlist
502{
503 my $self = shift(@_);
504 my @metadata_groups = @{shift(@_)};
505 my $classifier_node = shift(@_);
506 my $metadata_value_to_OIDs_hash_ref = shift(@_);
507
508 my $metadata_group = shift(@metadata_groups);
509 $classifier_node->{'mdtype'} = $metadata_group;
510
511 # Create an entry in the vlist for each value
512 foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
513 {
514 my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
515
516 # If there is only one item and 'always_bookshelf' is false, add the item to the list
517 if (@OIDs == 1 && $self->{$metadata_group . ".always_bookshelf"} eq "f") {
518 my $OID = $OIDs[0];
519
520 # Find the offset of this metadata value
521 my $offset = 0;
522 my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
523 my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
524 for (my $i = 0; $i < scalar(@metadata_values); $i++) {
525 if ($metadata_value eq $metadata_values[$i]) {
526 $offset = $i;
527 last;
528 }
529 }
530 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
531 }
532
533 # Otherwise create a sublist (bookshelf) for the metadata value
534 else {
535 my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
536 'childtype' => "VList",
537 'contains' => [] );
538
539 # If there are metadata elements remaining, recursively apply the process
540 if (@metadata_groups > 0) {
541 my $next_metadata_group = $metadata_groups[0];
542 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
543 $self->add_level(\@metadata_groups, \@OIDs, \%child_classifier_node);
544 }
545 # Otherwise just add the documents as children of this list
546 else {
547 # Sort the leaf nodes by the metadata elements specified with -sort_leaf_nodes_using
548 my @sort_leaf_nodes_usingmetadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
549 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_usingmetadata_groups) {
550 my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
551 # Force a stable sort (Perl 5.6's sort isn't stable)
552 # !! The [0] bits aren't ideal (multiple metadata values) !!
553 @OIDs = @OIDs[ sort { $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ];
554 }
555
556 foreach my $OID (@OIDs) {
557 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID });
558 }
559 }
560
561 # Add the sublist to the list
562 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
563 }
564 }
565}
566
567
568sub sort_metadata_values_array
569{
570 my $self = shift(@_);
571 my @metadata_values = @_;
572
573 if ($self->{'unicode_collator'}) {
574 return $self->{'unicode_collator'}->sort(@metadata_values);
575 }
576 else {
577 return sort(@metadata_values);
578 }
579}
580
581
5821;
Note: See TracBrowser for help on using the repository browser.