source: trunk/gsdl/perllib/classify/GenericList.pm@ 13791

Last change on this file since 13791 was 13791, checked in by mdewsnip, 17 years ago

Added a comment about downloading the allkeys.txt file for using Unicode::Collate.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.6 KB
Line 
1###########################################################################
2#
3# GenericList.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30# - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package GenericList;
35
36
37use BasClas;
38
39use strict;
40
41
42sub BEGIN {
43 @GenericList::ISA = ('BasClas');
44}
45
46
47my $arguments =
48 [ { 'name' => "metadata",
49 'desc' => "{GenericList.metadata}",
50 'type' => "metadata",
51 'reqd' => "yes" },
52
53 # The interesting options
54 { 'name' => "always_bookshelf_last_level",
55 'desc' => "{GenericList.always_bookshelf_last_level}",
56 'type' => "flag" },
57 { 'name' => "classify_sections",
58 'desc' => "{GenericList.classify_sections}",
59 'type' => "flag" },
60 { 'name' => "partition_type_within_level",
61 'desc' => "{GenericList.partition_type_within_level}",
62 'type' => "string",
63 'deft' => "none" },
64 { 'name' => "partition_size_within_level",
65 'desc' => "{GenericList.partition_size_within_level}",
66 'type' => "string" },
67 { 'name' => "sort_leaf_nodes_using",
68 'desc' => "{GenericList.sort_leaf_nodes_using}",
69 'type' => "metadata",
70 'deft' => "Title" },
71 { 'name' => "sort_using_unicode_collation",
72 'desc' => "{GenericList.sort_using_unicode_collation}",
73 'type' => "flag" },
74 { 'name' => "use_hlist_for",
75 'desc' => "{GenericList.use_hlist_for}",
76 'type' => "string" } ];
77
78my $options = { 'name' => "GenericList",
79 'desc' => "{GenericList.desc}",
80 'abstract' => "no",
81 'inherits' => "Yes",
82 'args' => $arguments };
83
84
85sub new
86{
87 my ($class) = shift(@_);
88 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
89 push(@$classifierslist, $class);
90
91 if (defined $arguments) { push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments}); }
92 if (defined $options) { push(@{$hashArgOptLists->{"OptList"}}, $options); }
93
94 my $self = new BasClas($classifierslist, $inputargs, $hashArgOptLists);
95
96 if ($self->{'info_only'}) {
97 # don't worry about any options etc
98 return bless $self, $class;
99 }
100
101 # The metadata elements to use (required)
102 if (!$self->{'metadata'}) {
103 die "Error: No metadata fields specified for GenericList.\n";
104 }
105 my @metadata_groups = split(/\//, $self->{'metadata'});
106 $self->{'metadata_groups'} = \@metadata_groups;
107
108 # The classifier button name (default: the first metadata element specified)
109 if (!$self->{'buttonname'}) {
110 my $first_metadata_group = $metadata_groups[0];
111 my $first_metadata_element = (split(/\;/, $first_metadata_group))[0];
112 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
113 }
114
115 # Whether to group single items into a bookshelf (must be true for all metadata fields except the last)
116 foreach my $metadata_group (@metadata_groups) {
117 $self->{$metadata_group . ".always_bookshelf"} = "t";
118 }
119 if (!$self->{'always_bookshelf_last_level'}) {
120 # Default: leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
121 my $last_metadata_group = $metadata_groups[$#metadata_groups];
122 $self->{$last_metadata_group . ".always_bookshelf"} = "f";
123 }
124
125 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
126 foreach my $metadata_group (@metadata_groups) {
127 $self->{$metadata_group . ".list_type"} = "VList";
128 }
129 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
130 $self->{$metadata_group . ".list_type"} = "HList";
131 }
132
133 # How the items are grouped into partitions (default: no partition)
134 if (!$self->{"partition_type_within_level"}) {
135 $self->{"partition_type_within_level"} = "none";
136 }
137
138 # The number of items in each partition
139 if (!$self->{'partition_size_within_level'}) {
140 # Default: 20
141 foreach my $metadata_group (@metadata_groups) {
142 $self->{$metadata_group . ".partition_size_within_level"} = 20;
143 }
144 }
145 else {
146 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
147
148 # Assign values based on the partition_size_within_level parameter
149 foreach my $metadata_group (@metadata_groups) {
150 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
151 if (defined($partition_size_within_levelelem)) {
152 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
153 }
154 else {
155 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
156 }
157 }
158 }
159
160 # The metadata elements to use to sort the leaf nodes (default: Title)
161 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
162 if ($self->{'sort_leaf_nodes_using'}) {
163 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
164 }
165 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
166
167 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
168 if ($self->{'sort_using_unicode_collation'}) {
169 # To use this you first need to download the allkeys.txt file from
170 # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
171 # Unicode/Collate directory.
172 require Unicode::Collate;
173 $self->{'unicode_collator'} = Unicode::Collate->new();
174 }
175
176 # An empty array for the document OIDs
177 $self->{'OIDs'} = [];
178
179 return bless $self, $class;
180}
181
182
183sub init
184{
185 # Nothing to do...
186}
187
188
189# Called for each document in the collection
190sub classify
191{
192 my $self = shift(@_);
193 my $doc_obj = shift(@_);
194
195 # If "-classify_sections" is set, classify every section of the document
196 if ($self->{'classify_sections'}) {
197 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
198 while (defined $section) {
199 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
200 $section = $doc_obj->get_next_section($section);
201 }
202 }
203 # Otherwise just classify the top document section
204 else {
205 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
206 }
207}
208
209
210sub classify_section
211{
212 my $self = shift(@_);
213 my $doc_obj = shift(@_);
214 my $section_OID = shift(@_);
215 my $section = shift(@_);
216
217 my @metadata_groups = @{$self->{'metadata_groups'}};
218
219 # Only classify the section if it has a value for one of the metadata elements in the first group
220 my $classify_section = 0;
221 my $first_metadata_group = $metadata_groups[0];
222 foreach my $first_metadata_group_element (split(/\;/, $first_metadata_group)) {
223 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $first_metadata_group_element);
224 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
225 # This section must be included in the classifier
226 $classify_section = 1;
227 last;
228 }
229 }
230
231 # We're not classifying this section because it doesn't have the required metadata
232 return if (!$classify_section);
233
234 # Otherwise, include this section in the classifier
235 push(@{$self->{'OIDs'}}, $section_OID);
236
237 # Create a hash for the metadata values of each metadata element we're interested in
238 my %metadata_groups_done = ();
239 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
240 # Take care not to do a metadata group more than once
241 unless ($metadata_groups_done{$metadata_group}) {
242 foreach my $metadata_element (split(/\;/, $metadata_group)) {
243 my @metadata_values = @{$doc_obj->get_metadata($section, $metadata_element)};
244 foreach my $metadata_value (@metadata_values) {
245 # Strip leading and trailing whitespace
246 $metadata_value =~ s/^\s*//;
247 $metadata_value =~ s/\s*$//;
248
249 # Convert the metadata value from a UTF-8 string to a Unicode string
250 # This means that length() and substr() work properly
251 # We need to be careful to convert classifier node title values back to UTF-8, however
252 my $metadata_value_unicode_string = &convert_utf8_string_to_unicode_string($metadata_value);
253
254 # Add the metadata value into the list for this combination of metadata group and section
255 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
256 }
257 last if (@metadata_values > 0);
258 }
259
260 $metadata_groups_done{$metadata_group} = 1;
261 }
262 }
263}
264
265
266sub get_classify_info
267{
268 my $self = shift(@_);
269
270 # The metadata groups to classify by
271 my @metadata_groups = @{$self->{'metadata_groups'}};
272 my $first_metadata_group = $metadata_groups[0];
273
274 # The OID values of the documents to include in the classifier
275 my @OIDs = @{$self->{'OIDs'}};
276
277 # Create the root node of the classification hierarchy
278 my %classifier_node = ( 'thistype' => "Invisible",
279 'childtype' => $self->{$first_metadata_group . ".list_type"},
280 'Title' => $self->{'buttonname'},
281 'contains' => [],
282 'mdtype' => $first_metadata_group );
283
284 # Recursively create the classification hierarchy, one level for each metadata group
285 &add_level($self, \@metadata_groups, \@OIDs, \%classifier_node);
286 return \%classifier_node;
287}
288
289
290sub add_level
291{
292 my $self = shift(@_);
293 my @metadata_groups = @{shift(@_)};
294 my @OIDs = @{shift(@_)};
295 my $classifier_node = shift(@_);
296 # print STDERR "\nAdding AZ list for " . $classifier_node->{'Title'} . "\n";
297
298 my $metadata_group = $metadata_groups[0];
299 # print STDERR "Processing metadata group: " . $metadata_group . "\n";
300 # print STDERR "Number of OID values: " . @OIDs . "\n";
301
302 if (!defined($self->{$metadata_group . ".list"})) {
303 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
304 return;
305 }
306 my %OIDtometavalueshash = %{$self->{$metadata_group . ".list"}};
307
308 # Create a mapping from metadata value to OID
309 my %metavaluetoOIDhash = ();
310 foreach my $OID (@OIDs) {
311 if ($OIDtometavalueshash{$OID}) {
312 my @metavalues = @{$OIDtometavalueshash{$OID}};
313 foreach my $metavalue (@metavalues) {
314 push(@{$metavaluetoOIDhash{$metavalue}}, $OID);
315 }
316 }
317 }
318 # print STDERR "Number of distinct values: " . scalar(keys %metavaluetoOIDhash) . "\n";
319
320 # Partition the values (if necessary)
321 my $partition_type_within_level = $self->{"partition_type_within_level"};
322 if ($partition_type_within_level =~ /^per_letter$/i) {
323 # Generate one hlist for each letter
324 my @sortedmetavalues = &sort_metadata_values_array($self, keys(%metavaluetoOIDhash));
325 my %metavaluetoOIDsubhash = ();
326
327 my $lastpartition = substr($sortedmetavalues[0], 0, 1);
328 foreach my $metavalue (@sortedmetavalues) {
329 my $metavaluepartition = substr($metavalue, 0, 1);
330
331 # Is this the start of a new partition?
332 if ($metavaluepartition ne $lastpartition) {
333 &add_hlist_partition($self, \@metadata_groups, $classifier_node, $lastpartition, \%metavaluetoOIDsubhash);
334 %metavaluetoOIDsubhash = ();
335 $lastpartition = $metavaluepartition;
336 }
337
338 $metavaluetoOIDsubhash{$metavalue} = $metavaluetoOIDhash{$metavalue};
339 }
340
341 # Don't forget to add the last partition
342 &add_hlist_partition($self, \@metadata_groups, $classifier_node, $lastpartition, \%metavaluetoOIDsubhash);
343
344 # The partitions are stored in an HList
345 $classifier_node->{'childtype'} = "HList";
346 }
347
348 else {
349 # Generate hlists of a certain size
350 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
351 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metavaluetoOIDhash) > $partition_size_within_level) {
352 my @sortedmetavalues = &sort_metadata_values_array($self, keys(%metavaluetoOIDhash));
353 my $itemsdone = 0;
354 my %metavaluetoOIDsubhash = ();
355 my $lastpartitionend = "";
356 my $partitionstart;
357 foreach my $metavalue (@sortedmetavalues) {
358 $metavaluetoOIDsubhash{$metavalue} = $metavaluetoOIDhash{$metavalue};
359 $itemsdone++;
360 my $itemsinpartition = scalar(keys %metavaluetoOIDsubhash);
361
362 # Is this the start of a new partition?
363 if ($itemsinpartition == 1) {
364 $partitionstart = &generate_partition_start($metavalue, $lastpartitionend);
365 }
366
367 # Is this the end of the partition?
368 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetavalues) {
369 my $partitionend = &generate_partition_end($metavalue, $partitionstart);
370 my $partitionname = $partitionstart;
371 if ($partitionend ne $partitionstart) {
372 $partitionname = $partitionname . "-" . $partitionend;
373 }
374
375 &add_hlist_partition($self, \@metadata_groups, $classifier_node, $partitionname, \%metavaluetoOIDsubhash);
376 %metavaluetoOIDsubhash = ();
377 $lastpartitionend = $partitionend;
378 }
379 }
380
381 # The partitions are stored in an HList
382 $classifier_node->{'childtype'} = "HList";
383 }
384
385 # Otherwise just add all the values to a VList
386 else {
387 &add_vlist($self, \@metadata_groups, $classifier_node, \%metavaluetoOIDhash);
388 }
389 }
390}
391
392
393sub convert_utf8_string_to_unicode_string
394{
395 my $utf8_string = shift(@_);
396
397 my $unicode_string = "";
398 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
399 $unicode_string .= chr($unicode_value);
400 }
401 return $unicode_string;
402}
403
404
405sub convert_unicode_string_to_utf8_string
406{
407 my $unicode_string = shift(@_);
408
409 my @unicode_array;
410 for (my $i = 0; $i < length($unicode_string); $i++) {
411 push(@unicode_array, ord(substr($unicode_string, $i, 1)));
412 }
413 return &unicode::unicode2utf8(\@unicode_array);
414}
415
416
417sub generate_partition_start
418{
419 my $metavalue = shift(@_);
420 my $lastpartitionend = shift(@_);
421
422 my $partitionstart = substr($metavalue, 0, 1);
423 if ($partitionstart le $lastpartitionend) {
424 $partitionstart = substr($metavalue, 0, 2);
425 # Give up after three characters
426 if ($partitionstart le $lastpartitionend) {
427 $partitionstart = substr($metavalue, 0, 3);
428 }
429 }
430
431 return $partitionstart;
432}
433
434
435sub generate_partition_end
436{
437 my $metavalue = shift(@_);
438 my $partitionstart = shift(@_);
439
440 my $partitionend = substr($metavalue, 0, length($partitionstart));
441 if ($partitionend gt $partitionstart) {
442 $partitionend = substr($metavalue, 0, 1);
443 if ($partitionend le $partitionstart) {
444 $partitionend = substr($metavalue, 0, 2);
445 # Give up after three characters
446 if ($partitionend le $partitionstart) {
447 $partitionend = substr($metavalue, 0, 3);
448 }
449 }
450 }
451
452 return $partitionend;
453}
454
455
456sub add_hlist_partition
457{
458 my $self = shift(@_);
459 my @metadata_groups = @{shift(@_)};
460 my $classifier_node = shift(@_);
461 my $partitionname = shift(@_);
462 my %metavaluetoOIDhash = %{shift(@_)};
463
464 # Create an hlist partition
465 my %child_classifier_node = ( 'Title' => &convert_unicode_string_to_utf8_string($partitionname),
466 'childtype' => "VList",
467 'contains' => [] );
468
469 # Add the children to the hlist partition
470 &add_vlist($self, \@metadata_groups, \%child_classifier_node, \%metavaluetoOIDhash);
471 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
472}
473
474
475sub add_vlist
476{
477 my $self = shift(@_);
478 my @metadata_groups = @{shift(@_)};
479 my $classifier_node = shift(@_);
480 my %metavaluetoOIDhash = %{shift(@_)};
481
482 my $metadata_group = shift(@metadata_groups);
483 $classifier_node->{'mdtype'} = $metadata_group;
484
485 # Create an entry in the vlist for each value
486 foreach my $metavalue (&sort_metadata_values_array($self, keys(%metavaluetoOIDhash))) {
487 my @OIDs = @{$metavaluetoOIDhash{$metavalue}};
488
489 # If there is only one item and 'always_bookshelf' is false, add the item to the list
490 if (@OIDs == 1 && $self->{$metadata_group . ".always_bookshelf"} eq "f") {
491 my $OID = $OIDs[0];
492
493 # Find the offset of this metadata value
494 my $offset = 0;
495 my %OIDtometavalueshash = %{$self->{$metadata_group . ".list"}};
496 my @metavalues = @{$OIDtometavalueshash{$OID}};
497 for (my $i = 0; $i < scalar(@metavalues); $i++) {
498 if ($metavalue eq $metavalues[$i]) {
499 $offset = $i;
500 last;
501 }
502 }
503 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
504 }
505
506 # Otherwise create a sublist (bookshelf) for the metadata value
507 else {
508 my %child_classifier_node = ( 'Title' => &convert_unicode_string_to_utf8_string($metavalue),
509 'childtype' => "VList",
510 'contains' => [] );
511
512 # If there are metadata elements remaining, recursively apply the process
513 if (@metadata_groups > 0) {
514 my $next_metadata_group = $metadata_groups[0];
515 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
516 &add_level($self, \@metadata_groups, \@OIDs, \%child_classifier_node);
517 }
518 # Otherwise just add the documents as children of this list
519 else {
520 # Sort the leaf nodes by the metadata elements specified with -sort_leaf_nodes_using
521 my @sort_leaf_nodes_usingmetadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
522 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_usingmetadata_groups) {
523 my %OIDtometavalueshash = %{$self->{$sort_leaf_nodes_usingmetaelem . ".list"}};
524 # Force a stable sort (Perl 5.6's sort isn't stable)
525 # !! The [0] bits aren't ideal (multiple metadata values) !!
526 @OIDs = @OIDs[ sort { $OIDtometavalueshash{$OIDs[$a]}[0] cmp $OIDtometavalueshash{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ];
527 }
528
529 foreach my $OID (@OIDs) {
530 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID });
531 }
532 }
533
534 # Add the sublist to the list
535 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
536 }
537 }
538}
539
540
541sub sort_metadata_values_array
542{
543 my $self = shift(@_);
544 my @metadata_values = @_;
545
546 if ($self->{'unicode_collator'}) {
547 return $self->{'unicode_collator'}->sort(@metadata_values);
548 }
549 else {
550 return sort(@metadata_values);
551 }
552}
553
554
5551;
Note: See TracBrowser for help on using the repository browser.