source: trunk/gsdl/perllib/classify/GenericList.pm@ 13551

Last change on this file since 13551 was 13551, checked in by mdewsnip, 17 years ago

Added a new option: sort_using_unicode_collation, which uses the Unicode::Collate module to sort Unicode values better (eg. e with acute is sorted with the other e values, not after everything else).

  • Property svn:keywords set to Author Date Id Revision
File size: 18.3 KB
Line 
1###########################################################################
2#
3# GenericList.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: Remove punctuation from metadata values before sorting.
30#
31###########################################################################
32
33package GenericList;
34
35
36use BasClas;
37
38use strict;
39
40
41sub BEGIN {
42 @GenericList::ISA = ('BasClas');
43}
44
45
46my $arguments =
47 [ { 'name' => "metadata",
48 'desc' => "{GenericList.metadata}",
49 'type' => "metadata",
50 'reqd' => "yes" },
51
52 # The interesting options
53 { 'name' => "always_bookshelf_last_level",
54 'desc' => "{GenericList.always_bookshelf_last_level}",
55 'type' => "flag" },
56 { 'name' => "classify_sections",
57 'desc' => "{GenericList.classify_sections}",
58 'type' => "flag" },
59 { 'name' => "partition_type_within_level",
60 'desc' => "{GenericList.partition_type_within_level}",
61 'type' => "string",
62 'deft' => "none" },
63 { 'name' => "partition_size_within_level",
64 'desc' => "{GenericList.partition_size_within_level}",
65 'type' => "string" },
66 { 'name' => "sort_leaf_nodes_using",
67 'desc' => "{GenericList.sort_leaf_nodes_using}",
68 'type' => "metadata",
69 'deft' => "Title" },
70 { 'name' => "sort_using_unicode_collation",
71 'desc' => "{GenericList.sort_using_unicode_collation}",
72 'type' => "flag" },
73 { 'name' => "use_hlist_for",
74 'desc' => "{GenericList.use_hlist_for}",
75 'type' => "string" } ];
76
77my $options = { 'name' => "GenericList",
78 'desc' => "{GenericList.desc}",
79 'abstract' => "no",
80 'inherits' => "Yes",
81 'args' => $arguments };
82
83
84sub new
85{
86 my ($class) = shift(@_);
87 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
88 push(@$classifierslist, $class);
89
90 if (defined $arguments) { push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments}); }
91 if (defined $options) { push(@{$hashArgOptLists->{"OptList"}}, $options); }
92
93 my $self = new BasClas($classifierslist, $inputargs, $hashArgOptLists);
94
95 if ($self->{'info_only'}) {
96 # don't worry about any options etc
97 return bless $self, $class;
98 }
99
100 # The metadata elements to use (required)
101 if (!$self->{'metadata'}) {
102 die "Error: No metadata fields specified for GenericList.\n";
103 }
104 my @metadata_groups = split(/\//, $self->{'metadata'});
105 $self->{'metadata_groups'} = \@metadata_groups;
106
107 # The classifier button name (default: the first metadata element specified)
108 if (!$self->{'buttonname'}) {
109 my $first_metadata_group = $metadata_groups[0];
110 my $first_metadata_element = (split(/\;/, $first_metadata_group))[0];
111 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
112 }
113
114 # Whether to group single items into a bookshelf (must be true for all metadata fields except the last)
115 foreach my $metadata_group (@metadata_groups) {
116 $self->{$metadata_group . ".always_bookshelf"} = "t";
117 }
118 if (!$self->{'always_bookshelf_last_level'}) {
119 # Default: leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
120 my $last_metadata_group = $metadata_groups[$#metadata_groups];
121 $self->{$last_metadata_group . ".always_bookshelf"} = "f";
122 }
123
124 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
125 foreach my $metadata_group (@metadata_groups) {
126 $self->{$metadata_group . ".list_type"} = "VList";
127 }
128 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
129 $self->{$metadata_group . ".list_type"} = "HList";
130 }
131
132 # How the items are grouped into partitions (default: no partition)
133 if (!$self->{"partition_type_within_level"}) {
134 $self->{"partition_type_within_level"} = "none";
135 }
136
137 # The number of items in each partition
138 if (!$self->{'partition_size_within_level'}) {
139 # Default: 20
140 foreach my $metadata_group (@metadata_groups) {
141 $self->{$metadata_group . ".partition_size_within_level"} = 20;
142 }
143 }
144 else {
145 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
146
147 # Assign values based on the partition_size_within_level parameter
148 foreach my $metadata_group (@metadata_groups) {
149 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
150 if (defined($partition_size_within_levelelem)) {
151 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
152 }
153 else {
154 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
155 }
156 }
157 }
158
159 # The metadata elements to use to sort the leaf nodes (default: Title)
160 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
161 if ($self->{'sort_leaf_nodes_using'}) {
162 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
163 }
164 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
165
166 # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
167 if ($self->{'sort_using_unicode_collation'}) {
168 require Unicode::Collate;
169 $self->{'unicode_collator'} = Unicode::Collate->new();
170 }
171
172 # An empty array for the document OIDs
173 $self->{'OIDs'} = [];
174
175 return bless $self, $class;
176}
177
178
179sub init
180{
181 # Nothing to do...
182}
183
184
185# Called for each document in the collection
186sub classify
187{
188 my $self = shift(@_);
189 my $doc_obj = shift(@_);
190
191 # If "-classify_sections" is set, classify every section of the document
192 if ($self->{'classify_sections'}) {
193 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
194 while (defined $section) {
195 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
196 $section = $doc_obj->get_next_section($section);
197 }
198 }
199 # Otherwise just classify the top document section
200 else {
201 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
202 }
203}
204
205
206sub classify_section
207{
208 my $self = shift(@_);
209 my $doc_obj = shift(@_);
210 my $section_OID = shift(@_);
211 my $section = shift(@_);
212
213 my @metadata_groups = @{$self->{'metadata_groups'}};
214
215 # Only classify the section if it has a value for one of the metadata elements in the first group
216 my $classify_section = 0;
217 my $first_metadata_group = $metadata_groups[0];
218 foreach my $first_metadata_group_element (split(/\;/, $first_metadata_group)) {
219 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $first_metadata_group_element);
220 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
221 # This section must be included in the classifier
222 $classify_section = 1;
223 last;
224 }
225 }
226
227 # We're not classifying this section because it doesn't have the required metadata
228 return if (!$classify_section);
229
230 # Otherwise, include this section in the classifier
231 push(@{$self->{'OIDs'}}, $section_OID);
232
233 # Create a hash for the metadata values of each metadata element we're interested in
234 my %metadata_groups_done = ();
235 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
236 # Take care not to do a metadata group more than once
237 unless ($metadata_groups_done{$metadata_group}) {
238 foreach my $metadata_element (split(/\;/, $metadata_group)) {
239 my @metadata_values = @{$doc_obj->get_metadata($section, $metadata_element)};
240 foreach my $metadata_value (@metadata_values) {
241 # Strip leading and trailing whitespace
242 $metadata_value =~ s/^\s*//;
243 $metadata_value =~ s/\s*$//;
244
245 # Convert the metadata value from a UTF-8 string to a Unicode string
246 # This means that length() and substr() work properly
247 # We need to be careful to convert classifier node title values back to UTF-8, however
248 my $metadata_value_unicode_string = &convert_utf8_string_to_unicode_string($metadata_value);
249
250 # Add the metadata value into the list for this combination of metadata group and section
251 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
252 }
253 last if (@metadata_values > 0);
254 }
255
256 $metadata_groups_done{$metadata_group} = 1;
257 }
258 }
259}
260
261
262sub get_classify_info
263{
264 my $self = shift(@_);
265
266 # The metadata groups to classify by
267 my @metadata_groups = @{$self->{'metadata_groups'}};
268 my $first_metadata_group = $metadata_groups[0];
269
270 # The OID values of the documents to include in the classifier
271 my @OIDs = @{$self->{'OIDs'}};
272
273 # Create the root node of the classification hierarchy
274 my %classifier_node = ( 'thistype' => "Invisible",
275 'childtype' => $self->{$first_metadata_group . ".list_type"},
276 'Title' => $self->{'buttonname'},
277 'contains' => [],
278 'mdtype' => $first_metadata_group );
279
280 # Recursively create the classification hierarchy, one level for each metadata group
281 &add_level($self, \@metadata_groups, \@OIDs, \%classifier_node);
282 return \%classifier_node;
283}
284
285
286sub add_level
287{
288 my $self = shift(@_);
289 my @metadata_groups = @{shift(@_)};
290 my @OIDs = @{shift(@_)};
291 my $classifier_node = shift(@_);
292 # print STDERR "\nAdding AZ list for " . $classifier_node->{'Title'} . "\n";
293
294 my $metadata_group = $metadata_groups[0];
295 # print STDERR "Processing metadata group: " . $metadata_group . "\n";
296 # print STDERR "Number of OID values: " . @OIDs . "\n";
297
298 if (!defined($self->{$metadata_group . ".list"})) {
299 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
300 return;
301 }
302 my %OIDtometavalueshash = %{$self->{$metadata_group . ".list"}};
303
304 # Create a mapping from metadata value to OID
305 my %metavaluetoOIDhash = ();
306 foreach my $OID (@OIDs) {
307 if ($OIDtometavalueshash{$OID}) {
308 my @metavalues = @{$OIDtometavalueshash{$OID}};
309 foreach my $metavalue (@metavalues) {
310 push(@{$metavaluetoOIDhash{$metavalue}}, $OID);
311 }
312 }
313 }
314 # print STDERR "Number of distinct values: " . scalar(keys %metavaluetoOIDhash) . "\n";
315
316 # Partition the values (if necessary)
317 my $partition_type_within_level = $self->{"partition_type_within_level"};
318 if ($partition_type_within_level =~ /^per_letter$/i) {
319 # Generate one hlist for each letter
320 my @sortedmetavalues = &sort_metadata_values_array($self, keys(%metavaluetoOIDhash));
321 my %metavaluetoOIDsubhash = ();
322
323 my $lastpartition = substr($sortedmetavalues[0], 0, 1);
324 foreach my $metavalue (@sortedmetavalues) {
325 my $metavaluepartition = substr($metavalue, 0, 1);
326
327 # Is this the start of a new partition?
328 if ($metavaluepartition ne $lastpartition) {
329 &add_hlist_partition($self, \@metadata_groups, $classifier_node, $lastpartition, \%metavaluetoOIDsubhash);
330 %metavaluetoOIDsubhash = ();
331 $lastpartition = $metavaluepartition;
332 }
333
334 $metavaluetoOIDsubhash{$metavalue} = $metavaluetoOIDhash{$metavalue};
335 }
336
337 # Don't forget to add the last partition
338 &add_hlist_partition($self, \@metadata_groups, $classifier_node, $lastpartition, \%metavaluetoOIDsubhash);
339
340 # The partitions are stored in an HList
341 $classifier_node->{'childtype'} = "HList";
342 }
343
344 else {
345 # Generate hlists of a certain size
346 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
347 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metavaluetoOIDhash) > $partition_size_within_level) {
348 my @sortedmetavalues = &sort_metadata_values_array($self, keys(%metavaluetoOIDhash));
349 my $itemsdone = 0;
350 my %metavaluetoOIDsubhash = ();
351 my $lastpartitionend = "";
352 my $partitionstart;
353 foreach my $metavalue (@sortedmetavalues) {
354 $metavaluetoOIDsubhash{$metavalue} = $metavaluetoOIDhash{$metavalue};
355 $itemsdone++;
356 my $itemsinpartition = scalar(keys %metavaluetoOIDsubhash);
357
358 # Is this the start of a new partition?
359 if ($itemsinpartition == 1) {
360 $partitionstart = &generate_partition_start($metavalue, $lastpartitionend);
361 }
362
363 # Is this the end of the partition?
364 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetavalues) {
365 my $partitionend = &generate_partition_end($metavalue, $partitionstart);
366 my $partitionname = $partitionstart;
367 if ($partitionend ne $partitionstart) {
368 $partitionname = $partitionname . "-" . $partitionend;
369 }
370
371 &add_hlist_partition($self, \@metadata_groups, $classifier_node, $partitionname, \%metavaluetoOIDsubhash);
372 %metavaluetoOIDsubhash = ();
373 $lastpartitionend = $partitionend;
374 }
375 }
376
377 # The partitions are stored in an HList
378 $classifier_node->{'childtype'} = "HList";
379 }
380
381 # Otherwise just add all the values to a VList
382 else {
383 &add_vlist($self, \@metadata_groups, $classifier_node, \%metavaluetoOIDhash);
384 }
385 }
386}
387
388
389sub convert_utf8_string_to_unicode_string
390{
391 my $utf8_string = shift(@_);
392
393 my $unicode_string = "";
394 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
395 $unicode_string .= chr($unicode_value);
396 }
397 return $unicode_string;
398}
399
400
401sub convert_unicode_string_to_utf8_string
402{
403 my $unicode_string = shift(@_);
404
405 my @unicode_array;
406 for (my $i = 0; $i < length($unicode_string); $i++) {
407 push(@unicode_array, ord(substr($unicode_string, $i, 1)));
408 }
409 return &unicode::unicode2utf8(\@unicode_array);
410}
411
412
413sub generate_partition_start
414{
415 my $metavalue = shift(@_);
416 my $lastpartitionend = shift(@_);
417
418 my $partitionstart = substr($metavalue, 0, 1);
419 if ($partitionstart le $lastpartitionend) {
420 $partitionstart = substr($metavalue, 0, 2);
421 # Give up after three characters
422 if ($partitionstart le $lastpartitionend) {
423 $partitionstart = substr($metavalue, 0, 3);
424 }
425 }
426
427 return $partitionstart;
428}
429
430
431sub generate_partition_end
432{
433 my $metavalue = shift(@_);
434 my $partitionstart = shift(@_);
435
436 my $partitionend = substr($metavalue, 0, length($partitionstart));
437 if ($partitionend gt $partitionstart) {
438 $partitionend = substr($metavalue, 0, 1);
439 if ($partitionend le $partitionstart) {
440 $partitionend = substr($metavalue, 0, 2);
441 # Give up after three characters
442 if ($partitionend le $partitionstart) {
443 $partitionend = substr($metavalue, 0, 3);
444 }
445 }
446 }
447
448 return $partitionend;
449}
450
451
452sub add_hlist_partition
453{
454 my $self = shift(@_);
455 my @metadata_groups = @{shift(@_)};
456 my $classifier_node = shift(@_);
457 my $partitionname = shift(@_);
458 my %metavaluetoOIDhash = %{shift(@_)};
459
460 # Create an hlist partition
461 my %child_classifier_node = ( 'Title' => &convert_unicode_string_to_utf8_string($partitionname),
462 'childtype' => "VList",
463 'contains' => [] );
464
465 # Add the children to the hlist partition
466 &add_vlist($self, \@metadata_groups, \%child_classifier_node, \%metavaluetoOIDhash);
467 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
468}
469
470
471sub add_vlist
472{
473 my $self = shift(@_);
474 my @metadata_groups = @{shift(@_)};
475 my $classifier_node = shift(@_);
476 my %metavaluetoOIDhash = %{shift(@_)};
477
478 my $metadata_group = shift(@metadata_groups);
479 $classifier_node->{'mdtype'} = $metadata_group;
480
481 # Create an entry in the vlist for each value
482 foreach my $metavalue (&sort_metadata_values_array($self, keys(%metavaluetoOIDhash))) {
483 my @OIDs = @{$metavaluetoOIDhash{$metavalue}};
484
485 # If there is only one item and 'always_bookshelf' is false, add the item to the list
486 if (@OIDs == 1 && $self->{$metadata_group . ".always_bookshelf"} eq "f") {
487 my $OID = $OIDs[0];
488
489 # Find the offset of this metadata value
490 my $offset = 0;
491 my %OIDtometavalueshash = %{$self->{$metadata_group . ".list"}};
492 my @metavalues = @{$OIDtometavalueshash{$OID}};
493 for (my $i = 0; $i < scalar(@metavalues); $i++) {
494 if ($metavalue eq $metavalues[$i]) {
495 $offset = $i;
496 last;
497 }
498 }
499 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
500 }
501
502 # Otherwise create a sublist (bookshelf) for the metadata value
503 else {
504 my %child_classifier_node = ( 'Title' => &convert_unicode_string_to_utf8_string($metavalue),
505 'childtype' => "VList",
506 'contains' => [] );
507
508 # If there are metadata elements remaining, recursively apply the process
509 if (@metadata_groups > 0) {
510 my $next_metadata_group = $metadata_groups[0];
511 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
512 &add_level($self, \@metadata_groups, \@OIDs, \%child_classifier_node);
513 }
514 # Otherwise just add the documents as children of this list
515 else {
516 # Sort the leaf nodes by the metadata elements specified with -sort_leaf_nodes_using
517 my @sort_leaf_nodes_usingmetadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
518 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_usingmetadata_groups) {
519 my %OIDtometavalueshash = %{$self->{$sort_leaf_nodes_usingmetaelem . ".list"}};
520 # Force a stable sort (Perl 5.6's sort isn't stable)
521 # !! The [0] bits aren't ideal (multiple metadata values) !!
522 @OIDs = @OIDs[ sort { $OIDtometavalueshash{$OIDs[$a]}[0] cmp $OIDtometavalueshash{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ];
523 }
524
525 foreach my $OID (@OIDs) {
526 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID });
527 }
528 }
529
530 # Add the sublist to the list
531 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
532 }
533 }
534}
535
536
537sub sort_metadata_values_array
538{
539 my $self = shift(@_);
540 my @metadata_values = @_;
541
542 if ($self->{'unicode_collator'}) {
543 return $self->{'unicode_collator'}->sort(@metadata_values);
544 }
545 else {
546 return sort(@metadata_values);
547 }
548}
549
550
5511;
Note: See TracBrowser for help on using the repository browser.