source: trunk/gsdl/perllib/classify/GenericList.pm@ 13340

Last change on this file since 13340 was 13340, checked in by mdewsnip, 17 years ago

Added check to prevent error when attempting to build a GenericList classifier on an invalid metadata element.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.9 KB
Line 
1###########################################################################
2#
3# GenericList.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: Remove punctuation from metadata values before sorting.
30#
31###########################################################################
32
33package GenericList;
34
35
36use BasClas;
37
38use strict;
39
40
41sub BEGIN {
42 @GenericList::ISA = ('BasClas');
43}
44
45
46my $arguments =
47 [ { 'name' => "metadata",
48 'desc' => "{GenericList.metadata}",
49 'type' => "metadata",
50 'reqd' => "yes" },
51
52 # The interesting options
53 { 'name' => "always_bookshelf_last_level",
54 'desc' => "{GenericList.always_bookshelf_last_level}",
55 'type' => "flag" },
56 { 'name' => "classify_sections",
57 'desc' => "{GenericList.classify_sections}",
58 'type' => "flag" },
59 { 'name' => "partition_type_within_level",
60 'desc' => "{GenericList.partition_type_within_level}",
61 'type' => "string",
62 'deft' => "none" },
63 { 'name' => "partition_size_within_level",
64 'desc' => "{GenericList.partition_size_within_level}",
65 'type' => "string" },
66 { 'name' => "sort_leaf_nodes_using",
67 'desc' => "{GenericList.sort_leaf_nodes_using}",
68 'type' => "metadata",
69 'deft' => "Title" },
70 { 'name' => "use_hlist_for",
71 'desc' => "{GenericList.use_hlist_for}",
72 'type' => "string" } ];
73
74my $options = { 'name' => "GenericList",
75 'desc' => "{GenericList.desc}",
76 'abstract' => "no",
77 'inherits' => "Yes",
78 'args' => $arguments };
79
80
81sub new
82{
83 my ($class) = shift(@_);
84 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
85 push(@$classifierslist, $class);
86
87 if (defined $arguments) { push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments}); }
88 if (defined $options) { push(@{$hashArgOptLists->{"OptList"}}, $options); }
89
90 my $self = new BasClas($classifierslist, $inputargs, $hashArgOptLists);
91
92 if ($self->{'info_only'}) {
93 # don't worry about any options etc
94 return bless $self, $class;
95 }
96
97 # The metadata elements to use (required)
98 if (!$self->{'metadata'}) {
99 die "Error: No metadata fields specified for GenericList.\n";
100 }
101 my @metadata_groups = split(/\//, $self->{'metadata'});
102 $self->{'metadata_groups'} = \@metadata_groups;
103
104 # The classifier button name (default: the first metadata element specified)
105 if (!$self->{'buttonname'}) {
106 my $first_metadata_group = $metadata_groups[0];
107 my $first_metadata_element = (split(/\;/, $first_metadata_group))[0];
108 $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
109 }
110
111 # Whether to group single items into a bookshelf (must be true for all metadata fields except the last)
112 foreach my $metadata_group (@metadata_groups) {
113 $self->{$metadata_group . ".always_bookshelf"} = "t";
114 }
115 if (!$self->{'always_bookshelf_last_level'}) {
116 # Default: leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
117 my $last_metadata_group = $metadata_groups[$#metadata_groups];
118 $self->{$last_metadata_group . ".always_bookshelf"} = "f";
119 }
120
121 # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
122 foreach my $metadata_group (@metadata_groups) {
123 $self->{$metadata_group . ".list_type"} = "VList";
124 }
125 foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
126 $self->{$metadata_group . ".list_type"} = "HList";
127 }
128
129 # How the items are grouped into partitions (default: no partition)
130 if (!$self->{"partition_type_within_level"}) {
131 $self->{"partition_type_within_level"} = "none";
132 }
133
134 # The number of items in each partition
135 if (!$self->{'partition_size_within_level'}) {
136 # Default: 20
137 foreach my $metadata_group (@metadata_groups) {
138 $self->{$metadata_group . ".partition_size_within_level"} = 20;
139 }
140 }
141 else {
142 my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
143
144 # Assign values based on the partition_size_within_level parameter
145 foreach my $metadata_group (@metadata_groups) {
146 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
147 if (defined($partition_size_within_levelelem)) {
148 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
149 }
150 else {
151 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
152 }
153 }
154 }
155
156 # The metadata elements to use to sort the leaf nodes (default: Title)
157 my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
158 if ($self->{'sort_leaf_nodes_using'}) {
159 @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
160 }
161 $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
162
163 # An empty array for the document OIDs
164 $self->{'OIDs'} = [];
165
166 return bless $self, $class;
167}
168
169
170sub init
171{
172 # Nothing to do...
173}
174
175
176# Called for each document in the collection
177sub classify
178{
179 my $self = shift(@_);
180 my $doc_obj = shift(@_);
181
182 # If "-classify_sections" is set, classify every section of the document
183 if ($self->{'classify_sections'}) {
184 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
185 while (defined $section) {
186 $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section);
187 $section = $doc_obj->get_next_section($section);
188 }
189 }
190 # Otherwise just classify the top document section
191 else {
192 $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section());
193 }
194}
195
196
197sub classify_section
198{
199 my $self = shift(@_);
200 my $doc_obj = shift(@_);
201 my $section_OID = shift(@_);
202 my $section = shift(@_);
203
204 my @metadata_groups = @{$self->{'metadata_groups'}};
205
206 # Only classify the section if it has a value for one of the metadata elements in the first group
207 my $classify_section = 0;
208 my $first_metadata_group = $metadata_groups[0];
209 foreach my $first_metadata_group_element (split(/\;/, $first_metadata_group)) {
210 my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $first_metadata_group_element);
211 if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
212 # This section must be included in the classifier
213 $classify_section = 1;
214 last;
215 }
216 }
217
218 # We're not classifying this section because it doesn't have the required metadata
219 return if (!$classify_section);
220
221 # Otherwise, include this section in the classifier
222 push(@{$self->{'OIDs'}}, $section_OID);
223
224 # Create a hash for the metadata values of each metadata element we're interested in
225 my %metadata_groups_done = ();
226 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
227 # Take care not to do a metadata group more than once
228 unless ($metadata_groups_done{$metadata_group}) {
229 foreach my $metadata_element (split(/\;/, $metadata_group)) {
230 my @metadata_values = @{$doc_obj->get_metadata($section, $metadata_element)};
231 foreach my $metadata_value (@metadata_values) {
232 # Strip leading and trailing whitespace
233 $metadata_value =~ s/^\s*//;
234 $metadata_value =~ s/\s*$//;
235 push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value);
236 }
237 last if (@metadata_values > 0);
238 }
239
240 $metadata_groups_done{$metadata_group} = 1;
241 }
242 }
243}
244
245
246sub get_classify_info
247{
248 my $self = shift(@_);
249
250 # The metadata groups to classify by
251 my @metadata_groups = @{$self->{'metadata_groups'}};
252 my $first_metadata_group = $metadata_groups[0];
253
254 # The OID values of the documents to include in the classifier
255 my @OIDs = @{$self->{'OIDs'}};
256
257 # Create the root node of the classification hierarchy
258 my %classifier_node = ( 'thistype' => "Invisible",
259 'childtype' => $self->{$first_metadata_group . ".list_type"},
260 'Title' => $self->{'buttonname'},
261 'contains' => [],
262 'mdtype' => $first_metadata_group );
263
264 # Recursively create the classification hierarchy, one level for each metadata group
265 &add_level($self, \@metadata_groups, \@OIDs, \%classifier_node);
266 return \%classifier_node;
267}
268
269
270sub add_level
271{
272 my $self = shift(@_);
273 my @metadata_groups = @{shift(@_)};
274 my @OIDs = @{shift(@_)};
275 my $classifier_node = shift(@_);
276 # print STDERR "\nAdding AZ list for " . $classifier_node->{'Title'} . "\n";
277
278 my $metadata_group = $metadata_groups[0];
279 # print STDERR "Processing metadata group: " . $metadata_group . "\n";
280 # print STDERR "Number of OID values: " . @OIDs . "\n";
281
282 if (!defined($self->{$metadata_group . ".list"})) {
283 print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
284 return;
285 }
286 my %OIDtometavalueshash = %{$self->{$metadata_group . ".list"}};
287
288 # Create a mapping from metadata value to OID
289 my %metavaluetoOIDhash = ();
290 foreach my $OID (@OIDs) {
291 if ($OIDtometavalueshash{$OID}) {
292 my @metavalues = @{$OIDtometavalueshash{$OID}};
293 foreach my $metavalue (@metavalues) {
294 push(@{$metavaluetoOIDhash{$metavalue}}, $OID);
295 }
296 }
297 }
298 # print STDERR "Number of distinct values: " . scalar(keys %metavaluetoOIDhash) . "\n";
299
300 # Partition the values (if necessary)
301 my $partition_type_within_level = $self->{"partition_type_within_level"};
302 if ($partition_type_within_level =~ /^per_letter$/i) {
303 # Generate one hlist for each letter
304 my @sortedmetavalues = sort(keys %metavaluetoOIDhash);
305 my %metavaluetoOIDsubhash = ();
306
307 my $lastpartition = &unicode::substr($sortedmetavalues[0], 0, 1);
308 foreach my $metavalue (@sortedmetavalues) {
309 my $metavaluepartition = &unicode::substr($metavalue, 0, 1);
310
311 # Is this the start of a new partition?
312 if ($metavaluepartition ne $lastpartition) {
313 &add_hlist_partition($self, \@metadata_groups, $classifier_node, $lastpartition, \%metavaluetoOIDsubhash);
314 %metavaluetoOIDsubhash = ();
315 $lastpartition = $metavaluepartition;
316 }
317
318 $metavaluetoOIDsubhash{$metavalue} = $metavaluetoOIDhash{$metavalue};
319 }
320
321 # Don't forget to add the last partition
322 &add_hlist_partition($self, \@metadata_groups, $classifier_node, $lastpartition, \%metavaluetoOIDsubhash);
323
324 # The partitions are stored in an HList
325 $classifier_node->{'childtype'} = "HList";
326 }
327
328 else {
329 # Generate hlists of a certain size
330 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
331 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metavaluetoOIDhash) > $partition_size_within_level) {
332 my @sortedmetavalues = sort(keys %metavaluetoOIDhash);
333 my $itemsdone = 0;
334 my %metavaluetoOIDsubhash = ();
335 my $lastpartitionend = "";
336 my $partitionstart;
337 foreach my $metavalue (@sortedmetavalues) {
338 $metavaluetoOIDsubhash{$metavalue} = $metavaluetoOIDhash{$metavalue};
339 $itemsdone++;
340 my $itemsinpartition = scalar(keys %metavaluetoOIDsubhash);
341
342 # Is this the start of a new partition?
343 if ($itemsinpartition == 1) {
344 $partitionstart = &generate_partition_start($metavalue, $lastpartitionend);
345 }
346
347 # Is this the end of the partition?
348 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetavalues) {
349 my $partitionend = &generate_partition_end($metavalue, $partitionstart);
350 my $partitionname = $partitionstart;
351 if ($partitionend ne $partitionstart) {
352 $partitionname = $partitionname . "-" . $partitionend;
353 }
354
355 &add_hlist_partition($self, \@metadata_groups, $classifier_node, $partitionname, \%metavaluetoOIDsubhash);
356 %metavaluetoOIDsubhash = ();
357 $lastpartitionend = $partitionend;
358 }
359 }
360
361 # The partitions are stored in an HList
362 $classifier_node->{'childtype'} = "HList";
363 }
364
365 # Otherwise just add all the values to a VList
366 else {
367 &add_vlist($self, \@metadata_groups, $classifier_node, \%metavaluetoOIDhash);
368 }
369 }
370}
371
372
373sub unicode_length
374{
375 my $utf8string = shift(@_);
376
377 my @unicodestring = @{&unicode::utf82unicode($utf8string)};
378 return scalar(@unicodestring);
379}
380
381
382sub generate_partition_start
383{
384 my $metavalue = shift(@_);
385 my $lastpartitionend = shift(@_);
386
387 my $partitionstart = &unicode::substr($metavalue, 0, 1);
388 if ($partitionstart le $lastpartitionend) {
389 $partitionstart = &unicode::substr($metavalue, 0, 2);
390 # Give up after three characters
391 if ($partitionstart le $lastpartitionend) {
392 $partitionstart = &unicode::substr($metavalue, 0, 3);
393 }
394 }
395
396 return $partitionstart;
397}
398
399
400sub generate_partition_end
401{
402 my $metavalue = shift(@_);
403 my $partitionstart = shift(@_);
404
405 my $partitionend = &unicode::substr($metavalue, 0, &unicode_length($partitionstart));
406 if ($partitionend gt $partitionstart) {
407 $partitionend = &unicode::substr($metavalue, 0, 1);
408 if ($partitionend le $partitionstart) {
409 $partitionend = &unicode::substr($metavalue, 0, 2);
410 # Give up after three characters
411 if ($partitionend le $partitionstart) {
412 $partitionend = &unicode::substr($metavalue, 0, 3);
413 }
414 }
415 }
416
417 return $partitionend;
418}
419
420
421sub add_hlist_partition
422{
423 my $self = shift(@_);
424 my @metadata_groups = @{shift(@_)};
425 my $classifier_node = shift(@_);
426 my $partitionname = shift(@_);
427 my %metavaluetoOIDhash = %{shift(@_)};
428
429 # Create an hlist partition
430 my %child_classifier_node = ( 'Title' => $partitionname,
431 'childtype' => "VList",
432 'contains' => [] );
433
434 # Add the children to the hlist partition
435 &add_vlist($self, \@metadata_groups, \%child_classifier_node, \%metavaluetoOIDhash);
436 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
437}
438
439
440sub add_vlist
441{
442 my $self = shift(@_);
443 my @metadata_groups = @{shift(@_)};
444 my $classifier_node = shift(@_);
445 my %metavaluetoOIDhash = %{shift(@_)};
446
447 my $metadata_group = shift(@metadata_groups);
448 $classifier_node->{'mdtype'} = $metadata_group;
449
450 # Create an entry in the vlist for each value
451 foreach my $metavalue (sort(keys %metavaluetoOIDhash)) {
452 my @OIDs = @{$metavaluetoOIDhash{$metavalue}};
453
454 # If there is only one item and 'always_bookshelf' is false, add the item to the list
455 if (@OIDs == 1 && $self->{$metadata_group . ".always_bookshelf"} eq "f") {
456 my $OID = $OIDs[0];
457
458 # Find the offset of this metadata value
459 my $offset = 0;
460 my %OIDtometavalueshash = %{$self->{$metadata_group . ".list"}};
461 my @metavalues = @{$OIDtometavalueshash{$OID}};
462 for (my $i = 0; $i < scalar(@metavalues); $i++) {
463 if ($metavalue eq $metavalues[$i]) {
464 $offset = $i;
465 last;
466 }
467 }
468 push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
469 }
470
471 # Otherwise create a sublist (bookshelf) for the metadata value
472 else {
473 my %child_classifier_node = ( 'Title' => $metavalue,
474 'childtype' => "VList",
475 'contains' => [] );
476
477 # If there are metadata elements remaining, recursively apply the process
478 if (@metadata_groups > 0) {
479 my $next_metadata_group = $metadata_groups[0];
480 $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
481 &add_level($self, \@metadata_groups, \@OIDs, \%child_classifier_node);
482 }
483 # Otherwise just add the documents as children of this list
484 else {
485 # Sort the leaf nodes by the metadata elements specified with -sort_leaf_nodes_using
486 my @sort_leaf_nodes_usingmetadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
487 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_usingmetadata_groups) {
488 my %OIDtometavalueshash = %{$self->{$sort_leaf_nodes_usingmetaelem . ".list"}};
489 # Force a stable sort (Perl 5.6's sort isn't stable)
490 # !! The [0] bits aren't ideal (multiple metadata values) !!
491 @OIDs = @OIDs[ sort { $OIDtometavalueshash{$OIDs[$a]}[0] cmp $OIDtometavalueshash{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ];
492 }
493
494 foreach my $OID (@OIDs) {
495 push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID });
496 }
497 }
498
499 # Add the sublist to the list
500 push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
501 }
502 }
503}
504
505
5061;
Note: See TracBrowser for help on using the repository browser.