source: trunk/gsdl/perllib/classify/GenericList.pm@ 12891

Last change on this file since 12891 was 12891, checked in by mdewsnip, 18 years ago

Tidied up that horrible long line in the new() function of every classifier.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.9 KB
Line 
1###########################################################################
2#
3# GenericList.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: Remove punctuation from metadata values before sorting.
30#
31###########################################################################
32
33package GenericList;
34
35
36use BasClas;
37
38use strict;
39
40
41sub BEGIN {
42 @GenericList::ISA = ('BasClas');
43}
44
45
46my $arguments =
47 [ { 'name' => "metadata",
48 'desc' => "{GenericList.metadata}",
49 'type' => "metadata",
50 'reqd' => "yes" },
51
52 # The interesting options
53 { 'name' => "always_bookshelf_last_level",
54 'desc' => "{GenericList.always_bookshelf_last_level}",
55 'type' => "flag" },
56 { 'name' => "classify_sections",
57 'desc' => "{GenericList.classify_sections}",
58 'type' => "flag" },
59 { 'name' => "partition_type_within_level",
60 'desc' => "{GenericList.partition_type_within_level}",
61 'type' => "string",
62 'deft' => "none" },
63 { 'name' => "partition_size_within_level",
64 'desc' => "{GenericList.partition_size_within_level}",
65 'type' => "string" },
66 { 'name' => "sort_leaf_nodes_using",
67 'desc' => "{GenericList.sort_leaf_nodes_using}",
68 'type' => "metadata",
69 'deft' => "Title" },
70 { 'name' => "use_hlist_for",
71 'desc' => "{GenericList.use_hlist_for}",
72 'type' => "string" } ];
73
74my $options = { 'name' => "GenericList",
75 'desc' => "{GenericList.desc}",
76 'abstract' => "no",
77 'inherits' => "Yes",
78 'args' => $arguments };
79
80
81sub new
82{
83 my ($class) = shift(@_);
84 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
85 push(@$classifierslist, $class);
86
87 if (defined $arguments) { push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments}); }
88 if (defined $options) { push(@{$hashArgOptLists->{"OptList"}}, $options); }
89
90 my $self = new BasClas($classifierslist, $inputargs, $hashArgOptLists);
91 $self->{'OIDs'} = [];
92
93 if ($self->{'info_only'}) {
94 # don't worry about any options etc
95 return bless $self, $class;
96 }
97
98 my $metadata = $self->{'metadata'};
99 my $buttonname = $self->{'buttonname'};
100 my $partition_type_within_level = $self->{'partition_type_within_level'};
101 my $partition_size_within_level = $self->{'partition_size_within_level'};
102 my $sort_leaf_nodes_using = $self->{'sort_leaf_nodes_using'};
103 my $use_hlist_for = $self->{'use_hlist_for'};
104
105 # The metadata elements to use (required)
106 # print STDERR "Creating new GenericList for $metadata...\n";
107 if (!$metadata) {
108 die "Error: No metadata fields specified for GenericList.\n";
109 }
110 my @metadata_groups = split(/\//, $metadata);
111 $self->{'metadata_groups'} = \@metadata_groups;
112
113 # The classifier button name
114 if (!$buttonname) {
115 # Default: the first metadata element specified
116 my $firstmetagroupfirstelem = (split(/\;/, $metadata_groups[0]))[0];
117 $buttonname = $self->generate_title_from_metadata($firstmetagroupfirstelem);
118 }
119 $self->{'title'} = $buttonname;
120
121 # Whether to group single items into a bookshelf (must be true for all metadata fields except the last)
122 foreach my $metadata_group (@metadata_groups) {
123 $self->{$metadata_group . ".always_bookshelf"} = "t";
124 }
125 if (!$self->{'always_bookshelf_last_level'}) {
126 # Default: leave leafnodes ungrouped (equivalent to AZCompactList -mingroup 2)
127 my $last_metadata_group = $metadata_groups[$#metadata_groups];
128 $self->{$last_metadata_group . ".always_bookshelf"} = "f";
129 }
130
131 # Whether to use an hlist or a vlist for each level in the hierarchy
132 foreach my $metadata_group (@metadata_groups) {
133 $self->{$metadata_group . ".use_hlist"} = "f";
134 }
135 foreach my $metadata_group (split(/\,/, $use_hlist_for)) {
136 $self->{$metadata_group . ".use_hlist"} = "t";
137 }
138
139 # How the items are grouped into partitions
140 if (!$partition_type_within_level) {
141 # Default: none
142 $partition_type_within_level = "none";
143 }
144 $self->{"partition_type_within_level"} = $partition_type_within_level;
145
146 # The number of items in each partition
147 if (!$partition_size_within_level) {
148 # Default: 20
149 foreach my $metadata_group (@metadata_groups) {
150 $self->{$metadata_group . ".partition_size_within_level"} = 20;
151 }
152 }
153 else {
154 my @partition_size_within_levellist = split(/\//, $partition_size_within_level);
155
156 # Assign values based on the partition_size_within_level parameter
157 foreach my $metadata_group (@metadata_groups) {
158 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
159 if (defined($partition_size_within_levelelem)) {
160 $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
161 }
162 else {
163 $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
164 }
165 }
166 }
167
168 # The metadata elements to use to sort the leaf nodes
169 my @sort_leaf_nodes_usingmetadata_groups = ( "Title" );
170 if ($sort_leaf_nodes_using) {
171 @sort_leaf_nodes_usingmetadata_groups = split(/\|/, $sort_leaf_nodes_using);
172 }
173 $self->{'sort_leaf_nodes_usingmetadata_groups'} = \@sort_leaf_nodes_usingmetadata_groups;
174
175 return bless $self, $class;
176}
177
178
179sub init
180{
181 # Nothing to do...
182}
183
184
185sub classify
186{
187 my $self = shift(@_);
188 my $doc_obj = shift(@_);
189
190 my $doc_OID = $doc_obj->get_OID();
191
192 if ($self->{'classify_sections'}) {
193 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
194 while (defined $section) {
195 $self->classify_section($doc_obj, $doc_OID . ".$section", $section);
196 $section = $doc_obj->get_next_section($section);
197 }
198 }
199 else {
200 $self->classify_section($doc_obj, $doc_OID, $doc_obj->get_top_section());
201 }
202}
203
204
205sub classify_section
206{
207 my $self = shift(@_);
208 my $doc_obj = shift(@_);
209 my $doc_OID = shift(@_);
210 my $section = shift(@_);
211
212 my @metadata_groups = @{$self->{'metadata_groups'}};
213
214 # Only classify the document if it has a value for one of the metadata elements in the first group
215 foreach my $firstmetagroupelem (split(/\;/, $metadata_groups[0])) {
216 my $firstmetagroupelemvalue = $doc_obj->get_metadata_element($section, $firstmetagroupelem);
217 if (defined($firstmetagroupelemvalue) && $firstmetagroupelemvalue ne "") {
218 push(@{$self->{'OIDs'}}, $doc_OID);
219
220 # Create a hash for the metadata values of each metadata element we're interested in
221 my %metagroupsdone = ();
222 foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_usingmetadata_groups'}}) {
223 # Take care not to do a metadata group more than once
224 unless ($metagroupsdone{$metadata_group}) {
225 foreach my $metaelem (split(/\;/, $metadata_group)) {
226 my @metavalues = @{$doc_obj->get_metadata($section, $metaelem)};
227 foreach my $metavalue (@metavalues) {
228 # Strip leading and trailing whitespace
229 $metavalue =~ s/^\s*//;
230 $metavalue =~ s/\s*$//;
231 push(@{$self->{$metadata_group . ".list"}->{$doc_OID}}, $metavalue);
232 }
233 last if (@metavalues > 0);
234 }
235
236 $metagroupsdone{$metadata_group} = 1;
237 }
238 }
239
240 last;
241 }
242 }
243}
244
245
246sub get_classify_info
247{
248 my $self = shift(@_);
249
250 # The metadata elements to classify by
251 my @metadata_groups = @{$self->{'metadata_groups'}};
252 my $first_metadata_group = $metadata_groups[0];
253
254 # The OID values of the documents to include in the classification
255 my @OIDs = @{$self->{'OIDs'}};
256
257 # The root node of the classification hierarchy
258 my $childtype = (($self->{$first_metadata_group . ".use_hlist"} eq "t") ? "HList" : "VList");
259 my %classifyinfo = ( 'thistype' => "Invisible",
260 'childtype' => $childtype,
261 'Title' => $self->{'title'},
262 'contains' => [] );
263
264 # Recursively create the classification hierarchy, one level for each metadata element
265 &add_az_list($self, \@metadata_groups, \@OIDs, \%classifyinfo);
266 return \%classifyinfo;
267}
268
269
270sub add_az_list
271{
272 my $self = shift(@_);
273 my @metadata_groups = @{shift(@_)};
274 my @OIDs = @{shift(@_)};
275 my $classifyinfo = shift(@_);
276 # print STDERR "\nAdding AZ list for " . $classifyinfo->{'Title'} . "\n";
277
278 my $metadata_group = $metadata_groups[0];
279 # print STDERR "Processing metadata group: " . $metadata_group . "\n";
280 # print STDERR "Number of OID values: " . @OIDs . "\n";
281
282 my %OIDtometavaluehash = %{$self->{$metadata_group . ".list"}};
283
284 # Create a mapping from metadata value to OID
285 my %metavaluetoOIDhash = ();
286 foreach my $OID (@OIDs) {
287 if ($OIDtometavaluehash{$OID}) {
288 my @metavalues = @{$OIDtometavaluehash{$OID}};
289 foreach my $metavalue (@metavalues) {
290 push(@{$metavaluetoOIDhash{$metavalue}}, $OID);
291 }
292 }
293 }
294 # print STDERR "Number of distinct values: " . scalar(keys %metavaluetoOIDhash) . "\n";
295
296 # Partition the values (if necessary)
297 my $partition_type_within_level = $self->{"partition_type_within_level"};
298 if ($partition_type_within_level =~ /^per_letter$/i) {
299 # Generate one hlist for each letter
300 my @sortedmetavalues = sort(keys %metavaluetoOIDhash);
301 my %metavaluetoOIDsubhash = ();
302
303 my $lastpartition = &unicode::substr($sortedmetavalues[0], 0, 1);
304 foreach my $metavalue (@sortedmetavalues) {
305 my $metavaluepartition = &unicode::substr($metavalue, 0, 1);
306
307 # Is this the start of a new partition?
308 if ($metavaluepartition ne $lastpartition) {
309 &add_hlist_partition($self, \@metadata_groups, $classifyinfo, $lastpartition, \%metavaluetoOIDsubhash);
310 %metavaluetoOIDsubhash = ();
311 $lastpartition = $metavaluepartition;
312 }
313
314 $metavaluetoOIDsubhash{$metavalue} = $metavaluetoOIDhash{$metavalue};
315 }
316
317 # Don't forget to add the last partition
318 &add_hlist_partition($self, \@metadata_groups, $classifyinfo, $lastpartition, \%metavaluetoOIDsubhash);
319
320 # The partitions are stored in an HList
321 $classifyinfo->{'childtype'} = "HList";
322 }
323
324 else {
325 # Generate hlists of a certain size
326 my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
327 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metavaluetoOIDhash) > $partition_size_within_level) {
328 my @sortedmetavalues = sort(keys %metavaluetoOIDhash);
329 my $itemsdone = 0;
330 my %metavaluetoOIDsubhash = ();
331 my $lastpartitionend = "";
332 my $partitionstart;
333 foreach my $metavalue (@sortedmetavalues) {
334 $metavaluetoOIDsubhash{$metavalue} = $metavaluetoOIDhash{$metavalue};
335 $itemsdone++;
336 my $itemsinpartition = scalar(keys %metavaluetoOIDsubhash);
337
338 # Is this the start of a new partition?
339 if ($itemsinpartition == 1) {
340 $partitionstart = &generate_partition_start($metavalue, $lastpartitionend);
341 }
342
343 # Is this the end of the partition?
344 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetavalues) {
345 my $partitionend = &generate_partition_end($metavalue, $partitionstart);
346 my $partitionname = $partitionstart;
347 if ($partitionend ne $partitionstart) {
348 $partitionname = $partitionname . "-" . $partitionend;
349 }
350
351 &add_hlist_partition($self, \@metadata_groups, $classifyinfo, $partitionname, \%metavaluetoOIDsubhash);
352 %metavaluetoOIDsubhash = ();
353 $lastpartitionend = $partitionend;
354 }
355 }
356
357 # The partitions are stored in an HList
358 $classifyinfo->{'childtype'} = "HList";
359 }
360
361 # Otherwise just add all the values to a VList
362 else {
363 &add_vlist($self, \@metadata_groups, $classifyinfo, \%metavaluetoOIDhash);
364 }
365 }
366}
367
368
369sub unicode_length
370{
371 my $utf8string = shift(@_);
372
373 my @unicodestring = @{&unicode::utf82unicode($utf8string)};
374 return scalar(@unicodestring);
375}
376
377
378sub generate_partition_start
379{
380 my $metavalue = shift(@_);
381 my $lastpartitionend = shift(@_);
382
383 my $partitionstart = &unicode::substr($metavalue, 0, 1);
384 if ($partitionstart le $lastpartitionend) {
385 $partitionstart = &unicode::substr($metavalue, 0, 2);
386 # Give up after three characters
387 if ($partitionstart le $lastpartitionend) {
388 $partitionstart = &unicode::substr($metavalue, 0, 3);
389 }
390 }
391
392 return $partitionstart;
393}
394
395
396sub generate_partition_end
397{
398 my $metavalue = shift(@_);
399 my $partitionstart = shift(@_);
400
401 my $partitionend = &unicode::substr($metavalue, 0, &unicode_length($partitionstart));
402 if ($partitionend gt $partitionstart) {
403 $partitionend = &unicode::substr($metavalue, 0, 1);
404 if ($partitionend le $partitionstart) {
405 $partitionend = &unicode::substr($metavalue, 0, 2);
406 # Give up after three characters
407 if ($partitionend le $partitionstart) {
408 $partitionend = &unicode::substr($metavalue, 0, 3);
409 }
410 }
411 }
412
413 return $partitionend;
414}
415
416
417sub add_hlist_partition
418{
419 my $self = shift(@_);
420 my @metadata_groups = @{shift(@_)};
421 my $classifyinfo = shift(@_);
422 my $partitionname = shift(@_);
423 my %metavaluetoOIDhash = %{shift(@_)};
424
425 # Create an hlist partition
426 my %subclassifyinfo = ( 'Title' => $partitionname,
427 'childtype' => "VList",
428 'contains' => [] );
429
430 # Add the children to the hlist partition
431 &add_vlist($self, \@metadata_groups, \%subclassifyinfo, \%metavaluetoOIDhash);
432 push(@{$classifyinfo->{'contains'}}, \%subclassifyinfo);
433}
434
435
436sub add_vlist
437{
438 my $self = shift(@_);
439 my @metadata_groups = @{shift(@_)};
440 my $classifyinfo = shift(@_);
441 my %metavaluetoOIDhash = %{shift(@_)};
442
443 my $metadata_group = shift(@metadata_groups);
444
445 # Create an entry in the vlist for each value
446 foreach my $metavalue (sort(keys %metavaluetoOIDhash)) {
447 my @OIDs = @{$metavaluetoOIDhash{$metavalue}};
448
449 # If there is only one item and 'always_bookshelf' is false, add the item to the list
450 if (@OIDs == 1 && $self->{$metadata_group . ".always_bookshelf"} eq "f") {
451 push(@{$classifyinfo->{'contains'}}, { 'OID' => $OIDs[0] });
452 }
453
454 # Otherwise create a sublist (bookshelf) for the metadata value
455 else {
456 my %subclassifyinfo = ( 'Title' => $metavalue,
457 'childtype' => "VList",
458 'contains' => [] );
459
460 # If there are metadata elements remaining, recursively apply the process
461 if (@metadata_groups > 0) {
462 my $next_metadata_group = $metadata_groups[0];
463 my $childtype = (($self->{$next_metadata_group . ".use_hlist"} eq "t") ? "HList" : "VList");
464 $subclassifyinfo{'childtype'} = $childtype;
465 &add_az_list($self, \@metadata_groups, \@OIDs, \%subclassifyinfo);
466 }
467 # Otherwise just add the documents as children of this list
468 else {
469 # Sort the leaf nodes by the metadata elements specified with -sort_leaf_nodes_using
470 my @sort_leaf_nodes_usingmetadata_groups = @{$self->{'sort_leaf_nodes_usingmetadata_groups'}};
471 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_usingmetadata_groups) {
472 my %OIDtometavaluehash = %{$self->{$sort_leaf_nodes_usingmetaelem . ".list"}};
473 # Force a stable sort (Perl 5.6's sort isn't stable)
474 # !! The [0] bits aren't ideal (multiple metadata values) !!
475 @OIDs = @OIDs[ sort { $OIDtometavaluehash{$OIDs[$a]}[0] cmp $OIDtometavaluehash{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ];
476 }
477
478 foreach my $OID (@OIDs) {
479 push(@{$subclassifyinfo{'contains'}}, { 'OID' => $OID });
480 }
481 }
482
483 # Add the sublist to the list
484 push(@{$classifyinfo->{'contains'}}, \%subclassifyinfo);
485 }
486 }
487}
488
489
4901;
Note: See TracBrowser for help on using the repository browser.