source: trunk/gsdl/perllib/classify/GenericList.pm@ 11293

Last change on this file since 11293 was 10505, checked in by mdewsnip, 19 years ago

Fixed a bug in the new "-use_hlist_for" stuff. Hope I haven't messed this up!

  • Property svn:keywords set to Author Date Id Revision
File size: 15.8 KB
Line 
1###########################################################################
2#
3# GenericList.pm -- A general and flexible list classifier with most of
4# the abilities of AZCompactList, and better Unicode,
5# metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: Remove punctuation from metadata values before sorting.
30#
31###########################################################################
32
33package GenericList;
34
35
36use BasClas;
37
38use strict;
39no strict 'refs'; # Allow filehandles to be variables and viceversa
40
41
42sub BEGIN {
43 @GenericList::ISA = ('BasClas');
44}
45
46
47my $arguments =
48 [ { 'name' => "metadata",
49 'desc' => "{GenericList.metadata}",
50 'type' => "metadata",
51 'reqd' => "yes" },
52 { 'name' => "buttonname",
53 'desc' => "{BasClas.buttonname}",
54 'type' => "string" },
55
56 # The interesting options
57 { 'name' => "always_bookshelf_last_level",
58 'desc' => "{GenericList.always_bookshelf_last_level}",
59 'type' => "flag" },
60 { 'name' => "classify_sections",
61 'desc' => "{GenericList.classify_sections}",
62 'type' => "flag" },
63 { 'name' => "partition_type_within_level",
64 'desc' => "{GenericList.partition_type_within_level}",
65 'type' => "string",
66 'deft' => "none" },
67 { 'name' => "partition_size_within_level",
68 'desc' => "{GenericList.partition_size_within_level}",
69 'type' => "string" },
70 { 'name' => "sort_leaf_nodes_using",
71 'desc' => "{GenericList.sort_leaf_nodes_using}",
72 'type' => "metadata",
73 'deft' => "Title" },
74 { 'name' => "use_hlist_for",
75 'desc' => "{GenericList.use_hlist_for}",
76 'type' => "string" } ];
77
78my $options = { 'name' => "GenericList",
79 'desc' => "{GenericList.desc}",
80 'abstract' => "no",
81 'inherits' => "Yes",
82 'args' => $arguments };
83
84
85sub new
86{
87 my ($class) = shift(@_);
88 my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
89 push(@$classifierslist, $class);
90
91 if (defined $arguments) { push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments}); }
92 if (defined $options) { push(@{$hashArgOptLists->{"OptList"}}, $options); }
93
94 my $self = (defined $hashArgOptLists) ? new BasClas($classifierslist, $inputargs, $hashArgOptLists) : new BasClas($classifierslist, $inputargs);
95 $self->{'OIDlist'} = [];
96
97 if ($self->{'info_only'}) {
98 # don't worry about any options etc
99 return bless $self, $class;
100 }
101
102 my $metadata = $self->{'metadata'};
103 my $buttonname = $self->{'buttonname'};
104 my $partition_type_within_level = $self->{'partition_type_within_level'};
105 my $partition_size_within_level = $self->{'partition_size_within_level'};
106 my $sort_leaf_nodes_using = $self->{'sort_leaf_nodes_using'};
107 my $use_hlist_for = $self->{'use_hlist_for'};
108
109 # The metadata elements to use (required)
110 # print STDERR "Creating new GenericList for $metadata...\n";
111 if (!$metadata) {
112 die "Error: No metadata fields specified for GenericList.\n";
113 }
114 my @metalist = split(/\//, $metadata);
115 $self->{'metalist'} = \@metalist;
116
117 # The classifier button name
118 if (!$buttonname) {
119 # Default: the first metadata element specified
120 my $firstmetagroupfirstelem = (split(/\;/, $metalist[0]))[0];
121 $buttonname = $self->generate_title_from_metadata($firstmetagroupfirstelem);
122 }
123 $self->{'title'} = $buttonname;
124
125 # Whether to group single items into a bookshelf (must be true for all metadata fields except the last)
126 foreach my $metagroup (@metalist) {
127 $self->{$metagroup . ".always_bookshelf"} = "t";
128 }
129 if (!$self->{'always_bookshelf_last_level'}) {
130 # Default: leave leafnodes ungrouped (equivalent to AZCompactList -mingroup 2)
131 my $lastmetagroup = $metalist[$#metalist];
132 $self->{$lastmetagroup . ".always_bookshelf"} = "f";
133 }
134
135 # Whether to use an hlist or a vlist for each level in the hierarchy
136 foreach my $metagroup (@metalist) {
137 $self->{$metagroup . ".use_hlist"} = "f";
138 }
139 foreach my $metagroup (split(/\,/, $use_hlist_for)) {
140 $self->{$metagroup . ".use_hlist"} = "t";
141 }
142
143 # How the items are grouped into partitions
144 if (!$partition_type_within_level) {
145 # Default: none
146 $partition_type_within_level = "none";
147 }
148 $self->{"partition_type_within_level"} = $partition_type_within_level;
149
150 # The number of items in each partition
151 if (!$partition_size_within_level) {
152 # Default: 20
153 foreach my $metagroup (@metalist) {
154 $self->{$metagroup . ".partition_size_within_level"} = 20;
155 }
156 }
157 else {
158 my @partition_size_within_levellist = split(/\//, $partition_size_within_level);
159
160 # Assign values based on the partition_size_within_level parameter
161 foreach my $metagroup (@metalist) {
162 my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
163 if (defined($partition_size_within_levelelem)) {
164 $self->{$metagroup . ".partition_size_within_level"} = $partition_size_within_levelelem;
165 }
166 else {
167 $self->{$metagroup . ".partition_size_within_level"} = $self->{$metalist[0] . ".partition_size_within_level"};
168 }
169 }
170 }
171
172 # The metadata elements to use to sort the leaf nodes
173 my @sort_leaf_nodes_usingmetalist = ( "Title" );
174 if ($sort_leaf_nodes_using) {
175 @sort_leaf_nodes_usingmetalist = split(/\|/, $sort_leaf_nodes_using);
176 }
177 $self->{'sort_leaf_nodes_usingmetalist'} = \@sort_leaf_nodes_usingmetalist;
178
179 return bless $self, $class;
180}
181
182
183sub init
184{
185 # Nothing to do...
186}
187
188
189sub classify
190{
191 my $self = shift(@_);
192 my $doc_obj = shift(@_);
193
194 my $doc_OID = $doc_obj->get_OID();
195
196 if ($self->{'classify_sections'}) {
197 my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
198 while (defined $section) {
199 $self->classify_section($doc_obj, $doc_OID . ".$section", $section);
200 $section = $doc_obj->get_next_section($section);
201 }
202 }
203 else {
204 $self->classify_section($doc_obj, $doc_OID, $doc_obj->get_top_section());
205 }
206}
207
208
209sub classify_section
210{
211 my $self = shift(@_);
212 my $doc_obj = shift(@_);
213 my $doc_OID = shift(@_);
214 my $section = shift(@_);
215
216 my @metalist = @{$self->{'metalist'}};
217
218 # Only classify the document if it has a value for one of the metadata elements in the first group
219 foreach my $firstmetagroupelem (split(/\;/, $metalist[0])) {
220 my $firstmetagroupelemvalue = $doc_obj->get_metadata_element($section, $firstmetagroupelem);
221 if (defined($firstmetagroupelemvalue) && $firstmetagroupelemvalue ne "") {
222 push(@{$self->{'OIDlist'}}, $doc_OID);
223
224 # Create a hash for the metadata values of each metadata element we're interested in
225 my %metagroupsdone = ();
226 foreach my $metagroup (@metalist, @{$self->{'sort_leaf_nodes_usingmetalist'}}) {
227 # Take care not to do a metadata group more than once
228 unless ($metagroupsdone{$metagroup}) {
229 foreach my $metaelem (split(/\;/, $metagroup)) {
230 my @metavalues = @{$doc_obj->get_metadata($section, $metaelem)};
231 foreach my $metavalue (@metavalues) {
232 # Strip leading and trailing whitespace
233 $metavalue =~ s/^\s*//;
234 $metavalue =~ s/\s*$//;
235 push(@{$self->{$metagroup . ".list"}->{$doc_OID}}, $metavalue);
236 }
237 last if (@metavalues > 0);
238 }
239
240 $metagroupsdone{$metagroup} = 1;
241 }
242 }
243
244 last;
245 }
246 }
247}
248
249
250sub get_classify_info
251{
252 my $self = shift(@_);
253
254 # The metadata elements to classify by
255 my @metalist = @{$self->{'metalist'}};
256 my $firstmetagroup = $metalist[0];
257
258 # The OID values of the documents to include in the classification
259 my @OIDlist = @{$self->{'OIDlist'}};
260
261 # The root node of the classification hierarchy
262 my $childtype = (($self->{$firstmetagroup . ".use_hlist"} eq "t") ? "HList" : "VList");
263 my %classifyinfo = ( 'thistype' => "Invisible",
264 'childtype' => $childtype,
265 'Title' => $self->{'title'},
266 'contains' => [] );
267
268 # Recursively create the classification hierarchy, one level for each metadata element
269 &add_az_list($self, \@metalist, \@OIDlist, \%classifyinfo);
270 return \%classifyinfo;
271}
272
273
274sub add_az_list
275{
276 my $self = shift(@_);
277 my @metalist = @{shift(@_)};
278 my @OIDlist = @{shift(@_)};
279 my $classifyinfo = shift(@_);
280 # print STDERR "\nAdding AZ list for " . $classifyinfo->{'Title'} . "\n";
281
282 my $metagroup = $metalist[0];
283 # print STDERR "Processing metadata group: " . $metagroup . "\n";
284 # print STDERR "Number of OID values: " . @OIDlist . "\n";
285
286 my %OIDtometavaluehash = %{$self->{$metagroup . ".list"}};
287
288 # Create a mapping from metadata value to OID
289 my %metavaluetoOIDhash = ();
290 foreach my $OID (@OIDlist) {
291 if ($OIDtometavaluehash{$OID}) {
292 my @metavalues = @{$OIDtometavaluehash{$OID}};
293 foreach my $metavalue (@metavalues) {
294 push(@{$metavaluetoOIDhash{$metavalue}}, $OID);
295 }
296 }
297 }
298 # print STDERR "Number of distinct values: " . scalar(keys %metavaluetoOIDhash) . "\n";
299
300 # Partition the values (if necessary)
301 my $partition_type_within_level = $self->{"partition_type_within_level"};
302 if ($partition_type_within_level =~ /^per_letter$/i) {
303 # Generate one hlist for each letter
304 my @sortedmetavalues = sort(keys %metavaluetoOIDhash);
305 my %metavaluetoOIDsubhash = ();
306
307 my $lastpartition = &unicode::substr($sortedmetavalues[0], 0, 1);
308 foreach my $metavalue (@sortedmetavalues) {
309 my $metavaluepartition = &unicode::substr($metavalue, 0, 1);
310
311 # Is this the start of a new partition?
312 if ($metavaluepartition ne $lastpartition) {
313 &add_hlist_partition($self, \@metalist, $classifyinfo, $lastpartition, \%metavaluetoOIDsubhash);
314 %metavaluetoOIDsubhash = ();
315 $lastpartition = $metavaluepartition;
316 }
317
318 $metavaluetoOIDsubhash{$metavalue} = $metavaluetoOIDhash{$metavalue};
319 }
320
321 # Don't forget to add the last partition
322 &add_hlist_partition($self, \@metalist, $classifyinfo, $lastpartition, \%metavaluetoOIDsubhash);
323
324 # The partitions are stored in an HList
325 $classifyinfo->{'childtype'} = "HList";
326 }
327
328 else {
329 # Generate hlists of a certain size
330 my $partition_size_within_level = $self->{$metagroup . ".partition_size_within_level"};
331 if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metavaluetoOIDhash) > $partition_size_within_level) {
332 my @sortedmetavalues = sort(keys %metavaluetoOIDhash);
333 my $itemsdone = 0;
334 my %metavaluetoOIDsubhash = ();
335 my $lastpartitionend = "";
336 my $partitionstart;
337 foreach my $metavalue (@sortedmetavalues) {
338 $metavaluetoOIDsubhash{$metavalue} = $metavaluetoOIDhash{$metavalue};
339 $itemsdone++;
340 my $itemsinpartition = scalar(keys %metavaluetoOIDsubhash);
341
342 # Is this the start of a new partition?
343 if ($itemsinpartition == 1) {
344 $partitionstart = &generate_partition_start($metavalue, $lastpartitionend);
345 }
346
347 # Is this the end of the partition?
348 if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetavalues) {
349 my $partitionend = &generate_partition_end($metavalue, $partitionstart);
350 my $partitionname = $partitionstart;
351 if ($partitionend ne $partitionstart) {
352 $partitionname = $partitionname . "-" . $partitionend;
353 }
354
355 &add_hlist_partition($self, \@metalist, $classifyinfo, $partitionname, \%metavaluetoOIDsubhash);
356 %metavaluetoOIDsubhash = ();
357 $lastpartitionend = $partitionend;
358 }
359 }
360
361 # The partitions are stored in an HList
362 $classifyinfo->{'childtype'} = "HList";
363 }
364
365 # Otherwise just add all the values to a VList
366 else {
367 &add_vlist($self, \@metalist, $classifyinfo, \%metavaluetoOIDhash);
368 }
369 }
370}
371
372
373sub unicode_length
374{
375 my $utf8string = shift(@_);
376
377 my @unicodestring = @{&unicode::utf82unicode($utf8string)};
378 return scalar(@unicodestring);
379}
380
381
382sub generate_partition_start
383{
384 my $metavalue = shift(@_);
385 my $lastpartitionend = shift(@_);
386
387 my $partitionstart = &unicode::substr($metavalue, 0, 1);
388 if ($partitionstart le $lastpartitionend) {
389 $partitionstart = &unicode::substr($metavalue, 0, 2);
390 # Give up after three characters
391 if ($partitionstart le $lastpartitionend) {
392 $partitionstart = &unicode::substr($metavalue, 0, 3);
393 }
394 }
395
396 return $partitionstart;
397}
398
399
400sub generate_partition_end
401{
402 my $metavalue = shift(@_);
403 my $partitionstart = shift(@_);
404
405 my $partitionend = &unicode::substr($metavalue, 0, &unicode_length($partitionstart));
406 if ($partitionend gt $partitionstart) {
407 $partitionend = &unicode::substr($metavalue, 0, 1);
408 if ($partitionend le $partitionstart) {
409 $partitionend = &unicode::substr($metavalue, 0, 2);
410 # Give up after three characters
411 if ($partitionend le $partitionstart) {
412 $partitionend = &unicode::substr($metavalue, 0, 3);
413 }
414 }
415 }
416
417 return $partitionend;
418}
419
420
421sub add_hlist_partition
422{
423 my $self = shift(@_);
424 my @metalist = @{shift(@_)};
425 my $classifyinfo = shift(@_);
426 my $partitionname = shift(@_);
427 my %metavaluetoOIDhash = %{shift(@_)};
428
429 # Create an hlist partition
430 my %subclassifyinfo = ( 'Title' => $partitionname,
431 'childtype' => "VList",
432 'contains' => [] );
433
434 # Add the children to the hlist partition
435 &add_vlist($self, \@metalist, \%subclassifyinfo, \%metavaluetoOIDhash);
436 push(@{$classifyinfo->{'contains'}}, \%subclassifyinfo);
437}
438
439
440sub add_vlist
441{
442 my $self = shift(@_);
443 my @metalist = @{shift(@_)};
444 my $classifyinfo = shift(@_);
445 my %metavaluetoOIDhash = %{shift(@_)};
446
447 my $metagroup = shift(@metalist);
448
449 # Create an entry in the vlist for each value
450 foreach my $metavalue (sort(keys %metavaluetoOIDhash)) {
451 my @OIDlist = @{$metavaluetoOIDhash{$metavalue}};
452
453 # If there is only one item and 'always_bookshelf' is false, add the item to the list
454 if (@OIDlist == 1 && $self->{$metagroup . ".always_bookshelf"} eq "f") {
455 push(@{$classifyinfo->{'contains'}}, { 'OID' => $OIDlist[0] });
456 }
457
458 # Otherwise create a sublist (bookshelf) for the metadata value
459 else {
460 my %subclassifyinfo = ( 'Title' => $metavalue,
461 'childtype' => "VList",
462 'contains' => [] );
463
464 # If there are metadata elements remaining, recursively apply the process
465 if (@metalist > 0) {
466 my $nextmetagroup = $metalist[0];
467 my $childtype = (($self->{$nextmetagroup . ".use_hlist"} eq "t") ? "HList" : "VList");
468 $subclassifyinfo{'childtype'} = $childtype;
469 &add_az_list($self, \@metalist, \@OIDlist, \%subclassifyinfo);
470 }
471 # Otherwise just add the documents as children of this list
472 else {
473 # Sort the leaf nodes by the metadata elements specified with -sort_leaf_nodes_using
474 my @sort_leaf_nodes_usingmetalist = @{$self->{'sort_leaf_nodes_usingmetalist'}};
475 foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_usingmetalist) {
476 my %OIDtometavaluehash = %{$self->{$sort_leaf_nodes_usingmetaelem . ".list"}};
477 # Force a stable sort (Perl 5.6's sort isn't stable)
478 # !! The [0] bits aren't ideal (multiple metadata values) !!
479 @OIDlist = @OIDlist[ sort { $OIDtometavaluehash{$OIDlist[$a]}[0] cmp $OIDtometavaluehash{$OIDlist[$b]}[0] || $a <=> $b; } 0..$#OIDlist ];
480 }
481
482 foreach my $OID (@OIDlist) {
483 push(@{$subclassifyinfo{'contains'}}, { 'OID' => $OID });
484 }
485 }
486
487 # Add the sublist to the list
488 push(@{$classifyinfo->{'contains'}}, \%subclassifyinfo);
489 }
490 }
491}
492
493
4941;
Note: See TracBrowser for help on using the repository browser.