source: trunk/gsdl/perllib/classify/AZCompactList.pm@ 1313

Last change on this file since 1313 was 1313, checked in by sjboddie, 24 years ago

Added Davids version of AZCompactList which handles multiple value
metadata

  • Property svn:keywords set to Author Date Id Revision
File size: 16.7 KB
Line 
1###########################################################################
2#
3# AZCompactList.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# classifier plugin for sorting alphabetically
27# options are:
28#
29# metadata=Metaname -- all documents with Metaname metadata
30# will be included in list, list will be sorted
31# by this element.
32# title=Title -- (optional) the title field for this classification.
33# if not included title field will be Metaname.
34# mingroup=Num -- (optional) the smallest value that will cause
35# a group in the hierarchy to form.
36# minnesting=Num -- (optional) the smallest value that will cause a
37# list to converted into nested list
38# mincompact=Num -- (optional) used in compact list
39# maxcompact=Num -- (optional) used in compact list
40# doclevel=top|section -- (optional) level to process document at.
41# onlyfirst -- (optional) control whether all or only first
42# metadata value used from array of metadata
43package AZCompactList;
44
45use sorttools;
46
47sub new {
48 my ($class, @options) = @_;
49
50 my ($metaname, $title);
51 my $mingroup = 2;
52 my $minnesting = 20;
53 my $mincompact = 10;
54 my $maxcompact = 30;
55 my $doclevel = "top";
56 my $onlyfirst = 0;
57 my $recopt = undef;
58
59 foreach $option (@options) {
60 if ($option =~ /^metadata=(.*)$/i) {
61 $metaname = $1;
62 } elsif ($option =~ /^title=(.*)$/i) {
63 $title = $1;
64 } elsif ($option =~ /^mingroup(size)?=(.*)$/i) {
65 $mingroup = $2;
66 } elsif ($option =~ /^minnesting=(.*)$/i) {
67 $minnesting = $1;
68 } elsif ($option =~ /^mincompact=(.*)$/i) {
69 $mincompact = $1;
70 } elsif ($option =~ /^maxcompact=(.*)$/i) {
71 $maxcompact = $1;
72 } elsif ($option =~ /^doclevel=(.*)$/i) {
73 $doclevel = $1;
74 } elsif ($option =~ /^onlyfirst$/i) {
75 $onlyfirst = 1;
76 } elsif ($option =~ /^recopt$/i) {
77 $recopt = "on";
78 }
79 }
80
81 die "AZCompactList used with no metadata name to classify by\n"
82 unless defined $metaname;
83 $title = $metaname unless defined $title;
84
85 return bless {
86 'list'=>{},
87 'listmetavalue'=>{},
88 'reclassify'=>{},
89 'reclassifylist'=>{},
90 'metaname' => $metaname,
91 'title' => $title,
92 'mingroup' => $mingroup,
93 'minnesting' => $minnesting,
94 'mincompact' => $mincompact,
95 'maxcompact' => $maxcompact,
96 'doclevel' => $doclevel,
97 'onlyfirst' => $onlyfirst,
98 'recopt' => $recopt
99 }, $class;
100
101
102}
103
104sub init
105{
106 my $self = shift (@_);
107
108 $self->{'list'} = {};
109 $self->{'listmetavalue'} = {};
110 $self->{'reclassify'} = {};
111 $self->{'reclassifylist'} = {};
112}
113
114$tmp = 0;
115
116sub classify
117{
118 my $self = shift (@_);
119 my ($doc_obj) = @_;
120
121 my $doc_OID = $doc_obj->get_OID();
122
123 my @sectionlist = ();
124 my $topsection = $doc_obj->get_top_section();
125
126 my $metaname = $self->{'metaname'};
127
128 $metaname =~ s/(\/.*)//; # grab first name in n1/n2/n3 list
129## print STDERR "AZCompactList: processing $doc_OID for $metaname\n";
130
131 if ($self->{'doclevel'} =~ /^top(level)?/i)
132 {
133 push(@sectionlist,$topsection);
134 }
135 else
136 {
137 my $thissection = $doc_obj->get_next_section($topsection);
138 while (defined $thissection)
139 {
140 push(@sectionlist,$thissection);
141 $thissection = $doc_obj->get_next_section ($thissection);
142 }
143 }
144
145 my $thissection;
146 foreach $thissection (@sectionlist)
147 {
148 my $full_doc_OID
149 = ($thissection ne "") ? "$doc_OID.$thissection" : $doc_OID;
150
151 if (defined $self->{'list'}->{$full_doc_OID})
152 {
153 print STDERR "WARNING: AZCompactList::classify called multiple times for $full_doc_OID\n";
154 }
155 $self->{'list'}->{$full_doc_OID} = [];
156 $self->{'listmetavalue'}->{$full_doc_OID} = [];
157
158 my $metavalues = $doc_obj->get_metadata($thissection,$metaname);
159 my $metavalue;
160 foreach $metavalue (@$metavalues)
161 {
162### print STDERR "$metaname :\tmetavalue = $metavalue\n";
163 # if this document doesn't contain the metadata element we're
164 # sorting by we won't include it in this classification
165 if (defined $metavalue && $metavalue =~ /\w/)
166 {
167 my $formatted_metavalue = $metavalue;
168 if ($self->{'metaname'} =~ m/^Creator(:.*)?$/)
169 {
170 &sorttools::format_string_name_english (\$formatted_metavalue);
171 }
172 else
173 {
174 &sorttools::format_string_english (\$formatted_metavalue);
175 }
176
177 push(@{$self->{'list'}->{$full_doc_OID}},$formatted_metavalue);
178 push(@{$self->{'listmetavalue'}->{$full_doc_OID}} ,$metavalue);
179
180 last if ($self->{'onlyfirst'});
181 }
182 }
183 my $date = $doc_obj->get_metadata_element($thissection,"Date");
184 $self->{'reclassify'}->{$full_doc_OID} = [$doc_obj,$date];
185 }
186}
187
188sub reinit
189{
190 my ($self,$classlist_ref) = @_;
191
192 my %mtfreq = ();
193 my @single_classlist = ();
194 my @multiple_classlist = ();
195
196 # find out how often each metavalue occurs
197 map
198 {
199 my $mv;
200 foreach $mv (@{$self->{'listmetavalue'}->{$_}} )
201 {
202 $mtfreq{$mv}++;
203 }
204 } @$classlist_ref;
205
206 # use this information to split the list: single metavalue/repeated value
207 map
208 {
209 my $i = 1;
210 my $metavalue;
211 foreach $metavalue (@{$self->{'listmetavalue'}->{$_}})
212 {
213 if ($mtfreq{$metavalue} >= $self->{'mingroup'})
214 {
215 push(@multiple_classlist,[$_,$i,$metavalue]);
216 }
217 else
218 {
219 push(@single_classlist,[$_,$metavalue]);
220 $metavalue =~ tr/[A-Z]/[a-z]/;
221 $self->{'reclassifylist'}->{"Metavalue_$i.$_"} = $metavalue;
222 }
223 $i++;
224 }
225 } @$classlist_ref;
226
227
228 # Setup sub-classifiers for multiple list
229
230 $self->{'classifiers'} = {};
231
232 my $pm;
233 foreach $pm ("List", "SectionList")
234 {
235 my $listname
236 = &util::filename_cat($ENV{'GSDLHOME'},"perllib/classify/$pm.pm");
237 if (-e $listname) { require $listname; }
238 else
239 {
240 die "AZCompactList ERROR - couldn't find classifier \"$listname\"\n";
241 }
242 }
243
244 # Create classifiers objects for each entry >= mingroup
245 my $metavalue;
246 foreach $metavalue (keys %mtfreq)
247 {
248 if ($mtfreq{$metavalue} >= $self->{'mingroup'})
249 {
250 my $listclassobj;
251 my $doclevel = $self->{'doclevel'};
252 my $metaname = $self->{'metaname'};
253 my @metaname_list = split('/',$metaname);
254 $metaname = shift(@metaname_list);
255 if (@metaname_list==0)
256 {
257 if ($doclevel =~ m/^top(level)?/i)
258 {
259 my $args = "\"metadata=$metaname\"";
260 $args .= ", \"title=\$metavalue\"";
261 $args .= ", \"sort=Date\"";
262 eval ("\$listclassobj = new List($args)");
263 }
264 else
265 {
266 my $args = "\"metadata=$metaname\"";
267 $args .= ", \"title=\$metavalue\"";
268 $args .= ", \"sort=Date\"";
269 eval ("\$listclassobj = new SectionList($args)");
270 }
271 }
272 else
273 {
274 $metaname = join('/',@metaname_list);
275
276 my $args = "\"metadata=$metaname\"";
277 $args .= ", \"title=\$metavalue\"";
278 $args .= ", \"doclevel=\$doclevel\"";
279 $args .= ", \"recopt\"";
280
281 eval ("\$listclassobj = new AZCompactList($args)");
282 }
283 die "$@" if $@;
284
285 $listclassobj->init();
286
287 if (defined $metavalue && $metavalue =~ /\w/)
288 {
289 my $formatted_node = $metavalue;
290 if ($self->{'metaname'} =~ m/^Creator(:.*)?$/)
291 {
292 &sorttools::format_string_name_english(\$formatted_node);
293 }
294 else
295 {
296 &sorttools::format_string_english(\$formatted_node);
297 }
298
299 $self->{'classifiers'}->{$metavalue}
300 = { 'classifyobj' => $listclassobj,
301 'formattednode' => $formatted_node };
302 }
303 }
304 }
305
306
307 return (\@single_classlist,\@multiple_classlist);
308}
309
310
311sub reclassify
312{
313 my ($self,$multiple_cl_ref) = @_;
314
315 # Entries in the current classify list that are "book nodes"
316 # should be recursively classified.
317 #--
318 foreach $dm_pair (@$multiple_cl_ref)
319 {
320 my ($doc_OID,$mdoffset,$metavalue) = @$dm_pair;
321 my $listclassobj;
322
323 # find metavalue in list of sub-classifiers
324 my $found = 0;
325 my $node_name;
326 foreach $node_name (keys %{$self->{'classifiers'}})
327 {
328 $resafe_node_name = $node_name;
329 $resafe_node_name =~ s/(\(|\)|\[|\]|\{|\}|\^|\$|\.|\+|\*|\?|\|)/\\$1/g;
330 if ($metavalue =~ m/^$resafe_node_name$/i)
331 {
332 my ($doc_obj,$date) = @{$self->{'reclassify'}->{$doc_OID}};
333
334 ## date appears to not be used in classifier call ####
335
336 if ($doc_OID =~ m/^.*\.(\d+)$/)
337 {
338 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
339 ->classify($doc_obj, "Section=$1");
340 }
341 else
342 {
343 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
344 ->classify($doc_obj);
345 }
346
347 $found = 1;
348 last;
349 }
350 }
351
352 if (!$found)
353 {
354 print STDERR "Warning: AZCompactList::reclassify ";
355 print STDERR "could not find sub-node for $metavalue with doc_OID $doc_OID\n";
356 }
357 }
358}
359
360
361
362sub get_reclassify_info
363{
364 my $self = shift (@_);
365
366 my $node_name;
367 foreach $node_name (keys %{$self->{'classifiers'}})
368 {
369 my $classifyinfo
370 = $self->{'classifiers'}->{$node_name}->{'classifyobj'}
371 ->get_classify_info();
372 $self->{'classifiers'}->{$node_name}->{'classifyinfo'}
373 = $classifyinfo;
374 $self->{'reclassifylist'}->{"CLASSIFY.$node_name"}
375 = $self->{'classifiers'}->{$node_name}->{'formattednode'};
376 }
377}
378
379
380sub alpha_numeric_cmp
381{
382 my ($self,$a,$b) = @_;
383
384 my $title_a = $self->{'reclassifylist'}->{$a};
385 my $title_b = $self->{'reclassifylist'}->{$b};
386
387 if ($title_a =~ m/^(\d+(\.\d+)?)/)
388 {
389 my $val_a = $1;
390 if ($title_b =~ m/^(\d+(\.\d+)?)/)
391 {
392 my $val_b = $1;
393 if ($val_a != $val_b)
394 {
395 return ($val_a <=> $val_b);
396 }
397 }
398 }
399
400 return ($title_a cmp $title_b);
401}
402
403sub get_classify_info {
404 my $self = shift (@_);
405
406 my @classlist =keys %{$self->{'list'}}; # list all doc oids
407
408 my ($single_cl_ref,$multiple_cl_ref) = $self->reinit(\@classlist);
409 $self->reclassify($multiple_cl_ref);
410 $self->get_reclassify_info();
411
412
413# my @reclassified_classlist
414# = sort {$self->{'reclassifylist'}->{$a} cmp $self->{'reclassifylist'}->{$b};} keys %{$self->{'reclassifylist'}};
415
416 # alpha_numeric_cmp is slower but handles numbers better ...
417 my @reclassified_classlist
418 = sort { $self->alpha_numeric_cmp($a,$b) } keys %{$self->{'reclassifylist'}};
419
420
421 return $self->splitlist (\@reclassified_classlist);
422}
423
424sub get_entry {
425 my $self = shift (@_);
426 my ($title, $childtype, $metaname, $thistype) = @_;
427
428 # organise into classification structure
429 my %classifyinfo = ('childtype'=>$childtype,
430 'Title'=>$title,
431 'contains'=>[],
432 'mdtype'=>$metaname);
433
434 $classifyinfo{'thistype'} = $thistype
435 if defined $thistype && $thistype =~ /\w/;
436
437 return \%classifyinfo;
438}
439
440
441
442# splitlist takes an ordered list of classifications (@$classlistref) and
443# splits it up into alphabetical sub-sections.
444sub splitlist {
445 my $self = shift (@_);
446 my ($classlistref) = @_;
447 my $classhash = {};
448
449 # top level
450 my @metanames = split("/",$self->{'metaname'});
451 my $metaname = shift(@metanames);
452
453 my $childtype = "HList";
454 $childtype = "VList" if (scalar (@$classlistref) <= $self->{'minnesting'});
455
456 my $classifyinfo;
457 if (!defined($self->{'recopt'}))
458 {
459 my $title = $metaname;
460 $classifyinfo
461 = $self->get_entry ($metaname, $childtype, $metaname, "Invisible");
462 }
463 else
464 {
465 my $title = $self->{'title'};
466 $classifyinfo
467 = $self->get_entry ($title, $childtype, $metaname, "VList");
468 }
469
470 # don't need to do any splitting if there are less than 'minnesting' classifications
471 if ((scalar @$classlistref) <= $self->{'minnesting'}) {
472 foreach $subOID (@$classlistref) {
473 if ($subOID =~ /^CLASSIFY\.(.*)$/
474 && defined $self->{'classifiers'}->{$1})
475 {
476### print STDERR "*** subOID = $subOID\n";
477
478 push (@{$classifyinfo->{'contains'}},
479 $self->{'classifiers'}->{$1}->{'classifyinfo'});
480 }
481 else
482 {
483 $subOID =~ s/^Metavalue_(\d+)\.//;
484 my $metaname_offset = $1 -1;
485 my $oid_rec = {'OID'=>$subOID, 'offset'=>$metaname_offset};
486 push (@{$classifyinfo->{'contains'}}, $oid_rec);
487 }
488 }
489 return $classifyinfo;
490 }
491
492 # first split up the list into separate A-Z and 0-9 classifications
493 foreach $classification (@$classlistref) {
494 my $title = $self->{'reclassifylist'}->{$classification};
495 $title =~ s/&(.){2,4};//g; # remove any HTML special chars
496### $title =~ s/^\s+//g; # remove a leading spaces
497### $title =~ s/^_+//g; # remove a leading underscores
498 $title =~ s/^\W+//g;
499### $title =~ s/^(\'|\`|\"|\:|\()//g; # remove any opening punctutation
500 $title =~ s/^(.).*$/$1/;
501 $title =~ tr/[a-z]/[A-Z]/;
502
503 if ($title =~ /^[0-9]$/) {$title = '0-9';}
504 elsif ($title !~ /^[A-Z]$/) {
505 print STDERR "AZCompactList: WARNING $classification has badly formatted title ($title)\n";
506 }
507 $classhash->{$title} = [] unless defined $classhash->{$title};
508 push (@{$classhash->{$title}}, $classification);
509 }
510 $classhash = $self->compactlist ($classhash);
511
512 my @tmparr = ();
513 foreach $subsection (sort keys (%$classhash)) {
514 push (@tmparr, $subsection);
515 }
516
517 # if there's a 0-9 section it will have been sorted to the beginning
518 # but we want it at the end
519 if ($tmparr[0] eq '0-9') {
520 shift @tmparr;
521 push (@tmparr, '0-9');
522 }
523
524 foreach $subclass (@tmparr)
525 {
526 my $tempclassify = $self->get_entry($subclass, "VList", $metaname);
527 foreach $subsubOID (@{$classhash->{$subclass}})
528 {
529 if ($subsubOID =~ /^CLASSIFY\.(.*)$/
530 && defined $self->{'classifiers'}->{$1})
531 {
532 push (@{$tempclassify->{'contains'}},
533 $self->{'classifiers'}->{$1}->{'classifyinfo'});
534 }
535 else
536 {
537 $subsubOID =~ s/^Metavalue_(\d+)\.//;
538 my $metaname_offset = $1 -1;
539 my $oid_rec = {'OID'=>$subsubOID, 'offset'=>$metaname_offset};
540 push (@{$tempclassify->{'contains'}}, $oid_rec);
541 }
542 }
543 push (@{$classifyinfo->{'contains'}}, $tempclassify);
544 }
545
546 return $classifyinfo;
547}
548
549sub compactlist {
550 my $self = shift (@_);
551 my ($classhashref) = @_;
552 my $compactedhash = {};
553 my @currentOIDs = ();
554 my $currentfirstletter = "";
555 my $currentlastletter = "";
556 my $lastkey = "";
557
558 # minimum and maximum documents to be displayed per page.
559 # the actual maximum will be max + (min-1).
560 # the smallest sub-section is a single letter at present
561 # so in this case there may be many times max documents
562 # displayed on a page.
563 my $min = $self->{'mincompact'};
564 my $max = $self->{'maxcompact'};
565
566 foreach $subsection (sort keys %$classhashref) {
567 if ($subsection eq '0-9') {
568 @{$compactedhash->{$subsection}} = @{$classhashref->{$subsection}};
569 next;
570 }
571 $currentfirstletter = $subsection if $currentfirstletter eq "";
572 if ((scalar (@currentOIDs) < $min) ||
573 ((scalar (@currentOIDs) + scalar (@{$classhashref->{$subsection}})) <= $max)) {
574 push (@currentOIDs, @{$classhashref->{$subsection}});
575 $currentlastletter = $subsection;
576 } else {
577
578 if ($currentfirstletter eq $currentlastletter) {
579 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
580 $lastkey = $currentfirstletter;
581 } else {
582 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
583 $lastkey = "$currentfirstletter-$currentlastletter";
584 }
585 if (scalar (@{$classhashref->{$subsection}}) >= $max) {
586 $compactedhash->{$subsection} = $classhashref->{$subsection};
587 @currentOIDs = ();
588 $currentfirstletter = "";
589 } else {
590 @currentOIDs = @{$classhashref->{$subsection}};
591 $currentfirstletter = $subsection;
592 $currentlastletter = $subsection;
593 }
594 }
595 }
596
597 # add final OIDs to last sub-classification if there aren't many otherwise
598 # add final sub-classification
599 if (scalar (@currentOIDs) < $min) {
600 my ($newkey) = $lastkey =~ /^(.)/;
601 @currentOIDs = (@{$compactedhash->{$lastkey}}, @currentOIDs);
602 delete $compactedhash->{$lastkey};
603 @{$compactedhash->{"$newkey-$currentlastletter"}} = @currentOIDs;
604 } else {
605 if ($currentfirstletter eq $currentlastletter) {
606 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
607 } else {
608 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
609 }
610 }
611
612 return $compactedhash;
613}
614
6151;
616
617
Note: See TracBrowser for help on using the repository browser.