source: trunk/gsdl/perllib/classify/AZCompactList.pm@ 2173

Last change on this file since 2173 was 2080, checked in by jrm21, 23 years ago

When creating nodes, now need to pass -buttonname instead of -title.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.4 KB
Line 
1###########################################################################
2#
3# AZCompactList.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# classifier plugin for sorting alphabetically
27# options are:
28#
29# metadata=Metaname -- all documents with Metaname metadata
30# will be included in list, list will be sorted
31# by this element.
32# buttonname=Title -- (optional) the title field for this classification.
33# if not included title field will be Metaname.
34# mingroup=Num -- (optional) the smallest value that will cause
35# a group in the hierarchy to form.
36# minnesting=Num -- (optional) the smallest value that will cause a
37# list to converted into nested list
38# mincompact=Num -- (optional) used in compact list
39# maxcompact=Num -- (optional) used in compact list
40# doclevel=top|section -- (optional) level to process document at.
41# onlyfirst -- (optional) control whether all or only first
42# metadata value used from array of metadata
43package AZCompactList;
44
45use BasClas;
46use sorttools;
47
48sub BEGIN {
49 @ISA = ('BasClas');
50}
51
52sub print_usage {
53 print STDERR "
54 usage: classify AZCompactList [options]
55 options:
56 -metadata Metadata field used for classification
57 -buttonname Title to use on web pages
58
59 -mingroup
60 -minnesting
61 -mincompact
62 -maxcompact
63 -doclevel
64 -onlyfirst
65 -recopt
66";
67}
68
69sub new {
70 my $class = shift (@_);
71 my $self = new BasClas($class, @_);
72
73 my ($metaname, $title);
74 my $mingroup = 2;
75 my $minnesting = 20;
76 my $mincompact = 10;
77 my $maxcompact = 30;
78 my $doclevel = "top";
79 my $onlyfirst = 0;
80 my $recopt = undef;
81
82 if (!parsargv::parse(\@_,
83 q^metadata/.*/^, \$metaname,
84 q^buttonname/.*/^, \$title,
85
86 q^mingroup/.*/2^, \$mingroup,
87 q^minnesting/.*/20^, \$minnesting,
88 q^mincompact/.*/10^, \$mincompact,
89 q^maxcompact/.*/30^, \$maxcompact,
90 q^doclevel/.*/top^, \$doclevel,
91 q^onlyfirst/.*/0^, \$onlyfirst,
92 q^recopt/.*/-1^, \$recopt,
93
94 "allow_extra_options")) {
95
96 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
97 &print_usage();
98 die "\n";
99 }
100
101# These are the old-fashioned optins to AZCompactList.
102# They should be deleted once we're sure the parsearg version is working.
103# foreach $option (@options) {
104# if ($option =~ /^metadata=(.*)$/i) {
105# $metaname = $1;
106# } elsif ($option =~ /^title=(.*)$/i) {
107# $title = $1;
108# } elsif ($option =~ /^mingroup(size)?=(.*)$/i) {
109# $mingroup = $2;
110# } elsif ($option =~ /^minnesting=(.*)$/i) {
111# $minnesting = $1;
112# } elsif ($option =~ /^mincompact=(.*)$/i) {
113# $mincompact = $1;
114# } elsif ($option =~ /^maxcompact=(.*)$/i) {
115# $maxcompact = $1;
116# } elsif ($option =~ /^doclevel=(.*)$/i) {
117# $doclevel = $1;
118# } elsif ($option =~ /^onlyfirst$/i) {
119# $onlyfirst = 1;
120# } elsif ($option =~ /^recopt$/i) {
121# $recopt = "on";
122# }
123# }
124
125 if (!defined $metaname) {
126 my $outhandle = $self->{'outhandle'};
127 print $outhandle "AZCompactList used with no metadata name to classify by\n";
128 die "\n";
129 }
130
131 $title = $metaname unless ($title);
132
133 $self->{'list'} = {};
134 $self->{'listmetavalue'} = {};
135 $self->{'reclassify'} = {};
136 $self->{'reclassifylist'} = {};
137 $self->{'metaname'} = $metaname;
138 $self->{'title'} = "$title"; # title for the titlebar.
139 $self->{'mingroup'} = $mingroup;
140 $self->{'minnesting'} = $minnesting;
141 $self->{'mincompact'} = $mincompact;
142 $self->{'maxcompact'} = $maxcompact;
143 $self->{'doclevel'} = $doclevel;
144
145 if ($onlyfirst != 0) {
146 $onlyfirst = 1;
147 }
148 $self->{'onlyfirst'} = $onlyfirst;
149
150 if ($recopt == -1) {
151 $recopt = undef;
152 } else {
153 $recopt = "on";
154 }
155 $self->{'recopt'} = $recopt;
156
157 return bless $self, $class;
158}
159
160sub init
161{
162 my $self = shift (@_);
163
164 $self->{'list'} = {};
165 $self->{'listmetavalue'} = {};
166 $self->{'reclassify'} = {};
167 $self->{'reclassifylist'} = {};
168}
169
170$tmp = 0;
171
172sub classify
173{
174 my $self = shift (@_);
175 my ($doc_obj) = @_;
176
177 my $doc_OID = $doc_obj->get_OID();
178
179 my @sectionlist = ();
180 my $topsection = $doc_obj->get_top_section();
181
182 my $metaname = $self->{'metaname'};
183 my $outhandle = $self->{'outhandle'};
184
185 $metaname =~ s/(\/.*)//; # grab first name in n1/n2/n3 list
186
187 if ($self->{'doclevel'} =~ /^top(level)?/i)
188 {
189 push(@sectionlist,$topsection);
190 }
191 else
192 {
193 my $thissection = $doc_obj->get_next_section($topsection);
194 while (defined $thissection)
195 {
196 push(@sectionlist,$thissection);
197 $thissection = $doc_obj->get_next_section ($thissection);
198 }
199 }
200
201 my $thissection;
202 foreach $thissection (@sectionlist)
203 {
204 my $full_doc_OID
205 = ($thissection ne "") ? "$doc_OID.$thissection" : $doc_OID;
206
207 if (defined $self->{'list'}->{$full_doc_OID})
208 {
209 print $outhandle "WARNING: AZCompactList::classify called multiple times for $full_doc_OID\n";
210 }
211 $self->{'list'}->{$full_doc_OID} = [];
212 $self->{'listmetavalue'}->{$full_doc_OID} = [];
213
214 my $metavalues = $doc_obj->get_metadata($thissection,$metaname);
215 my $metavalue;
216 foreach $metavalue (@$metavalues)
217 {
218### print STDERR "$metaname :\tmetavalue = $metavalue\n";
219 # if this document doesn't contain the metadata element we're
220 # sorting by we won't include it in this classification
221 if (defined $metavalue && $metavalue =~ /\w/)
222 {
223 my $formatted_metavalue = $metavalue;
224 if ($self->{'metaname'} =~ m/^Creator(:.*)?$/)
225 {
226 &sorttools::format_string_name_english (\$formatted_metavalue);
227 }
228 else
229 {
230 &sorttools::format_string_english (\$formatted_metavalue);
231 }
232
233 push(@{$self->{'list'}->{$full_doc_OID}},$formatted_metavalue);
234 push(@{$self->{'listmetavalue'}->{$full_doc_OID}} ,$metavalue);
235
236 last if ($self->{'onlyfirst'});
237 }
238 }
239 my $date = $doc_obj->get_metadata_element($thissection,"Date");
240 $self->{'reclassify'}->{$full_doc_OID} = [$doc_obj,$date];
241 }
242}
243
244sub reinit
245{
246 my ($self,$classlist_ref) = @_;
247
248 my %mtfreq = ();
249 my @single_classlist = ();
250 my @multiple_classlist = ();
251
252 # find out how often each metavalue occurs
253 map
254 {
255 my $mv;
256 foreach $mv (@{$self->{'listmetavalue'}->{$_}} )
257 {
258 $mtfreq{$mv}++;
259 }
260 } @$classlist_ref;
261
262 # use this information to split the list: single metavalue/repeated value
263 map
264 {
265 my $i = 1;
266 my $metavalue;
267 foreach $metavalue (@{$self->{'listmetavalue'}->{$_}})
268 {
269 if ($mtfreq{$metavalue} >= $self->{'mingroup'})
270 {
271 push(@multiple_classlist,[$_,$i,$metavalue]);
272 }
273 else
274 {
275 push(@single_classlist,[$_,$metavalue]);
276 $metavalue =~ tr/[A-Z]/[a-z]/;
277 $self->{'reclassifylist'}->{"Metavalue_$i.$_"} = $metavalue;
278 }
279 $i++;
280 }
281 } @$classlist_ref;
282
283
284 # Setup sub-classifiers for multiple list
285
286 $self->{'classifiers'} = {};
287
288 my $pm;
289 foreach $pm ("List", "SectionList")
290 {
291 my $listname
292 = &util::filename_cat($ENV{'GSDLHOME'},"perllib/classify/$pm.pm");
293 if (-e $listname) { require $listname; }
294 else
295 {
296 my $outhandle = $self->{'outhandle'};
297 print $outhandle "AZCompactList ERROR - couldn't find classifier \"$listname\"\n";
298 die "\n";
299 }
300 }
301
302 # Create classifiers objects for each entry >= mingroup
303 my $metavalue;
304 foreach $metavalue (keys %mtfreq)
305 {
306 if ($mtfreq{$metavalue} >= $self->{'mingroup'})
307 {
308 my $listclassobj;
309 my $doclevel = $self->{'doclevel'};
310 my $metaname = $self->{'metaname'};
311 my @metaname_list = split('/',$metaname);
312 $metaname = shift(@metaname_list);
313 if (@metaname_list==0)
314 {
315 my @args;
316 push @args, ("-metadata", "$metaname");
317# buttonname is also used for the node's title
318 push @args, ("-buttonname", "$metavalue");
319 push @args, ("-sort", "Date");
320
321 if ($doclevel =~ m/^top(level)?/i)
322 {
323 eval ("\$listclassobj = new List(\@args)"); warn $@ if $@;
324 }
325 else
326 {
327 eval ("\$listclassobj = new SectionList($args)");
328 }
329 }
330 else
331 {
332 $metaname = join('/',@metaname_list);
333
334 my @args;
335 push @args, ("-metadata", "$metaname");
336# buttonname is also used for the node's title
337 push @args, ("-buttonname", "$metavalue");
338 push @args, ("-doclevel", "$doclevel");
339 push @args, "-recopt";
340
341 eval ("\$listclassobj = new AZCompactList($args)");
342 }
343 if ($@) {
344 my $outhandle = $self->{'outhandle'};
345 print $outhandle "$@";
346 die "\n";
347 }
348
349 $listclassobj->init();
350
351 if (defined $metavalue && $metavalue =~ /\w/)
352 {
353 my $formatted_node = $metavalue;
354 if ($self->{'metaname'} =~ m/^Creator(:.*)?$/)
355 {
356 &sorttools::format_string_name_english(\$formatted_node);
357 }
358 else
359 {
360 &sorttools::format_string_english(\$formatted_node);
361 }
362
363 $self->{'classifiers'}->{$metavalue}
364 = { 'classifyobj' => $listclassobj,
365 'formattednode' => $formatted_node };
366 }
367 }
368 }
369
370
371 return (\@single_classlist,\@multiple_classlist);
372}
373
374
375sub reclassify
376{
377 my ($self,$multiple_cl_ref) = @_;
378
379 # Entries in the current classify list that are "book nodes"
380 # should be recursively classified.
381 #--
382 foreach $dm_pair (@$multiple_cl_ref)
383 {
384 my ($doc_OID,$mdoffset,$metavalue) = @$dm_pair;
385 my $listclassobj;
386
387 # find metavalue in list of sub-classifiers
388 my $found = 0;
389 my $node_name;
390 foreach $node_name (keys %{$self->{'classifiers'}})
391 {
392 $resafe_node_name = $node_name;
393 $resafe_node_name =~ s/(\(|\)|\[|\]|\{|\}|\^|\$|\.|\+|\*|\?|\|)/\\$1/g;
394 if ($metavalue =~ m/^$resafe_node_name$/i)
395 {
396 my ($doc_obj,$date) = @{$self->{'reclassify'}->{$doc_OID}};
397
398 ## date appears to not be used in classifier call ####
399
400 if ($doc_OID =~ m/^.*\.(\d+)$/)
401 {
402 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
403 ->classify($doc_obj, "Section=$1");
404 }
405 else
406 {
407 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
408 ->classify($doc_obj);
409 }
410
411 $found = 1;
412 last;
413 }
414 }
415
416 if (!$found)
417 {
418 print $outhandle "Warning: AZCompactList::reclassify ";
419 print $outhandle "could not find sub-node for $metavalue with doc_OID $doc_OID\n";
420 }
421 }
422}
423
424
425
426sub get_reclassify_info
427{
428 my $self = shift (@_);
429
430 my $node_name;
431 foreach $node_name (keys %{$self->{'classifiers'}})
432 {
433 my $classifyinfo
434 = $self->{'classifiers'}->{$node_name}->{'classifyobj'}
435 ->get_classify_info();
436 $self->{'classifiers'}->{$node_name}->{'classifyinfo'}
437 = $classifyinfo;
438 $self->{'reclassifylist'}->{"CLASSIFY.$node_name"}
439 = $self->{'classifiers'}->{$node_name}->{'formattednode'};
440 }
441}
442
443
444sub alpha_numeric_cmp
445{
446 my ($self,$a,$b) = @_;
447
448 my $title_a = $self->{'reclassifylist'}->{$a};
449 my $title_b = $self->{'reclassifylist'}->{$b};
450
451 if ($title_a =~ m/^(\d+(\.\d+)?)/)
452 {
453 my $val_a = $1;
454 if ($title_b =~ m/^(\d+(\.\d+)?)/)
455 {
456 my $val_b = $1;
457 if ($val_a != $val_b)
458 {
459 return ($val_a <=> $val_b);
460 }
461 }
462 }
463
464 return ($title_a cmp $title_b);
465}
466
467sub get_classify_info {
468 my $self = shift (@_);
469
470 my @classlist =keys %{$self->{'list'}}; # list all doc oids
471
472 my ($single_cl_ref,$multiple_cl_ref) = $self->reinit(\@classlist);
473 $self->reclassify($multiple_cl_ref);
474 $self->get_reclassify_info();
475
476
477# my @reclassified_classlist
478# = sort {$self->{'reclassifylist'}->{$a} cmp $self->{'reclassifylist'}->{$b};} keys %{$self->{'reclassifylist'}};
479
480 # alpha_numeric_cmp is slower but handles numbers better ...
481 my @reclassified_classlist
482 = sort { $self->alpha_numeric_cmp($a,$b) } keys %{$self->{'reclassifylist'}};
483
484
485 return $self->splitlist (\@reclassified_classlist);
486}
487
488sub get_entry {
489 my $self = shift (@_);
490 my ($title, $childtype, $metaname, $thistype) = @_;
491
492 # organise into classification structure
493 my %classifyinfo = ('childtype'=>$childtype,
494 'Title'=>$title,
495 'contains'=>[],
496 'mdtype'=>$metaname);
497
498 $classifyinfo{'thistype'} = $thistype
499 if defined $thistype && $thistype =~ /\w/;
500
501 return \%classifyinfo;
502}
503
504
505
506# splitlist takes an ordered list of classifications (@$classlistref) and
507# splits it up into alphabetical sub-sections.
508sub splitlist {
509 my $self = shift (@_);
510 my ($classlistref) = @_;
511 my $classhash = {};
512
513 # top level
514 my @metanames = split("/",$self->{'metaname'});
515 my $metaname = shift(@metanames);
516
517 my $childtype = "HList";
518 $childtype = "VList" if (scalar (@$classlistref) <= $self->{'minnesting'});
519
520 my $classifyinfo;
521 if (!defined($self->{'recopt'}))
522 {
523 my $title = $self->{'title'}; # should always be defined by now....
524 $title = $metaname unless defined $title;
525 $classifyinfo
526 = $self->get_entry ($title, $childtype, $metaname, "Invisible");
527 }
528 else
529 {
530 my $title = $self->{'title'};
531 $classifyinfo
532 = $self->get_entry ($title, $childtype, $metaname, "VList");
533 }
534
535 # don't need to do any splitting if there are less than 'minnesting' classifications
536 if ((scalar @$classlistref) <= $self->{'minnesting'}) {
537 foreach $subOID (@$classlistref) {
538 if ($subOID =~ /^CLASSIFY\.(.*)$/
539 && defined $self->{'classifiers'}->{$1})
540 {
541### print STDERR "*** subOID = $subOID\n";
542
543 push (@{$classifyinfo->{'contains'}},
544 $self->{'classifiers'}->{$1}->{'classifyinfo'});
545 }
546 else
547 {
548 $subOID =~ s/^Metavalue_(\d+)\.//;
549 my $metaname_offset = $1 -1;
550 my $oid_rec = {'OID'=>$subOID, 'offset'=>$metaname_offset};
551 push (@{$classifyinfo->{'contains'}}, $oid_rec);
552 }
553 }
554 return $classifyinfo;
555 }
556
557 # first split up the list into separate A-Z and 0-9 classifications
558 foreach $classification (@$classlistref) {
559 my $title = $self->{'reclassifylist'}->{$classification};
560 $title =~ s/&(.){2,4};//g; # remove any HTML special chars
561### $title =~ s/^\s+//g; # remove a leading spaces
562### $title =~ s/^_+//g; # remove a leading underscores
563 $title =~ s/^\W+//g;
564### $title =~ s/^(\'|\`|\"|\:|\()//g; # remove any opening punctutation
565 $title =~ s/^(.).*$/$1/;
566 $title =~ tr/[a-z]/[A-Z]/;
567
568 if ($title =~ /^[0-9]$/) {$title = '0-9';}
569 elsif ($title !~ /^[A-Z]$/) {
570 my $outhandle = $self->{'outhandle'};
571 print $outhandle "AZCompactList: WARNING $classification has badly formatted title ($title)\n";
572 }
573 $classhash->{$title} = [] unless defined $classhash->{$title};
574 push (@{$classhash->{$title}}, $classification);
575 }
576 $classhash = $self->compactlist ($classhash);
577
578 my @tmparr = ();
579 foreach $subsection (sort keys (%$classhash)) {
580 push (@tmparr, $subsection);
581 }
582
583 # if there's a 0-9 section it will have been sorted to the beginning
584 # but we want it at the end
585 if ($tmparr[0] eq '0-9') {
586 shift @tmparr;
587 push (@tmparr, '0-9');
588 }
589
590 foreach $subclass (@tmparr)
591 {
592 my $tempclassify = $self->get_entry($subclass, "VList", $metaname);
593 foreach $subsubOID (@{$classhash->{$subclass}})
594 {
595 if ($subsubOID =~ /^CLASSIFY\.(.*)$/
596 && defined $self->{'classifiers'}->{$1})
597 {
598 push (@{$tempclassify->{'contains'}},
599 $self->{'classifiers'}->{$1}->{'classifyinfo'});
600 }
601 else
602 {
603 $subsubOID =~ s/^Metavalue_(\d+)\.//;
604 my $metaname_offset = $1 -1;
605 my $oid_rec = {'OID'=>$subsubOID, 'offset'=>$metaname_offset};
606 push (@{$tempclassify->{'contains'}}, $oid_rec);
607 }
608 }
609 push (@{$classifyinfo->{'contains'}}, $tempclassify);
610 }
611
612 return $classifyinfo;
613}
614
615sub compactlist {
616 my $self = shift (@_);
617 my ($classhashref) = @_;
618 my $compactedhash = {};
619 my @currentOIDs = ();
620 my $currentfirstletter = "";
621 my $currentlastletter = "";
622 my $lastkey = "";
623
624 # minimum and maximum documents to be displayed per page.
625 # the actual maximum will be max + (min-1).
626 # the smallest sub-section is a single letter at present
627 # so in this case there may be many times max documents
628 # displayed on a page.
629 my $min = $self->{'mincompact'};
630 my $max = $self->{'maxcompact'};
631
632 foreach $subsection (sort keys %$classhashref) {
633 if ($subsection eq '0-9') {
634 @{$compactedhash->{$subsection}} = @{$classhashref->{$subsection}};
635 next;
636 }
637 $currentfirstletter = $subsection if $currentfirstletter eq "";
638 if ((scalar (@currentOIDs) < $min) ||
639 ((scalar (@currentOIDs) + scalar (@{$classhashref->{$subsection}})) <= $max)) {
640 push (@currentOIDs, @{$classhashref->{$subsection}});
641 $currentlastletter = $subsection;
642 } else {
643
644 if ($currentfirstletter eq $currentlastletter) {
645 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
646 $lastkey = $currentfirstletter;
647 } else {
648 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
649 $lastkey = "$currentfirstletter-$currentlastletter";
650 }
651 if (scalar (@{$classhashref->{$subsection}}) >= $max) {
652 $compactedhash->{$subsection} = $classhashref->{$subsection};
653 @currentOIDs = ();
654 $currentfirstletter = "";
655 } else {
656 @currentOIDs = @{$classhashref->{$subsection}};
657 $currentfirstletter = $subsection;
658 $currentlastletter = $subsection;
659 }
660 }
661 }
662
663 # add final OIDs to last sub-classification if there aren't many otherwise
664 # add final sub-classification
665 if (scalar (@currentOIDs) < $min) {
666 my ($newkey) = $lastkey =~ /^(.)/;
667 @currentOIDs = (@{$compactedhash->{$lastkey}}, @currentOIDs);
668 delete $compactedhash->{$lastkey};
669 @{$compactedhash->{"$newkey-$currentlastletter"}} = @currentOIDs;
670 } else {
671 if ($currentfirstletter eq $currentlastletter) {
672 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
673 } else {
674 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
675 }
676 }
677
678 return $compactedhash;
679}
680
6811;
682
683
Note: See TracBrowser for help on using the repository browser.