source: trunk/gsdl/perllib/classify/AZCompactList.pm@ 2956

Last change on this file since 2956 was 2956, checked in by jrm21, 22 years ago

Added Don Gourley's changes for getting Sections to work properly.

  • Property svn:keywords set to Author Date Id Revision
File size: 19.0 KB
Line 
1###########################################################################
2#
3# AZCompactList.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# classifier plugin for sorting alphabetically
27# options are:
28#
29# metadata=Metaname -- all documents with Metaname metadata
30# will be included in list, list will be sorted
31# by this element.
32# buttonname=Title -- (optional) the title field for this classification.
33# if not included title field will be Metaname.
34# mingroup=Num -- (optional) the smallest value that will cause
35# a group in the hierarchy to form.
36# minnesting=Num -- (optional) the smallest value that will cause a
37# list to converted into nested list
38# mincompact=Num -- (optional) used in compact list
39# maxcompact=Num -- (optional) used in compact list
40# doclevel=top|section -- (optional) level to process document at.
41# onlyfirst -- (optional) control whether all or only first
42# metadata value used from array of metadata
43package AZCompactList;
44
45use BasClas;
46use sorttools;
47
48sub BEGIN {
49 @ISA = ('BasClas');
50}
51
52sub print_usage {
53 print STDERR "
54 usage: classify AZCompactList -metadata X [options]
55 options:
56 -metadata X (required) Metadata field used for classification
57 -buttonname X Title to use on web pages (defaults to metadata)
58 -removeprefix regex pattern to remove from metadata before sorting
59 -doclevel top|section (Defaults to top)
60 -mingroup N Minimum num of documents required to form a new group
61 -minnesting N Minimum list size to become a nested list
62 -mincompact N Used in compact list
63 -maxcompact N Used in compact list
64 -onlyfirst Only use the first value if metadata is repeated.
65 -recopt
66";
67}
68
69sub new {
70 my $class = shift (@_);
71 my $self = new BasClas($class, @_);
72
73 my ($metaname, $title, $removeprefix);
74 my $mingroup = 2;
75 my $minnesting = 20;
76 my $mincompact = 10;
77 my $maxcompact = 30;
78 my $doclevel = "top";
79 my $onlyfirst = 0;
80 my $recopt = undef;
81
82 if (!parsargv::parse(\@_,
83 q^metadata/.*/^, \$metaname,
84 q^buttonname/.*/^, \$title,
85 q^removeprefix/.*/^, \$removeprefix,
86 q^mingroup/.*/2^, \$mingroup,
87 q^minnesting/.*/20^, \$minnesting,
88 q^mincompact/.*/10^, \$mincompact,
89 q^maxcompact/.*/30^, \$maxcompact,
90 q^doclevel/.*/top^, \$doclevel,
91 q^onlyfirst/.*/0^, \$onlyfirst,
92 q^recopt/.*/-1^, \$recopt,
93
94 "allow_extra_options")) {
95
96 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
97 &print_usage();
98 die "\n";
99 }
100
101 if (!defined $metaname) {
102 my $outhandle = $self->{'outhandle'};
103 print $outhandle "AZCompactList used with no metadata name to classify by\n";
104 die "\n";
105 }
106
107 $title = $metaname unless ($title);
108
109 $self->{'list'} = {};
110 $self->{'listmetavalue'} = {};
111 $self->{'reclassify'} = {};
112 $self->{'reclassifylist'} = {};
113 $self->{'metaname'} = $metaname;
114 $self->{'title'} = "$title"; # title for the titlebar.
115 if (defined($removeprefix) && $removeprefix) {
116 $self->{'removeprefix'} = $removeprefix;
117 }
118 $self->{'mingroup'} = $mingroup;
119 $self->{'minnesting'} = $minnesting;
120 $self->{'mincompact'} = $mincompact;
121 $self->{'maxcompact'} = $maxcompact;
122 $self->{'doclevel'} = $doclevel;
123
124 if ($onlyfirst != 0) {
125 $onlyfirst = 1;
126 }
127 $self->{'onlyfirst'} = $onlyfirst;
128
129 if ($recopt == -1) {
130 $recopt = undef;
131 } else {
132 $recopt = "on";
133 }
134 $self->{'recopt'} = $recopt;
135
136 return bless $self, $class;
137}
138
139sub init
140{
141 my $self = shift (@_);
142
143 $self->{'list'} = {};
144 $self->{'listmetavalue'} = {};
145 $self->{'reclassify'} = {};
146 $self->{'reclassifylist'} = {};
147}
148
149$tmp = 0;
150
151sub classify
152{
153 my $self = shift (@_);
154 my ($doc_obj) = @_;
155
156 my $doc_OID = $doc_obj->get_OID();
157
158 my @sectionlist = ();
159 my $topsection = $doc_obj->get_top_section();
160
161 my $metaname = $self->{'metaname'};
162 my $outhandle = $self->{'outhandle'};
163
164 $metaname =~ s/(\/.*)//; # grab first name in n1/n2/n3 list
165
166 if ($self->{'doclevel'} =~ /^top(level)?/i)
167 {
168 push(@sectionlist,$topsection);
169 }
170 else
171 {
172 my $thissection = $doc_obj->get_next_section($topsection);
173 while (defined $thissection)
174 {
175 push(@sectionlist,$thissection);
176 $thissection = $doc_obj->get_next_section ($thissection);
177 }
178 }
179
180 my $thissection;
181 foreach $thissection (@sectionlist)
182 {
183 my $full_doc_OID
184 = ($thissection ne "") ? "$doc_OID.$thissection" : $doc_OID;
185
186 if (defined $self->{'list'}->{$full_doc_OID})
187 {
188 print $outhandle "WARNING: AZCompactList::classify called multiple times for $full_doc_OID\n";
189 }
190 $self->{'list'}->{$full_doc_OID} = [];
191 $self->{'listmetavalue'}->{$full_doc_OID} = [];
192
193 my $metavalues = $doc_obj->get_metadata($thissection,$metaname);
194 my $metavalue;
195 foreach $metavalue (@$metavalues)
196 {
197 # if this document doesn't contain the metadata element we're
198 # sorting by we won't include it in this classification
199 if (defined $metavalue && $metavalue =~ /\w/)
200 {
201 if ($self->{'removeprefix'}) {
202 $metavalue =~ s/^$self->{'removeprefix'}//;
203 }
204
205 my $formatted_metavalue = $metavalue;
206
207 if ($self->{'metaname'} =~ m/^Creator(:.*)?$/)
208 {
209 &sorttools::format_string_name_english (\$formatted_metavalue);
210 }
211 else
212 {
213 &sorttools::format_string_english (\$formatted_metavalue);
214 }
215
216 #### prefix-str
217 if (! defined($formatted_metavalue)) {
218 print $outhandle "Warning: AZCompactList: metavalue is ";
219 print $outhandle "empty\n";
220 $formatted_metavalue="";
221 }
222
223 push(@{$self->{'list'}->{$full_doc_OID}},$formatted_metavalue);
224 push(@{$self->{'listmetavalue'}->{$full_doc_OID}} ,$metavalue);
225
226 last if ($self->{'onlyfirst'});
227 }
228 }
229 my $date = $doc_obj->get_metadata_element($thissection,"Date");
230 $self->{'reclassify'}->{$full_doc_OID} = [$doc_obj,$date];
231 }
232}
233
234sub reinit
235{
236 my ($self,$classlist_ref) = @_;
237 my $outhandle = $self->{'outhandle'};
238
239 my %mtfreq = ();
240 my @single_classlist = ();
241 my @multiple_classlist = ();
242
243 # find out how often each metavalue occurs
244 map
245 {
246 my $mv;
247 foreach $mv (@{$self->{'listmetavalue'}->{$_}} )
248 {
249 $mtfreq{$mv}++;
250 }
251 } @$classlist_ref;
252
253 # use this information to split the list: single metavalue/repeated value
254 map
255 {
256 my $i = 1;
257 my $metavalue;
258 foreach $metavalue (@{$self->{'listmetavalue'}->{$_}})
259 {
260 if ($mtfreq{$metavalue} >= $self->{'mingroup'})
261 {
262 push(@multiple_classlist,[$_,$i,$metavalue]);
263 }
264 else
265 {
266 push(@single_classlist,[$_,$metavalue]);
267 $metavalue =~ tr/[A-Z]/[a-z]/;
268 $self->{'reclassifylist'}->{"Metavalue_$i.$_"} = $metavalue;
269 }
270 $i++;
271 }
272 } @$classlist_ref;
273
274
275 # Setup sub-classifiers for multiple list
276
277 $self->{'classifiers'} = {};
278
279 my $pm;
280 foreach $pm ("List", "SectionList")
281 {
282 my $listname
283 = &util::filename_cat($ENV{'GSDLHOME'},"perllib/classify/$pm.pm");
284 if (-e $listname) { require $listname; }
285 else
286 {
287 print $outhandle "AZCompactList ERROR - couldn't find classifier \"$listname\"\n";
288 die "\n";
289 }
290 }
291
292 # Create classifiers objects for each entry >= mingroup
293 my $metavalue;
294 foreach $metavalue (keys %mtfreq)
295 {
296 if ($mtfreq{$metavalue} >= $self->{'mingroup'})
297 {
298 my $listclassobj;
299 my $doclevel = $self->{'doclevel'};
300 my $metaname = $self->{'metaname'};
301 my @metaname_list = split('/',$metaname);
302 $metaname = shift(@metaname_list);
303 if (@metaname_list==0)
304 {
305 my @args;
306 push @args, ("-metadata", "$metaname");
307# buttonname is also used for the node's title
308 push @args, ("-buttonname", "$metavalue");
309 push @args, ("-sort", "Date");
310
311 if ($doclevel =~ m/^top(level)?/i)
312 {
313 eval ("\$listclassobj = new List(\@args)"); warn $@ if $@;
314 }
315 else
316 {
317 eval ("\$listclassobj = new SectionList(\@args)");
318 }
319 }
320 else
321 {
322 $metaname = join('/',@metaname_list);
323
324 my @args;
325 push @args, ("-metadata", "$metaname");
326# buttonname is also used for the node's title
327 push @args, ("-buttonname", "$metavalue");
328 push @args, ("-doclevel", "$doclevel");
329 push @args, "-recopt";
330
331 eval ("\$listclassobj = new AZCompactList(\@args)");
332 }
333 if ($@) {
334 print $outhandle "$@";
335 die "\n";
336 }
337
338 $listclassobj->init();
339
340 if (defined $metavalue && $metavalue =~ /\w/)
341 {
342 my $formatted_node = $metavalue;
343
344 if ($self->{'removeprefix'}) {
345 $formatted_node =~ s/^$self->{'removeprefix'}//;
346 }
347
348 if ($self->{'metaname'} =~ m/^Creator(:.*)?$/)
349 {
350 &sorttools::format_string_name_english(\$formatted_node);
351 }
352 else
353 {
354 &sorttools::format_string_english(\$formatted_node);
355 }
356
357 # In case our formatted string is empty...
358 if (! defined($formatted_node)) {
359 print $outhandle "Warning: AZCompactList: metavalue is ";
360 print $outhandle "empty\n";
361 $formatted_node="";
362 }
363
364 $self->{'classifiers'}->{$metavalue}
365 = { 'classifyobj' => $listclassobj,
366 'formattednode' => $formatted_node };
367 }
368 }
369 }
370
371
372 return (\@single_classlist,\@multiple_classlist);
373}
374
375
376sub reclassify
377{
378 my ($self,$multiple_cl_ref) = @_;
379
380 # Entries in the current classify list that are "book nodes"
381 # should be recursively classified.
382 #--
383 foreach $dm_pair (@$multiple_cl_ref)
384 {
385 my ($doc_OID,$mdoffset,$metavalue) = @$dm_pair;
386 my $listclassobj;
387
388 # find metavalue in list of sub-classifiers
389 my $found = 0;
390 my $node_name;
391 foreach $node_name (keys %{$self->{'classifiers'}})
392 {
393 $resafe_node_name = $node_name;
394 $resafe_node_name =~ s/(\(|\)|\[|\]|\{|\}|\^|\$|\.|\+|\*|\?|\|)/\\$1/g;
395 if ($metavalue =~ m/^$resafe_node_name$/i)
396 {
397 my ($doc_obj,$date) = @{$self->{'reclassify'}->{$doc_OID}};
398
399 ## date appears to not be used in classifier call ####
400
401 if ($doc_OID =~ m/^[^\.]*\.([\d\.]+)$/)
402 {
403 my $section=$1;
404 if ($self->{'doclevel'} =~ m/^top/i) { # toplevel
405 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
406 ->classify($doc_obj, "Section=$section");
407 } else { # section level
408 # Thanks to Don Gourley for this...
409 # classify can't handle multi-level section
410 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
411 ->classify_section($section, $doc_obj, $date);
412 }
413 }
414 else
415 {
416 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
417 ->classify($doc_obj);
418 }
419
420 $found = 1;
421 last;
422 }
423 }
424
425 if (!$found)
426 {
427 my $outhandle=$self->{outhandle};
428 print $outhandle "Warning: AZCompactList::reclassify ";
429 print $outhandle "could not find sub-node for metadata=`$metavalue' with doc_OID $doc_OID\n";
430 }
431 }
432}
433
434
435
436sub get_reclassify_info
437{
438 my $self = shift (@_);
439
440 my $node_name;
441 foreach $node_name (keys %{$self->{'classifiers'}})
442 {
443 my $classifyinfo
444 = $self->{'classifiers'}->{$node_name}->{'classifyobj'}
445 ->get_classify_info();
446 $self->{'classifiers'}->{$node_name}->{'classifyinfo'}
447 = $classifyinfo;
448 $self->{'reclassifylist'}->{"CLASSIFY.$node_name"}
449 = $self->{'classifiers'}->{$node_name}->{'formattednode'};
450 }
451}
452
453
454sub alpha_numeric_cmp
455{
456 my ($self,$a,$b) = @_;
457
458 my $title_a = $self->{'reclassifylist'}->{$a};
459 my $title_b = $self->{'reclassifylist'}->{$b};
460
461 if ($title_a =~ m/^(\d+(\.\d+)?)/)
462 {
463 my $val_a = $1;
464 if ($title_b =~ m/^(\d+(\.\d+)?)/)
465 {
466 my $val_b = $1;
467 if ($val_a != $val_b)
468 {
469 return ($val_a <=> $val_b);
470 }
471 }
472 }
473
474 return ($title_a cmp $title_b);
475}
476
477sub get_classify_info {
478 my $self = shift (@_);
479
480 my @classlist =keys %{$self->{'list'}}; # list all doc oids
481
482 my ($single_cl_ref,$multiple_cl_ref) = $self->reinit(\@classlist);
483 $self->reclassify($multiple_cl_ref);
484 $self->get_reclassify_info();
485
486
487# my @reclassified_classlist
488# = sort {$self->{'reclassifylist'}->{$a} cmp $self->{'reclassifylist'}->{$b};} keys %{$self->{'reclassifylist'}};
489
490 # alpha_numeric_cmp is slower but handles numbers better ...
491 my @reclassified_classlist
492 = sort { $self->alpha_numeric_cmp($a,$b) } keys %{$self->{'reclassifylist'}};
493
494
495 return $self->splitlist (\@reclassified_classlist);
496}
497
498sub get_entry {
499 my $self = shift (@_);
500 my ($title, $childtype, $metaname, $thistype) = @_;
501
502 # organise into classification structure
503 my %classifyinfo = ('childtype'=>$childtype,
504 'Title'=>$title,
505 'contains'=>[],
506 'mdtype'=>$metaname);
507
508 $classifyinfo{'thistype'} = $thistype
509 if defined $thistype && $thistype =~ /\w/;
510
511 return \%classifyinfo;
512}
513
514
515
516# splitlist takes an ordered list of classifications (@$classlistref) and
517# splits it up into alphabetical sub-sections.
518sub splitlist {
519 my $self = shift (@_);
520 my ($classlistref) = @_;
521 my $classhash = {};
522
523 # top level
524 my @metanames = split("/",$self->{'metaname'});
525 my $metaname = shift(@metanames);
526
527 my $childtype = "HList";
528 $childtype = "VList" if (scalar (@$classlistref) <= $self->{'minnesting'});
529
530 my $classifyinfo;
531 if (!defined($self->{'recopt'}))
532 {
533 my $title = $self->{'title'}; # should always be defined by now....
534 $title = $metaname unless defined $title;
535 $classifyinfo
536 = $self->get_entry ($title, $childtype, $metaname, "Invisible");
537 }
538 else
539 {
540 my $title = $self->{'title'};
541 $classifyinfo
542 = $self->get_entry ($title, $childtype, $metaname, "VList");
543 }
544
545 # don't need to do any splitting if there are less than 'minnesting' classifications
546 if ((scalar @$classlistref) <= $self->{'minnesting'}) {
547 foreach $subOID (@$classlistref) {
548 if ($subOID =~ /^CLASSIFY\.(.*)$/
549 && defined $self->{'classifiers'}->{$1})
550 {
551### print STDERR "*** subOID = $subOID\n";
552
553 push (@{$classifyinfo->{'contains'}},
554 $self->{'classifiers'}->{$1}->{'classifyinfo'});
555 }
556 else
557 {
558 $subOID =~ s/^Metavalue_(\d+)\.//;
559 my $metaname_offset = $1 -1;
560 my $oid_rec = {'OID'=>$subOID, 'offset'=>$metaname_offset};
561 push (@{$classifyinfo->{'contains'}}, $oid_rec);
562 }
563 }
564 return $classifyinfo;
565 }
566
567 # first split up the list into separate A-Z and 0-9 classifications
568 foreach $classification (@$classlistref) {
569 my $title = $self->{'reclassifylist'}->{$classification};
570 $title =~ s/&(.){2,4};//g; # remove any HTML special chars
571### $title =~ s/^\s+//g; # remove a leading spaces
572### $title =~ s/^_+//g; # remove a leading underscores
573 $title =~ s/^\W+//g;
574### $title =~ s/^(\'|\`|\"|\:|\()//g; # remove any opening punctutation
575 $title =~ s/^(.).*$/$1/;
576 $title =~ tr/[a-z]/[A-Z]/;
577
578 if ($title =~ /^[0-9]$/) {$title = '0-9';}
579 elsif ($title !~ /^[A-Z]$/) {
580 my $outhandle = $self->{'outhandle'};
581 print $outhandle "AZCompactList: WARNING $classification has badly formatted title ($title)\n";
582 }
583 $classhash->{$title} = [] unless defined $classhash->{$title};
584 push (@{$classhash->{$title}}, $classification);
585 }
586 $classhash = $self->compactlist ($classhash);
587
588 my @tmparr = ();
589 foreach $subsection (sort keys (%$classhash)) {
590 push (@tmparr, $subsection);
591 }
592
593 # if there's a 0-9 section it will have been sorted to the beginning
594 # but we want it at the end
595 if ($tmparr[0] eq '0-9') {
596 shift @tmparr;
597 push (@tmparr, '0-9');
598 }
599
600 foreach $subclass (@tmparr)
601 {
602 my $tempclassify = $self->get_entry($subclass, "VList", $metaname);
603 foreach $subsubOID (@{$classhash->{$subclass}})
604 {
605 if ($subsubOID =~ /^CLASSIFY\.(.*)$/
606 && defined $self->{'classifiers'}->{$1})
607 {
608 push (@{$tempclassify->{'contains'}},
609 $self->{'classifiers'}->{$1}->{'classifyinfo'});
610 }
611 else
612 {
613 $subsubOID =~ s/^Metavalue_(\d+)\.//;
614 my $metaname_offset = $1 -1;
615 my $oid_rec = {'OID'=>$subsubOID, 'offset'=>$metaname_offset};
616 push (@{$tempclassify->{'contains'}}, $oid_rec);
617 }
618 }
619 push (@{$classifyinfo->{'contains'}}, $tempclassify);
620 }
621
622 return $classifyinfo;
623}
624
625sub compactlist {
626 my $self = shift (@_);
627 my ($classhashref) = @_;
628 my $compactedhash = {};
629 my @currentOIDs = ();
630 my $currentfirstletter = "";
631 my $currentlastletter = "";
632 my $lastkey = "";
633
634 # minimum and maximum documents to be displayed per page.
635 # the actual maximum will be max + (min-1).
636 # the smallest sub-section is a single letter at present
637 # so in this case there may be many times max documents
638 # displayed on a page.
639 my $min = $self->{'mincompact'};
640 my $max = $self->{'maxcompact'};
641
642 foreach $subsection (sort keys %$classhashref) {
643 if ($subsection eq '0-9') {
644 @{$compactedhash->{$subsection}} = @{$classhashref->{$subsection}};
645 next;
646 }
647 $currentfirstletter = $subsection if $currentfirstletter eq "";
648 if ((scalar (@currentOIDs) < $min) ||
649 ((scalar (@currentOIDs) + scalar (@{$classhashref->{$subsection}})) <= $max)) {
650 push (@currentOIDs, @{$classhashref->{$subsection}});
651 $currentlastletter = $subsection;
652 } else {
653
654 if ($currentfirstletter eq $currentlastletter) {
655 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
656 $lastkey = $currentfirstletter;
657 } else {
658 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
659 $lastkey = "$currentfirstletter-$currentlastletter";
660 }
661 if (scalar (@{$classhashref->{$subsection}}) >= $max) {
662 $compactedhash->{$subsection} = $classhashref->{$subsection};
663 @currentOIDs = ();
664 $currentfirstletter = "";
665 } else {
666 @currentOIDs = @{$classhashref->{$subsection}};
667 $currentfirstletter = $subsection;
668 $currentlastletter = $subsection;
669 }
670 }
671 }
672
673 # add final OIDs to last sub-classification if there aren't many otherwise
674 # add final sub-classification
675 if (scalar (@currentOIDs) < $min) {
676 my ($newkey) = $lastkey =~ /^(.)/;
677 @currentOIDs = (@{$compactedhash->{$lastkey}}, @currentOIDs);
678 delete $compactedhash->{$lastkey};
679 @{$compactedhash->{"$newkey-$currentlastletter"}} = @currentOIDs;
680 } else {
681 if ($currentfirstletter eq $currentlastletter) {
682 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
683 } else {
684 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
685 }
686 }
687
688 return $compactedhash;
689}
690
6911;
692
693
Note: See TracBrowser for help on using the repository browser.