source: trunk/gsdl/perllib/classify/AZCompactList.pm@ 1948

Last change on this file since 1948 was 1948, checked in by jrm21, 23 years ago

Updated to now pass arguments using the new parsargv list format, rather
than the old string format.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.2 KB
Line 
1###########################################################################
2#
3# AZCompactList.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# classifier plugin for sorting alphabetically
27# options are:
28#
29# metadata=Metaname -- all documents with Metaname metadata
30# will be included in list, list will be sorted
31# by this element.
32# title=Title -- (optional) the title field for this classification.
33# if not included title field will be Metaname.
34# mingroup=Num -- (optional) the smallest value that will cause
35# a group in the hierarchy to form.
36# minnesting=Num -- (optional) the smallest value that will cause a
37# list to converted into nested list
38# mincompact=Num -- (optional) used in compact list
39# maxcompact=Num -- (optional) used in compact list
40# doclevel=top|section -- (optional) level to process document at.
41# onlyfirst -- (optional) control whether all or only first
42# metadata value used from array of metadata
43package AZCompactList;
44
45use BasClas;
46use sorttools;
47
48sub BEGIN {
49 @ISA = ('BasClas');
50}
51
52sub print_usage {
53 print STDERR "
54 usage: classify AZCompactList [options]
55 options:
56 -metadata Metadata field used for classification
57 -title Title to use on web pages
58
59 -mingroup
60 -minnesting
61 -mincompact
62 -maxcompact
63 -doclevel
64 -onlyfirst
65 -recopt
66";
67}
68
69sub new {
70 my $class = shift (@_);
71 my $self = new BasClas($class, @_);
72
73 my ($metaname, $title);
74 my $mingroup = 2;
75 my $minnesting = 20;
76 my $mincompact = 10;
77 my $maxcompact = 30;
78 my $doclevel = "top";
79 my $onlyfirst = 0;
80 my $recopt = undef;
81
82 if (!parsargv::parse(\@_,
83 q^metadata/.*/^, \$metaname,
84 q^title/.*/^, \$title,
85
86 q^mingroup/.*/2^, \$mingroup,
87 q^minnesting/.*/20^, \$minnesting,
88 q^mincompact/.*/10^, \$mincompact,
89 q^maxcompact/.*/30^, \$maxcompact,
90 q^doclevel/.*/top^, \$doclevel,
91 q^onlyfirst/.*/0^, \$onlyfirst,
92 q^recopt/.*/-1^, \$recopt,
93
94 "allow_extra_options")) {
95
96 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
97 &print_usage();
98 die "\n";
99 }
100
101# These are the old-fashioned optins to AZCompactList.
102# They should be deleted once we're sure the parsearg version is working.
103# foreach $option (@options) {
104# if ($option =~ /^metadata=(.*)$/i) {
105# $metaname = $1;
106# } elsif ($option =~ /^title=(.*)$/i) {
107# $title = $1;
108# } elsif ($option =~ /^mingroup(size)?=(.*)$/i) {
109# $mingroup = $2;
110# } elsif ($option =~ /^minnesting=(.*)$/i) {
111# $minnesting = $1;
112# } elsif ($option =~ /^mincompact=(.*)$/i) {
113# $mincompact = $1;
114# } elsif ($option =~ /^maxcompact=(.*)$/i) {
115# $maxcompact = $1;
116# } elsif ($option =~ /^doclevel=(.*)$/i) {
117# $doclevel = $1;
118# } elsif ($option =~ /^onlyfirst$/i) {
119# $onlyfirst = 1;
120# } elsif ($option =~ /^recopt$/i) {
121# $recopt = "on";
122# }
123# }
124
125 if (!defined $metaname) {
126 my $outhandle = $self->{'outhandle'};
127 print $outhandle "AZCompactList used with no metadata name to classify by\n";
128 die "\n";
129 }
130
131 $title = $metaname unless ($title);
132
133 $self->{'list'} = {};
134 $self->{'listmetavalue'} = {};
135 $self->{'reclassify'} = {};
136 $self->{'reclassifylist'} = {};
137 $self->{'metaname'} = $metaname;
138 $self->{'title'} = $title;
139 $self->{'mingroup'} = $mingroup;
140 $self->{'minnesting'} = $minnesting;
141 $self->{'mincompact'} = $mincompact;
142 $self->{'maxcompact'} = $maxcompact;
143 $self->{'doclevel'} = $doclevel;
144
145 if ($onlyfirst != 0) {
146 $onlyfirst = 1;
147 }
148 $self->{'onlyfirst'} = $onlyfirst;
149
150 if ($recopt == -1) {
151 $recopt = undef;
152 } else {
153 $recopt = "on";
154 }
155 $self->{'recopt'} = $recopt;
156
157 return bless $self, $class;
158}
159
160sub init
161{
162 my $self = shift (@_);
163
164 $self->{'list'} = {};
165 $self->{'listmetavalue'} = {};
166 $self->{'reclassify'} = {};
167 $self->{'reclassifylist'} = {};
168}
169
170$tmp = 0;
171
172sub classify
173{
174 my $self = shift (@_);
175 my ($doc_obj) = @_;
176
177 my $doc_OID = $doc_obj->get_OID();
178
179 my @sectionlist = ();
180 my $topsection = $doc_obj->get_top_section();
181
182 my $metaname = $self->{'metaname'};
183 my $outhandle = $self->{'outhandle'};
184
185 $metaname =~ s/(\/.*)//; # grab first name in n1/n2/n3 list
186
187 if ($self->{'doclevel'} =~ /^top(level)?/i)
188 {
189 push(@sectionlist,$topsection);
190 }
191 else
192 {
193 my $thissection = $doc_obj->get_next_section($topsection);
194 while (defined $thissection)
195 {
196 push(@sectionlist,$thissection);
197 $thissection = $doc_obj->get_next_section ($thissection);
198 }
199 }
200
201 my $thissection;
202 foreach $thissection (@sectionlist)
203 {
204 my $full_doc_OID
205 = ($thissection ne "") ? "$doc_OID.$thissection" : $doc_OID;
206
207 if (defined $self->{'list'}->{$full_doc_OID})
208 {
209 print $outhandle "WARNING: AZCompactList::classify called multiple times for $full_doc_OID\n";
210 }
211 $self->{'list'}->{$full_doc_OID} = [];
212 $self->{'listmetavalue'}->{$full_doc_OID} = [];
213
214 my $metavalues = $doc_obj->get_metadata($thissection,$metaname);
215 my $metavalue;
216 foreach $metavalue (@$metavalues)
217 {
218### print STDERR "$metaname :\tmetavalue = $metavalue\n";
219 # if this document doesn't contain the metadata element we're
220 # sorting by we won't include it in this classification
221 if (defined $metavalue && $metavalue =~ /\w/)
222 {
223 my $formatted_metavalue = $metavalue;
224 if ($self->{'metaname'} =~ m/^Creator(:.*)?$/)
225 {
226 &sorttools::format_string_name_english (\$formatted_metavalue);
227 }
228 else
229 {
230 &sorttools::format_string_english (\$formatted_metavalue);
231 }
232
233 push(@{$self->{'list'}->{$full_doc_OID}},$formatted_metavalue);
234 push(@{$self->{'listmetavalue'}->{$full_doc_OID}} ,$metavalue);
235
236 last if ($self->{'onlyfirst'});
237 }
238 }
239 my $date = $doc_obj->get_metadata_element($thissection,"Date");
240 $self->{'reclassify'}->{$full_doc_OID} = [$doc_obj,$date];
241 }
242}
243
244sub reinit
245{
246 my ($self,$classlist_ref) = @_;
247
248 my %mtfreq = ();
249 my @single_classlist = ();
250 my @multiple_classlist = ();
251
252 # find out how often each metavalue occurs
253 map
254 {
255 my $mv;
256 foreach $mv (@{$self->{'listmetavalue'}->{$_}} )
257 {
258 $mtfreq{$mv}++;
259 }
260 } @$classlist_ref;
261
262 # use this information to split the list: single metavalue/repeated value
263 map
264 {
265 my $i = 1;
266 my $metavalue;
267 foreach $metavalue (@{$self->{'listmetavalue'}->{$_}})
268 {
269 if ($mtfreq{$metavalue} >= $self->{'mingroup'})
270 {
271 push(@multiple_classlist,[$_,$i,$metavalue]);
272 }
273 else
274 {
275 push(@single_classlist,[$_,$metavalue]);
276 $metavalue =~ tr/[A-Z]/[a-z]/;
277 $self->{'reclassifylist'}->{"Metavalue_$i.$_"} = $metavalue;
278 }
279 $i++;
280 }
281 } @$classlist_ref;
282
283
284 # Setup sub-classifiers for multiple list
285
286 $self->{'classifiers'} = {};
287
288 my $pm;
289 foreach $pm ("List", "SectionList")
290 {
291 my $listname
292 = &util::filename_cat($ENV{'GSDLHOME'},"perllib/classify/$pm.pm");
293 if (-e $listname) { require $listname; }
294 else
295 {
296 my $outhandle = $self->{'outhandle'};
297 print $outhandle "AZCompactList ERROR - couldn't find classifier \"$listname\"\n";
298 die "\n";
299 }
300 }
301
302 # Create classifiers objects for each entry >= mingroup
303 my $metavalue;
304 foreach $metavalue (keys %mtfreq)
305 {
306 if ($mtfreq{$metavalue} >= $self->{'mingroup'})
307 {
308 my $listclassobj;
309 my $doclevel = $self->{'doclevel'};
310 my $metaname = $self->{'metaname'};
311 my @metaname_list = split('/',$metaname);
312 $metaname = shift(@metaname_list);
313 if (@metaname_list==0)
314 {
315 my @args;
316 push @args, ("-metadata", "$metaname");
317 push @args, ("-title", "$metavalue");
318 push @args, ("-sort", "Date");
319
320 if ($doclevel =~ m/^top(level)?/i)
321 {
322 eval ("\$listclassobj = new List(\@args)"); warn $@ if $@;
323 }
324 else
325 {
326 eval ("\$listclassobj = new SectionList($args)");
327 }
328 }
329 else
330 {
331 $metaname = join('/',@metaname_list);
332
333 my @args;
334 push @args, ("-metadata", "$metaname");
335 push @args, ("-title", "$metavalue");
336 push @args, ("-doclevel", "$doclevel");
337 push @args, "-recopt";
338
339 eval ("\$listclassobj = new AZCompactList($args)");
340 }
341 if ($@) {
342 my $outhandle = $self->{'outhandle'};
343 print $outhandle "$@";
344 die "\n";
345 }
346
347 $listclassobj->init();
348
349 if (defined $metavalue && $metavalue =~ /\w/)
350 {
351 my $formatted_node = $metavalue;
352 if ($self->{'metaname'} =~ m/^Creator(:.*)?$/)
353 {
354 &sorttools::format_string_name_english(\$formatted_node);
355 }
356 else
357 {
358 &sorttools::format_string_english(\$formatted_node);
359 }
360
361 $self->{'classifiers'}->{$metavalue}
362 = { 'classifyobj' => $listclassobj,
363 'formattednode' => $formatted_node };
364 }
365 }
366 }
367
368
369 return (\@single_classlist,\@multiple_classlist);
370}
371
372
373sub reclassify
374{
375 my ($self,$multiple_cl_ref) = @_;
376
377 # Entries in the current classify list that are "book nodes"
378 # should be recursively classified.
379 #--
380 foreach $dm_pair (@$multiple_cl_ref)
381 {
382 my ($doc_OID,$mdoffset,$metavalue) = @$dm_pair;
383 my $listclassobj;
384
385 # find metavalue in list of sub-classifiers
386 my $found = 0;
387 my $node_name;
388 foreach $node_name (keys %{$self->{'classifiers'}})
389 {
390 $resafe_node_name = $node_name;
391 $resafe_node_name =~ s/(\(|\)|\[|\]|\{|\}|\^|\$|\.|\+|\*|\?|\|)/\\$1/g;
392 if ($metavalue =~ m/^$resafe_node_name$/i)
393 {
394 my ($doc_obj,$date) = @{$self->{'reclassify'}->{$doc_OID}};
395
396 ## date appears to not be used in classifier call ####
397
398 if ($doc_OID =~ m/^.*\.(\d+)$/)
399 {
400 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
401 ->classify($doc_obj, "Section=$1");
402 }
403 else
404 {
405 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
406 ->classify($doc_obj);
407 }
408
409 $found = 1;
410 last;
411 }
412 }
413
414 if (!$found)
415 {
416 print $outhandle "Warning: AZCompactList::reclassify ";
417 print $outhandle "could not find sub-node for $metavalue with doc_OID $doc_OID\n";
418 }
419 }
420}
421
422
423
424sub get_reclassify_info
425{
426 my $self = shift (@_);
427
428 my $node_name;
429 foreach $node_name (keys %{$self->{'classifiers'}})
430 {
431 my $classifyinfo
432 = $self->{'classifiers'}->{$node_name}->{'classifyobj'}
433 ->get_classify_info();
434 $self->{'classifiers'}->{$node_name}->{'classifyinfo'}
435 = $classifyinfo;
436 $self->{'reclassifylist'}->{"CLASSIFY.$node_name"}
437 = $self->{'classifiers'}->{$node_name}->{'formattednode'};
438 }
439}
440
441
442sub alpha_numeric_cmp
443{
444 my ($self,$a,$b) = @_;
445
446 my $title_a = $self->{'reclassifylist'}->{$a};
447 my $title_b = $self->{'reclassifylist'}->{$b};
448
449 if ($title_a =~ m/^(\d+(\.\d+)?)/)
450 {
451 my $val_a = $1;
452 if ($title_b =~ m/^(\d+(\.\d+)?)/)
453 {
454 my $val_b = $1;
455 if ($val_a != $val_b)
456 {
457 return ($val_a <=> $val_b);
458 }
459 }
460 }
461
462 return ($title_a cmp $title_b);
463}
464
465sub get_classify_info {
466 my $self = shift (@_);
467
468 my @classlist =keys %{$self->{'list'}}; # list all doc oids
469
470 my ($single_cl_ref,$multiple_cl_ref) = $self->reinit(\@classlist);
471 $self->reclassify($multiple_cl_ref);
472 $self->get_reclassify_info();
473
474
475# my @reclassified_classlist
476# = sort {$self->{'reclassifylist'}->{$a} cmp $self->{'reclassifylist'}->{$b};} keys %{$self->{'reclassifylist'}};
477
478 # alpha_numeric_cmp is slower but handles numbers better ...
479 my @reclassified_classlist
480 = sort { $self->alpha_numeric_cmp($a,$b) } keys %{$self->{'reclassifylist'}};
481
482
483 return $self->splitlist (\@reclassified_classlist);
484}
485
486sub get_entry {
487 my $self = shift (@_);
488 my ($title, $childtype, $metaname, $thistype) = @_;
489
490 # organise into classification structure
491 my %classifyinfo = ('childtype'=>$childtype,
492 'Title'=>$title,
493 'contains'=>[],
494 'mdtype'=>$metaname);
495
496 $classifyinfo{'thistype'} = $thistype
497 if defined $thistype && $thistype =~ /\w/;
498
499 return \%classifyinfo;
500}
501
502
503
504# splitlist takes an ordered list of classifications (@$classlistref) and
505# splits it up into alphabetical sub-sections.
506sub splitlist {
507 my $self = shift (@_);
508 my ($classlistref) = @_;
509 my $classhash = {};
510
511 # top level
512 my @metanames = split("/",$self->{'metaname'});
513 my $metaname = shift(@metanames);
514
515 my $childtype = "HList";
516 $childtype = "VList" if (scalar (@$classlistref) <= $self->{'minnesting'});
517
518 my $classifyinfo;
519 if (!defined($self->{'recopt'}))
520 {
521 my $title = $self->{'title'}; # should always be defined by now....
522 $title = $metaname unless defined $title;
523 $classifyinfo
524 = $self->get_entry ($title, $childtype, $metaname, "Invisible");
525 }
526 else
527 {
528 my $title = $self->{'title'};
529 $classifyinfo
530 = $self->get_entry ($title, $childtype, $metaname, "VList");
531 }
532
533 # don't need to do any splitting if there are less than 'minnesting' classifications
534 if ((scalar @$classlistref) <= $self->{'minnesting'}) {
535 foreach $subOID (@$classlistref) {
536 if ($subOID =~ /^CLASSIFY\.(.*)$/
537 && defined $self->{'classifiers'}->{$1})
538 {
539### print STDERR "*** subOID = $subOID\n";
540
541 push (@{$classifyinfo->{'contains'}},
542 $self->{'classifiers'}->{$1}->{'classifyinfo'});
543 }
544 else
545 {
546 $subOID =~ s/^Metavalue_(\d+)\.//;
547 my $metaname_offset = $1 -1;
548 my $oid_rec = {'OID'=>$subOID, 'offset'=>$metaname_offset};
549 push (@{$classifyinfo->{'contains'}}, $oid_rec);
550 }
551 }
552 return $classifyinfo;
553 }
554
555 # first split up the list into separate A-Z and 0-9 classifications
556 foreach $classification (@$classlistref) {
557 my $title = $self->{'reclassifylist'}->{$classification};
558 $title =~ s/&(.){2,4};//g; # remove any HTML special chars
559### $title =~ s/^\s+//g; # remove a leading spaces
560### $title =~ s/^_+//g; # remove a leading underscores
561 $title =~ s/^\W+//g;
562### $title =~ s/^(\'|\`|\"|\:|\()//g; # remove any opening punctutation
563 $title =~ s/^(.).*$/$1/;
564 $title =~ tr/[a-z]/[A-Z]/;
565
566 if ($title =~ /^[0-9]$/) {$title = '0-9';}
567 elsif ($title !~ /^[A-Z]$/) {
568 my $outhandle = $self->{'outhandle'};
569 print $outhandle "AZCompactList: WARNING $classification has badly formatted title ($title)\n";
570 }
571 $classhash->{$title} = [] unless defined $classhash->{$title};
572 push (@{$classhash->{$title}}, $classification);
573 }
574 $classhash = $self->compactlist ($classhash);
575
576 my @tmparr = ();
577 foreach $subsection (sort keys (%$classhash)) {
578 push (@tmparr, $subsection);
579 }
580
581 # if there's a 0-9 section it will have been sorted to the beginning
582 # but we want it at the end
583 if ($tmparr[0] eq '0-9') {
584 shift @tmparr;
585 push (@tmparr, '0-9');
586 }
587
588 foreach $subclass (@tmparr)
589 {
590 my $tempclassify = $self->get_entry($subclass, "VList", $metaname);
591 foreach $subsubOID (@{$classhash->{$subclass}})
592 {
593 if ($subsubOID =~ /^CLASSIFY\.(.*)$/
594 && defined $self->{'classifiers'}->{$1})
595 {
596 push (@{$tempclassify->{'contains'}},
597 $self->{'classifiers'}->{$1}->{'classifyinfo'});
598 }
599 else
600 {
601 $subsubOID =~ s/^Metavalue_(\d+)\.//;
602 my $metaname_offset = $1 -1;
603 my $oid_rec = {'OID'=>$subsubOID, 'offset'=>$metaname_offset};
604 push (@{$tempclassify->{'contains'}}, $oid_rec);
605 }
606 }
607 push (@{$classifyinfo->{'contains'}}, $tempclassify);
608 }
609
610 return $classifyinfo;
611}
612
613sub compactlist {
614 my $self = shift (@_);
615 my ($classhashref) = @_;
616 my $compactedhash = {};
617 my @currentOIDs = ();
618 my $currentfirstletter = "";
619 my $currentlastletter = "";
620 my $lastkey = "";
621
622 # minimum and maximum documents to be displayed per page.
623 # the actual maximum will be max + (min-1).
624 # the smallest sub-section is a single letter at present
625 # so in this case there may be many times max documents
626 # displayed on a page.
627 my $min = $self->{'mincompact'};
628 my $max = $self->{'maxcompact'};
629
630 foreach $subsection (sort keys %$classhashref) {
631 if ($subsection eq '0-9') {
632 @{$compactedhash->{$subsection}} = @{$classhashref->{$subsection}};
633 next;
634 }
635 $currentfirstletter = $subsection if $currentfirstletter eq "";
636 if ((scalar (@currentOIDs) < $min) ||
637 ((scalar (@currentOIDs) + scalar (@{$classhashref->{$subsection}})) <= $max)) {
638 push (@currentOIDs, @{$classhashref->{$subsection}});
639 $currentlastletter = $subsection;
640 } else {
641
642 if ($currentfirstletter eq $currentlastletter) {
643 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
644 $lastkey = $currentfirstletter;
645 } else {
646 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
647 $lastkey = "$currentfirstletter-$currentlastletter";
648 }
649 if (scalar (@{$classhashref->{$subsection}}) >= $max) {
650 $compactedhash->{$subsection} = $classhashref->{$subsection};
651 @currentOIDs = ();
652 $currentfirstletter = "";
653 } else {
654 @currentOIDs = @{$classhashref->{$subsection}};
655 $currentfirstletter = $subsection;
656 $currentlastletter = $subsection;
657 }
658 }
659 }
660
661 # add final OIDs to last sub-classification if there aren't many otherwise
662 # add final sub-classification
663 if (scalar (@currentOIDs) < $min) {
664 my ($newkey) = $lastkey =~ /^(.)/;
665 @currentOIDs = (@{$compactedhash->{$lastkey}}, @currentOIDs);
666 delete $compactedhash->{$lastkey};
667 @{$compactedhash->{"$newkey-$currentlastletter"}} = @currentOIDs;
668 } else {
669 if ($currentfirstletter eq $currentlastletter) {
670 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
671 } else {
672 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
673 }
674 }
675
676 return $compactedhash;
677}
678
6791;
680
681
Note: See TracBrowser for help on using the repository browser.