source: main/tags/2.30/gsdl/perllib/classify/AZCompactList.pm@ 24168

Last change on this file since 24168 was 1716, checked in by jrm21, 24 years ago

minor change to allow the -title option to display correctly on HTML page.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.2 KB
Line 
1###########################################################################
2#
3# AZCompactList.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# classifier plugin for sorting alphabetically
27# options are:
28#
29# metadata=Metaname -- all documents with Metaname metadata
30# will be included in list, list will be sorted
31# by this element.
32# title=Title -- (optional) the title field for this classification.
33# if not included title field will be Metaname.
34# mingroup=Num -- (optional) the smallest value that will cause
35# a group in the hierarchy to form.
36# minnesting=Num -- (optional) the smallest value that will cause a
37# list to converted into nested list
38# mincompact=Num -- (optional) used in compact list
39# maxcompact=Num -- (optional) used in compact list
40# doclevel=top|section -- (optional) level to process document at.
41# onlyfirst -- (optional) control whether all or only first
42# metadata value used from array of metadata
43package AZCompactList;
44
45use BasClas;
46use sorttools;
47
48sub BEGIN {
49 @ISA = ('BasClas');
50}
51
52sub new {
53 my ($class, @options) = @_;
54 my $self = new BasClas ($class, @_);
55
56 my ($metaname, $title);
57 my $mingroup = 2;
58 my $minnesting = 20;
59 my $mincompact = 10;
60 my $maxcompact = 30;
61 my $doclevel = "top";
62 my $onlyfirst = 0;
63 my $recopt = undef;
64
65 foreach $option (@options) {
66 if ($option =~ /^metadata=(.*)$/i) {
67 $metaname = $1;
68 } elsif ($option =~ /^title=(.*)$/i) {
69 $title = $1;
70 } elsif ($option =~ /^mingroup(size)?=(.*)$/i) {
71 $mingroup = $2;
72 } elsif ($option =~ /^minnesting=(.*)$/i) {
73 $minnesting = $1;
74 } elsif ($option =~ /^mincompact=(.*)$/i) {
75 $mincompact = $1;
76 } elsif ($option =~ /^maxcompact=(.*)$/i) {
77 $maxcompact = $1;
78 } elsif ($option =~ /^doclevel=(.*)$/i) {
79 $doclevel = $1;
80 } elsif ($option =~ /^onlyfirst$/i) {
81 $onlyfirst = 1;
82 } elsif ($option =~ /^recopt$/i) {
83 $recopt = "on";
84 }
85 }
86
87 if (!defined $metaname) {
88 my $outhandle = $self->{'outhandle'};
89 print $outhandle "AZCompactList used with no metadata name to classify by\n";
90 die "\n";
91 }
92
93 $title = $metaname unless defined $title;
94
95 $self->{'list'} = {};
96 $self->{'listmetavalue'} = {};
97 $self->{'reclassify'} = {};
98 $self->{'reclassifylist'} = {};
99 $self->{'metaname'} = $metaname;
100 $self->{'title'} = $title;
101 $self->{'mingroup'} = $mingroup;
102 $self->{'minnesting'} = $minnesting;
103 $self->{'mincompact'} = $mincompact;
104 $self->{'maxcompact'} = $maxcompact;
105 $self->{'doclevel'} = $doclevel;
106 $self->{'onlyfirst'} = $onlyfirst;
107 $self->{'recopt'} = $recopt;
108
109 return bless $self, $class;
110}
111
112sub init
113{
114 my $self = shift (@_);
115
116 $self->{'list'} = {};
117 $self->{'listmetavalue'} = {};
118 $self->{'reclassify'} = {};
119 $self->{'reclassifylist'} = {};
120}
121
122$tmp = 0;
123
124sub classify
125{
126 my $self = shift (@_);
127 my ($doc_obj) = @_;
128
129 my $doc_OID = $doc_obj->get_OID();
130
131 my @sectionlist = ();
132 my $topsection = $doc_obj->get_top_section();
133
134 my $metaname = $self->{'metaname'};
135 my $outhandle = $self->{'outhandle'};
136
137 $metaname =~ s/(\/.*)//; # grab first name in n1/n2/n3 list
138
139 if ($self->{'doclevel'} =~ /^top(level)?/i)
140 {
141 push(@sectionlist,$topsection);
142 }
143 else
144 {
145 my $thissection = $doc_obj->get_next_section($topsection);
146 while (defined $thissection)
147 {
148 push(@sectionlist,$thissection);
149 $thissection = $doc_obj->get_next_section ($thissection);
150 }
151 }
152
153 my $thissection;
154 foreach $thissection (@sectionlist)
155 {
156 my $full_doc_OID
157 = ($thissection ne "") ? "$doc_OID.$thissection" : $doc_OID;
158
159 if (defined $self->{'list'}->{$full_doc_OID})
160 {
161 print $outhandle "WARNING: AZCompactList::classify called multiple times for $full_doc_OID\n";
162 }
163 $self->{'list'}->{$full_doc_OID} = [];
164 $self->{'listmetavalue'}->{$full_doc_OID} = [];
165
166 my $metavalues = $doc_obj->get_metadata($thissection,$metaname);
167 my $metavalue;
168 foreach $metavalue (@$metavalues)
169 {
170### print STDERR "$metaname :\tmetavalue = $metavalue\n";
171 # if this document doesn't contain the metadata element we're
172 # sorting by we won't include it in this classification
173 if (defined $metavalue && $metavalue =~ /\w/)
174 {
175 my $formatted_metavalue = $metavalue;
176 if ($self->{'metaname'} =~ m/^Creator(:.*)?$/)
177 {
178 &sorttools::format_string_name_english (\$formatted_metavalue);
179 }
180 else
181 {
182 &sorttools::format_string_english (\$formatted_metavalue);
183 }
184
185 push(@{$self->{'list'}->{$full_doc_OID}},$formatted_metavalue);
186 push(@{$self->{'listmetavalue'}->{$full_doc_OID}} ,$metavalue);
187
188 last if ($self->{'onlyfirst'});
189 }
190 }
191 my $date = $doc_obj->get_metadata_element($thissection,"Date");
192 $self->{'reclassify'}->{$full_doc_OID} = [$doc_obj,$date];
193 }
194}
195
196sub reinit
197{
198 my ($self,$classlist_ref) = @_;
199
200 my %mtfreq = ();
201 my @single_classlist = ();
202 my @multiple_classlist = ();
203
204 # find out how often each metavalue occurs
205 map
206 {
207 my $mv;
208 foreach $mv (@{$self->{'listmetavalue'}->{$_}} )
209 {
210 $mtfreq{$mv}++;
211 }
212 } @$classlist_ref;
213
214 # use this information to split the list: single metavalue/repeated value
215 map
216 {
217 my $i = 1;
218 my $metavalue;
219 foreach $metavalue (@{$self->{'listmetavalue'}->{$_}})
220 {
221 if ($mtfreq{$metavalue} >= $self->{'mingroup'})
222 {
223 push(@multiple_classlist,[$_,$i,$metavalue]);
224 }
225 else
226 {
227 push(@single_classlist,[$_,$metavalue]);
228 $metavalue =~ tr/[A-Z]/[a-z]/;
229 $self->{'reclassifylist'}->{"Metavalue_$i.$_"} = $metavalue;
230 }
231 $i++;
232 }
233 } @$classlist_ref;
234
235
236 # Setup sub-classifiers for multiple list
237
238 $self->{'classifiers'} = {};
239
240 my $pm;
241 foreach $pm ("List", "SectionList")
242 {
243 my $listname
244 = &util::filename_cat($ENV{'GSDLHOME'},"perllib/classify/$pm.pm");
245 if (-e $listname) { require $listname; }
246 else
247 {
248 my $outhandle = $self->{'outhandle'};
249 print $outhandle "AZCompactList ERROR - couldn't find classifier \"$listname\"\n";
250 die "\n";
251 }
252 }
253
254 # Create classifiers objects for each entry >= mingroup
255 my $metavalue;
256 foreach $metavalue (keys %mtfreq)
257 {
258 if ($mtfreq{$metavalue} >= $self->{'mingroup'})
259 {
260 my $listclassobj;
261 my $doclevel = $self->{'doclevel'};
262 my $metaname = $self->{'metaname'};
263 my @metaname_list = split('/',$metaname);
264 $metaname = shift(@metaname_list);
265 if (@metaname_list==0)
266 {
267 if ($doclevel =~ m/^top(level)?/i)
268 {
269 my $args = "\"metadata=$metaname\"";
270 $args .= ", \"title=\$metavalue\"";
271 $args .= ", \"sort=Date\"";
272 eval ("\$listclassobj = new List($args)");
273 }
274 else
275 {
276 my $args = "\"metadata=$metaname\"";
277 $args .= ", \"title=\$metavalue\"";
278 $args .= ", \"sort=Date\"";
279 eval ("\$listclassobj = new SectionList($args)");
280 }
281 }
282 else
283 {
284 $metaname = join('/',@metaname_list);
285
286 my $args = "\"metadata=$metaname\"";
287 $args .= ", \"title=\$metavalue\"";
288 $args .= ", \"doclevel=\$doclevel\"";
289 $args .= ", \"recopt\"";
290
291 eval ("\$listclassobj = new AZCompactList($args)");
292 }
293 if ($@) {
294 my $outhandle = $self->{'outhandle'};
295 print $outhandle "$@";
296 die "\n";
297 }
298
299 $listclassobj->init();
300
301 if (defined $metavalue && $metavalue =~ /\w/)
302 {
303 my $formatted_node = $metavalue;
304 if ($self->{'metaname'} =~ m/^Creator(:.*)?$/)
305 {
306 &sorttools::format_string_name_english(\$formatted_node);
307 }
308 else
309 {
310 &sorttools::format_string_english(\$formatted_node);
311 }
312
313 $self->{'classifiers'}->{$metavalue}
314 = { 'classifyobj' => $listclassobj,
315 'formattednode' => $formatted_node };
316 }
317 }
318 }
319
320
321 return (\@single_classlist,\@multiple_classlist);
322}
323
324
325sub reclassify
326{
327 my ($self,$multiple_cl_ref) = @_;
328
329 # Entries in the current classify list that are "book nodes"
330 # should be recursively classified.
331 #--
332 foreach $dm_pair (@$multiple_cl_ref)
333 {
334 my ($doc_OID,$mdoffset,$metavalue) = @$dm_pair;
335 my $listclassobj;
336
337 # find metavalue in list of sub-classifiers
338 my $found = 0;
339 my $node_name;
340 foreach $node_name (keys %{$self->{'classifiers'}})
341 {
342 $resafe_node_name = $node_name;
343 $resafe_node_name =~ s/(\(|\)|\[|\]|\{|\}|\^|\$|\.|\+|\*|\?|\|)/\\$1/g;
344 if ($metavalue =~ m/^$resafe_node_name$/i)
345 {
346 my ($doc_obj,$date) = @{$self->{'reclassify'}->{$doc_OID}};
347
348 ## date appears to not be used in classifier call ####
349
350 if ($doc_OID =~ m/^.*\.(\d+)$/)
351 {
352 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
353 ->classify($doc_obj, "Section=$1");
354 }
355 else
356 {
357 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
358 ->classify($doc_obj);
359 }
360
361 $found = 1;
362 last;
363 }
364 }
365
366 if (!$found)
367 {
368 print $outhandle "Warning: AZCompactList::reclassify ";
369 print $outhandle "could not find sub-node for $metavalue with doc_OID $doc_OID\n";
370 }
371 }
372}
373
374
375
376sub get_reclassify_info
377{
378 my $self = shift (@_);
379
380 my $node_name;
381 foreach $node_name (keys %{$self->{'classifiers'}})
382 {
383 my $classifyinfo
384 = $self->{'classifiers'}->{$node_name}->{'classifyobj'}
385 ->get_classify_info();
386 $self->{'classifiers'}->{$node_name}->{'classifyinfo'}
387 = $classifyinfo;
388 $self->{'reclassifylist'}->{"CLASSIFY.$node_name"}
389 = $self->{'classifiers'}->{$node_name}->{'formattednode'};
390 }
391}
392
393
394sub alpha_numeric_cmp
395{
396 my ($self,$a,$b) = @_;
397
398 my $title_a = $self->{'reclassifylist'}->{$a};
399 my $title_b = $self->{'reclassifylist'}->{$b};
400
401 if ($title_a =~ m/^(\d+(\.\d+)?)/)
402 {
403 my $val_a = $1;
404 if ($title_b =~ m/^(\d+(\.\d+)?)/)
405 {
406 my $val_b = $1;
407 if ($val_a != $val_b)
408 {
409 return ($val_a <=> $val_b);
410 }
411 }
412 }
413
414 return ($title_a cmp $title_b);
415}
416
417sub get_classify_info {
418 my $self = shift (@_);
419
420 my @classlist =keys %{$self->{'list'}}; # list all doc oids
421
422 my ($single_cl_ref,$multiple_cl_ref) = $self->reinit(\@classlist);
423 $self->reclassify($multiple_cl_ref);
424 $self->get_reclassify_info();
425
426
427# my @reclassified_classlist
428# = sort {$self->{'reclassifylist'}->{$a} cmp $self->{'reclassifylist'}->{$b};} keys %{$self->{'reclassifylist'}};
429
430 # alpha_numeric_cmp is slower but handles numbers better ...
431 my @reclassified_classlist
432 = sort { $self->alpha_numeric_cmp($a,$b) } keys %{$self->{'reclassifylist'}};
433
434
435 return $self->splitlist (\@reclassified_classlist);
436}
437
438sub get_entry {
439 my $self = shift (@_);
440 my ($title, $childtype, $metaname, $thistype) = @_;
441
442 # organise into classification structure
443 my %classifyinfo = ('childtype'=>$childtype,
444 'Title'=>$title,
445 'contains'=>[],
446 'mdtype'=>$metaname);
447
448 $classifyinfo{'thistype'} = $thistype
449 if defined $thistype && $thistype =~ /\w/;
450
451 return \%classifyinfo;
452}
453
454
455
456# splitlist takes an ordered list of classifications (@$classlistref) and
457# splits it up into alphabetical sub-sections.
458sub splitlist {
459 my $self = shift (@_);
460 my ($classlistref) = @_;
461 my $classhash = {};
462
463 # top level
464 my @metanames = split("/",$self->{'metaname'});
465 my $metaname = shift(@metanames);
466
467 my $childtype = "HList";
468 $childtype = "VList" if (scalar (@$classlistref) <= $self->{'minnesting'});
469
470 my $classifyinfo;
471 if (!defined($self->{'recopt'}))
472 {
473 my $title = $self->{'title'}; # should always be defined by now....
474 $title = $metaname unless defined $title;
475 $classifyinfo
476 = $self->get_entry ($title, $childtype, $metaname, "Invisible");
477 }
478 else
479 {
480 my $title = $self->{'title'};
481 $classifyinfo
482 = $self->get_entry ($title, $childtype, $metaname, "VList");
483 }
484
485 # don't need to do any splitting if there are less than 'minnesting' classifications
486 if ((scalar @$classlistref) <= $self->{'minnesting'}) {
487 foreach $subOID (@$classlistref) {
488 if ($subOID =~ /^CLASSIFY\.(.*)$/
489 && defined $self->{'classifiers'}->{$1})
490 {
491### print STDERR "*** subOID = $subOID\n";
492
493 push (@{$classifyinfo->{'contains'}},
494 $self->{'classifiers'}->{$1}->{'classifyinfo'});
495 }
496 else
497 {
498 $subOID =~ s/^Metavalue_(\d+)\.//;
499 my $metaname_offset = $1 -1;
500 my $oid_rec = {'OID'=>$subOID, 'offset'=>$metaname_offset};
501 push (@{$classifyinfo->{'contains'}}, $oid_rec);
502 }
503 }
504 return $classifyinfo;
505 }
506
507 # first split up the list into separate A-Z and 0-9 classifications
508 foreach $classification (@$classlistref) {
509 my $title = $self->{'reclassifylist'}->{$classification};
510 $title =~ s/&(.){2,4};//g; # remove any HTML special chars
511### $title =~ s/^\s+//g; # remove a leading spaces
512### $title =~ s/^_+//g; # remove a leading underscores
513 $title =~ s/^\W+//g;
514### $title =~ s/^(\'|\`|\"|\:|\()//g; # remove any opening punctutation
515 $title =~ s/^(.).*$/$1/;
516 $title =~ tr/[a-z]/[A-Z]/;
517
518 if ($title =~ /^[0-9]$/) {$title = '0-9';}
519 elsif ($title !~ /^[A-Z]$/) {
520 my $outhandle = $self->{'outhandle'};
521 print $outhandle "AZCompactList: WARNING $classification has badly formatted title ($title)\n";
522 }
523 $classhash->{$title} = [] unless defined $classhash->{$title};
524 push (@{$classhash->{$title}}, $classification);
525 }
526 $classhash = $self->compactlist ($classhash);
527
528 my @tmparr = ();
529 foreach $subsection (sort keys (%$classhash)) {
530 push (@tmparr, $subsection);
531 }
532
533 # if there's a 0-9 section it will have been sorted to the beginning
534 # but we want it at the end
535 if ($tmparr[0] eq '0-9') {
536 shift @tmparr;
537 push (@tmparr, '0-9');
538 }
539
540 foreach $subclass (@tmparr)
541 {
542 my $tempclassify = $self->get_entry($subclass, "VList", $metaname);
543 foreach $subsubOID (@{$classhash->{$subclass}})
544 {
545 if ($subsubOID =~ /^CLASSIFY\.(.*)$/
546 && defined $self->{'classifiers'}->{$1})
547 {
548 push (@{$tempclassify->{'contains'}},
549 $self->{'classifiers'}->{$1}->{'classifyinfo'});
550 }
551 else
552 {
553 $subsubOID =~ s/^Metavalue_(\d+)\.//;
554 my $metaname_offset = $1 -1;
555 my $oid_rec = {'OID'=>$subsubOID, 'offset'=>$metaname_offset};
556 push (@{$tempclassify->{'contains'}}, $oid_rec);
557 }
558 }
559 push (@{$classifyinfo->{'contains'}}, $tempclassify);
560 }
561
562 return $classifyinfo;
563}
564
565sub compactlist {
566 my $self = shift (@_);
567 my ($classhashref) = @_;
568 my $compactedhash = {};
569 my @currentOIDs = ();
570 my $currentfirstletter = "";
571 my $currentlastletter = "";
572 my $lastkey = "";
573
574 # minimum and maximum documents to be displayed per page.
575 # the actual maximum will be max + (min-1).
576 # the smallest sub-section is a single letter at present
577 # so in this case there may be many times max documents
578 # displayed on a page.
579 my $min = $self->{'mincompact'};
580 my $max = $self->{'maxcompact'};
581
582 foreach $subsection (sort keys %$classhashref) {
583 if ($subsection eq '0-9') {
584 @{$compactedhash->{$subsection}} = @{$classhashref->{$subsection}};
585 next;
586 }
587 $currentfirstletter = $subsection if $currentfirstletter eq "";
588 if ((scalar (@currentOIDs) < $min) ||
589 ((scalar (@currentOIDs) + scalar (@{$classhashref->{$subsection}})) <= $max)) {
590 push (@currentOIDs, @{$classhashref->{$subsection}});
591 $currentlastletter = $subsection;
592 } else {
593
594 if ($currentfirstletter eq $currentlastletter) {
595 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
596 $lastkey = $currentfirstletter;
597 } else {
598 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
599 $lastkey = "$currentfirstletter-$currentlastletter";
600 }
601 if (scalar (@{$classhashref->{$subsection}}) >= $max) {
602 $compactedhash->{$subsection} = $classhashref->{$subsection};
603 @currentOIDs = ();
604 $currentfirstletter = "";
605 } else {
606 @currentOIDs = @{$classhashref->{$subsection}};
607 $currentfirstletter = $subsection;
608 $currentlastletter = $subsection;
609 }
610 }
611 }
612
613 # add final OIDs to last sub-classification if there aren't many otherwise
614 # add final sub-classification
615 if (scalar (@currentOIDs) < $min) {
616 my ($newkey) = $lastkey =~ /^(.)/;
617 @currentOIDs = (@{$compactedhash->{$lastkey}}, @currentOIDs);
618 delete $compactedhash->{$lastkey};
619 @{$compactedhash->{"$newkey-$currentlastletter"}} = @currentOIDs;
620 } else {
621 if ($currentfirstletter eq $currentlastletter) {
622 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
623 } else {
624 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
625 }
626 }
627
628 return $compactedhash;
629}
630
6311;
632
633
Note: See TracBrowser for help on using the repository browser.