source: main/trunk/greenstone2/perllib/classify/AZCompactList.pm

Last change on this file was 36794, checked in by anupama, 19 months ago

Dr Bainbridge fixed a diffcol classifier's subsidiary documents ordering issue: for identical titles under an authorr bookshelf for AZCompactList classifier on Creators in Word-PDF-Basic tutorial model collection, a different order of the documents would appear each time. The solution was 2-fold: Besides the PERL_PERTURB_KEYS environment variable, which we set to 0, there is also the PERL_HASH_SEED (see https://www.perlmonks.org/?node_id=1167787 ), and they both need to be set to 0 to get consistent ordering when calling perl's 'keys' command on a hashmap. The other part of the solution is to initialise AZCompactList's sort property to 'nosort' which then uses an array (thus, having a sense of ordering) instead of AZCompactList's default behaviour of using a hashmap (which does not enforce a sense of ordering). Setting the sort property to nosort had the effect of a consistent order of the same identically Titled documents upon a single build, but no consistent ordering between builds which is what PERL_PERTURB_KEYS in conjunction with PERL_HASH_SEED ensure.

  • Property svn:keywords set to Author Date Id Revision
File size: 24.1 KB
Line 
1###########################################################################
2#
3# AZCompactList.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# classifier plugin for sorting alphabetically
26
27package AZCompactList;
28
29use strict;
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use BaseClassifier;
33use sorttools;
34use FileUtils;
35
36use Unicode::Normalize;
37
38sub BEGIN {
39 @AZCompactList::ISA = ('BaseClassifier');
40}
41
42our $doclevel_list =
43 [ { 'name' => "top",
44 'desc' => "{AZCompactList.doclevel.top}" },
45 { 'name' => "firstlevel",
46 'desc' => "{AZCompactList.doclevel.firstlevel}" },
47 { 'name' => "section",
48 'desc' => "{AZCompactList.doclevel.section}" } ];
49
50my $arguments =
51 [ { 'name' => "metadata",
52 'desc' => "{AZCompactList.metadata}",
53 'type' => "metadata",
54 'reqd' => "yes" },
55 { 'name' => "firstvalueonly",
56 'desc' => "{AZCompactList.firstvalueonly}",
57 'type' => "flag",
58 'reqd' => "no" },
59 { 'name' => "allvalues",
60 'desc' => "{AZCompactList.allvalues}",
61 'type' => "flag",
62 'reqd' => "no" },
63 { 'name' => "sort",
64 'desc' => "{AZCompactList.sort}",
65 'type' => "metadata",
66# 'deft' => "Title",
67 'deft' => "nosort",
68 'reqd' => "no" },
69 { 'name' => "removeprefix",
70 'desc' => "{BasClas.removeprefix}",
71 'type' => "regexp",
72 'deft' => "",
73 'reqd' => "no" },
74 { 'name' => "removesuffix",
75 'desc' => "{BasClas.removesuffix}",
76 'type' => "regexp",
77 'deft' => "",
78 'reqd' => "no" },
79 { 'name' => "mingroup",
80 'desc' => "{AZCompactList.mingroup}",
81 'type' => "int",
82 'deft' => "1",
83 'range' => "1,",
84 'reqd' => "no" },
85 { 'name' => "minnesting",
86 'desc' => "{AZCompactList.minnesting}",
87 'type' => "int",
88 'deft' => "20",
89 'range' => "2,",
90 'reqd' => "no" },
91 { 'name' => "mincompact",
92 'desc' => "{AZCompactList.mincompact}",
93 'type' => "int",
94 'deft' => "10",
95 'range' => "1,",
96 'reqd' => "no" },
97 { 'name' => "maxcompact",
98 'desc' => "{AZCompactList.maxcompact}",
99 'type' => "int",
100 'deft' => "30",
101 'range' => "1,",
102 'reqd' => "no" },
103 { 'name' => "doclevel",
104 'desc' => "{AZCompactList.doclevel}",
105 'type' => "enum",
106 'list' => $doclevel_list,
107 'deft' => "top",
108 'reqd' => "no" },
109 { 'name' => "freqsort",
110 'desc' => "{AZCompactList.freqsort}",
111 'type' => "flag"},
112 { 'name' => "recopt",
113 'desc' => "{AZCompactList.recopt}",
114 'type' => "flag",
115 'reqd' => "no" } ];
116
117my $options =
118{ 'name' => "AZCompactList",
119 'desc' => "{AZCompactList.desc}",
120 'abstract' => "no",
121 'inherits' => "yes",
122 'args' => $arguments };
123
124
125sub new {
126 my ($class) = shift (@_);
127 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
128 push(@$pluginlist, $class);
129
130 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
131 push(@{$hashArgOptLists->{"OptList"}},$options);
132
133 my $self = new BaseClassifier($pluginlist, $inputargs, $hashArgOptLists);
134
135 if ($self->{'info_only'}) {
136 # don't worry about any options etc
137 return bless $self, $class;
138 }
139
140 if (!$self->{"metadata"}) {
141 my $outhandle = $self->{'outhandle'};
142 print $outhandle "AZCompactList Error: required option -metadata not supplied\n";
143 $self->print_txt_usage("");
144 die "AZCompactList Error: required option -metadata not supplied\n";
145 }
146
147 $self->{'metadata'} = $self->strip_ex_from_metadata($self->{'metadata'});
148 # Manually set $self parameters.
149 $self->{'list'} = {};
150 $self->{'listmetavalue'} = {};
151 $self->{'list_mvpair'} = {};
152 $self->{'reclassify'} = {};
153 $self->{'reclassifylist'} = {};
154
155 $self->{'buttonname'} = $self->generate_title_from_metadata($self->{'metadata'}) unless ($self->{'buttonname'});
156
157 if (defined($self->{"removeprefix"}) && $self->{"removeprefix"}) {
158 $self->{"removeprefix"} =~ s/^\^//; # don't need a leading ^
159 }
160 if (defined($self->{"removesuffix"}) && $self->{"removesuffix"}) {
161 $self->{"removesuffix"} =~ s/\$$//; # don't need a trailing $
162 }
163
164 $self->{'recopt'} = ($self->{'recopt'} == 0) ? undef : "on";
165
166 if (defined $self->{'sort'}) {
167 $self->{'sort'} = $self->strip_ex_from_metadata($self->{'sort'});
168 }
169 # Clean out the unused keys
170 if($self->{"removeprefix"} eq "") {delete $self->{"removeprefix"};}
171 if($self->{"removesuffix"} eq "") {delete $self->{"removesuffix"};}
172
173 return bless $self, $class;
174}
175
176sub init
177{
178 my $self = shift (@_);
179
180 $self->{'list'} = {};
181 $self->{'listmetavalue'} = {};
182 $self->{'list_mvpair'} = {};
183 $self->{'reclassify'} = {};
184 $self->{'reclassifylist'} = {};
185}
186
187my $tmp = 0;
188
189sub classify
190{
191 my $self = shift (@_);
192 my ($doc_obj) = @_;
193
194 my $doc_OID = $doc_obj->get_OID();
195 my $outhandle = $self->{'outhandle'};
196
197 my @sectionlist = ();
198 my $topsection = $doc_obj->get_top_section();
199 my $metaname = $self->{'metadata'};
200
201 $metaname =~ s/(\/|\|).*//; # grab first name in n1/n2/n3 or n1|n2|n3 list
202 my @commameta_list = split(/,|;/, $metaname);
203
204 if ($self->{'doclevel'} =~ /^top(level)?/i)
205 {
206 push(@sectionlist,$topsection);
207 }
208 elsif ($self->{'doclevel'} =~ /^first(level)?/i)
209 {
210 my $toplevel_children = $doc_obj->get_children($topsection);
211 push(@sectionlist,@$toplevel_children);
212 }
213 else # (all)?section(s)?
214 {
215 my $thissection = $doc_obj->get_next_section($topsection);
216 while (defined $thissection)
217 {
218 push(@sectionlist,$thissection);
219 $thissection = $doc_obj->get_next_section ($thissection);
220 }
221 }
222
223 my $thissection;
224 foreach $thissection (@sectionlist)
225 {
226 my $full_doc_OID
227 = ($thissection ne "") ? "$doc_OID.$thissection" : $doc_OID;
228
229 if (defined $self->{'list_mvpair'}->{$full_doc_OID})
230 {
231 print $outhandle "WARNING: AZCompactList::classify called multiple times for $full_doc_OID\n";
232 }
233 $self->{'list'}->{$full_doc_OID} = [];
234 $self->{'listmetavalue'}->{$full_doc_OID} = [];
235 $self->{'list_mvpair'}->{$full_doc_OID} = [];
236
237 my $metavalues = [];
238 foreach my $cmn (@commameta_list) {
239 my $cmvalues = $doc_obj->get_metadata($thissection,$cmn);
240 push(@$metavalues,@$cmvalues) if (@{$cmvalues});
241 last if (@{$cmvalues} && !$self->{'allvalues'});
242 }
243
244 my $metavalue;
245 foreach $metavalue (@$metavalues)
246 {
247 # Tidy up use of white space in metavalue for classifying
248 $metavalue =~ s/^\s*//s;
249 $metavalue =~ s/\s*$//s;
250 $metavalue =~ s/\n/ /s;
251 $metavalue =~ s/\s{2,}/ /s;
252
253 # if this document doesn't contain the metadata element we're
254 # sorting by we won't include it in this classification
255 if (defined $metavalue && $metavalue =~ /\w/)
256 {
257 if (defined($self->{'removeprefix'}) &&
258 length($self->{'removeprefix'})) {
259 $metavalue =~ s/^$self->{'removeprefix'}//;
260
261 # check that it's not now empty
262 if (!$metavalue) {next;}
263 }
264
265 if (defined($self->{'removesuffix'}) &&
266 length($self->{'removesuffix'})) {
267 $metavalue =~ s/$self->{'removesuffix'}$//;
268
269 # check that it's not now empty
270 if (!$metavalue) {next;}
271 }
272
273 my $formatted_metavalue;
274 if ($self->{'no_metadata_formatting'}) {
275 $formatted_metavalue = $metavalue;
276 } else {
277 $formatted_metavalue = &sorttools::format_metadata_for_sorting($self->{'metadata'}, $metavalue, $doc_obj, $self->{'casefold_metadata_for_sorting'}, $self->{'accentfold_metadata_for_sorting'});
278 }
279
280 #### prefix-str
281 if (! defined($formatted_metavalue)) {
282 print $outhandle "Warning: AZCompactList: metavalue is ";
283 print $outhandle "empty\n";
284 $formatted_metavalue="";
285 }
286
287 my $mv_pair = { 'mv' => $metavalue, 'fmv' => $formatted_metavalue };
288 push(@{$self->{'list'}->{$full_doc_OID}},$formatted_metavalue);
289 push(@{$self->{'listmetavalue'}->{$full_doc_OID}} ,$metavalue);
290 push(@{$self->{'list_mvpair'}->{$full_doc_OID}},$mv_pair);
291
292
293 last if ($self->{'firstvalueonly'});
294 }
295 }
296
297 # This is used in reclassify below for AZCompactSectionList
298 my $sortmeta = $doc_obj->get_metadata_element($thissection, $self->{'sort'});
299 $self->{'reclassify'}->{$full_doc_OID} = [$doc_obj,$sortmeta];
300 }
301}
302
303sub reinit
304{
305 my ($self,$classlist_ref) = @_;
306 my $outhandle = $self->{'outhandle'};
307
308 my %mtfreq = ();
309 my @single_classlist = ();
310 my @multiple_classlist = ();
311
312 # find out how often each metavalue occurs
313 map
314 {
315 foreach my $mvp (@{$self->{'list_mvpair'}->{$_}} )
316 {
317### print STDERR "*** plain mv = $mvp->{'mv'}\n";
318### print STDERR "*** format mv = $mvp->{'fmv'}\n";
319
320 my $metavalue = $mvp->{'mv'};
321 $metavalue =~ s!^-!\\-!; # in case it starts with "-"
322 $mtfreq{$metavalue}++;
323 }
324 } @$classlist_ref;
325
326 # use this information to split the list: single metavalue/repeated value
327 map
328 {
329 my $i = 1;
330 my $metavalue;
331 foreach my $mvp (@{$self->{'list_mvpair'}->{$_}})
332 {
333 my $metavalue = $mvp->{'mv'};
334 $metavalue =~ s!^-!\\-!; # in case it starts with "-"
335 my $cs_metavalue = $mvp->{'mv'}; # case sensitive
336 if ($mtfreq{$metavalue} >= $self->{'mingroup'})
337 {
338### print STDERR "*** pushing on $cs_metavalue\n";
339 push(@multiple_classlist,[$_,$i,$metavalue]);
340 }
341 else
342 {
343 push(@single_classlist,[$_,$cs_metavalue]);
344 $metavalue =~ tr/[A-Z]/[a-z]/;
345 $self->{'reclassifylist'}->{"Metavalue_$i.$_"} = $metavalue;
346 }
347 $i++;
348 }
349 } @$classlist_ref;
350
351
352 # Setup sub-classifiers for multiple list
353
354 $self->{'classifiers'} = {};
355
356 my $pm;
357 foreach $pm ("SimpleList", "SectionList")
358 {
359 my $listname
360 = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"perllib/classify/$pm.pm");
361 if (-e $listname) { require $listname; }
362 else
363 {
364 print $outhandle "AZCompactList ERROR - couldn't find classifier \"$listname\"\n";
365 die "\n";
366 }
367 }
368
369 # Create classifiers objects for each entry >= mingroup
370 my $metavalue;
371 my $doclevel = $self->{'doclevel'};
372 my $mingroup = $self->{'mingroup'};
373 my @metaname_list = split(/\/|\|/,$self->{'metadata'});
374 my $metaname = shift(@metaname_list);
375 my $hierarchical = 0;
376 if (scalar(@metaname_list) > 1) {
377 $hierarchical = 1;
378 $metaname = join('/',@metaname_list);
379 }
380 foreach $metavalue (sort keys %mtfreq)
381 {
382 if ($mtfreq{$metavalue} >= $mingroup)
383 {
384 # occurs more often than minimum required to compact into a group
385 my $listclassobj;
386
387 if (!$hierarchical)
388 {
389 my @args;
390 push @args, ("-metadata", "$metaname");
391 # buttonname is also used for the node's title
392 push @args, ("-buttonname", "$metavalue");
393 push @args, ("-sort", $self->{'sort'});
394
395 my $ptArgs = \@args;
396 if ($doclevel =~ m/^top(level)?/i)
397 {
398 eval ("\$listclassobj = new SimpleList([],\$ptArgs)");
399 }
400 else
401 {
402 # first(level)? or (all)?section(s)?
403 eval ("\$listclassobj = new SectionList([],\$ptArgs)");
404 }
405 }
406 else
407 {
408
409 my @args;
410 push @args, ("-metadata", "$metaname");
411 # buttonname is also used for the node's title
412 push @args, ("-buttonname", "$metavalue");
413 push @args, ("-sort", $self->{'sort'});
414
415 if (defined $self->{'removeprefix'}) {
416 push @args, ("-removeprefix", $self->{'removeprefix'});
417 }
418 if (defined $self->{'removesuffix'}) {
419 push @args, ("-removesuffix", $self->{'removesuffix'});
420 }
421
422 push @args, ("-doclevel", "$doclevel");
423 push @args, ("-mingroup", $mingroup);
424 push @args, "-recopt ";
425
426 my $ptArgs = \@args;
427 eval ("\$listclassobj = new AZCompactList([],\$ptArgs)");
428 }
429
430 if ($@) {
431 print $outhandle "$@";
432 die "\n";
433 }
434
435 $listclassobj->init();
436
437 if (defined $metavalue && $metavalue =~ /\w/)
438 {
439 my $formatted_node = $metavalue;
440
441 if (defined($self->{'removeprefix'}) &&
442 length($self->{'removeprefix'})) {
443 $formatted_node =~ s/^$self->{'removeprefix'}//;
444 # check that it's not now empty
445 if (!$formatted_node) {next;}
446 }
447 if (defined($self->{'removesuffix'}) &&
448 length($self->{'removesuffix'})) {
449 $formatted_node =~ s/$self->{'removesuffix'}$//;
450 # check that it's not now empty
451 if (!$formatted_node) {next;}
452 }
453
454 $formatted_node = &sorttools::format_metadata_for_sorting($self->{'metadata'}, $formatted_node, undef, $self->{'casefold_metadata_for_sorting'}, $self->{'accentfold_metadata_for_sorting'}) unless $self->{'no_metadata_formatting'};
455
456 # In case our formatted string is empty...
457 if (! defined($formatted_node)) {
458 print $outhandle "Warning: AZCompactList: metavalue is ";
459 print $outhandle "empty\n";
460 $formatted_node="";
461 }
462
463 # use the lower case, for speed of lookup.
464 my $meta_lc=lc($metavalue);
465 $self->{'classifiers'}->{$meta_lc}
466 = { 'classifyobj' => $listclassobj,
467 'formattednode' => $formatted_node };
468 }
469 }
470 }
471
472
473 return (\@single_classlist,\@multiple_classlist);
474}
475
476
477sub reclassify
478{
479 my ($self,$multiple_cl_ref) = @_;
480
481 # Entries in the current classify list that are "book nodes"
482 # should be recursively classified.
483 #--
484 foreach my $dm_pair (@$multiple_cl_ref)
485 {
486 my ($doc_OID,$mdoffset,$metavalue,$cs_metavalue) = @$dm_pair;
487 my $listclassobj;
488
489 # find metavalue in list of sub-classifiers
490 # check if we have a key (lower case) for this metadata value
491 my $node_name=lc($metavalue);
492 if (exists $self->{'classifiers'}->{$node_name})
493 {
494 my ($doc_obj, $sortmeta) = @{$self->{'reclassify'}->{$doc_OID}};
495
496 # record the metadata value offset temporarily, so eg AZList can
497 # get the correct metadata value (for multi-valued metadata fields)
498 $doc_obj->{'mdoffset'}=$mdoffset;
499
500 if ($doc_OID =~ m/^[^\.]*\.([\d\.]+)$/)
501 {
502 my $section=$1;
503 if ($self->{'doclevel'} =~ m/^top(level)?/i) { # toplevel
504 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
505 ->classify($doc_obj,"Section=$section");
506 } else {
507 # first(level)? or (all)?section(s)?
508
509 # classify() can't handle multi-level section, so use
510 # classify_section()
511 # ... thanks to Don Gourley for this...
512
513 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
514 ->classify_section($section, $doc_obj, $sortmeta);
515 }
516 }
517 else
518 {
519 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
520 ->classify($doc_obj);
521 }
522 } else { # this key is not in the hash
523 my $outhandle=$self->{outhandle};
524 print $outhandle "Warning: AZCompactList::reclassify ";
525 print $outhandle "could not find sub-node for metadata=`$metavalue' with doc_OID $doc_OID\n";
526 }
527 }
528}
529
530
531
532sub get_reclassify_info
533{
534 my $self = shift (@_);
535
536 my $node_name;
537 foreach $node_name (keys %{$self->{'classifiers'}})
538 {
539 my $classifyinfo
540 = $self->{'classifiers'}->{$node_name}->{'classifyobj'}
541 ->get_classify_info();
542 $self->{'classifiers'}->{$node_name}->{'classifyinfo'}
543 = $classifyinfo;
544 $self->{'reclassifylist'}->{"CLASSIFY.$node_name"}
545 = $self->{'classifiers'}->{$node_name}->{'formattednode'};
546 }
547}
548
549
550sub alpha_numeric_cmp
551{
552 my ($self,$a,$b) = @_;
553
554 my $title_a = $self->{'reclassifylist'}->{$a};
555 my $title_b = $self->{'reclassifylist'}->{$b};
556
557 if ($title_a =~ m/^(\d+(\.\d+)?)/)
558 {
559 my $val_a = $1;
560 if ($title_b =~ m/^(\d+(\.\d+)?)/)
561 {
562 my $val_b = $1;
563 if ($val_a != $val_b)
564 {
565 return ($val_a <=> $val_b);
566 }
567 }
568 }
569
570 return ($title_a cmp $title_b);
571}
572
573sub frequency_cmp
574{
575 my ($self,$a,$b) = @_;
576
577
578 my $title_a = $self->{'reclassifylist'}->{$a};
579 my $title_b = $self->{'reclassifylist'}->{$b};
580
581 my $a_freq = 1;
582 my $b_freq = 1;
583
584 if ($a =~ m/^CLASSIFY\.(.*)$/)
585 {
586 my $a_node = $1;
587 my $a_nodeinfo = $self->{'classifiers'}->{$a_node}->{'classifyinfo'};
588 $a_freq = scalar(@{$a_nodeinfo->{'contains'}});
589 }
590
591 if ($b =~ m/^CLASSIFY\.(.*)$/)
592 {
593 my $b_node = $1;
594 my $b_nodeinfo = $self->{'classifiers'}->{$b_node}->{'classifyinfo'};
595 $b_freq = scalar(@{$b_nodeinfo->{'contains'}});
596 }
597
598 return $b_freq <=> $a_freq;
599}
600
601sub get_classify_info {
602 my $self = shift (@_);
603
604 my @classlist =keys %{$self->{'list_mvpair'}}; # list all doc oids
605
606 my ($single_cl_ref,$multiple_cl_ref) = $self->reinit(\@classlist);
607 $self->reclassify($multiple_cl_ref);
608 $self->get_reclassify_info();
609
610 my @reclassified_classlist;
611 if ($self->{'freqsort'})
612 {
613 @reclassified_classlist
614 = sort { $self->frequency_cmp($a,$b) } keys %{$self->{'reclassifylist'}};
615 # supress sub-grouping by alphabet
616 map { $self->{'reclassifylist'}->{$_} = "A".$self->{'reclassifylist'}; } keys %{$self->{'reclassifylist'}};
617 }
618 else
619 {
620# @reclassified_classlist
621# = sort {$self->{'reclassifylist'}->{$a} cmp $self->{'reclassifylist'}->{$b};} keys %{$self->{'reclassifylist'}};
622
623 # alpha_numeric_cmp is slower than "cmp" but handles numbers better ...
624
625 @reclassified_classlist
626 = sort { $self->alpha_numeric_cmp($a,$b) } keys %{$self->{'reclassifylist'}};
627
628 }
629
630 return $self->splitlist (\@reclassified_classlist);
631}
632
633sub get_entry {
634 my $self = shift (@_);
635 my ($title, $childtype, $metaname, $thistype) = @_;
636 # organise into classification structure
637 my %classifyinfo = ('childtype'=>$childtype,
638 'Title'=>$title,
639 'contains'=>[],
640 'mdtype'=>$metaname);
641
642 $classifyinfo{'thistype'} = $thistype
643 if defined $thistype && $thistype =~ /\w/;
644
645 return \%classifyinfo;
646}
647
648
649
650# splitlist takes an ordered list of classifications (@$classlistref) and
651# splits it up into alphabetical sub-sections.
652sub splitlist {
653 my $self = shift (@_);
654 my ($classlistref) = @_;
655 my $classhash = {};
656
657 # top level
658 my @metanames = split(/\/|\|/,$self->{'metadata'});
659 my $metaname = shift(@metanames);
660
661 my $childtype = "HList";
662 $childtype = "VList" if (scalar (@$classlistref) <= $self->{'minnesting'});
663
664 my $title = $self->{'buttonname'}; # should always be defined by now.
665 my $classifyinfo;
666 if (!defined($self->{'recopt'}))
667 {
668 $classifyinfo
669 = $self->get_entry ($title, $childtype, $metaname, "Invisible");
670 }
671 else
672 {
673 $classifyinfo
674 = $self->get_entry ($title, $childtype, $metaname, "VList");
675 }
676
677 # don't need to do any splitting if there are less than 'minnesting' classifications
678 if ((scalar @$classlistref) <= $self->{'minnesting'}) {
679 foreach my $subOID (@$classlistref) {
680 if ($subOID =~ /^CLASSIFY\.(.*)$/
681 && defined $self->{'classifiers'}->{$1})
682 {
683 push (@{$classifyinfo->{'contains'}},
684 $self->{'classifiers'}->{$1}->{'classifyinfo'});
685 }
686 else
687 {
688 $subOID =~ s/^Metavalue_(\d+)\.//;
689 my $metaname_offset = $1 -1;
690 my $oid_rec = {'OID'=>$subOID, 'offset'=>$metaname_offset};
691 push (@{$classifyinfo->{'contains'}}, $oid_rec);
692 }
693 }
694 return $classifyinfo;
695 }
696
697 # first split up the list into separate A-Z and 0-9 classifications
698 foreach my $classification (@$classlistref) {
699 my $title = $self->{'reclassifylist'}->{$classification};
700 $title =~ s/&(.){2,4};//g; # remove any HTML special chars
701 $title =~ s/^(\W|_)+//g; # remove leading non-word chars
702
703 # only want first character for classification
704 $title =~ m/^(.)/; # always a char, or first byte of wide char?
705 if (defined($1) && $1 ne "") {
706 $title=$1;
707
708 # remove any accents on initial character by mapping to Unicode's
709 # normalized decomposed form (accents follow initial letter)
710 # and then pick off the initial letter
711 my $title_decomposed = NFD($title);
712 $title = substr($title_decomposed,0,1);
713 } else {
714 print STDERR "no first character found for \"$title\" - \"" .
715 $self->{'reclassifylist'}->{$classification} . "\"\n";
716 }
717 $title =~ tr/[a-z]/[A-Z]/;
718
719 if ($title =~ /^[0-9]$/) {$title = '0-9';}
720 elsif ($title !~ /^[A-Z]$/) {
721 my $outhandle = $self->{'outhandle'};
722 print $outhandle "AZCompactList: WARNING $classification has badly formatted title ($title)\n";
723 }
724 $classhash->{$title} = [] unless defined $classhash->{$title};
725 push (@{$classhash->{$title}}, $classification);
726 }
727 $classhash = $self->compactlist ($classhash);
728
729 my @tmparr = ();
730 foreach my $subsection (sort keys (%$classhash)) {
731 push (@tmparr, $subsection);
732 }
733
734 # if there's a 0-9 section it will have been sorted to the beginning
735 # but we want it at the end
736 if ($tmparr[0] eq '0-9') {
737 shift @tmparr;
738 push (@tmparr, '0-9');
739 }
740 foreach my $subclass (@tmparr)
741 {
742 my $tempclassify
743 = (scalar(@tmparr)==1)
744 ? ($self->get_entry(" ", "VList", $metaname))
745 : ($self->get_entry($subclass, "VList", $metaname));
746
747
748 foreach my $subsubOID (@{$classhash->{$subclass}})
749 {
750 if ($subsubOID =~ /^CLASSIFY\.(.*)$/
751 && defined $self->{'classifiers'}->{$1})
752 {
753 # this is a "bookshelf" node... >1 entry compacted
754 push (@{$tempclassify->{'contains'}},
755 $self->{'classifiers'}->{$1}->{'classifyinfo'});
756 # set the metadata field name, for mdoffset to work
757 $self->{'classifiers'}->{$1}->{'classifyinfo'}->{'mdtype'}=
758 $metaname;
759 }
760 else
761 {
762 $subsubOID =~ s/^Metavalue_(\d+)\.//;
763 # record the offset if this metadata type has multiple values
764 my $metaname_offset = $1 -1;
765 my $oid_rec = {'OID'=>$subsubOID, 'offset'=>$metaname_offset};
766 push (@{$tempclassify->{'contains'}}, $oid_rec);
767 }
768 }
769 push (@{$classifyinfo->{'contains'}}, $tempclassify);
770 }
771
772 return $classifyinfo;
773}
774
775sub compactlist {
776 my $self = shift (@_);
777 my ($classhashref) = @_;
778 my $compactedhash = {};
779 my @currentOIDs = ();
780 my $currentfirstletter = "";
781 my $currentlastletter = "";
782 my $lastkey = "";
783
784 # minimum and maximum documents to be displayed per page.
785 # the actual maximum will be max + (min-1).
786 # the smallest sub-section is a single letter at present
787 # so in this case there may be many times max documents
788 # displayed on a page.
789 my $min = $self->{'mincompact'};
790 my $max = $self->{'maxcompact'};
791
792 foreach my $subsection (sort keys %$classhashref) {
793 if ($subsection eq '0-9') {
794 @{$compactedhash->{$subsection}} = @{$classhashref->{$subsection}};
795 next;
796 }
797 $currentfirstletter = $subsection if $currentfirstletter eq "";
798 if ((scalar (@currentOIDs) < $min) ||
799 ((scalar (@currentOIDs) + scalar (@{$classhashref->{$subsection}})) <= $max)) {
800 push (@currentOIDs, @{$classhashref->{$subsection}});
801 $currentlastletter = $subsection;
802 } else {
803
804 if ($currentfirstletter eq $currentlastletter) {
805 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
806 $lastkey = $currentfirstletter;
807 } else {
808 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
809 $lastkey = "$currentfirstletter-$currentlastletter";
810 }
811 if (scalar (@{$classhashref->{$subsection}}) >= $max) {
812 $compactedhash->{$subsection} = $classhashref->{$subsection};
813 @currentOIDs = ();
814 $currentfirstletter = "";
815 $lastkey=$subsection;
816 } else {
817 @currentOIDs = @{$classhashref->{$subsection}};
818 $currentfirstletter = $subsection;
819 $currentlastletter = $subsection;
820 }
821 }
822 }
823
824 # add final OIDs to last sub-classification if there aren't many otherwise
825 # add final sub-classification
826
827 # don't add if there aren't any oids
828 if (! scalar (@currentOIDs)) {return $compactedhash;}
829
830 if (scalar (@currentOIDs) < $min) {
831 my ($newkey) = $lastkey =~ /^(.)/;
832 @currentOIDs = (@{$compactedhash->{$lastkey}}, @currentOIDs);
833 delete $compactedhash->{$lastkey};
834 @{$compactedhash->{"$newkey-$currentlastletter"}} = @currentOIDs;
835 } else {
836 if ($currentfirstletter eq $currentlastletter) {
837 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
838 } else {
839 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
840 }
841 }
842
843 return $compactedhash;
844}
845
8461;
847
848
Note: See TracBrowser for help on using the repository browser.