source: main/trunk/greenstone2/perllib/classify/AZCompactList.pm@ 29476

Last change on this file since 29476 was 29476, checked in by sjs49, 9 years ago

First of 2 commits to get diffcol on the 64 bit Ubuntu that has perl 5.18 to work again for the Multimedia and MARC-Singlefile collections. This commit does a sort keys once in both AZCompactList and MARCPlugin. Without the sort keys change to MARCPlugin, various entries of dc.Meta fields are stored in different orders each time. Without the sort keys change to AZCompactList, nodes with identical titles/names that differ only in capitalisation are sorted in a different order each time.

  • Property svn:keywords set to Author Date Id Revision
File size: 23.9 KB
Line 
1###########################################################################
2#
3# AZCompactList.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# classifier plugin for sorting alphabetically
26
27package AZCompactList;
28
29use strict;
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use BaseClassifier;
33use sorttools;
34use FileUtils;
35
36use Unicode::Normalize;
37
38sub BEGIN {
39 @AZCompactList::ISA = ('BaseClassifier');
40}
41
42our $doclevel_list =
43 [ { 'name' => "top",
44 'desc' => "{AZCompactList.doclevel.top}" },
45 { 'name' => "firstlevel",
46 'desc' => "{AZCompactList.doclevel.firstlevel}" },
47 { 'name' => "section",
48 'desc' => "{AZCompactList.doclevel.section}" } ];
49
50my $arguments =
51 [ { 'name' => "metadata",
52 'desc' => "{AZCompactList.metadata}",
53 'type' => "metadata",
54 'reqd' => "yes" },
55 { 'name' => "firstvalueonly",
56 'desc' => "{AZCompactList.firstvalueonly}",
57 'type' => "flag",
58 'reqd' => "no" },
59 { 'name' => "allvalues",
60 'desc' => "{AZCompactList.allvalues}",
61 'type' => "flag",
62 'reqd' => "no" },
63 { 'name' => "sort",
64 'desc' => "{AZCompactList.sort}",
65 'type' => "metadata",
66# 'deft' => "Title",
67 'reqd' => "no" },
68 { 'name' => "removeprefix",
69 'desc' => "{BasClas.removeprefix}",
70 'type' => "regexp",
71 'deft' => "",
72 'reqd' => "no" },
73 { 'name' => "removesuffix",
74 'desc' => "{BasClas.removesuffix}",
75 'type' => "regexp",
76 'deft' => "",
77 'reqd' => "no" },
78 { 'name' => "mingroup",
79 'desc' => "{AZCompactList.mingroup}",
80 'type' => "int",
81 'deft' => "1",
82 'range' => "1,",
83 'reqd' => "no" },
84 { 'name' => "minnesting",
85 'desc' => "{AZCompactList.minnesting}",
86 'type' => "int",
87 'deft' => "20",
88 'range' => "2,",
89 'reqd' => "no" },
90 { 'name' => "mincompact",
91 'desc' => "{AZCompactList.mincompact}",
92 'type' => "int",
93 'deft' => "10",
94 'range' => "1,",
95 'reqd' => "no" },
96 { 'name' => "maxcompact",
97 'desc' => "{AZCompactList.maxcompact}",
98 'type' => "int",
99 'deft' => "30",
100 'range' => "1,",
101 'reqd' => "no" },
102 { 'name' => "doclevel",
103 'desc' => "{AZCompactList.doclevel}",
104 'type' => "enum",
105 'list' => $doclevel_list,
106 'deft' => "top",
107 'reqd' => "no" },
108 { 'name' => "freqsort",
109 'desc' => "{AZCompactList.freqsort}",
110 'type' => "flag"},
111 { 'name' => "recopt",
112 'desc' => "{AZCompactList.recopt}",
113 'type' => "flag",
114 'reqd' => "no" } ];
115
116my $options =
117{ 'name' => "AZCompactList",
118 'desc' => "{AZCompactList.desc}",
119 'abstract' => "no",
120 'inherits' => "yes",
121 'args' => $arguments };
122
123
124sub new {
125 my ($class) = shift (@_);
126 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
127 push(@$pluginlist, $class);
128
129 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
130 push(@{$hashArgOptLists->{"OptList"}},$options);
131
132 my $self = new BaseClassifier($pluginlist, $inputargs, $hashArgOptLists);
133
134 if ($self->{'info_only'}) {
135 # don't worry about any options etc
136 return bless $self, $class;
137 }
138
139 if (!$self->{"metadata"}) {
140 my $outhandle = $self->{'outhandle'};
141 print $outhandle "AZCompactList Error: required option -metadata not supplied\n";
142 $self->print_txt_usage("");
143 die "AZCompactList Error: required option -metadata not supplied\n";
144 }
145
146 $self->{'metadata'} = $self->strip_ex_from_metadata($self->{'metadata'});
147 # Manually set $self parameters.
148 $self->{'list'} = {};
149 $self->{'listmetavalue'} = {};
150 $self->{'list_mvpair'} = {};
151 $self->{'reclassify'} = {};
152 $self->{'reclassifylist'} = {};
153
154 $self->{'buttonname'} = $self->generate_title_from_metadata($self->{'metadata'}) unless ($self->{'buttonname'});
155
156 if (defined($self->{"removeprefix"}) && $self->{"removeprefix"}) {
157 $self->{"removeprefix"} =~ s/^\^//; # don't need a leading ^
158 }
159 if (defined($self->{"removesuffix"}) && $self->{"removesuffix"}) {
160 $self->{"removesuffix"} =~ s/\$$//; # don't need a trailing $
161 }
162
163 $self->{'recopt'} = ($self->{'recopt'} == 0) ? undef : "on";
164
165 if (defined $self->{'sort'}) {
166 $self->{'sort'} = $self->strip_ex_from_metadata($self->{'sort'});
167 }
168 # Clean out the unused keys
169 if($self->{"removeprefix"} eq "") {delete $self->{"removeprefix"};}
170 if($self->{"removesuffix"} eq "") {delete $self->{"removesuffix"};}
171
172 return bless $self, $class;
173}
174
175sub init
176{
177 my $self = shift (@_);
178
179 $self->{'list'} = {};
180 $self->{'listmetavalue'} = {};
181 $self->{'list_mvpair'} = {};
182 $self->{'reclassify'} = {};
183 $self->{'reclassifylist'} = {};
184}
185
186my $tmp = 0;
187
188sub classify
189{
190 my $self = shift (@_);
191 my ($doc_obj) = @_;
192
193 my $doc_OID = $doc_obj->get_OID();
194 my $outhandle = $self->{'outhandle'};
195
196 my @sectionlist = ();
197 my $topsection = $doc_obj->get_top_section();
198 my $metaname = $self->{'metadata'};
199
200 $metaname =~ s/(\/|\|).*//; # grab first name in n1/n2/n3 or n1|n2|n3 list
201 my @commameta_list = split(/,|;/, $metaname);
202
203 if ($self->{'doclevel'} =~ /^top(level)?/i)
204 {
205 push(@sectionlist,$topsection);
206 }
207 elsif ($self->{'doclevel'} =~ /^first(level)?/i)
208 {
209 my $toplevel_children = $doc_obj->get_children($topsection);
210 push(@sectionlist,@$toplevel_children);
211 }
212 else # (all)?section(s)?
213 {
214 my $thissection = $doc_obj->get_next_section($topsection);
215 while (defined $thissection)
216 {
217 push(@sectionlist,$thissection);
218 $thissection = $doc_obj->get_next_section ($thissection);
219 }
220 }
221
222 my $thissection;
223 foreach $thissection (@sectionlist)
224 {
225 my $full_doc_OID
226 = ($thissection ne "") ? "$doc_OID.$thissection" : $doc_OID;
227
228 if (defined $self->{'list_mvpair'}->{$full_doc_OID})
229 {
230 print $outhandle "WARNING: AZCompactList::classify called multiple times for $full_doc_OID\n";
231 }
232 $self->{'list'}->{$full_doc_OID} = [];
233 $self->{'listmetavalue'}->{$full_doc_OID} = [];
234 $self->{'list_mvpair'}->{$full_doc_OID} = [];
235
236 my $metavalues = [];
237 foreach my $cmn (@commameta_list) {
238 my $cmvalues = $doc_obj->get_metadata($thissection,$cmn);
239 push(@$metavalues,@$cmvalues) if (@{$cmvalues});
240 last if (@{$cmvalues} && !$self->{'allvalues'});
241 }
242
243 my $metavalue;
244 foreach $metavalue (@$metavalues)
245 {
246 # Tidy up use of white space in metavalue for classifying
247 $metavalue =~ s/^\s*//s;
248 $metavalue =~ s/\s*$//s;
249 $metavalue =~ s/\n/ /s;
250 $metavalue =~ s/\s{2,}/ /s;
251
252 # if this document doesn't contain the metadata element we're
253 # sorting by we won't include it in this classification
254 if (defined $metavalue && $metavalue =~ /\w/)
255 {
256 if (defined($self->{'removeprefix'}) &&
257 length($self->{'removeprefix'})) {
258 $metavalue =~ s/^$self->{'removeprefix'}//;
259
260 # check that it's not now empty
261 if (!$metavalue) {next;}
262 }
263
264 if (defined($self->{'removesuffix'}) &&
265 length($self->{'removesuffix'})) {
266 $metavalue =~ s/$self->{'removesuffix'}$//;
267
268 # check that it's not now empty
269 if (!$metavalue) {next;}
270 }
271
272 my $formatted_metavalue;
273 if ($self->{'no_metadata_formatting'}) {
274 $formatted_metavalue = $metavalue;
275 } else {
276 $formatted_metavalue = &sorttools::format_metadata_for_sorting($self->{'metadata'}, $metavalue, $doc_obj);
277 }
278
279 #### prefix-str
280 if (! defined($formatted_metavalue)) {
281 print $outhandle "Warning: AZCompactList: metavalue is ";
282 print $outhandle "empty\n";
283 $formatted_metavalue="";
284 }
285
286 my $mv_pair = { 'mv' => $metavalue, 'fmv' => $formatted_metavalue };
287 push(@{$self->{'list'}->{$full_doc_OID}},$formatted_metavalue);
288 push(@{$self->{'listmetavalue'}->{$full_doc_OID}} ,$metavalue);
289 push(@{$self->{'list_mvpair'}->{$full_doc_OID}},$mv_pair);
290
291
292 last if ($self->{'firstvalueonly'});
293 }
294 }
295
296 # This is used in reclassify below for AZCompactSectionList
297 my $sortmeta = $doc_obj->get_metadata_element($thissection, $self->{'sort'});
298 $self->{'reclassify'}->{$full_doc_OID} = [$doc_obj,$sortmeta];
299 }
300}
301
302sub reinit
303{
304 my ($self,$classlist_ref) = @_;
305 my $outhandle = $self->{'outhandle'};
306
307 my %mtfreq = ();
308 my @single_classlist = ();
309 my @multiple_classlist = ();
310
311 # find out how often each metavalue occurs
312 map
313 {
314 foreach my $mvp (@{$self->{'list_mvpair'}->{$_}} )
315 {
316### print STDERR "*** plain mv = $mvp->{'mv'}\n";
317### print STDERR "*** format mv = $mvp->{'fmv'}\n";
318
319 my $metavalue = $mvp->{'mv'};
320 $metavalue =~ s!^-!\\-!; # in case it starts with "-"
321 $mtfreq{$metavalue}++;
322 }
323 } @$classlist_ref;
324
325 # use this information to split the list: single metavalue/repeated value
326 map
327 {
328 my $i = 1;
329 my $metavalue;
330 foreach my $mvp (@{$self->{'list_mvpair'}->{$_}})
331 {
332 my $metavalue = $mvp->{'mv'};
333 $metavalue =~ s!^-!\\-!; # in case it starts with "-"
334 my $cs_metavalue = $mvp->{'mv'}; # case sensitive
335 if ($mtfreq{$metavalue} >= $self->{'mingroup'})
336 {
337### print STDERR "*** pushing on $cs_metavalue\n";
338 push(@multiple_classlist,[$_,$i,$metavalue]);
339 }
340 else
341 {
342 push(@single_classlist,[$_,$cs_metavalue]);
343 $metavalue =~ tr/[A-Z]/[a-z]/;
344 $self->{'reclassifylist'}->{"Metavalue_$i.$_"} = $metavalue;
345 }
346 $i++;
347 }
348 } @$classlist_ref;
349
350
351 # Setup sub-classifiers for multiple list
352
353 $self->{'classifiers'} = {};
354
355 my $pm;
356 foreach $pm ("SimpleList", "SectionList")
357 {
358 my $listname
359 = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"perllib/classify/$pm.pm");
360 if (-e $listname) { require $listname; }
361 else
362 {
363 print $outhandle "AZCompactList ERROR - couldn't find classifier \"$listname\"\n";
364 die "\n";
365 }
366 }
367
368 # Create classifiers objects for each entry >= mingroup
369 my $metavalue;
370 my $doclevel = $self->{'doclevel'};
371 my $mingroup = $self->{'mingroup'};
372 my @metaname_list = split(/\/|\|/,$self->{'metadata'});
373 my $metaname = shift(@metaname_list);
374 my $hierarchical = 0;
375 if (scalar(@metaname_list) > 1) {
376 $hierarchical = 1;
377 $metaname = join('/',@metaname_list);
378 }
379 foreach $metavalue (sort keys %mtfreq)
380 {
381 if ($mtfreq{$metavalue} >= $mingroup)
382 {
383 # occurs more often than minimum required to compact into a group
384 my $listclassobj;
385
386 if (!$hierarchical)
387 {
388 my @args;
389 push @args, ("-metadata", "$metaname");
390 # buttonname is also used for the node's title
391 push @args, ("-buttonname", "$metavalue");
392 push @args, ("-sort", $self->{'sort'});
393
394 my $ptArgs = \@args;
395 if ($doclevel =~ m/^top(level)?/i)
396 {
397 eval ("\$listclassobj = new SimpleList([],\$ptArgs)");
398 }
399 else
400 {
401 # first(level)? or (all)?section(s)?
402 eval ("\$listclassobj = new SectionList([],\$ptArgs)");
403 }
404 }
405 else
406 {
407
408 my @args;
409 push @args, ("-metadata", "$metaname");
410 # buttonname is also used for the node's title
411 push @args, ("-buttonname", "$metavalue");
412 push @args, ("-sort", $self->{'sort'});
413
414 if (defined $self->{'removeprefix'}) {
415 push @args, ("-removeprefix", $self->{'removeprefix'});
416 }
417 if (defined $self->{'removesuffix'}) {
418 push @args, ("-removesuffix", $self->{'removesuffix'});
419 }
420
421 push @args, ("-doclevel", "$doclevel");
422 push @args, ("-mingroup", $mingroup);
423 push @args, "-recopt ";
424
425 my $ptArgs = \@args;
426 eval ("\$listclassobj = new AZCompactList([],\$ptArgs)");
427 }
428
429 if ($@) {
430 print $outhandle "$@";
431 die "\n";
432 }
433
434 $listclassobj->init();
435
436 if (defined $metavalue && $metavalue =~ /\w/)
437 {
438 my $formatted_node = $metavalue;
439
440 if (defined($self->{'removeprefix'}) &&
441 length($self->{'removeprefix'})) {
442 $formatted_node =~ s/^$self->{'removeprefix'}//;
443 # check that it's not now empty
444 if (!$formatted_node) {next;}
445 }
446 if (defined($self->{'removesuffix'}) &&
447 length($self->{'removesuffix'})) {
448 $formatted_node =~ s/$self->{'removesuffix'}$//;
449 # check that it's not now empty
450 if (!$formatted_node) {next;}
451 }
452
453 $formatted_node = &sorttools::format_metadata_for_sorting($self->{'metadata'}, $formatted_node) unless $self->{'no_metadata_formatting'};
454
455 # In case our formatted string is empty...
456 if (! defined($formatted_node)) {
457 print $outhandle "Warning: AZCompactList: metavalue is ";
458 print $outhandle "empty\n";
459 $formatted_node="";
460 }
461
462 # use the lower case, for speed of lookup.
463 my $meta_lc=lc($metavalue);
464 $self->{'classifiers'}->{$meta_lc}
465 = { 'classifyobj' => $listclassobj,
466 'formattednode' => $formatted_node };
467 }
468 }
469 }
470
471
472 return (\@single_classlist,\@multiple_classlist);
473}
474
475
476sub reclassify
477{
478 my ($self,$multiple_cl_ref) = @_;
479
480 # Entries in the current classify list that are "book nodes"
481 # should be recursively classified.
482 #--
483 foreach my $dm_pair (@$multiple_cl_ref)
484 {
485 my ($doc_OID,$mdoffset,$metavalue,$cs_metavalue) = @$dm_pair;
486 my $listclassobj;
487
488 # find metavalue in list of sub-classifiers
489 # check if we have a key (lower case) for this metadata value
490 my $node_name=lc($metavalue);
491 if (exists $self->{'classifiers'}->{$node_name})
492 {
493 my ($doc_obj, $sortmeta) = @{$self->{'reclassify'}->{$doc_OID}};
494
495 # record the metadata value offset temporarily, so eg AZList can
496 # get the correct metadata value (for multi-valued metadata fields)
497 $doc_obj->{'mdoffset'}=$mdoffset;
498
499 if ($doc_OID =~ m/^[^\.]*\.([\d\.]+)$/)
500 {
501 my $section=$1;
502 if ($self->{'doclevel'} =~ m/^top(level)?/i) { # toplevel
503 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
504 ->classify($doc_obj,"Section=$section");
505 } else {
506 # first(level)? or (all)?section(s)?
507
508 # classify() can't handle multi-level section, so use
509 # classify_section()
510 # ... thanks to Don Gourley for this...
511
512 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
513 ->classify_section($section, $doc_obj, $sortmeta);
514 }
515 }
516 else
517 {
518 $self->{'classifiers'}->{$node_name}->{'classifyobj'}
519 ->classify($doc_obj);
520 }
521 } else { # this key is not in the hash
522 my $outhandle=$self->{outhandle};
523 print $outhandle "Warning: AZCompactList::reclassify ";
524 print $outhandle "could not find sub-node for metadata=`$metavalue' with doc_OID $doc_OID\n";
525 }
526 }
527}
528
529
530
531sub get_reclassify_info
532{
533 my $self = shift (@_);
534
535 my $node_name;
536 foreach $node_name (keys %{$self->{'classifiers'}})
537 {
538 my $classifyinfo
539 = $self->{'classifiers'}->{$node_name}->{'classifyobj'}
540 ->get_classify_info();
541 $self->{'classifiers'}->{$node_name}->{'classifyinfo'}
542 = $classifyinfo;
543 $self->{'reclassifylist'}->{"CLASSIFY.$node_name"}
544 = $self->{'classifiers'}->{$node_name}->{'formattednode'};
545 }
546}
547
548
549sub alpha_numeric_cmp
550{
551 my ($self,$a,$b) = @_;
552
553 my $title_a = $self->{'reclassifylist'}->{$a};
554 my $title_b = $self->{'reclassifylist'}->{$b};
555
556 if ($title_a =~ m/^(\d+(\.\d+)?)/)
557 {
558 my $val_a = $1;
559 if ($title_b =~ m/^(\d+(\.\d+)?)/)
560 {
561 my $val_b = $1;
562 if ($val_a != $val_b)
563 {
564 return ($val_a <=> $val_b);
565 }
566 }
567 }
568
569 return ($title_a cmp $title_b);
570}
571
572sub frequency_cmp
573{
574 my ($self,$a,$b) = @_;
575
576
577 my $title_a = $self->{'reclassifylist'}->{$a};
578 my $title_b = $self->{'reclassifylist'}->{$b};
579
580 my $a_freq = 1;
581 my $b_freq = 1;
582
583 if ($a =~ m/^CLASSIFY\.(.*)$/)
584 {
585 my $a_node = $1;
586 my $a_nodeinfo = $self->{'classifiers'}->{$a_node}->{'classifyinfo'};
587 $a_freq = scalar(@{$a_nodeinfo->{'contains'}});
588 }
589
590 if ($b =~ m/^CLASSIFY\.(.*)$/)
591 {
592 my $b_node = $1;
593 my $b_nodeinfo = $self->{'classifiers'}->{$b_node}->{'classifyinfo'};
594 $b_freq = scalar(@{$b_nodeinfo->{'contains'}});
595 }
596
597 return $b_freq <=> $a_freq;
598}
599
600sub get_classify_info {
601 my $self = shift (@_);
602
603 my @classlist =keys %{$self->{'list_mvpair'}}; # list all doc oids
604
605 my ($single_cl_ref,$multiple_cl_ref) = $self->reinit(\@classlist);
606 $self->reclassify($multiple_cl_ref);
607 $self->get_reclassify_info();
608
609 my @reclassified_classlist;
610 if ($self->{'freqsort'})
611 {
612 @reclassified_classlist
613 = sort { $self->frequency_cmp($a,$b) } keys %{$self->{'reclassifylist'}};
614 # supress sub-grouping by alphabet
615 map { $self->{'reclassifylist'}->{$_} = "A".$self->{'reclassifylist'}; } keys %{$self->{'reclassifylist'}};
616 }
617 else
618 {
619# @reclassified_classlist
620# = sort {$self->{'reclassifylist'}->{$a} cmp $self->{'reclassifylist'}->{$b};} keys %{$self->{'reclassifylist'}};
621
622 # alpha_numeric_cmp is slower than "cmp" but handles numbers better ...
623
624 @reclassified_classlist
625 = sort { $self->alpha_numeric_cmp($a,$b) } keys %{$self->{'reclassifylist'}};
626
627 }
628
629 return $self->splitlist (\@reclassified_classlist);
630}
631
632sub get_entry {
633 my $self = shift (@_);
634 my ($title, $childtype, $metaname, $thistype) = @_;
635 # organise into classification structure
636 my %classifyinfo = ('childtype'=>$childtype,
637 'Title'=>$title,
638 'contains'=>[],
639 'mdtype'=>$metaname);
640
641 $classifyinfo{'thistype'} = $thistype
642 if defined $thistype && $thistype =~ /\w/;
643
644 return \%classifyinfo;
645}
646
647
648
649# splitlist takes an ordered list of classifications (@$classlistref) and
650# splits it up into alphabetical sub-sections.
651sub splitlist {
652 my $self = shift (@_);
653 my ($classlistref) = @_;
654 my $classhash = {};
655
656 # top level
657 my @metanames = split(/\/|\|/,$self->{'metadata'});
658 my $metaname = shift(@metanames);
659
660 my $childtype = "HList";
661 $childtype = "VList" if (scalar (@$classlistref) <= $self->{'minnesting'});
662
663 my $title = $self->{'buttonname'}; # should always be defined by now.
664 my $classifyinfo;
665 if (!defined($self->{'recopt'}))
666 {
667 $classifyinfo
668 = $self->get_entry ($title, $childtype, $metaname, "Invisible");
669 }
670 else
671 {
672 $classifyinfo
673 = $self->get_entry ($title, $childtype, $metaname, "VList");
674 }
675
676 # don't need to do any splitting if there are less than 'minnesting' classifications
677 if ((scalar @$classlistref) <= $self->{'minnesting'}) {
678 foreach my $subOID (@$classlistref) {
679 if ($subOID =~ /^CLASSIFY\.(.*)$/
680 && defined $self->{'classifiers'}->{$1})
681 {
682 push (@{$classifyinfo->{'contains'}},
683 $self->{'classifiers'}->{$1}->{'classifyinfo'});
684 }
685 else
686 {
687 $subOID =~ s/^Metavalue_(\d+)\.//;
688 my $metaname_offset = $1 -1;
689 my $oid_rec = {'OID'=>$subOID, 'offset'=>$metaname_offset};
690 push (@{$classifyinfo->{'contains'}}, $oid_rec);
691 }
692 }
693 return $classifyinfo;
694 }
695
696 # first split up the list into separate A-Z and 0-9 classifications
697 foreach my $classification (@$classlistref) {
698 my $title = $self->{'reclassifylist'}->{$classification};
699 $title =~ s/&(.){2,4};//g; # remove any HTML special chars
700 $title =~ s/^(\W|_)+//g; # remove leading non-word chars
701
702 # only want first character for classification
703 $title =~ m/^(.)/; # always a char, or first byte of wide char?
704 if (defined($1) && $1 ne "") {
705 $title=$1;
706
707 # remove any accents on initial character by mapping to Unicode's
708 # normalized decomposed form (accents follow initial letter)
709 # and then pick off the initial letter
710 my $title_decomposed = NFD($title);
711 $title = substr($title_decomposed,0,1);
712 } else {
713 print STDERR "no first character found for \"$title\" - \"" .
714 $self->{'reclassifylist'}->{$classification} . "\"\n";
715 }
716 $title =~ tr/[a-z]/[A-Z]/;
717
718 if ($title =~ /^[0-9]$/) {$title = '0-9';}
719 elsif ($title !~ /^[A-Z]$/) {
720 my $outhandle = $self->{'outhandle'};
721 print $outhandle "AZCompactList: WARNING $classification has badly formatted title ($title)\n";
722 }
723 $classhash->{$title} = [] unless defined $classhash->{$title};
724 push (@{$classhash->{$title}}, $classification);
725 }
726 $classhash = $self->compactlist ($classhash);
727
728 my @tmparr = ();
729 foreach my $subsection (sort keys (%$classhash)) {
730 push (@tmparr, $subsection);
731 }
732
733 # if there's a 0-9 section it will have been sorted to the beginning
734 # but we want it at the end
735 if ($tmparr[0] eq '0-9') {
736 shift @tmparr;
737 push (@tmparr, '0-9');
738 }
739 foreach my $subclass (@tmparr)
740 {
741 my $tempclassify
742 = (scalar(@tmparr)==1)
743 ? ($self->get_entry(" ", "VList", $metaname))
744 : ($self->get_entry($subclass, "VList", $metaname));
745
746
747 foreach my $subsubOID (@{$classhash->{$subclass}})
748 {
749 if ($subsubOID =~ /^CLASSIFY\.(.*)$/
750 && defined $self->{'classifiers'}->{$1})
751 {
752 # this is a "bookshelf" node... >1 entry compacted
753 push (@{$tempclassify->{'contains'}},
754 $self->{'classifiers'}->{$1}->{'classifyinfo'});
755 # set the metadata field name, for mdoffset to work
756 $self->{'classifiers'}->{$1}->{'classifyinfo'}->{'mdtype'}=
757 $metaname;
758 }
759 else
760 {
761 $subsubOID =~ s/^Metavalue_(\d+)\.//;
762 # record the offset if this metadata type has multiple values
763 my $metaname_offset = $1 -1;
764 my $oid_rec = {'OID'=>$subsubOID, 'offset'=>$metaname_offset};
765 push (@{$tempclassify->{'contains'}}, $oid_rec);
766 }
767 }
768 push (@{$classifyinfo->{'contains'}}, $tempclassify);
769 }
770
771 return $classifyinfo;
772}
773
774sub compactlist {
775 my $self = shift (@_);
776 my ($classhashref) = @_;
777 my $compactedhash = {};
778 my @currentOIDs = ();
779 my $currentfirstletter = "";
780 my $currentlastletter = "";
781 my $lastkey = "";
782
783 # minimum and maximum documents to be displayed per page.
784 # the actual maximum will be max + (min-1).
785 # the smallest sub-section is a single letter at present
786 # so in this case there may be many times max documents
787 # displayed on a page.
788 my $min = $self->{'mincompact'};
789 my $max = $self->{'maxcompact'};
790
791 foreach my $subsection (sort keys %$classhashref) {
792 if ($subsection eq '0-9') {
793 @{$compactedhash->{$subsection}} = @{$classhashref->{$subsection}};
794 next;
795 }
796 $currentfirstletter = $subsection if $currentfirstletter eq "";
797 if ((scalar (@currentOIDs) < $min) ||
798 ((scalar (@currentOIDs) + scalar (@{$classhashref->{$subsection}})) <= $max)) {
799 push (@currentOIDs, @{$classhashref->{$subsection}});
800 $currentlastletter = $subsection;
801 } else {
802
803 if ($currentfirstletter eq $currentlastletter) {
804 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
805 $lastkey = $currentfirstletter;
806 } else {
807 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
808 $lastkey = "$currentfirstletter-$currentlastletter";
809 }
810 if (scalar (@{$classhashref->{$subsection}}) >= $max) {
811 $compactedhash->{$subsection} = $classhashref->{$subsection};
812 @currentOIDs = ();
813 $currentfirstletter = "";
814 $lastkey=$subsection;
815 } else {
816 @currentOIDs = @{$classhashref->{$subsection}};
817 $currentfirstletter = $subsection;
818 $currentlastletter = $subsection;
819 }
820 }
821 }
822
823 # add final OIDs to last sub-classification if there aren't many otherwise
824 # add final sub-classification
825
826 # don't add if there aren't any oids
827 if (! scalar (@currentOIDs)) {return $compactedhash;}
828
829 if (scalar (@currentOIDs) < $min) {
830 my ($newkey) = $lastkey =~ /^(.)/;
831 @currentOIDs = (@{$compactedhash->{$lastkey}}, @currentOIDs);
832 delete $compactedhash->{$lastkey};
833 @{$compactedhash->{"$newkey-$currentlastletter"}} = @currentOIDs;
834 } else {
835 if ($currentfirstletter eq $currentlastletter) {
836 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
837 } else {
838 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
839 }
840 }
841
842 return $compactedhash;
843}
844
8451;
846
847
Note: See TracBrowser for help on using the repository browser.