source: main/trunk/greenstone2/perllib/classify/AZList.pm@ 32594

Last change on this file since 32594 was 23116, checked in by kjdon, 14 years ago

for incremental build, classifiers are not really done incrementally. Previously, we reconstructed all the docs from the database, and classified them, then processed any new/edited/deleted docs, updating the classifier as necessary. Now, we process all new/updated docs, then reconstruct the docs from the database, but only classify those not changed/deleted. This means that we are only ever adding docs to a classifier, never updating or deleting. I have removed edit_mode and all code handling deleting stuff from the classifier.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.9 KB
Line 
1###########################################################################
2#
3# AZList.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# classifier plugin for sorting alphabetically
27
28package AZList;
29
30use BaseClassifier;
31use sorttools;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36sub BEGIN {
37 @AZList::ISA = ('BaseClassifier');
38}
39
40
41my $arguments =
42 [ { 'name' => "metadata",
43 'desc' => "{AZList.metadata}",
44 'type' => "metadata",
45 'reqd' => "yes" } ,
46 { 'name' => "removeprefix",
47 'desc' => "{BasClas.removeprefix}",
48 'type' => "regexp",
49 'deft' => "",
50 'reqd' => "no" } ,
51 { 'name' => "removesuffix",
52 'desc' => "{BasClas.removesuffix}",
53 'type' => "regexp",
54 'deft' => "",
55 'reqd' => "no" }
56 ];
57
58my $options = { 'name' => "AZList",
59 'desc' => "{AZList.desc}",
60 'abstract' => "no",
61 'inherits' => "yes",
62 'args' => $arguments };
63
64
65sub new {
66 my ($class) = shift (@_);
67 my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
68 push(@$classifierslist, $class);
69
70 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
71 push(@{$hashArgOptLists->{"OptList"}},$options);
72
73 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
74
75 if ($self->{'info_only'}) {
76 # don't worry about any options etc
77 return bless $self, $class;
78 }
79
80 if (!$self->{"metadata"}) {
81 print STDERR "AZList Error: required option -metadata not supplied \n";
82 $self->print_txt_usage(""); # Use default resource bundle
83
84 die "AZList Error: required option -metadata not supplied\n";
85 }
86
87 # Manually set $self parameters.
88 $self->{'list'} = {};
89
90 # Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
91 my $metadata = $self->{"metadata"};
92 $metadata = $self->strip_ex_from_metadata($metadata);
93 my @meta_list = split(/,/, $metadata);
94 $self->{'meta_list'} = \@meta_list;
95
96 $self->{'buttonname'} = $self->generate_title_from_metadata($metadata) unless ($self->{'buttonname'});
97
98 # Further setup
99 if (defined($self->{"removeprefix"}) && $self->{"removeprefix"}) {
100 $self->{"removeprefix"} =~ s/^\^//; # don't need a leading ^
101 }
102 if (defined($self->{"removesuffix"}) && $self->{"removesuffix"}) {
103 $self->{"removesuffix"} =~ s/\$$//; # don't need a trailing $
104 }
105
106 # Clean out the unused keys
107 delete $self->{"metadata"}; # Delete this key
108
109 if($self->{"removeprefix"} eq "") {delete $self->{"removeprefix"};}
110 if($self->{"removesuffix"} eq "") {delete $self->{"removesuffix"};}
111
112 return bless $self, $class;
113}
114
115sub init {
116 my $self = shift (@_);
117
118 $self->{'list'} = {};
119}
120
121sub classify {
122 my $self = shift (@_);
123 my ($doc_obj) = @_;
124
125 my $doc_OID = $doc_obj->get_OID();
126 my $outhandle = $self->{'outhandle'};
127
128 my $metavalue;
129 my $metaname;
130 # should we extend this to use all available metadata not just the first?
131 if (!defined $self->{'meta_list'}) {
132 # just in case
133 return;
134 }
135
136 # find the first available metadata
137 foreach my $m (@{$self->{'meta_list'}}) {
138 $metavalue = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $m);
139 $metaname = $m;
140 last if defined $metavalue;
141 }
142
143 #if we haven't found a metavalue here, then the doc shouldn't be included
144 if (!defined $metavalue || $metavalue eq "") {
145 print $outhandle "WARNING: AZList: $doc_OID metadata is empty - not classifying\n";
146 return;
147 }
148
149 if (defined($self->{'removeprefix'}) &&
150 length($self->{'removeprefix'})) {
151 $metavalue =~ s/^$self->{'removeprefix'}//;
152 }
153 if (defined($self->{'removesuffix'}) &&
154 length($self->{'removesuffix'})) {
155 $metavalue =~ s/$self->{'removesuffix'}$//;
156 }
157
158
159 $metavalue = &sorttools::format_metadata_for_sorting($metaname, $metavalue, $doc_obj) unless $self->{'no_metadata_formatting'};
160
161 if (defined $self->{'list'}->{$doc_OID}) {
162 print $outhandle "WARNING: AZList::classify called multiple times for $doc_OID\n";
163 }
164 if ($metavalue) {
165 $self->{'list'}->{$doc_OID} = $metavalue;
166 } else {
167 # the formatting has made it empty
168 my $outhandle = $self->{'outhandle'};
169 print $outhandle "WARNING: AZList: $doc_OID metadata has become empty - not classifying\n";
170 }
171
172}
173
174sub alpha_numeric_cmp
175{
176 my ($self,$a,$b) = @_;
177
178 my $title_a = $self->{'list'}->{$a};
179 my $title_b = $self->{'list'}->{$b};
180
181 if ($title_a =~ m/^(\d+(\.\d+)?)/)
182 {
183 my $val_a = $1;
184 if ($title_b =~ m/^(\d+(\.\d+)?)/)
185 {
186 my $val_b = $1;
187 if ($val_a != $val_b)
188 {
189 return ($val_a <=> $val_b);
190 }
191 }
192 }
193
194 return ($title_a cmp $title_b);
195}
196
197sub get_classify_info {
198 my $self = shift (@_);
199
200 my @classlist
201 = sort { $self->alpha_numeric_cmp($a,$b) } keys %{$self->{'list'}};
202
203 return $self->splitlist (\@classlist);
204}
205
206sub get_entry {
207 my $self = shift (@_);
208 my ($title, $childtype, $thistype) = @_;
209
210 # organise into classification structure
211 my %classifyinfo = ('childtype'=>$childtype,
212 'Title'=>$title,
213 'contains'=>[]);
214 $classifyinfo{'thistype'} = $thistype
215 if defined $thistype && $thistype =~ /\w/;
216
217 return \%classifyinfo;
218}
219
220# splitlist takes an ordered list of classifications (@$classlistref) and splits it
221# up into alphabetical sub-sections.
222sub splitlist {
223 my $self = shift (@_);
224 my ($classlistref) = @_;
225 my $classhash = {};
226
227 # top level
228 my $childtype = "HList";
229 if (scalar (@$classlistref) <= 39) {$childtype = "VList";}
230 my $classifyinfo = $self->get_entry ($self->{'buttonname'}, $childtype, "Invisible");
231
232 # don't need to do any splitting if there are less than 39 (max + min -1) classifications
233 if ((scalar @$classlistref) <= 39) {
234 foreach my $subOID (@$classlistref) {
235 push (@{$classifyinfo->{'contains'}}, {'OID'=>$subOID});
236 }
237 return $classifyinfo;
238 }
239
240 # first split up the list into separate A-Z and 0-9 classifications
241 foreach my $classification (@$classlistref) {
242 my $title = $self->{'list'}->{$classification};
243
244 $title =~ s/^(&.{1,6};|<[^>]>|[^a-zA-Z0-9])//g; # remove any unwanted stuff
245 # only need first char for classification
246 $title =~ m/^(.)/; $title=$1;
247 $title =~ tr/[a-z]/[A-Z]/;
248 if ($title =~ /^[0-9]$/) {$title = '0-9';}
249 elsif ($title !~ /^[A-Z]$/) {
250 my $outhandle = $self->{'outhandle'};
251 print $outhandle "AZList: WARNING $classification has badly formatted title ($title)\n";
252 }
253 $classhash->{$title} = [] unless defined $classhash->{$title};
254 push (@{$classhash->{$title}}, $classification);
255 }
256 $classhash = $self->compactlist ($classhash);
257
258 my @tmparr = ();
259 foreach my $subsection (sort keys (%$classhash)) {
260 push (@tmparr, $subsection);
261 }
262 #if there is only one entry here, we suppress the buckets
263 if ((scalar @tmparr) == 1) {
264 $classifyinfo->{'childtype'} = "VList";
265 foreach my $OID (@{$classhash->{$tmparr[0]}}) {
266 push (@{$classifyinfo->{'contains'}}, {'OID'=>$OID});
267 }
268 return $classifyinfo;
269 }
270
271 # if there's a 0-9 section it will have been sorted to the beginning
272 # but we want it at the end
273 if ($tmparr[0] eq '0-9') {
274 shift @tmparr;
275 push (@tmparr, '0-9');
276 }
277
278 foreach my $subclass (@tmparr) {
279 my $tempclassify = $self->get_entry($subclass, "VList");
280 foreach my $subsubOID (@{$classhash->{$subclass}}) {
281 push (@{$tempclassify->{'contains'}}, {'OID'=>$subsubOID});
282 }
283 push (@{$classifyinfo->{'contains'}}, $tempclassify);
284 }
285
286 return $classifyinfo;
287}
288
289sub compactlist {
290 my $self = shift (@_);
291 my ($classhashref) = @_;
292 my $compactedhash = {};
293 my @currentOIDs = ();
294 my $currentfirstletter = ""; # start of working bin
295 my $currentlastletter = ""; # end of working bin
296 my $lastkey = ""; # the name of the last completed key
297
298 # minimum and maximum documents to be displayed per page.
299 # the actual maximum will be max + (min-1).
300 # the smallest sub-section is a single letter at present
301 # so in this case there may be many times max documents
302 # displayed on a page.
303 my $min = 10;
304 my $max = 30;
305
306 foreach my $subsection (sort keys %$classhashref) {
307 if ($subsection eq '0-9') {
308 # leave this bin as-is... copy it straight across
309 @{$compactedhash->{$subsection}} = @{$classhashref->{$subsection}};
310 next;
311 }
312 $currentfirstletter = $subsection if $currentfirstletter eq "";
313 if ((scalar (@currentOIDs) < $min) ||
314 ((scalar (@currentOIDs) + scalar (@{$classhashref->{$subsection}})) <= $max)) {
315 # add this letter to the bin and continue
316 push (@currentOIDs, @{$classhashref->{$subsection}});
317 $currentlastletter = $subsection;
318 } else {
319 # too many or too few for a separate bin
320 if ($currentfirstletter eq $currentlastletter) {
321 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
322 $lastkey = $currentfirstletter;
323 } else {
324 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
325 $lastkey = "$currentfirstletter-$currentlastletter";
326 }
327 if (scalar (@{$classhashref->{$subsection}}) >= $max) {
328 # this key is now complete. Start a new one
329 $compactedhash->{$subsection} = $classhashref->{$subsection};
330 @currentOIDs = ();
331 $currentfirstletter = "";
332 $lastkey = $subsection;
333 } else {
334 @currentOIDs = @{$classhashref->{$subsection}};
335 $currentfirstletter = $subsection;
336 $currentlastletter = $subsection;
337 }
338 }
339 }
340
341 # add final OIDs to last sub-classification if there aren't many otherwise
342 # add final sub-classification
343 # BUG FIX: don't add anything if there are no currentOIDs (thanks to Don Gourley)
344 if (! scalar(@currentOIDs)) {return $compactedhash;}
345
346 if (scalar (@currentOIDs) < $min) {
347 my ($newkey) = $lastkey =~ /^(.)/;
348 @currentOIDs = (@{$compactedhash->{$lastkey}}, @currentOIDs);
349 delete $compactedhash->{$lastkey};
350 @{$compactedhash->{"$newkey-$currentlastletter"}} = @currentOIDs;
351 } else {
352 if ($currentfirstletter eq $currentlastletter) {
353 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
354 }
355 else {
356 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} =
357 @currentOIDs;
358 }
359 }
360
361 return $compactedhash;
362}
363
3641;
Note: See TracBrowser for help on using the repository browser.