source: trunk/gsdl/perllib/classify/AZList.pm@ 256

Last change on this file since 256 was 256, checked in by sjboddie, 25 years ago

renamed AZClassify AZList

  • Property svn:keywords set to Author Date Id Revision
File size: 7.1 KB
Line 
1# classifier plugin for sorting alphabetically
2
3package AZList;
4
5sub new {
6 my ($class, @options) = @_;
7
8 if (!defined @options || !scalar @options) {
9 die "AZList used with no metadata name to classify by\n";
10 }
11
12 return bless {
13 'OID'=>"NULL",
14 'list'=>{},
15 'metaname' => $options[0]
16 }, $class;
17}
18
19sub init {
20 my $self = shift (@_);
21
22 $self->{'list'} = {};
23}
24
25sub set_OID {
26 my $self = shift (@_);
27 my ($OID) = @_;
28
29 $self->{'OID'} = $OID;
30}
31
32sub get_OID {
33 my $self = shift (@_);
34 return $self->{'OID'};
35}
36
37sub classify {
38 my $self = shift (@_);
39 my ($doc_obj) = @_;
40
41 my $doc_OID = $doc_obj->get_OID();
42 my $metavalue = $doc_obj->get_metadata_element ($doc_obj->get_top_section(),
43 $self->{'metaname'});
44
45 # if this document doesn't contain the metadata element we're
46 # sorting by we won't include it in this classification
47
48 if (defined $metavalue) {
49 if ($self->{'metaname'} eq 'Creator') {
50 $self->format_string_name_english (\$metavalue);
51 } else {
52 $self->format_string_english (\$metavalue);
53 }
54 if (defined $self->{'list'}->{$doc_OID}) {
55 print STDERR "WARNING: AZList::classify called multiple times for $doc_OID\n";
56 }
57 $self->{'list'}->{$doc_OID} = $metavalue;
58 }
59}
60
61sub output_classify_info {
62 my $self = shift (@_);
63 my ($handle) = @_;
64
65 my @classlist = sort {$self->{'list'}->{$a} cmp $self->{'list'}->{$b};} keys %{$self->{'list'}};
66 my $subclasses = $self->splitlist (\@classlist);
67
68 if (scalar @$subclasses) {
69
70 my @contents = ();
71 map { push (@contents, "\".$_->[1]"); } @$subclasses;
72 # write out top level entry
73 $self->output_entry ($handle, $self->{'OID'}, $self->{'metaname'}, \@contents, "AZList");
74
75 # write out the entries for each sub-classification
76 map {$self->output_entry ($handle, "$self->{'OID'}.$_->[1]", $_->[0], $_->[2]);} @$subclasses;
77
78 } else {
79
80 # write out top level entry
81 $self->output_entry ($handle, $self->{'OID'}, $self->{'metaname'}, \@classlist, "AZList");
82 }
83}
84
85# splitlist takes an ordered list of classifications (@$classlistref) and splits it
86# up into alphabetical sub-sections. returns a list reference each entry of which is
87# in the form ["sub-section title", "sub-section number", [contents list]]
88# note that the title of each classification in @$classlistref is obtained from
89# $self->{'list'}->{$classlistref->[n]}
90sub splitlist {
91 my $self = shift (@_);
92 my ($classlistref) = @_;
93 my $classhash = {};
94 my $rarr = [];
95
96 # don't need to do any splitting if there are less than 20 classifications
97 return [] if ((scalar @$classlistref) <= 20);
98
99 # first split up the list into separate A-Z and 0-9 classifications
100 foreach $classification (@$classlistref) {
101 my $title = $self->{'list'}->{$classification};
102 $title =~ s/^(.).*$/$1/;
103 $title =~ tr/[a-z]/[A-Z]/;
104 if ($title =~ /^[0-9]$/) {$title = '0-9';}
105 elsif ($title !~ /^[A-Z]$/) {
106 print STDERR "AZList: WARNING $classification has badly formatted title ($title)\n";
107 }
108 $classhash->{$title} = [] unless defined $classhash->{$title};
109 push (@{$classhash->{$title}}, $classification);
110 }
111
112 $classhash = $self->compactlist ($classhash);
113
114 my @tmparr = ();
115 foreach $subsection (sort keys (%$classhash)) {
116 push (@tmparr, $subsection);
117 }
118
119 # if there's a 0-9 section it will have been sorted to the beginning
120 # but we want it at the end
121 if ($tmparr[0] eq '0-9') {
122 shift @tmparr;
123 push (@tmparr, '0-9');
124 }
125
126 my $count = 1;
127 foreach $subsection (@tmparr) {
128 push (@$rarr, [$subsection, $count, $classhash->{$subsection}]);
129 $count ++;
130 }
131
132 return $rarr;
133}
134
135sub compactlist {
136 my $self = shift (@_);
137 my ($classhashref) = @_;
138 my $compactedhash = {};
139 my @currentOIDs = ();
140 my $currentfirstletter = "";
141 my $currentlastletter = "";
142 my $lastkey = "";
143
144 # minimum and maximum documents to be displayed per page.
145 # the actual maximum will be max + (min-1).
146 # the smallest sub-section is a single letter at present
147 # so in this case there may be many times max documents
148 # displayed on a page.
149 my $min = 10;
150 my $max = 30;
151
152 foreach $subsection (sort keys %$classhashref) {
153 if ($subsection eq '0-9') {
154 @{$compactedhash->{$subsection}} = @{$classhashref->{$subsection}};
155 next;
156 }
157 $currentfirstletter = $subsection if $currentfirstletter eq "";
158 if ((scalar (@currentOIDs) < $min) ||
159 ((scalar (@currentOIDs) + scalar (@{$classhashref->{$subsection}})) <= $max)) {
160 push (@currentOIDs, @{$classhashref->{$subsection}});
161 $currentlastletter = $subsection;
162 } else {
163
164 if ($currentfirstletter eq $currentlastletter) {
165 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
166 $lastkey = $currentfirstletter;
167 } else {
168 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
169 $lastkey = "$currentfirstletter-$currentlastletter";
170 }
171 if (scalar (@{$classhashref->{$subsection}}) >= $max) {
172 $compactedhash->{$subsection} = $classhashref->{$subsection};
173 @currentOIDs = ();
174 $currentfirstletter = "";
175 } else {
176 @currentOIDs = @{$classhashref->{$subsection}};
177 $currentfirstletter = $subsection;
178 $currentlastletter = $subsection;
179 }
180 }
181 }
182
183 # add final OIDs to last sub-classification if there aren't many otherwise
184 # add final sub-classification
185 if (scalar (@currentOIDs) < $min) {
186 my ($newkey) = $lastkey =~ /^(.)/;
187 @currentOIDs = (@{$compactedhash->{$lastkey}}, @currentOIDs);
188 delete $compactedhash->{$lastkey};
189 @{$compactedhash->{"$newkey-$currentlastletter"}} = @currentOIDs;
190 } else {
191 if ($currentfirstletter eq $currentlastletter) {
192 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
193 } else {
194 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
195 }
196 }
197
198 return $compactedhash;
199}
200
201sub output_entry {
202 my $self = shift (@_);
203 my ($handle, $OID, $title, $contentsref, $classifytype) = @_;
204
205 print $handle "[$OID]\n";
206 print $handle "<doctype>classify\n";
207 print $handle "<hastxt>0\n";
208 print $handle "<Title>$title\n";
209 print $handle "<classifytype>$classifytype\n" if defined $classifytype;
210 if (scalar @$contentsref) {
211 print $handle "<contains>", join (";", @$contentsref), "\n";
212 }
213 print $handle '-' x 70, "\n";
214}
215
216# format an english name for sorting
217# i.e. convert to lowercase, put surname before
218# first names etc.
219sub format_string_name_english {
220 my $self = shift (@_);
221 my ($stringref) = @_;
222
223 $$stringref =~ tr/A-Z/a-z/;
224 $$stringref =~ s/[^a-z0-9 ]//g;
225 $$stringref =~ s/\s+/ /g;
226 $$stringref =~ s/^\s+//;
227 my @names = split / /, $$stringref;
228 my $surname = pop @names;
229 while (scalar @names && $surname =~ /^(jnr|snr)$/i) {
230 $surname = pop @names;
231 }
232 $$stringref = $surname . " " . $$stringref;
233}
234
235# format an english string for sorting
236# i.e. convert to lowercase, remove the, a or an
237# from beginning of string etc.
238sub format_string_english {
239 my $self = shift (@_);
240 my ($stringref) = @_;
241
242 $$stringref =~ tr/A-Z/a-z/;
243 $$stringref =~ s/[^a-z0-9 ]//g;
244 $$stringref =~ s/^\s*(the|a|an)\b//;
245 $$stringref =~ s/^\s+//;
246}
247
2481;
Note: See TracBrowser for help on using the repository browser.