source: trunk/gsdl/perllib/classify/AZClassify.pm@ 244

Last change on this file since 244 was 244, checked in by sjboddie, 25 years ago

fixed a couple of bugs

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1# classifier plugin for sorting alphabetically
2
3package AZClassify;
4
5sub new {
6 my ($class, @options) = @_;
7
8 if (!defined @options || !scalar @options) {
9 die "AZClassify used with no metadata name to classify by\n";
10 }
11
12 return bless {
13 'OID'=>"NULL",
14 'list'=>{},
15 'metaname' => $options[0]
16 }, $class;
17}
18
19sub init {
20 my $self = shift (@_);
21
22 $self->{'list'} = {};
23}
24
25sub set_OID {
26 my $self = shift (@_);
27 my ($OID) = @_;
28
29 $self->{'OID'} = $OID;
30}
31
32sub get_OID {
33 my $self = shift (@_);
34 return $self->{'OID'};
35}
36
37sub classify {
38 my $self = shift (@_);
39 my ($doc_obj) = @_;
40
41 my $doc_OID = $doc_obj->get_OID();
42 my $metavalue = $doc_obj->get_metadata_element ($doc_obj->get_top_section(),
43 $self->{'metaname'});
44
45 # if this document doesn't contain the metadata element we're
46 # sorting by we won't include it in this classification
47 if (defined $metavalue) {
48 if ($self->{'metaname'} eq 'Creator') {
49 $self->format_string_name_english (\$metavalue);
50 } else {
51 $self->format_string_english (\$metavalue);
52 }
53 if (defined $self->{'list'}->{$doc_OID}) {
54 print STDERR "WARNING: AZClassify::classify called multiple times for $doc_OID\n";
55 }
56 $self->{'list'}->{$doc_OID} = $metavalue;
57 }
58}
59
60sub output_classify_info {
61 my $self = shift (@_);
62 my ($handle) = @_;
63
64 my @classlist = sort {$self->{'list'}->{$a} cmp $self->{'list'}->{$b};} keys %{$self->{'list'}};
65 my $subclasses = $self->splitlist (\@classlist);
66
67 if (scalar @$subclasses) {
68
69 my @contents = ();
70 map { push (@contents, "\".$_->[1]"); } @$subclasses;
71 # write out top level entry
72 $self->output_entry ($handle, $self->{'OID'}, $self->{'metaname'}, \@contents, "List");
73
74 # write out the entries for each sub-classification
75 map {$self->output_entry ($handle, "$self->{'OID'}.$_->[1]", $_->[0], $_->[2]);} @$subclasses;
76
77 } else {
78
79 # write out top level entry
80 $self->output_entry ($handle, $self->{'OID'}, $self->{'metaname'}, \@classlist, "List");
81 }
82}
83
84# splitlist takes an ordered list of classifications (@$classlistref) and splits it
85# up into alphabetical sub-sections. returns a list reference each entry of which is
86# in the form ["sub-section title", "sub-section number", [contents list]]
87# note that the title of each classification in @$classlistref is obtained from
88# $self->{'list'}->{$classlistref->[n]}
89sub splitlist {
90 my $self = shift (@_);
91 my ($classlistref) = @_;
92 my $classhash = {};
93 my $rarr = [];
94
95 # don't need to do any splitting if there are less than 20 classifications
96 return [] if ((scalar @$classlistref) <= 20);
97
98 # first split up the list into separate A-Z and 0-9 classifications
99 foreach $classification (@$classlistref) {
100 my $title = $self->{'list'}->{$classification};
101 $title =~ s/^(.).*$/$1/;
102 $title =~ tr/[a-z]/[A-Z]/;
103 if ($title =~ /^[0-9]$/) {$title = '0-9';}
104 elsif ($title !~ /^[A-Z]$/) {
105 print STDERR "AZClassify: WARNING $classification has badly formatted title ($title)\n";
106 }
107 $classhash->{$title} = [] unless defined $classhash->{$title};
108 push (@{$classhash->{$title}}, $classification);
109 }
110
111 $classhash = $self->compactlist ($classhash);
112
113 my $count = 1;
114 foreach $subsection (sort keys (%$classhash)) {
115 push (@$rarr, [$subsection, $count, $classhash->{$subsection}]);
116 $count ++;
117 }
118 return $rarr;
119}
120
121sub compactlist {
122 my $self = shift (@_);
123 my ($classhashref) = @_;
124 my $compactedhash = {};
125 my @currentOIDs = ();
126 my $currentfirstletter = "";
127 my $currentlastletter = "";
128 my $lastkey = "";
129
130 # minimum and maximum documents to be displayed per page.
131 # the actual maximum will be max + (min-1).
132 # the smallest sub-section is a single letter at present
133 # so in this case there may be many times max documents
134 # displayed on a page.
135 my $min = 10;
136 my $max = 30;
137
138 foreach $subsection (sort keys %$classhashref) {
139 if ($subsection eq '0-9') {
140 @{$compactedhash->{$subsection}} = @{$classhashref->{$subsection}};
141 next;
142 }
143 $currentfirstletter = $subsection if $currentfirstletter eq "";
144 if ((scalar (@currentOIDs) < $min) ||
145 ((scalar (@currentOIDs) + scalar (@{$classhashref->{$subsection}})) <= $max)) {
146 push (@currentOIDs, @{$classhashref->{$subsection}});
147 $currentlastletter = $subsection;
148 } else {
149
150 if ($currentfirstletter eq $currentlastletter) {
151 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
152 $lastkey = $currentfirstletter;
153 } else {
154 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
155 $lastkey = "$currentfirstletter-$currentlastletter";
156 }
157 if (scalar (@{$classhashref->{$subsection}}) >= $max) {
158 $compactedhash->{$subsection} = $classhashref->{$subsection};
159 @currentOIDs = ();
160 $currentfirstletter = "";
161 } else {
162 @currentOIDs = @{$classhashref->{$subsection}};
163 $currentfirstletter = $subsection;
164 $currentlastletter = $subsection;
165 }
166 }
167 }
168
169 # add final OIDs to last sub-classification if there aren't many otherwise
170 # add final sub-classification
171 if (scalar (@currentOIDs) < $min) {
172 my ($newkey) = $lastkey =~ /^(.)/;
173 @currentOIDs = (@{$compactedhash->{$lastkey}}, @currentOIDs);
174 delete $compactedhash->{$lastkey};
175 @{$compactedhash->{"$newkey-$currentlastletter"}} = @currentOIDs;
176 } else {
177 if ($currentfirstletter eq $currentlastletter) {
178 @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
179 } else {
180 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
181 }
182 }
183
184 return $compactedhash;
185}
186
187sub output_entry {
188 my $self = shift (@_);
189 my ($handle, $OID, $title, $contentsref, $classifytype) = @_;
190
191 print $handle "[$OID]\n";
192 print $handle "<doctype>classify\n";
193 print $handle "<hastxt>0\n";
194 print $handle "<Title>$title\n";
195 print $handle "<classifytype>$classifytype\n" if defined $classifytype;
196 if (scalar @$contentsref) {
197 print $handle "<contains>", join (";", @$contentsref), "\n";
198 }
199 print $handle '-' x 70, "\n";
200}
201
202# format an english name for sorting
203# i.e. convert to lowercase, put surname before
204# first names etc.
205sub format_string_name_english {
206 my $self = shift (@_);
207 my ($stringref) = @_;
208
209 $$stringref =~ tr/A-Z/a-z/;
210 $$stringref =~ s/[^a-z0-9 ]//g;
211 $$stringref =~ s/\s+/ /g;
212 $$stringref =~ s/^\s+//;
213 my @names = split / /, $$stringref;
214 my $surname = pop @names;
215 while (scalar @names && $surname =~ /^(jnr|snr)$/i) {
216 $surname = pop @names;
217 }
218 $$stringref = $surname . " " . $$stringref;
219}
220
221# format an english string for sorting
222# i.e. convert to lowercase, remove the, a or an
223# from beginning of string etc.
224sub format_string_english {
225 my $self = shift (@_);
226 my ($stringref) = @_;
227
228 $$stringref =~ tr/A-Z/a-z/;
229 $$stringref =~ s/[^a-z0-9 ]//g;
230 $$stringref =~ s/^\s*(the|a|an)\b//;
231 $$stringref =~ s/^\s+//;
232}
233
2341;
Note: See TracBrowser for help on using the repository browser.