source: trunk/gsdl/perllib/classify/BasClas.pm@ 7405

Last change on this file since 7405 was 7175, checked in by kjdon, 20 years ago

fixed the bug where AZCompactList wasn't sorting Creators properly (by surname)

  • Property svn:keywords set to Author Date Id Revision
File size: 8.4 KB
Line 
1###########################################################################
2#
3# BasClas.pm -- base class for all classifiers
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2000 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package BasClas;
28
29# How a classifier works.
30#
31# For each classifier requested in the collect.cfg file, buildcol.pl creates
32# a new classifier object (a subclass of BasClas). Later, it passes each
33# document object to each classifier in turn for classification.
34#
35# Four primary functions are used:
36#
37# 1. "new" is called before the documents are processed to set up the
38# classifier.
39#
40# 2. "init" is called after buildcol.pl has created the indexes etc but
41# before the documents are classified in order that the classifier might
42# set any variables it requires, etc.
43#
44# 3. "classify" is called once for each document object. The classifier
45# "classifies" each document and updates its local data accordingly.
46#
47# 4. "get_classify_info" is called after every document has been
48# classified. It collates the information about the documents and
49# stores a reference to the classifier so that Greenstone can later
50# display it.
51
52# 09/05/02 Added usage datastructure - John Thompson
53# 28/11/03 Commented out verbosity argument - John Thompson
54
55use parsargv;
56use gsprintf;
57use printusage;
58
59
60my $arguments =
61 [ { 'name' => "builddir",
62 'desc' => "{BasClas.builddir}",
63 'type' => "string",
64 'deft' => "" },
65 { 'name' => "outhandle",
66 'desc' => "{BasClas.outhandle}",
67 'type' => "string",
68 'deft' => "STDERR" },
69# { 'name' => "verbosity",
70# 'desc' => "{BasClas.verbosity}",
71# 'type' => "enum",
72# 'deft' => "2",
73# 'reqd' => "no" } ];
74# { 'name' => "ignore_namespace",
75# 'desc' => "{BasClas.ignore_namespace}",
76# 'type' => "flag"}
77 ];
78
79my $options = { 'name' => "BasClas",
80 'desc' => "{BasClas.desc}",
81 'abstract' => "yes",
82 'inherits' => "no",
83 'args' => $arguments };
84
85
86sub gsprintf
87{
88 return &gsprintf::gsprintf(@_);
89}
90
91
92sub print_xml_usage
93{
94 local $self = shift(@_);
95
96 # XML output is always in UTF-8
97 &gsprintf::output_strings_in_UTF8;
98
99 &PrintUsage::print_xml_header();
100 $self->print_xml();
101}
102
103
104sub print_xml
105{
106 local $self = shift(@_);
107
108 local $optionlistref = $self->{'option_list'};
109 local @optionlist = @$optionlistref;
110 local $classifieroptions = pop(@$optionlistref);
111 return if (!defined($classifieroptions));
112
113 &gsprintf(STDERR, "<ClassInfo>\n");
114 &gsprintf(STDERR, " <Name>$classifieroptions->{'name'}</Name>\n");
115 my $desc = &gsprintf::lookup_string($classifieroptions->{'desc'});
116 $desc =~ s/</&amp;lt;/g; # doubly escaped
117 $desc =~ s/>/&amp;gt;/g;
118 &gsprintf(STDERR, " <Desc>$desc</Desc>\n");
119 &gsprintf(STDERR, " <Abstract>$classifieroptions->{'abstract'}</Abstract>\n");
120 &gsprintf(STDERR, " <Inherits>$classifieroptions->{'inherits'}</Inherits>\n");
121 &gsprintf(STDERR, " <Arguments>\n");
122 if (defined($classifieroptions->{'args'})) {
123 &PrintUsage::print_options_xml($classifieroptions->{'args'});
124 }
125
126 # Recurse up the classifier hierarchy
127 $self->print_xml();
128
129 &gsprintf(STDERR, " </Arguments>\n");
130 &gsprintf(STDERR, "</ClassInfo>\n");
131}
132
133
134sub print_txt_usage
135{
136 local $self = shift(@_);
137
138 # Print the usage message for a classifier (recursively)
139 local $descoffset = $self->determine_description_offset(0);
140 $self->print_classifier_usage($descoffset, 1);
141}
142
143
144sub determine_description_offset
145{
146 local $self = shift(@_);
147 local $maxoffset = shift(@_);
148
149 local $optionlistref = $self->{'option_list'};
150 local @optionlist = @$optionlistref;
151 local $classifieroptions = pop(@$optionlistref);
152 return $maxoffset if (!defined($classifieroptions));
153
154 # Find the length of the longest option string of this classifier
155 local $classifierargs = $classifieroptions->{'args'};
156 if (defined($classifierargs)) {
157 local $longest = &PrintUsage::find_longest_option_string($classifierargs);
158 if ($longest > $maxoffset) {
159 $maxoffset = $longest;
160 }
161 }
162
163 # Recurse up the classifier hierarchy
164 $maxoffset = $self->determine_description_offset($maxoffset);
165 $self->{'option_list'} = \@optionlist;
166 return $maxoffset;
167}
168
169
170sub print_classifier_usage
171{
172 local $self = shift(@_);
173 local $descoffset = shift(@_);
174 local $isleafclass = shift(@_);
175
176 local $optionlistref = $self->{'option_list'};
177 local @optionlist = @$optionlistref;
178 local $classifieroptions = pop(@$optionlistref);
179 return if (!defined($classifieroptions));
180
181 local $classifiername = $classifieroptions->{'name'};
182 local $classifierargs = $classifieroptions->{'args'};
183 local $classifierdesc = $classifieroptions->{'desc'};
184 # Produce the usage information using the data structure above
185 if ($isleafclass) {
186 if (defined($classifierdesc)) {
187 &gsprintf(STDERR, "$classifierdesc\n\n");
188 }
189 &gsprintf(STDERR, " {common.usage}: classify $classifiername [{common.options}]\n\n");
190
191 }
192
193 # Display the classifier options, if there are some
194 if (defined($classifierargs)) {
195 # Calculate the column offset of the option descriptions
196 local $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
197
198 if ($isleafclass) {
199 &gsprintf(STDERR, " {common.specific_options}:\n");
200 }
201 else {
202 &gsprintf(STDERR, " {common.general_options}:\n", $classifiername);
203 }
204
205 # Display the classifier options
206 &PrintUsage::print_options_txt($classifierargs, $optiondescoffset);
207 }
208
209 # Recurse up the classifier hierarchy
210 $self->print_classifier_usage($descoffset, 0);
211 $self->{'option_list'} = \@optionlist;
212}
213
214
215sub new {
216 my $class = shift (@_);
217 my $name = shift (@_);
218
219 my $self = {};
220
221 $self->{'outhandle'} = STDERR;
222
223 $self->{'option_list'} = [ $options ];
224 my $gsdlinfo;
225
226 # general options available to all classifiers
227 if (!parsargv::parse(\@_,
228 q^builddir/.*/^, \$self->{'builddir'},
229 q^outhandle/.*/STDERR^, \$self->{'outhandle'},
230 q^verbosity/\d/2^, \$self->{'verbosity'},
231 q^ignore_namespace^, \$self->{'ignore_namespace'},
232 q^gsdlinfo^, \$gsdlinfo,
233 "allow_extra_options")) {
234
235 &gsprintf(STDERR, "\n{BasClas.bad_general_option}\n", $name);
236 $self->print_txt_usage(""); # Use default resource bundle
237 die "\n";
238 }
239
240 if ($gsdlinfo) {
241 # created from classinfo.pl - set this so subclasses don't need to
242 # parse the arguments
243 $self->{'info_only'} = 1;
244 }
245 return bless $self, $class;
246}
247
248sub init {
249 my $self = shift (@_);
250}
251
252sub classify {
253 my $self = shift (@_);
254 my ($doc_obj) = @_;
255
256 my $outhandle = $self->{'outhandle'};
257 &gsprintf($outhandle, "BasClass::classify {common.must_be_implemented}\n");
258}
259
260sub get_classify_info {
261 my $self = shift (@_);
262
263 my $outhandle = $self->{'outhandle'};
264 &gsprintf($outhandle, "BasClass::get_classify_info {common.must_be_implemented}\n");
265}
266
267# all classifiers do something like this, but all slightly different
268# so put it here so they can all share it.
269sub format_metadata_for_sorting {
270
271 my $self = shift (@_);
272
273 my ($metaname, $metavalue, $doc_obj) = @_;
274
275 if ($metaname eq "Language") {
276 $metavalue = $iso639::fromiso639{$metavalue};
277 return $metavalue;
278 }
279
280 my $lang;
281 if (defined $doc_obj) {
282 $lang = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), 'Language');
283 }
284 $lang = 'en' unless defined $lang;
285 if ($lang eq 'en') {
286 if ($metaname =~ /^(\w+\.)?Creator(:.*)?/) {
287 &sorttools::format_string_name_english (\$metavalue);
288 } else {
289 &sorttools::format_string_english (\$metavalue);
290 }
291 }
292 return $metavalue;
293}
294
295
2961;
Note: See TracBrowser for help on using the repository browser.