source: trunk/gsdl/perllib/classify/BasClas.pm@ 8852

Last change on this file since 8852 was 8852, checked in by kjdon, 19 years ago

shifted format_metadata_for_sorting from BasCLas to sorttools, so other things can use it

  • Property svn:keywords set to Author Date Id Revision
File size: 8.4 KB
Line 
1###########################################################################
2#
3# BasClas.pm -- base class for all classifiers
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2000 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package BasClas;
28
29# How a classifier works.
30#
31# For each classifier requested in the collect.cfg file, buildcol.pl creates
32# a new classifier object (a subclass of BasClas). Later, it passes each
33# document object to each classifier in turn for classification.
34#
35# Four primary functions are used:
36#
37# 1. "new" is called before the documents are processed to set up the
38# classifier.
39#
40# 2. "init" is called after buildcol.pl has created the indexes etc but
41# before the documents are classified in order that the classifier might
42# set any variables it requires, etc.
43#
44# 3. "classify" is called once for each document object. The classifier
45# "classifies" each document and updates its local data accordingly.
46#
47# 4. "get_classify_info" is called after every document has been
48# classified. It collates the information about the documents and
49# stores a reference to the classifier so that Greenstone can later
50# display it.
51
52# 09/05/02 Added usage datastructure - John Thompson
53# 28/11/03 Commented out verbosity argument - John Thompson
54
55use parsargv;
56use gsprintf;
57use printusage;
58
59
60my $arguments =
61 [ { 'name' => "builddir",
62 'desc' => "{BasClas.builddir}",
63 'type' => "string",
64 'deft' => "" },
65 { 'name' => "outhandle",
66 'desc' => "{BasClas.outhandle}",
67 'type' => "string",
68 'deft' => "STDERR" },
69# { 'name' => "verbosity",
70# 'desc' => "{BasClas.verbosity}",
71# 'type' => "enum",
72# 'deft' => "2",
73# 'reqd' => "no" } ];
74# { 'name' => "ignore_namespace",
75# 'desc' => "{BasClas.ignore_namespace}",
76# 'type' => "flag"}
77 ];
78
79my $options = { 'name' => "BasClas",
80 'desc' => "{BasClas.desc}",
81 'abstract' => "yes",
82 'inherits' => "no",
83 'args' => $arguments };
84
85
86sub gsprintf
87{
88 return &gsprintf::gsprintf(@_);
89}
90
91
92sub print_xml_usage
93{
94 my $self = shift(@_);
95
96 # XML output is always in UTF-8
97 &gsprintf::output_strings_in_UTF8;
98
99 &PrintUsage::print_xml_header();
100 $self->print_xml();
101}
102
103
104sub print_xml
105{
106 my $self = shift(@_);
107
108 my $optionlistref = $self->{'option_list'};
109 my @optionlist = @$optionlistref;
110 my $classifieroptions = pop(@$optionlistref);
111 return if (!defined($classifieroptions));
112
113 &gsprintf(STDERR, "<ClassInfo>\n");
114 &gsprintf(STDERR, " <Name>$classifieroptions->{'name'}</Name>\n");
115 my $desc = &gsprintf::lookup_string($classifieroptions->{'desc'});
116 $desc =~ s/</&amp;lt;/g; # doubly escaped
117 $desc =~ s/>/&amp;gt;/g;
118 &gsprintf(STDERR, " <Desc>$desc</Desc>\n");
119 &gsprintf(STDERR, " <Abstract>$classifieroptions->{'abstract'}</Abstract>\n");
120 &gsprintf(STDERR, " <Inherits>$classifieroptions->{'inherits'}</Inherits>\n");
121 &gsprintf(STDERR, " <Arguments>\n");
122 if (defined($classifieroptions->{'args'})) {
123 &PrintUsage::print_options_xml($classifieroptions->{'args'});
124 }
125
126 # Recurse up the classifier hierarchy
127 $self->print_xml();
128
129 &gsprintf(STDERR, " </Arguments>\n");
130 &gsprintf(STDERR, "</ClassInfo>\n");
131}
132
133
134sub print_txt_usage
135{
136 my $self = shift(@_);
137
138 # Print the usage message for a classifier (recursively)
139 my $descoffset = $self->determine_description_offset(0);
140 $self->print_classifier_usage($descoffset, 1);
141}
142
143
144sub determine_description_offset
145{
146 my $self = shift(@_);
147 my $maxoffset = shift(@_);
148
149 my $optionlistref = $self->{'option_list'};
150 my @optionlist = @$optionlistref;
151 my $classifieroptions = pop(@$optionlistref);
152 return $maxoffset if (!defined($classifieroptions));
153
154 # Find the length of the longest option string of this classifier
155 my $classifierargs = $classifieroptions->{'args'};
156 if (defined($classifierargs)) {
157 my $longest = &PrintUsage::find_longest_option_string($classifierargs);
158 if ($longest > $maxoffset) {
159 $maxoffset = $longest;
160 }
161 }
162
163 # Recurse up the classifier hierarchy
164 $maxoffset = $self->determine_description_offset($maxoffset);
165 $self->{'option_list'} = \@optionlist;
166 return $maxoffset;
167}
168
169
170sub print_classifier_usage
171{
172 my $self = shift(@_);
173 my $descoffset = shift(@_);
174 my $isleafclass = shift(@_);
175
176 my $optionlistref = $self->{'option_list'};
177 my @optionlist = @$optionlistref;
178 my $classifieroptions = pop(@$optionlistref);
179 return if (!defined($classifieroptions));
180
181 my $classifiername = $classifieroptions->{'name'};
182 my $classifierargs = $classifieroptions->{'args'};
183 my $classifierdesc = $classifieroptions->{'desc'};
184 # Produce the usage information using the data structure above
185 if ($isleafclass) {
186 if (defined($classifierdesc)) {
187 &gsprintf(STDERR, "$classifierdesc\n\n");
188 }
189 &gsprintf(STDERR, " {common.usage}: classify $classifiername [{common.options}]\n\n");
190
191 }
192
193 # Display the classifier options, if there are some
194 if (defined($classifierargs)) {
195 # Calculate the column offset of the option descriptions
196 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
197
198 if ($isleafclass) {
199 &gsprintf(STDERR, " {common.specific_options}:\n");
200 }
201 else {
202 &gsprintf(STDERR, " {common.general_options}:\n", $classifiername);
203 }
204
205 # Display the classifier options
206 &PrintUsage::print_options_txt($classifierargs, $optiondescoffset);
207 }
208
209 # Recurse up the classifier hierarchy
210 $self->print_classifier_usage($descoffset, 0);
211 $self->{'option_list'} = \@optionlist;
212}
213
214
215sub new {
216 my $class = shift (@_);
217 my $name = shift (@_);
218
219 my $self = {};
220
221 $self->{'outhandle'} = STDERR;
222 $self->{'idnum'} = -1;
223
224 $self->{'option_list'} = [ $options ];
225 my $gsdlinfo;
226
227 # general options available to all classifiers
228 if (!parsargv::parse(\@_,
229 q^builddir/.*/^, \$self->{'builddir'},
230 q^outhandle/.*/STDERR^, \$self->{'outhandle'},
231 q^verbosity/\d/2^, \$self->{'verbosity'},
232 q^ignore_namespace^, \$self->{'ignore_namespace'},
233 q^gsdlinfo^, \$gsdlinfo,
234 "allow_extra_options")) {
235
236 &gsprintf(STDERR, "\n{BasClas.bad_general_option}\n", $name);
237 $self->print_txt_usage(""); # Use default resource bundle
238 die "\n";
239 }
240
241 if ($gsdlinfo) {
242 # created from classinfo.pl - set this so subclasses don't need to
243 # parse the arguments
244 $self->{'info_only'} = 1;
245 }
246 return bless $self, $class;
247}
248
249sub init {
250 my $self = shift (@_);
251
252 $self->{'supportsmemberof'} = &supports_memberof();
253}
254
255sub set_number {
256 my $self = shift (@_);
257 my ($id) = @_;
258 $self->{'idnum'} = $id;
259}
260
261sub get_number {
262 my $self = shift (@_);
263 return $self->{'idnum'};
264}
265
266sub classify {
267 my $self = shift (@_);
268 my ($doc_obj) = @_;
269
270 my $outhandle = $self->{'outhandle'};
271 &gsprintf($outhandle, "BasClass::classify {common.must_be_implemented}\n");
272}
273
274sub get_classify_info {
275 my $self = shift (@_);
276
277 my $outhandle = $self->{'outhandle'};
278 &gsprintf($outhandle, "BasClass::get_classify_info {common.must_be_implemented}\n");
279}
280
281sub supports_memberof {
282 my $self = shift(@_);
283
284 return "false";
285}
286
287# previously, if a buttonname wasn't specified, we just use the metadata value,
288# but with a list of metadata, we want to do something a bit nicer so that
289# eg -metadata dc.Title,Title will end up with Title as the buttonname
290
291# current algorithm - use the first element, but strip its namespace
292sub generate_title_from_metadata {
293
294 my $self = shift (@_);
295 my $metadata = shift (@_);
296
297 my @metalist = split(/,/, $metadata);
298 my $firstmeta = $metalist[0];
299 if ($firstmeta =~ /\./) {
300 $firstmeta =~ s/^\w+\.//;
301 }
302 return $firstmeta;
303}
304
3051;
Note: See TracBrowser for help on using the repository browser.