source: main/trunk/greenstone2/perllib/classify/BaseClassifier.pm@ 24193

Last change on this file since 24193 was 24193, checked in by ak19, 13 years ago

Non-English characters in the user-entered buttonname of all classifiers need to be preserved. BaseClassifier now does this, for which it has to assume that the user-entered buttonname is in UTF-8

  • Property svn:keywords set to Author Date Id Revision
File size: 11.1 KB
Line 
1###########################################################################
2#
3# BaseClassifier.pm -- base class for all classifiers
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2000 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package BaseClassifier;
28
29# How a classifier works.
30#
31# For each classifier requested in the collect.cfg file, buildcol.pl creates
32# a new classifier object (a subclass of BaseClassifier). Later, it passes each
33# document object to each classifier in turn for classification.
34#
35# Four primary functions are used:
36#
37# 1. "new" is called before the documents are processed to set up the
38# classifier.
39#
40# 2. "init" is called after buildcol.pl has created the indexes etc but
41# before the documents are classified in order that the classifier might
42# set any variables it requires, etc.
43#
44# 3. "classify" is called once for each document object. The classifier
45# "classifies" each document and updates its local data accordingly.
46#
47# 4. "get_classify_info" is called after every document has been
48# classified. It collates the information about the documents and
49# stores a reference to the classifier so that Greenstone can later
50# display it.
51
52# 09/05/02 Added usage datastructure - John Thompson
53# 28/11/03 Commented out verbosity argument - John Thompson
54
55use gsprintf;
56use printusage;
57use parse2;
58use unicode;
59
60# suppress the annoying "subroutine redefined" warning that various
61# classifiers cause under perl 5.6
62$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
63
64use strict;
65no strict 'subs'; # allow barewords (eg STDERR) as function arguments
66no strict 'refs'; # allow filehandles to be variables and viceversa
67
68my $arguments =
69 [
70 { 'name' => "buttonname",
71 'desc' => "{BasClas.buttonname}",
72 'type' => "string",
73 'deft' => "",
74 'reqd' => "no" },
75 { 'name' => "no_metadata_formatting",
76 'desc' => "{BasClas.no_metadata_formatting}",
77 'type' => "flag" },
78 { 'name' => "builddir",
79 'desc' => "{BasClas.builddir}",
80 'type' => "string",
81 'deft' => "" },
82 { 'name' => "outhandle",
83 'desc' => "{BasClas.outhandle}",
84 'type' => "string",
85 'deft' => "STDERR" },
86 { 'name' => "verbosity",
87 'desc' => "{BasClas.verbosity}",
88# 'type' => "enum",
89 'type' => "int",
90 'deft' => "2",
91 'reqd' => "no" }
92
93# { 'name' => "ignore_namespace",
94# 'desc' => "{BasClas.ignore_namespace}",
95# 'type' => "flag"}
96 ];
97
98my $options = { 'name' => "BaseClassifier",
99 'desc' => "{BasClas.desc}",
100 'abstract' => "yes",
101 'inherits' => "no",
102 'args' => $arguments };
103
104
105sub gsprintf
106{
107 return &gsprintf::gsprintf(@_);
108}
109
110
111sub print_xml_usage
112{
113 my $self = shift(@_);
114 my $header = shift(@_);
115 my $high_level_information_only = shift(@_);
116
117 # XML output is always in UTF-8
118 &gsprintf::output_strings_in_UTF8;
119
120 if ($header) {
121 &PrintUsage::print_xml_header("classify");
122 }
123 $self->print_xml($high_level_information_only);
124}
125
126
127sub print_xml
128{
129 my $self = shift(@_);
130 my $high_level_information_only = shift(@_);
131
132 my $optionlistref = $self->{'option_list'};
133 my @optionlist = @$optionlistref;
134 my $classifieroptions = shift(@$optionlistref);
135 return if (!defined($classifieroptions));
136
137 &gsprintf(STDERR, "<ClassInfo>\n");
138 &gsprintf(STDERR, " <Name>$classifieroptions->{'name'}</Name>\n");
139 my $desc = &gsprintf::lookup_string($classifieroptions->{'desc'});
140 $desc =~ s/</&amp;lt;/g; # doubly escaped
141 $desc =~ s/>/&amp;gt;/g;
142 &gsprintf(STDERR, " <Desc>$desc</Desc>\n");
143 &gsprintf(STDERR, " <Abstract>$classifieroptions->{'abstract'}</Abstract>\n");
144 &gsprintf(STDERR, " <Inherits>$classifieroptions->{'inherits'}</Inherits>\n");
145 unless (defined($high_level_information_only)) {
146 &gsprintf(STDERR, " <Arguments>\n");
147 if (defined($classifieroptions->{'args'})) {
148 &PrintUsage::print_options_xml($classifieroptions->{'args'});
149 }
150 &gsprintf(STDERR, " </Arguments>\n");
151
152 # Recurse up the classifier hierarchy
153 $self->print_xml();
154 }
155 &gsprintf(STDERR, "</ClassInfo>\n");
156}
157
158
159sub print_txt_usage
160{
161 my $self = shift(@_);
162
163 # Print the usage message for a classifier (recursively)
164 my $descoffset = $self->determine_description_offset(0);
165 $self->print_classifier_usage($descoffset, 1);
166}
167
168
169sub determine_description_offset
170{
171 my $self = shift(@_);
172 my $maxoffset = shift(@_);
173
174 my $optionlistref = $self->{'option_list'};
175 my @optionlist = @$optionlistref;
176 my $classifieroptions = pop(@$optionlistref);
177 return $maxoffset if (!defined($classifieroptions));
178
179 # Find the length of the longest option string of this classifier
180 my $classifierargs = $classifieroptions->{'args'};
181 if (defined($classifierargs)) {
182 my $longest = &PrintUsage::find_longest_option_string($classifierargs);
183 if ($longest > $maxoffset) {
184 $maxoffset = $longest;
185 }
186 }
187
188 # Recurse up the classifier hierarchy
189 $maxoffset = $self->determine_description_offset($maxoffset);
190 $self->{'option_list'} = \@optionlist;
191 return $maxoffset;
192}
193
194
195sub print_classifier_usage
196{
197 my $self = shift(@_);
198 my $descoffset = shift(@_);
199 my $isleafclass = shift(@_);
200
201 my $optionlistref = $self->{'option_list'};
202 my @optionlist = @$optionlistref;
203 my $classifieroptions = shift(@$optionlistref);
204 return if (!defined($classifieroptions));
205
206 my $classifiername = $classifieroptions->{'name'};
207 my $classifierargs = $classifieroptions->{'args'};
208 my $classifierdesc = $classifieroptions->{'desc'};
209 # Produce the usage information using the data structure above
210 if ($isleafclass) {
211 if (defined($classifierdesc)) {
212 &gsprintf(STDERR, "$classifierdesc\n\n");
213 }
214 &gsprintf(STDERR, " {common.usage}: classify $classifiername [{common.options}]\n\n");
215
216 }
217
218 # Display the classifier options, if there are some
219 if (defined($classifierargs)) {
220 # Calculate the column offset of the option descriptions
221 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
222
223 if ($isleafclass) {
224 &gsprintf(STDERR, " {common.specific_options}:\n");
225 }
226 else {
227 &gsprintf(STDERR, " {common.general_options}:\n", $classifiername);
228 }
229
230 # Display the classifier options
231 &PrintUsage::print_options_txt($classifierargs, $optiondescoffset);
232 }
233
234 # Recurse up the classifier hierarchy
235 $self->print_classifier_usage($descoffset, 0);
236 $self->{'option_list'} = \@optionlist;
237}
238
239
240sub new {
241 my ($class) = shift (@_);
242 my ($classifierslist,$args,$hashArgOptLists) = @_;
243 push(@$classifierslist, $class);
244 my $classifier_name = (defined $classifierslist->[0]) ? $classifierslist->[0] : $class;
245
246 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
247 push(@{$hashArgOptLists->{"OptList"}},$options);
248
249
250 # Manually set $self parameters.
251 my $self = {};
252 $self->{'outhandle'} = STDERR;
253 $self->{'idnum'} = -1;
254 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
255 $self->{"info_only"} = 0;
256
257 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
258 # the args, just return the object.
259 foreach my $strArg (@{$args})
260 {
261 if($strArg eq "-gsdlinfo")
262 {
263 $self->{"info_only"} = 1;
264 return bless $self, $class;
265 }
266 }
267
268 # general options available to all classifiers
269 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
270 {
271 #print out the text usage of this classifier.
272 my $classTempClass = bless $self, $class;
273 print STDERR "<BadClassifier c=$classifier_name>\n";
274
275 &gsprintf(STDERR, "\n{BasClas.bad_general_option}\n", $classifier_name);
276 $classTempClass->print_txt_usage(""); # Use default resource bundle
277 die "\n";
278 }
279
280 delete $self->{"info_only"};
281 # user-entered UTF-8 display meta for buttonname. Needs to be internally turned into unicode
282 # (will later be turned back to UTF-8 along with the rest of the metadata)
283 if ($self->{'buttonname'}) {
284 $self->{'buttonname'} = &unicode::convert_utf8_string_to_unicode_string($self->{'buttonname'});
285 }
286 return bless $self, $class;
287}
288
289sub init {
290 my $self = shift (@_);
291
292 $self->{'supportsmemberof'} = &supports_memberof();
293}
294
295sub set_number {
296 my $self = shift (@_);
297 my ($id) = @_;
298 $self->{'idnum'} = $id;
299}
300
301sub get_number {
302 my $self = shift (@_);
303 return $self->{'idnum'};
304}
305
306sub oid_array_delete
307{
308 my $self = shift (@_);
309 my ($delete_oid,$field) = @_;
310
311 my $outhandle = $self->{'outhandle'};
312
313 my @filtered_list = ();
314 foreach my $existing_oid (@{$self->{$field}}) {
315 if ($existing_oid eq $delete_oid) {
316 print $outhandle " Deleting old $delete_oid for ", ref $self, "\n";
317 }
318 else {
319 push(@filtered_list,$existing_oid);
320 }
321 }
322 $self->{$field} = \@filtered_list;
323}
324
325sub oid_hash_delete
326{
327 my $self = shift (@_);
328 my ($delete_oid,$field) = @_;
329
330 my $outhandle = $self->{'outhandle'};
331
332 print $outhandle " Deleting old $delete_oid for ", ref $self, "\n";
333 delete $self->{$field}->{$delete_oid};
334}
335
336sub classify {
337 my $self = shift (@_);
338 my ($doc_obj) = @_;
339
340 my $outhandle = $self->{'outhandle'};
341 &gsprintf($outhandle, "BaseClassifier::classify {common.must_be_implemented}\n");
342}
343
344sub get_classify_info {
345 my $self = shift (@_);
346
347 my $outhandle = $self->{'outhandle'};
348 &gsprintf($outhandle, "BaseClassifier::get_classify_info {common.must_be_implemented}\n");
349}
350
351sub supports_memberof {
352 my $self = shift(@_);
353
354 return "false";
355}
356
357# previously, if a buttonname wasn't specified, we just use the metadata value,
358# but with a list of metadata, we want to do something a bit nicer so that
359# eg -metadata dc.Title,Title will end up with Title as the buttonname
360
361# current algorithm - use the first element, but strip its namespace
362sub generate_title_from_metadata {
363
364 my $self = shift (@_);
365 my $metadata = shift (@_);
366
367 return "" unless defined $metadata && $metadata =~ /\S/;
368
369 my @metalist = split(/,|;/, $metadata);
370 my $firstmeta = $metalist[0];
371 if ($firstmeta =~ /\./) {
372 $firstmeta =~ s/^\w+\.//;
373 }
374 return $firstmeta;
375}
376
377
378# ex. can be at front, or it may be a list of metadata, separated by ,/;
379sub strip_ex_from_metadata {
380 my $self = shift (@_);
381 my $metadata = shift (@_);
382
383 return $metadata unless defined $metadata && $metadata =~ /\S/;
384
385 $metadata =~ s/^ex\.//;
386 $metadata =~ s/([,;:\/])ex\./$1/g;
387 return $metadata;
388}
389
390
3911;
Note: See TracBrowser for help on using the repository browser.