source: main/trunk/greenstone2/perllib/classify/BaseClassifier.pm@ 24745

Last change on this file since 24745 was 24745, checked in by ak19, 13 years ago

Removed code that was added in a recent commit and is no longer used: to do with getting accents in classifier buttonnames to be preserved. The previous fix was to treat buttonname as raw bytedata and call the old method to convert to unicode. Dr Bainbridge then fixed it much better: colcfg, which stores the buttonname, is now read in as unicode aware strings, so no conversion to unicode is necessary anymore.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.1 KB
Line 
1###########################################################################
2#
3# BaseClassifier.pm -- base class for all classifiers
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2000 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package BaseClassifier;
28
29# How a classifier works.
30#
31# For each classifier requested in the collect.cfg file, buildcol.pl creates
32# a new classifier object (a subclass of BaseClassifier). Later, it passes each
33# document object to each classifier in turn for classification.
34#
35# Four primary functions are used:
36#
37# 1. "new" is called before the documents are processed to set up the
38# classifier.
39#
40# 2. "init" is called after buildcol.pl has created the indexes etc but
41# before the documents are classified in order that the classifier might
42# set any variables it requires, etc.
43#
44# 3. "classify" is called once for each document object. The classifier
45# "classifies" each document and updates its local data accordingly.
46#
47# 4. "get_classify_info" is called after every document has been
48# classified. It collates the information about the documents and
49# stores a reference to the classifier so that Greenstone can later
50# display it.
51
52# 09/05/02 Added usage datastructure - John Thompson
53# 28/11/03 Commented out verbosity argument - John Thompson
54
55use gsprintf;
56use printusage;
57use parse2;
58
59# suppress the annoying "subroutine redefined" warning that various
60# classifiers cause under perl 5.6
61$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
62
63use strict;
64no strict 'subs'; # allow barewords (eg STDERR) as function arguments
65no strict 'refs'; # allow filehandles to be variables and viceversa
66
67my $arguments =
68 [
69 { 'name' => "buttonname",
70 'desc' => "{BasClas.buttonname}",
71 'type' => "string",
72 'deft' => "",
73 'reqd' => "no" },
74 { 'name' => "no_metadata_formatting",
75 'desc' => "{BasClas.no_metadata_formatting}",
76 'type' => "flag" },
77 { 'name' => "builddir",
78 'desc' => "{BasClas.builddir}",
79 'type' => "string",
80 'deft' => "" },
81 { 'name' => "outhandle",
82 'desc' => "{BasClas.outhandle}",
83 'type' => "string",
84 'deft' => "STDERR" },
85 { 'name' => "verbosity",
86 'desc' => "{BasClas.verbosity}",
87# 'type' => "enum",
88 'type' => "int",
89 'deft' => "2",
90 'reqd' => "no" }
91
92# { 'name' => "ignore_namespace",
93# 'desc' => "{BasClas.ignore_namespace}",
94# 'type' => "flag"}
95 ];
96
97my $options = { 'name' => "BaseClassifier",
98 'desc' => "{BasClas.desc}",
99 'abstract' => "yes",
100 'inherits' => "no",
101 'args' => $arguments };
102
103
104sub gsprintf
105{
106 return &gsprintf::gsprintf(@_);
107}
108
109
110sub print_xml_usage
111{
112 my $self = shift(@_);
113 my $header = shift(@_);
114 my $high_level_information_only = shift(@_);
115
116 # XML output is always in UTF-8
117 &gsprintf::output_strings_in_UTF8;
118
119 if ($header) {
120 &PrintUsage::print_xml_header("classify");
121 }
122 $self->print_xml($high_level_information_only);
123}
124
125
126sub print_xml
127{
128 my $self = shift(@_);
129 my $high_level_information_only = shift(@_);
130
131 my $optionlistref = $self->{'option_list'};
132 my @optionlist = @$optionlistref;
133 my $classifieroptions = shift(@$optionlistref);
134 return if (!defined($classifieroptions));
135
136 &gsprintf(STDERR, "<ClassInfo>\n");
137 &gsprintf(STDERR, " <Name>$classifieroptions->{'name'}</Name>\n");
138 my $desc = &gsprintf::lookup_string($classifieroptions->{'desc'});
139 $desc =~ s/</&amp;lt;/g; # doubly escaped
140 $desc =~ s/>/&amp;gt;/g;
141 &gsprintf(STDERR, " <Desc>$desc</Desc>\n");
142 &gsprintf(STDERR, " <Abstract>$classifieroptions->{'abstract'}</Abstract>\n");
143 &gsprintf(STDERR, " <Inherits>$classifieroptions->{'inherits'}</Inherits>\n");
144 unless (defined($high_level_information_only)) {
145 &gsprintf(STDERR, " <Arguments>\n");
146 if (defined($classifieroptions->{'args'})) {
147 &PrintUsage::print_options_xml($classifieroptions->{'args'});
148 }
149 &gsprintf(STDERR, " </Arguments>\n");
150
151 # Recurse up the classifier hierarchy
152 $self->print_xml();
153 }
154 &gsprintf(STDERR, "</ClassInfo>\n");
155}
156
157
158sub print_txt_usage
159{
160 my $self = shift(@_);
161
162 # Print the usage message for a classifier (recursively)
163 my $descoffset = $self->determine_description_offset(0);
164 $self->print_classifier_usage($descoffset, 1);
165}
166
167
168sub determine_description_offset
169{
170 my $self = shift(@_);
171 my $maxoffset = shift(@_);
172
173 my $optionlistref = $self->{'option_list'};
174 my @optionlist = @$optionlistref;
175 my $classifieroptions = pop(@$optionlistref);
176 return $maxoffset if (!defined($classifieroptions));
177
178 # Find the length of the longest option string of this classifier
179 my $classifierargs = $classifieroptions->{'args'};
180 if (defined($classifierargs)) {
181 my $longest = &PrintUsage::find_longest_option_string($classifierargs);
182 if ($longest > $maxoffset) {
183 $maxoffset = $longest;
184 }
185 }
186
187 # Recurse up the classifier hierarchy
188 $maxoffset = $self->determine_description_offset($maxoffset);
189 $self->{'option_list'} = \@optionlist;
190 return $maxoffset;
191}
192
193
194sub print_classifier_usage
195{
196 my $self = shift(@_);
197 my $descoffset = shift(@_);
198 my $isleafclass = shift(@_);
199
200 my $optionlistref = $self->{'option_list'};
201 my @optionlist = @$optionlistref;
202 my $classifieroptions = shift(@$optionlistref);
203 return if (!defined($classifieroptions));
204
205 my $classifiername = $classifieroptions->{'name'};
206 my $classifierargs = $classifieroptions->{'args'};
207 my $classifierdesc = $classifieroptions->{'desc'};
208 # Produce the usage information using the data structure above
209 if ($isleafclass) {
210 if (defined($classifierdesc)) {
211 &gsprintf(STDERR, "$classifierdesc\n\n");
212 }
213 &gsprintf(STDERR, " {common.usage}: classify $classifiername [{common.options}]\n\n");
214
215 }
216
217 # Display the classifier options, if there are some
218 if (defined($classifierargs)) {
219 # Calculate the column offset of the option descriptions
220 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
221
222 if ($isleafclass) {
223 &gsprintf(STDERR, " {common.specific_options}:\n");
224 }
225 else {
226 &gsprintf(STDERR, " {common.general_options}:\n", $classifiername);
227 }
228
229 # Display the classifier options
230 &PrintUsage::print_options_txt($classifierargs, $optiondescoffset);
231 }
232
233 # Recurse up the classifier hierarchy
234 $self->print_classifier_usage($descoffset, 0);
235 $self->{'option_list'} = \@optionlist;
236}
237
238
239sub new {
240 my ($class) = shift (@_);
241 my ($classifierslist,$args,$hashArgOptLists) = @_;
242 push(@$classifierslist, $class);
243 my $classifier_name = (defined $classifierslist->[0]) ? $classifierslist->[0] : $class;
244
245 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
246 push(@{$hashArgOptLists->{"OptList"}},$options);
247
248
249 # Manually set $self parameters.
250 my $self = {};
251 $self->{'outhandle'} = STDERR;
252 $self->{'idnum'} = -1;
253 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
254 $self->{"info_only"} = 0;
255
256 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
257 # the args, just return the object.
258 foreach my $strArg (@{$args})
259 {
260 if($strArg eq "-gsdlinfo")
261 {
262 $self->{"info_only"} = 1;
263 return bless $self, $class;
264 }
265 }
266
267 # general options available to all classifiers
268 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
269 {
270 #print out the text usage of this classifier.
271 my $classTempClass = bless $self, $class;
272 print STDERR "<BadClassifier c=$classifier_name>\n";
273
274 &gsprintf(STDERR, "\n{BasClas.bad_general_option}\n", $classifier_name);
275 $classTempClass->print_txt_usage(""); # Use default resource bundle
276 die "\n";
277 }
278
279 delete $self->{"info_only"};
280
281# We now ensure that when text files (and even colcfg) are read in,
282# they are straightaway made to be Unicode aware strings in Perl
283
284 return bless $self, $class;
285}
286
287sub init {
288 my $self = shift (@_);
289
290 $self->{'supportsmemberof'} = &supports_memberof();
291}
292
293sub set_number {
294 my $self = shift (@_);
295 my ($id) = @_;
296 $self->{'idnum'} = $id;
297}
298
299sub get_number {
300 my $self = shift (@_);
301 return $self->{'idnum'};
302}
303
304sub oid_array_delete
305{
306 my $self = shift (@_);
307 my ($delete_oid,$field) = @_;
308
309 my $outhandle = $self->{'outhandle'};
310
311 my @filtered_list = ();
312 foreach my $existing_oid (@{$self->{$field}}) {
313 if ($existing_oid eq $delete_oid) {
314 print $outhandle " Deleting old $delete_oid for ", ref $self, "\n";
315 }
316 else {
317 push(@filtered_list,$existing_oid);
318 }
319 }
320 $self->{$field} = \@filtered_list;
321}
322
323sub oid_hash_delete
324{
325 my $self = shift (@_);
326 my ($delete_oid,$field) = @_;
327
328 my $outhandle = $self->{'outhandle'};
329
330 print $outhandle " Deleting old $delete_oid for ", ref $self, "\n";
331 delete $self->{$field}->{$delete_oid};
332}
333
334sub classify {
335 my $self = shift (@_);
336 my ($doc_obj) = @_;
337
338 my $outhandle = $self->{'outhandle'};
339 &gsprintf($outhandle, "BaseClassifier::classify {common.must_be_implemented}\n");
340}
341
342sub get_classify_info {
343 my $self = shift (@_);
344
345 my $outhandle = $self->{'outhandle'};
346 &gsprintf($outhandle, "BaseClassifier::get_classify_info {common.must_be_implemented}\n");
347}
348
349sub supports_memberof {
350 my $self = shift(@_);
351
352 return "false";
353}
354
355# previously, if a buttonname wasn't specified, we just use the metadata value,
356# but with a list of metadata, we want to do something a bit nicer so that
357# eg -metadata dc.Title,Title will end up with Title as the buttonname
358
359# current algorithm - use the first element, but strip its namespace
360sub generate_title_from_metadata {
361
362 my $self = shift (@_);
363 my $metadata = shift (@_);
364
365 return "" unless defined $metadata && $metadata =~ /\S/;
366
367 my @metalist = split(/,|;/, $metadata);
368 my $firstmeta = $metalist[0];
369 if ($firstmeta =~ /\./) {
370 $firstmeta =~ s/^\w+\.//;
371 }
372 return $firstmeta;
373}
374
375
376# ex. can be at front, or it may be a list of metadata, separated by ,/;
377sub strip_ex_from_metadata {
378 my $self = shift (@_);
379 my $metadata = shift (@_);
380
381 return $metadata unless defined $metadata && $metadata =~ /\S/;
382
383 # only remove ex. metadata prefix if there are no other prefixes after it
384 $metadata =~ s/(,|;|:|\/)/$1 /g; # insert a space separator so meta names like flex.Image don't become fl.Image
385 $metadata =~ s/(^| )ex\.([^.,;:\/]+)(,|;|:|\/|$)/$1$2$3/g;
386 $metadata =~ s/(,|;|:|\/) /$1/g;
387
388 return $metadata;
389}
390
391
3921;
Note: See TracBrowser for help on using the repository browser.