source: main/trunk/greenstone2/perllib/classify/BaseClassifier.pm@ 24737

Last change on this file since 24737 was 24737, checked in by ak19, 13 years ago

Further adjustment of code, reflecting the fact that text strings read in from files are Unicode aware straight away

  • Property svn:keywords set to Author Date Id Revision
File size: 11.5 KB
Line 
1###########################################################################
2#
3# BaseClassifier.pm -- base class for all classifiers
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2000 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package BaseClassifier;
28
29# How a classifier works.
30#
31# For each classifier requested in the collect.cfg file, buildcol.pl creates
32# a new classifier object (a subclass of BaseClassifier). Later, it passes each
33# document object to each classifier in turn for classification.
34#
35# Four primary functions are used:
36#
37# 1. "new" is called before the documents are processed to set up the
38# classifier.
39#
40# 2. "init" is called after buildcol.pl has created the indexes etc but
41# before the documents are classified in order that the classifier might
42# set any variables it requires, etc.
43#
44# 3. "classify" is called once for each document object. The classifier
45# "classifies" each document and updates its local data accordingly.
46#
47# 4. "get_classify_info" is called after every document has been
48# classified. It collates the information about the documents and
49# stores a reference to the classifier so that Greenstone can later
50# display it.
51
52# 09/05/02 Added usage datastructure - John Thompson
53# 28/11/03 Commented out verbosity argument - John Thompson
54
55use gsprintf;
56use printusage;
57use parse2;
58use unicode;
59
60# suppress the annoying "subroutine redefined" warning that various
61# classifiers cause under perl 5.6
62$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
63
64use strict;
65no strict 'subs'; # allow barewords (eg STDERR) as function arguments
66no strict 'refs'; # allow filehandles to be variables and viceversa
67
68my $arguments =
69 [
70 { 'name' => "buttonname",
71 'desc' => "{BasClas.buttonname}",
72 'type' => "string",
73 'deft' => "",
74 'reqd' => "no" },
75 { 'name' => "no_metadata_formatting",
76 'desc' => "{BasClas.no_metadata_formatting}",
77 'type' => "flag" },
78 { 'name' => "builddir",
79 'desc' => "{BasClas.builddir}",
80 'type' => "string",
81 'deft' => "" },
82 { 'name' => "outhandle",
83 'desc' => "{BasClas.outhandle}",
84 'type' => "string",
85 'deft' => "STDERR" },
86 { 'name' => "verbosity",
87 'desc' => "{BasClas.verbosity}",
88# 'type' => "enum",
89 'type' => "int",
90 'deft' => "2",
91 'reqd' => "no" }
92
93# { 'name' => "ignore_namespace",
94# 'desc' => "{BasClas.ignore_namespace}",
95# 'type' => "flag"}
96 ];
97
98my $options = { 'name' => "BaseClassifier",
99 'desc' => "{BasClas.desc}",
100 'abstract' => "yes",
101 'inherits' => "no",
102 'args' => $arguments };
103
104
105sub gsprintf
106{
107 return &gsprintf::gsprintf(@_);
108}
109
110
111sub print_xml_usage
112{
113 my $self = shift(@_);
114 my $header = shift(@_);
115 my $high_level_information_only = shift(@_);
116
117 # XML output is always in UTF-8
118 &gsprintf::output_strings_in_UTF8;
119
120 if ($header) {
121 &PrintUsage::print_xml_header("classify");
122 }
123 $self->print_xml($high_level_information_only);
124}
125
126
127sub print_xml
128{
129 my $self = shift(@_);
130 my $high_level_information_only = shift(@_);
131
132 my $optionlistref = $self->{'option_list'};
133 my @optionlist = @$optionlistref;
134 my $classifieroptions = shift(@$optionlistref);
135 return if (!defined($classifieroptions));
136
137 &gsprintf(STDERR, "<ClassInfo>\n");
138 &gsprintf(STDERR, " <Name>$classifieroptions->{'name'}</Name>\n");
139 my $desc = &gsprintf::lookup_string($classifieroptions->{'desc'});
140 $desc =~ s/</&amp;lt;/g; # doubly escaped
141 $desc =~ s/>/&amp;gt;/g;
142 &gsprintf(STDERR, " <Desc>$desc</Desc>\n");
143 &gsprintf(STDERR, " <Abstract>$classifieroptions->{'abstract'}</Abstract>\n");
144 &gsprintf(STDERR, " <Inherits>$classifieroptions->{'inherits'}</Inherits>\n");
145 unless (defined($high_level_information_only)) {
146 &gsprintf(STDERR, " <Arguments>\n");
147 if (defined($classifieroptions->{'args'})) {
148 &PrintUsage::print_options_xml($classifieroptions->{'args'});
149 }
150 &gsprintf(STDERR, " </Arguments>\n");
151
152 # Recurse up the classifier hierarchy
153 $self->print_xml();
154 }
155 &gsprintf(STDERR, "</ClassInfo>\n");
156}
157
158
159sub print_txt_usage
160{
161 my $self = shift(@_);
162
163 # Print the usage message for a classifier (recursively)
164 my $descoffset = $self->determine_description_offset(0);
165 $self->print_classifier_usage($descoffset, 1);
166}
167
168
169sub determine_description_offset
170{
171 my $self = shift(@_);
172 my $maxoffset = shift(@_);
173
174 my $optionlistref = $self->{'option_list'};
175 my @optionlist = @$optionlistref;
176 my $classifieroptions = pop(@$optionlistref);
177 return $maxoffset if (!defined($classifieroptions));
178
179 # Find the length of the longest option string of this classifier
180 my $classifierargs = $classifieroptions->{'args'};
181 if (defined($classifierargs)) {
182 my $longest = &PrintUsage::find_longest_option_string($classifierargs);
183 if ($longest > $maxoffset) {
184 $maxoffset = $longest;
185 }
186 }
187
188 # Recurse up the classifier hierarchy
189 $maxoffset = $self->determine_description_offset($maxoffset);
190 $self->{'option_list'} = \@optionlist;
191 return $maxoffset;
192}
193
194
195sub print_classifier_usage
196{
197 my $self = shift(@_);
198 my $descoffset = shift(@_);
199 my $isleafclass = shift(@_);
200
201 my $optionlistref = $self->{'option_list'};
202 my @optionlist = @$optionlistref;
203 my $classifieroptions = shift(@$optionlistref);
204 return if (!defined($classifieroptions));
205
206 my $classifiername = $classifieroptions->{'name'};
207 my $classifierargs = $classifieroptions->{'args'};
208 my $classifierdesc = $classifieroptions->{'desc'};
209 # Produce the usage information using the data structure above
210 if ($isleafclass) {
211 if (defined($classifierdesc)) {
212 &gsprintf(STDERR, "$classifierdesc\n\n");
213 }
214 &gsprintf(STDERR, " {common.usage}: classify $classifiername [{common.options}]\n\n");
215
216 }
217
218 # Display the classifier options, if there are some
219 if (defined($classifierargs)) {
220 # Calculate the column offset of the option descriptions
221 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
222
223 if ($isleafclass) {
224 &gsprintf(STDERR, " {common.specific_options}:\n");
225 }
226 else {
227 &gsprintf(STDERR, " {common.general_options}:\n", $classifiername);
228 }
229
230 # Display the classifier options
231 &PrintUsage::print_options_txt($classifierargs, $optiondescoffset);
232 }
233
234 # Recurse up the classifier hierarchy
235 $self->print_classifier_usage($descoffset, 0);
236 $self->{'option_list'} = \@optionlist;
237}
238
239
240sub new {
241 my ($class) = shift (@_);
242 my ($classifierslist,$args,$hashArgOptLists) = @_;
243 push(@$classifierslist, $class);
244 my $classifier_name = (defined $classifierslist->[0]) ? $classifierslist->[0] : $class;
245
246 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
247 push(@{$hashArgOptLists->{"OptList"}},$options);
248
249
250 # Manually set $self parameters.
251 my $self = {};
252 $self->{'outhandle'} = STDERR;
253 $self->{'idnum'} = -1;
254 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
255 $self->{"info_only"} = 0;
256
257 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
258 # the args, just return the object.
259 foreach my $strArg (@{$args})
260 {
261 if($strArg eq "-gsdlinfo")
262 {
263 $self->{"info_only"} = 1;
264 return bless $self, $class;
265 }
266 }
267
268 # general options available to all classifiers
269 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
270 {
271 #print out the text usage of this classifier.
272 my $classTempClass = bless $self, $class;
273 print STDERR "<BadClassifier c=$classifier_name>\n";
274
275 &gsprintf(STDERR, "\n{BasClas.bad_general_option}\n", $classifier_name);
276 $classTempClass->print_txt_usage(""); # Use default resource bundle
277 die "\n";
278 }
279
280 delete $self->{"info_only"};
281
282# The following is no longer needed as we now ensure that when text files are
283# read in, there are straightaway made to be Unicode aware strings in Perl
284
285# # user-entered UTF-8 display meta for buttonname. Needs to be internally turned into unicode
286# # (will later be turned back to UTF-8 along with the rest of the metadata)
287# if ($self->{'buttonname'}) {
288# $self->{'buttonname'} = &unicode::convert_utf8_string_to_unicode_string($self->{'buttonname'});
289# }
290 return bless $self, $class;
291}
292
293sub init {
294 my $self = shift (@_);
295
296 $self->{'supportsmemberof'} = &supports_memberof();
297}
298
299sub set_number {
300 my $self = shift (@_);
301 my ($id) = @_;
302 $self->{'idnum'} = $id;
303}
304
305sub get_number {
306 my $self = shift (@_);
307 return $self->{'idnum'};
308}
309
310sub oid_array_delete
311{
312 my $self = shift (@_);
313 my ($delete_oid,$field) = @_;
314
315 my $outhandle = $self->{'outhandle'};
316
317 my @filtered_list = ();
318 foreach my $existing_oid (@{$self->{$field}}) {
319 if ($existing_oid eq $delete_oid) {
320 print $outhandle " Deleting old $delete_oid for ", ref $self, "\n";
321 }
322 else {
323 push(@filtered_list,$existing_oid);
324 }
325 }
326 $self->{$field} = \@filtered_list;
327}
328
329sub oid_hash_delete
330{
331 my $self = shift (@_);
332 my ($delete_oid,$field) = @_;
333
334 my $outhandle = $self->{'outhandle'};
335
336 print $outhandle " Deleting old $delete_oid for ", ref $self, "\n";
337 delete $self->{$field}->{$delete_oid};
338}
339
340sub classify {
341 my $self = shift (@_);
342 my ($doc_obj) = @_;
343
344 my $outhandle = $self->{'outhandle'};
345 &gsprintf($outhandle, "BaseClassifier::classify {common.must_be_implemented}\n");
346}
347
348sub get_classify_info {
349 my $self = shift (@_);
350
351 my $outhandle = $self->{'outhandle'};
352 &gsprintf($outhandle, "BaseClassifier::get_classify_info {common.must_be_implemented}\n");
353}
354
355sub supports_memberof {
356 my $self = shift(@_);
357
358 return "false";
359}
360
361# previously, if a buttonname wasn't specified, we just use the metadata value,
362# but with a list of metadata, we want to do something a bit nicer so that
363# eg -metadata dc.Title,Title will end up with Title as the buttonname
364
365# current algorithm - use the first element, but strip its namespace
366sub generate_title_from_metadata {
367
368 my $self = shift (@_);
369 my $metadata = shift (@_);
370
371 return "" unless defined $metadata && $metadata =~ /\S/;
372
373 my @metalist = split(/,|;/, $metadata);
374 my $firstmeta = $metalist[0];
375 if ($firstmeta =~ /\./) {
376 $firstmeta =~ s/^\w+\.//;
377 }
378 return $firstmeta;
379}
380
381
382# ex. can be at front, or it may be a list of metadata, separated by ,/;
383sub strip_ex_from_metadata {
384 my $self = shift (@_);
385 my $metadata = shift (@_);
386
387 return $metadata unless defined $metadata && $metadata =~ /\S/;
388
389 # only remove ex. metadata prefix if there are no other prefixes after it
390 $metadata =~ s/(,|;|:|\/)/$1 /g; # insert a space separator so meta names like flex.Image don't become fl.Image
391 $metadata =~ s/(^| )ex\.([^.,;:\/]+)(,|;|:|\/|$)/$1$2$3/g;
392 $metadata =~ s/(,|;|:|\/) /$1/g;
393
394 return $metadata;
395}
396
397
3981;
Note: See TracBrowser for help on using the repository browser.