source: main/trunk/greenstone2/perllib/classify/BaseClassifier.pm@ 30358

Last change on this file since 30358 was 24745, checked in by ak19, 13 years ago

Removed code that was added in a recent commit and is no longer used: to do with getting accents in classifier buttonnames to be preserved. The previous fix was to treat buttonname as raw bytedata and call the old method to convert to unicode. Dr Bainbridge then fixed it much better: colcfg, which stores the buttonname, is now read in as unicode aware strings, so no conversion to unicode is necessary anymore.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.1 KB
RevLine 
[1483]1###########################################################################
2#
[17209]3# BaseClassifier.pm -- base class for all classifiers
[1884]4#
[1483]5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2000 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
[17209]27package BaseClassifier;
[1483]28
[1884]29# How a classifier works.
30#
31# For each classifier requested in the collect.cfg file, buildcol.pl creates
[17209]32# a new classifier object (a subclass of BaseClassifier). Later, it passes each
[1884]33# document object to each classifier in turn for classification.
34#
35# Four primary functions are used:
36#
37# 1. "new" is called before the documents are processed to set up the
38# classifier.
39#
40# 2. "init" is called after buildcol.pl has created the indexes etc but
41# before the documents are classified in order that the classifier might
42# set any variables it requires, etc.
43#
44# 3. "classify" is called once for each document object. The classifier
45# "classifies" each document and updates its local data accordingly.
46#
47# 4. "get_classify_info" is called after every document has been
48# classified. It collates the information about the documents and
49# stores a reference to the classifier so that Greenstone can later
50# display it.
51
[3540]52# 09/05/02 Added usage datastructure - John Thompson
[6111]53# 28/11/03 Commented out verbosity argument - John Thompson
[3540]54
[5645]55use gsprintf;
[4778]56use printusage;
[10218]57use parse2;
[1483]58
[10229]59# suppress the annoying "subroutine redefined" warning that various
60# classifiers cause under perl 5.6
61$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
62
63use strict;
64no strict 'subs'; # allow barewords (eg STDERR) as function arguments
[10253]65no strict 'refs'; # allow filehandles to be variables and viceversa
[10229]66
[3540]67my $arguments =
[10630]68 [
[11541]69 { 'name' => "buttonname",
70 'desc' => "{BasClas.buttonname}",
71 'type' => "string",
72 'deft' => "",
73 'reqd' => "no" },
[10630]74 { 'name' => "no_metadata_formatting",
75 'desc' => "{BasClas.no_metadata_formatting}",
76 'type' => "flag" },
77 { 'name' => "builddir",
[4873]78 'desc' => "{BasClas.builddir}",
[4759]79 'type' => "string",
80 'deft' => "" },
81 { 'name' => "outhandle",
[4873]82 'desc' => "{BasClas.outhandle}",
[4759]83 'type' => "string",
84 'deft' => "STDERR" },
[10218]85 { 'name' => "verbosity",
86 'desc' => "{BasClas.verbosity}",
[6111]87# 'type' => "enum",
[10218]88 'type' => "int",
89 'deft' => "2",
[10630]90 'reqd' => "no" }
91
[6408]92# { 'name' => "ignore_namespace",
93# 'desc' => "{BasClas.ignore_namespace}",
94# 'type' => "flag"}
95 ];
[3540]96
[17209]97my $options = { 'name' => "BaseClassifier",
[5645]98 'desc' => "{BasClas.desc}",
[6408]99 'abstract' => "yes",
100 'inherits' => "no",
[4759]101 'args' => $arguments };
[3540]102
[4778]103
[5645]104sub gsprintf
105{
106 return &gsprintf::gsprintf(@_);
107}
108
109
[4778]110sub print_xml_usage
111{
[8716]112 my $self = shift(@_);
[11681]113 my $header = shift(@_);
[12624]114 my $high_level_information_only = shift(@_);
[11681]115
[6945]116 # XML output is always in UTF-8
117 &gsprintf::output_strings_in_UTF8;
118
[11681]119 if ($header) {
120 &PrintUsage::print_xml_header("classify");
121 }
[12624]122 $self->print_xml($high_level_information_only);
[3540]123}
124
[4759]125
[4778]126sub print_xml
127{
[8716]128 my $self = shift(@_);
[12624]129 my $high_level_information_only = shift(@_);
[4778]130
[8716]131 my $optionlistref = $self->{'option_list'};
132 my @optionlist = @$optionlistref;
[10229]133 my $classifieroptions = shift(@$optionlistref);
[4778]134 return if (!defined($classifieroptions));
135
[6987]136 &gsprintf(STDERR, "<ClassInfo>\n");
137 &gsprintf(STDERR, " <Name>$classifieroptions->{'name'}</Name>\n");
[7023]138 my $desc = &gsprintf::lookup_string($classifieroptions->{'desc'});
139 $desc =~ s/</&amp;lt;/g; # doubly escaped
140 $desc =~ s/>/&amp;gt;/g;
141 &gsprintf(STDERR, " <Desc>$desc</Desc>\n");
[6987]142 &gsprintf(STDERR, " <Abstract>$classifieroptions->{'abstract'}</Abstract>\n");
143 &gsprintf(STDERR, " <Inherits>$classifieroptions->{'inherits'}</Inherits>\n");
[12624]144 unless (defined($high_level_information_only)) {
145 &gsprintf(STDERR, " <Arguments>\n");
146 if (defined($classifieroptions->{'args'})) {
147 &PrintUsage::print_options_xml($classifieroptions->{'args'});
148 }
149 &gsprintf(STDERR, " </Arguments>\n");
150
151 # Recurse up the classifier hierarchy
152 $self->print_xml();
[4759]153 }
[6987]154 &gsprintf(STDERR, "</ClassInfo>\n");
[3540]155}
156
[4759]157
[4778]158sub print_txt_usage
[4759]159{
[8716]160 my $self = shift(@_);
[4759]161
162 # Print the usage message for a classifier (recursively)
[8716]163 my $descoffset = $self->determine_description_offset(0);
[6925]164 $self->print_classifier_usage($descoffset, 1);
[4759]165}
166
167
168sub determine_description_offset
169{
[8716]170 my $self = shift(@_);
171 my $maxoffset = shift(@_);
[4759]172
[8716]173 my $optionlistref = $self->{'option_list'};
174 my @optionlist = @$optionlistref;
175 my $classifieroptions = pop(@$optionlistref);
[4759]176 return $maxoffset if (!defined($classifieroptions));
177
178 # Find the length of the longest option string of this classifier
[8716]179 my $classifierargs = $classifieroptions->{'args'};
[4759]180 if (defined($classifierargs)) {
[8716]181 my $longest = &PrintUsage::find_longest_option_string($classifierargs);
[4778]182 if ($longest > $maxoffset) {
183 $maxoffset = $longest;
[4759]184 }
185 }
186
187 # Recurse up the classifier hierarchy
188 $maxoffset = $self->determine_description_offset($maxoffset);
189 $self->{'option_list'} = \@optionlist;
190 return $maxoffset;
191}
192
193
194sub print_classifier_usage
195{
[8716]196 my $self = shift(@_);
197 my $descoffset = shift(@_);
198 my $isleafclass = shift(@_);
[4759]199
[8716]200 my $optionlistref = $self->{'option_list'};
201 my @optionlist = @$optionlistref;
[10229]202 my $classifieroptions = shift(@$optionlistref);
[4759]203 return if (!defined($classifieroptions));
204
[8716]205 my $classifiername = $classifieroptions->{'name'};
206 my $classifierargs = $classifieroptions->{'args'};
207 my $classifierdesc = $classifieroptions->{'desc'};
[4759]208 # Produce the usage information using the data structure above
209 if ($isleafclass) {
[6932]210 if (defined($classifierdesc)) {
211 &gsprintf(STDERR, "$classifierdesc\n\n");
212 }
[5645]213 &gsprintf(STDERR, " {common.usage}: classify $classifiername [{common.options}]\n\n");
[6932]214
[4759]215 }
216
217 # Display the classifier options, if there are some
218 if (defined($classifierargs)) {
219 # Calculate the column offset of the option descriptions
[8716]220 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
[4759]221
222 if ($isleafclass) {
[5645]223 &gsprintf(STDERR, " {common.specific_options}:\n");
[4759]224 }
225 else {
[5645]226 &gsprintf(STDERR, " {common.general_options}:\n", $classifiername);
[4759]227 }
228
229 # Display the classifier options
[6925]230 &PrintUsage::print_options_txt($classifierargs, $optiondescoffset);
[4759]231 }
232
233 # Recurse up the classifier hierarchy
[6925]234 $self->print_classifier_usage($descoffset, 0);
[4759]235 $self->{'option_list'} = \@optionlist;
236}
237
238
[1483]239sub new {
[10218]240 my ($class) = shift (@_);
[10229]241 my ($classifierslist,$args,$hashArgOptLists) = @_;
[10218]242 push(@$classifierslist, $class);
243 my $classifier_name = (defined $classifierslist->[0]) ? $classifierslist->[0] : $class;
[10223]244
[17209]245 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
246 push(@{$hashArgOptLists->{"OptList"}},$options);
[1483]247
[10218]248
249 # Manually set $self parameters.
[1483]250 my $self = {};
[1839]251 $self->{'outhandle'} = STDERR;
[8221]252 $self->{'idnum'} = -1;
[10218]253 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
254 $self->{"info_only"} = 0;
[3540]255
[10229]256 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
257 # the args, just return the object.
258 foreach my $strArg (@{$args})
[10218]259 {
[10229]260 if($strArg eq "-gsdlinfo")
[10218]261 {
262 $self->{"info_only"} = 1;
[10229]263 return bless $self, $class;
[10218]264 }
265 }
[5645]266
[10229]267 # general options available to all classifiers
[12546]268 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
[10218]269 {
[10229]270 #print out the text usage of this classifier.
[10218]271 my $classTempClass = bless $self, $class;
[10620]272 print STDERR "<BadClassifier c=$classifier_name>\n";
273
[10229]274 &gsprintf(STDERR, "\n{BasClas.bad_general_option}\n", $classifier_name);
275 $classTempClass->print_txt_usage(""); # Use default resource bundle
276 die "\n";
[1483]277 }
[10229]278
279 delete $self->{"info_only"};
[24737]280
[24745]281# We now ensure that when text files (and even colcfg) are read in,
282# they are straightaway made to be Unicode aware strings in Perl
[24737]283
[10229]284 return bless $self, $class;
[1483]285}
286
287sub init {
288 my $self = shift (@_);
[8221]289
290 $self->{'supportsmemberof'} = &supports_memberof();
[1483]291}
292
[8221]293sub set_number {
294 my $self = shift (@_);
295 my ($id) = @_;
296 $self->{'idnum'} = $id;
297}
298
299sub get_number {
300 my $self = shift (@_);
301 return $self->{'idnum'};
302}
303
[18455]304sub oid_array_delete
305{
306 my $self = shift (@_);
307 my ($delete_oid,$field) = @_;
308
309 my $outhandle = $self->{'outhandle'};
310
311 my @filtered_list = ();
312 foreach my $existing_oid (@{$self->{$field}}) {
313 if ($existing_oid eq $delete_oid) {
[18555]314 print $outhandle " Deleting old $delete_oid for ", ref $self, "\n";
[18455]315 }
316 else {
317 push(@filtered_list,$existing_oid);
318 }
319 }
320 $self->{$field} = \@filtered_list;
321}
322
323sub oid_hash_delete
324{
325 my $self = shift (@_);
326 my ($delete_oid,$field) = @_;
327
328 my $outhandle = $self->{'outhandle'};
329
[18555]330 print $outhandle " Deleting old $delete_oid for ", ref $self, "\n";
[18455]331 delete $self->{$field}->{$delete_oid};
332}
333
[1483]334sub classify {
335 my $self = shift (@_);
[23116]336 my ($doc_obj) = @_;
[1483]337
338 my $outhandle = $self->{'outhandle'};
[17209]339 &gsprintf($outhandle, "BaseClassifier::classify {common.must_be_implemented}\n");
[1483]340}
341
342sub get_classify_info {
343 my $self = shift (@_);
344
345 my $outhandle = $self->{'outhandle'};
[17209]346 &gsprintf($outhandle, "BaseClassifier::get_classify_info {common.must_be_implemented}\n");
[1483]347}
348
[8221]349sub supports_memberof {
350 my $self = shift(@_);
351
352 return "false";
353}
354
[7580]355# previously, if a buttonname wasn't specified, we just use the metadata value,
356# but with a list of metadata, we want to do something a bit nicer so that
357# eg -metadata dc.Title,Title will end up with Title as the buttonname
[6956]358
[7580]359# current algorithm - use the first element, but strip its namespace
360sub generate_title_from_metadata {
361
362 my $self = shift (@_);
363 my $metadata = shift (@_);
[10253]364
365 return "" unless defined $metadata && $metadata =~ /\S/;
[7580]366
[20008]367 my @metalist = split(/,|;/, $metadata);
[7580]368 my $firstmeta = $metalist[0];
369 if ($firstmeta =~ /\./) {
370 $firstmeta =~ s/^\w+\.//;
371 }
372 return $firstmeta;
373}
374
[24193]375
[20426]376# ex. can be at front, or it may be a list of metadata, separated by ,/;
377sub strip_ex_from_metadata {
378 my $self = shift (@_);
379 my $metadata = shift (@_);
380
381 return $metadata unless defined $metadata && $metadata =~ /\S/;
382
[24404]383 # only remove ex. metadata prefix if there are no other prefixes after it
384 $metadata =~ s/(,|;|:|\/)/$1 /g; # insert a space separator so meta names like flex.Image don't become fl.Image
385 $metadata =~ s/(^| )ex\.([^.,;:\/]+)(,|;|:|\/|$)/$1$2$3/g;
386 $metadata =~ s/(,|;|:|\/) /$1/g;
387
[20426]388 return $metadata;
389}
390
391
[7580]3921;
Note: See TracBrowser for help on using the repository browser.