source: main/trunk/greenstone2/perllib/classify/BaseClassifier.pm@ 24193

Last change on this file since 24193 was 24193, checked in by ak19, 13 years ago

Non-English characters in the user-entered buttonname of all classifiers need to be preserved. BaseClassifier now does this, for which it has to assume that the user-entered buttonname is in UTF-8

  • Property svn:keywords set to Author Date Id Revision
File size: 11.1 KB
RevLine 
[1483]1###########################################################################
2#
[17209]3# BaseClassifier.pm -- base class for all classifiers
[1884]4#
[1483]5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2000 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
[17209]27package BaseClassifier;
[1483]28
[1884]29# How a classifier works.
30#
31# For each classifier requested in the collect.cfg file, buildcol.pl creates
[17209]32# a new classifier object (a subclass of BaseClassifier). Later, it passes each
[1884]33# document object to each classifier in turn for classification.
34#
35# Four primary functions are used:
36#
37# 1. "new" is called before the documents are processed to set up the
38# classifier.
39#
40# 2. "init" is called after buildcol.pl has created the indexes etc but
41# before the documents are classified in order that the classifier might
42# set any variables it requires, etc.
43#
44# 3. "classify" is called once for each document object. The classifier
45# "classifies" each document and updates its local data accordingly.
46#
47# 4. "get_classify_info" is called after every document has been
48# classified. It collates the information about the documents and
49# stores a reference to the classifier so that Greenstone can later
50# display it.
51
[3540]52# 09/05/02 Added usage datastructure - John Thompson
[6111]53# 28/11/03 Commented out verbosity argument - John Thompson
[3540]54
[5645]55use gsprintf;
[4778]56use printusage;
[10218]57use parse2;
[24193]58use unicode;
[1483]59
[10229]60# suppress the annoying "subroutine redefined" warning that various
61# classifiers cause under perl 5.6
62$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
63
64use strict;
65no strict 'subs'; # allow barewords (eg STDERR) as function arguments
[10253]66no strict 'refs'; # allow filehandles to be variables and viceversa
[10229]67
[3540]68my $arguments =
[10630]69 [
[11541]70 { 'name' => "buttonname",
71 'desc' => "{BasClas.buttonname}",
72 'type' => "string",
73 'deft' => "",
74 'reqd' => "no" },
[10630]75 { 'name' => "no_metadata_formatting",
76 'desc' => "{BasClas.no_metadata_formatting}",
77 'type' => "flag" },
78 { 'name' => "builddir",
[4873]79 'desc' => "{BasClas.builddir}",
[4759]80 'type' => "string",
81 'deft' => "" },
82 { 'name' => "outhandle",
[4873]83 'desc' => "{BasClas.outhandle}",
[4759]84 'type' => "string",
85 'deft' => "STDERR" },
[10218]86 { 'name' => "verbosity",
87 'desc' => "{BasClas.verbosity}",
[6111]88# 'type' => "enum",
[10218]89 'type' => "int",
90 'deft' => "2",
[10630]91 'reqd' => "no" }
92
[6408]93# { 'name' => "ignore_namespace",
94# 'desc' => "{BasClas.ignore_namespace}",
95# 'type' => "flag"}
96 ];
[3540]97
[17209]98my $options = { 'name' => "BaseClassifier",
[5645]99 'desc' => "{BasClas.desc}",
[6408]100 'abstract' => "yes",
101 'inherits' => "no",
[4759]102 'args' => $arguments };
[3540]103
[4778]104
[5645]105sub gsprintf
106{
107 return &gsprintf::gsprintf(@_);
108}
109
110
[4778]111sub print_xml_usage
112{
[8716]113 my $self = shift(@_);
[11681]114 my $header = shift(@_);
[12624]115 my $high_level_information_only = shift(@_);
[11681]116
[6945]117 # XML output is always in UTF-8
118 &gsprintf::output_strings_in_UTF8;
119
[11681]120 if ($header) {
121 &PrintUsage::print_xml_header("classify");
122 }
[12624]123 $self->print_xml($high_level_information_only);
[3540]124}
125
[4759]126
[4778]127sub print_xml
128{
[8716]129 my $self = shift(@_);
[12624]130 my $high_level_information_only = shift(@_);
[4778]131
[8716]132 my $optionlistref = $self->{'option_list'};
133 my @optionlist = @$optionlistref;
[10229]134 my $classifieroptions = shift(@$optionlistref);
[4778]135 return if (!defined($classifieroptions));
136
[6987]137 &gsprintf(STDERR, "<ClassInfo>\n");
138 &gsprintf(STDERR, " <Name>$classifieroptions->{'name'}</Name>\n");
[7023]139 my $desc = &gsprintf::lookup_string($classifieroptions->{'desc'});
140 $desc =~ s/</&amp;lt;/g; # doubly escaped
141 $desc =~ s/>/&amp;gt;/g;
142 &gsprintf(STDERR, " <Desc>$desc</Desc>\n");
[6987]143 &gsprintf(STDERR, " <Abstract>$classifieroptions->{'abstract'}</Abstract>\n");
144 &gsprintf(STDERR, " <Inherits>$classifieroptions->{'inherits'}</Inherits>\n");
[12624]145 unless (defined($high_level_information_only)) {
146 &gsprintf(STDERR, " <Arguments>\n");
147 if (defined($classifieroptions->{'args'})) {
148 &PrintUsage::print_options_xml($classifieroptions->{'args'});
149 }
150 &gsprintf(STDERR, " </Arguments>\n");
151
152 # Recurse up the classifier hierarchy
153 $self->print_xml();
[4759]154 }
[6987]155 &gsprintf(STDERR, "</ClassInfo>\n");
[3540]156}
157
[4759]158
[4778]159sub print_txt_usage
[4759]160{
[8716]161 my $self = shift(@_);
[4759]162
163 # Print the usage message for a classifier (recursively)
[8716]164 my $descoffset = $self->determine_description_offset(0);
[6925]165 $self->print_classifier_usage($descoffset, 1);
[4759]166}
167
168
169sub determine_description_offset
170{
[8716]171 my $self = shift(@_);
172 my $maxoffset = shift(@_);
[4759]173
[8716]174 my $optionlistref = $self->{'option_list'};
175 my @optionlist = @$optionlistref;
176 my $classifieroptions = pop(@$optionlistref);
[4759]177 return $maxoffset if (!defined($classifieroptions));
178
179 # Find the length of the longest option string of this classifier
[8716]180 my $classifierargs = $classifieroptions->{'args'};
[4759]181 if (defined($classifierargs)) {
[8716]182 my $longest = &PrintUsage::find_longest_option_string($classifierargs);
[4778]183 if ($longest > $maxoffset) {
184 $maxoffset = $longest;
[4759]185 }
186 }
187
188 # Recurse up the classifier hierarchy
189 $maxoffset = $self->determine_description_offset($maxoffset);
190 $self->{'option_list'} = \@optionlist;
191 return $maxoffset;
192}
193
194
195sub print_classifier_usage
196{
[8716]197 my $self = shift(@_);
198 my $descoffset = shift(@_);
199 my $isleafclass = shift(@_);
[4759]200
[8716]201 my $optionlistref = $self->{'option_list'};
202 my @optionlist = @$optionlistref;
[10229]203 my $classifieroptions = shift(@$optionlistref);
[4759]204 return if (!defined($classifieroptions));
205
[8716]206 my $classifiername = $classifieroptions->{'name'};
207 my $classifierargs = $classifieroptions->{'args'};
208 my $classifierdesc = $classifieroptions->{'desc'};
[4759]209 # Produce the usage information using the data structure above
210 if ($isleafclass) {
[6932]211 if (defined($classifierdesc)) {
212 &gsprintf(STDERR, "$classifierdesc\n\n");
213 }
[5645]214 &gsprintf(STDERR, " {common.usage}: classify $classifiername [{common.options}]\n\n");
[6932]215
[4759]216 }
217
218 # Display the classifier options, if there are some
219 if (defined($classifierargs)) {
220 # Calculate the column offset of the option descriptions
[8716]221 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
[4759]222
223 if ($isleafclass) {
[5645]224 &gsprintf(STDERR, " {common.specific_options}:\n");
[4759]225 }
226 else {
[5645]227 &gsprintf(STDERR, " {common.general_options}:\n", $classifiername);
[4759]228 }
229
230 # Display the classifier options
[6925]231 &PrintUsage::print_options_txt($classifierargs, $optiondescoffset);
[4759]232 }
233
234 # Recurse up the classifier hierarchy
[6925]235 $self->print_classifier_usage($descoffset, 0);
[4759]236 $self->{'option_list'} = \@optionlist;
237}
238
239
[1483]240sub new {
[10218]241 my ($class) = shift (@_);
[10229]242 my ($classifierslist,$args,$hashArgOptLists) = @_;
[10218]243 push(@$classifierslist, $class);
244 my $classifier_name = (defined $classifierslist->[0]) ? $classifierslist->[0] : $class;
[10223]245
[17209]246 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
247 push(@{$hashArgOptLists->{"OptList"}},$options);
[1483]248
[10218]249
250 # Manually set $self parameters.
[1483]251 my $self = {};
[1839]252 $self->{'outhandle'} = STDERR;
[8221]253 $self->{'idnum'} = -1;
[10218]254 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
255 $self->{"info_only"} = 0;
[3540]256
[10229]257 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
258 # the args, just return the object.
259 foreach my $strArg (@{$args})
[10218]260 {
[10229]261 if($strArg eq "-gsdlinfo")
[10218]262 {
263 $self->{"info_only"} = 1;
[10229]264 return bless $self, $class;
[10218]265 }
266 }
[5645]267
[10229]268 # general options available to all classifiers
[12546]269 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
[10218]270 {
[10229]271 #print out the text usage of this classifier.
[10218]272 my $classTempClass = bless $self, $class;
[10620]273 print STDERR "<BadClassifier c=$classifier_name>\n";
274
[10229]275 &gsprintf(STDERR, "\n{BasClas.bad_general_option}\n", $classifier_name);
276 $classTempClass->print_txt_usage(""); # Use default resource bundle
277 die "\n";
[1483]278 }
[10229]279
280 delete $self->{"info_only"};
[24193]281 # user-entered UTF-8 display meta for buttonname. Needs to be internally turned into unicode
282 # (will later be turned back to UTF-8 along with the rest of the metadata)
283 if ($self->{'buttonname'}) {
284 $self->{'buttonname'} = &unicode::convert_utf8_string_to_unicode_string($self->{'buttonname'});
285 }
[10229]286 return bless $self, $class;
[1483]287}
288
289sub init {
290 my $self = shift (@_);
[8221]291
292 $self->{'supportsmemberof'} = &supports_memberof();
[1483]293}
294
[8221]295sub set_number {
296 my $self = shift (@_);
297 my ($id) = @_;
298 $self->{'idnum'} = $id;
299}
300
301sub get_number {
302 my $self = shift (@_);
303 return $self->{'idnum'};
304}
305
[18455]306sub oid_array_delete
307{
308 my $self = shift (@_);
309 my ($delete_oid,$field) = @_;
310
311 my $outhandle = $self->{'outhandle'};
312
313 my @filtered_list = ();
314 foreach my $existing_oid (@{$self->{$field}}) {
315 if ($existing_oid eq $delete_oid) {
[18555]316 print $outhandle " Deleting old $delete_oid for ", ref $self, "\n";
[18455]317 }
318 else {
319 push(@filtered_list,$existing_oid);
320 }
321 }
322 $self->{$field} = \@filtered_list;
323}
324
325sub oid_hash_delete
326{
327 my $self = shift (@_);
328 my ($delete_oid,$field) = @_;
329
330 my $outhandle = $self->{'outhandle'};
331
[18555]332 print $outhandle " Deleting old $delete_oid for ", ref $self, "\n";
[18455]333 delete $self->{$field}->{$delete_oid};
334}
335
[1483]336sub classify {
337 my $self = shift (@_);
[23116]338 my ($doc_obj) = @_;
[1483]339
340 my $outhandle = $self->{'outhandle'};
[17209]341 &gsprintf($outhandle, "BaseClassifier::classify {common.must_be_implemented}\n");
[1483]342}
343
344sub get_classify_info {
345 my $self = shift (@_);
346
347 my $outhandle = $self->{'outhandle'};
[17209]348 &gsprintf($outhandle, "BaseClassifier::get_classify_info {common.must_be_implemented}\n");
[1483]349}
350
[8221]351sub supports_memberof {
352 my $self = shift(@_);
353
354 return "false";
355}
356
[7580]357# previously, if a buttonname wasn't specified, we just use the metadata value,
358# but with a list of metadata, we want to do something a bit nicer so that
359# eg -metadata dc.Title,Title will end up with Title as the buttonname
[6956]360
[7580]361# current algorithm - use the first element, but strip its namespace
362sub generate_title_from_metadata {
363
364 my $self = shift (@_);
365 my $metadata = shift (@_);
[10253]366
367 return "" unless defined $metadata && $metadata =~ /\S/;
[7580]368
[20008]369 my @metalist = split(/,|;/, $metadata);
[7580]370 my $firstmeta = $metalist[0];
371 if ($firstmeta =~ /\./) {
372 $firstmeta =~ s/^\w+\.//;
373 }
374 return $firstmeta;
375}
376
[24193]377
[20426]378# ex. can be at front, or it may be a list of metadata, separated by ,/;
379sub strip_ex_from_metadata {
380 my $self = shift (@_);
381 my $metadata = shift (@_);
382
383 return $metadata unless defined $metadata && $metadata =~ /\S/;
384
385 $metadata =~ s/^ex\.//;
[20453]386 $metadata =~ s/([,;:\/])ex\./$1/g;
[20426]387 return $metadata;
388}
389
390
[7580]3911;
Note: See TracBrowser for help on using the repository browser.