source: main/trunk/greenstone2/perllib/classify/BaseClassifier.pm@ 23116

Last change on this file since 23116 was 23116, checked in by kjdon, 11 years ago

for incremental build, classifiers are not really done incrementally. Previously, we reconstructed all the docs from the database, and classified them, then processed any new/edited/deleted docs, updating the classifier as necessary. Now, we process all new/updated docs, then reconstruct the docs from the database, but only classify those not changed/deleted. This means that we are only ever adding docs to a classifier, never updating or deleting. I have removed edit_mode and all code handling deleting stuff from the classifier.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.8 KB
Line 
1###########################################################################
2#
3# BaseClassifier.pm -- base class for all classifiers
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2000 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package BaseClassifier;
28
29# How a classifier works.
30#
31# For each classifier requested in the collect.cfg file, buildcol.pl creates
32# a new classifier object (a subclass of BaseClassifier). Later, it passes each
33# document object to each classifier in turn for classification.
34#
35# Four primary functions are used:
36#
37# 1. "new" is called before the documents are processed to set up the
38# classifier.
39#
40# 2. "init" is called after buildcol.pl has created the indexes etc but
41# before the documents are classified in order that the classifier might
42# set any variables it requires, etc.
43#
44# 3. "classify" is called once for each document object. The classifier
45# "classifies" each document and updates its local data accordingly.
46#
47# 4. "get_classify_info" is called after every document has been
48# classified. It collates the information about the documents and
49# stores a reference to the classifier so that Greenstone can later
50# display it.
51
52# 09/05/02 Added usage datastructure - John Thompson
53# 28/11/03 Commented out verbosity argument - John Thompson
54
55use gsprintf;
56use printusage;
57use parse2;
58
59# suppress the annoying "subroutine redefined" warning that various
60# classifiers cause under perl 5.6
61$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
62
63use strict;
64no strict 'subs'; # allow barewords (eg STDERR) as function arguments
65no strict 'refs'; # allow filehandles to be variables and viceversa
66
67my $arguments =
68 [
69 { 'name' => "buttonname",
70 'desc' => "{BasClas.buttonname}",
71 'type' => "string",
72 'deft' => "",
73 'reqd' => "no" },
74 { 'name' => "no_metadata_formatting",
75 'desc' => "{BasClas.no_metadata_formatting}",
76 'type' => "flag" },
77 { 'name' => "builddir",
78 'desc' => "{BasClas.builddir}",
79 'type' => "string",
80 'deft' => "" },
81 { 'name' => "outhandle",
82 'desc' => "{BasClas.outhandle}",
83 'type' => "string",
84 'deft' => "STDERR" },
85 { 'name' => "verbosity",
86 'desc' => "{BasClas.verbosity}",
87# 'type' => "enum",
88 'type' => "int",
89 'deft' => "2",
90 'reqd' => "no" }
91
92# { 'name' => "ignore_namespace",
93# 'desc' => "{BasClas.ignore_namespace}",
94# 'type' => "flag"}
95 ];
96
97my $options = { 'name' => "BaseClassifier",
98 'desc' => "{BasClas.desc}",
99 'abstract' => "yes",
100 'inherits' => "no",
101 'args' => $arguments };
102
103
104sub gsprintf
105{
106 return &gsprintf::gsprintf(@_);
107}
108
109
110sub print_xml_usage
111{
112 my $self = shift(@_);
113 my $header = shift(@_);
114 my $high_level_information_only = shift(@_);
115
116 # XML output is always in UTF-8
117 &gsprintf::output_strings_in_UTF8;
118
119 if ($header) {
120 &PrintUsage::print_xml_header("classify");
121 }
122 $self->print_xml($high_level_information_only);
123}
124
125
126sub print_xml
127{
128 my $self = shift(@_);
129 my $high_level_information_only = shift(@_);
130
131 my $optionlistref = $self->{'option_list'};
132 my @optionlist = @$optionlistref;
133 my $classifieroptions = shift(@$optionlistref);
134 return if (!defined($classifieroptions));
135
136 &gsprintf(STDERR, "<ClassInfo>\n");
137 &gsprintf(STDERR, " <Name>$classifieroptions->{'name'}</Name>\n");
138 my $desc = &gsprintf::lookup_string($classifieroptions->{'desc'});
139 $desc =~ s/</&amp;lt;/g; # doubly escaped
140 $desc =~ s/>/&amp;gt;/g;
141 &gsprintf(STDERR, " <Desc>$desc</Desc>\n");
142 &gsprintf(STDERR, " <Abstract>$classifieroptions->{'abstract'}</Abstract>\n");
143 &gsprintf(STDERR, " <Inherits>$classifieroptions->{'inherits'}</Inherits>\n");
144 unless (defined($high_level_information_only)) {
145 &gsprintf(STDERR, " <Arguments>\n");
146 if (defined($classifieroptions->{'args'})) {
147 &PrintUsage::print_options_xml($classifieroptions->{'args'});
148 }
149 &gsprintf(STDERR, " </Arguments>\n");
150
151 # Recurse up the classifier hierarchy
152 $self->print_xml();
153 }
154 &gsprintf(STDERR, "</ClassInfo>\n");
155}
156
157
158sub print_txt_usage
159{
160 my $self = shift(@_);
161
162 # Print the usage message for a classifier (recursively)
163 my $descoffset = $self->determine_description_offset(0);
164 $self->print_classifier_usage($descoffset, 1);
165}
166
167
168sub determine_description_offset
169{
170 my $self = shift(@_);
171 my $maxoffset = shift(@_);
172
173 my $optionlistref = $self->{'option_list'};
174 my @optionlist = @$optionlistref;
175 my $classifieroptions = pop(@$optionlistref);
176 return $maxoffset if (!defined($classifieroptions));
177
178 # Find the length of the longest option string of this classifier
179 my $classifierargs = $classifieroptions->{'args'};
180 if (defined($classifierargs)) {
181 my $longest = &PrintUsage::find_longest_option_string($classifierargs);
182 if ($longest > $maxoffset) {
183 $maxoffset = $longest;
184 }
185 }
186
187 # Recurse up the classifier hierarchy
188 $maxoffset = $self->determine_description_offset($maxoffset);
189 $self->{'option_list'} = \@optionlist;
190 return $maxoffset;
191}
192
193
194sub print_classifier_usage
195{
196 my $self = shift(@_);
197 my $descoffset = shift(@_);
198 my $isleafclass = shift(@_);
199
200 my $optionlistref = $self->{'option_list'};
201 my @optionlist = @$optionlistref;
202 my $classifieroptions = shift(@$optionlistref);
203 return if (!defined($classifieroptions));
204
205 my $classifiername = $classifieroptions->{'name'};
206 my $classifierargs = $classifieroptions->{'args'};
207 my $classifierdesc = $classifieroptions->{'desc'};
208 # Produce the usage information using the data structure above
209 if ($isleafclass) {
210 if (defined($classifierdesc)) {
211 &gsprintf(STDERR, "$classifierdesc\n\n");
212 }
213 &gsprintf(STDERR, " {common.usage}: classify $classifiername [{common.options}]\n\n");
214
215 }
216
217 # Display the classifier options, if there are some
218 if (defined($classifierargs)) {
219 # Calculate the column offset of the option descriptions
220 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
221
222 if ($isleafclass) {
223 &gsprintf(STDERR, " {common.specific_options}:\n");
224 }
225 else {
226 &gsprintf(STDERR, " {common.general_options}:\n", $classifiername);
227 }
228
229 # Display the classifier options
230 &PrintUsage::print_options_txt($classifierargs, $optiondescoffset);
231 }
232
233 # Recurse up the classifier hierarchy
234 $self->print_classifier_usage($descoffset, 0);
235 $self->{'option_list'} = \@optionlist;
236}
237
238
239sub new {
240 my ($class) = shift (@_);
241 my ($classifierslist,$args,$hashArgOptLists) = @_;
242 push(@$classifierslist, $class);
243 my $classifier_name = (defined $classifierslist->[0]) ? $classifierslist->[0] : $class;
244
245 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
246 push(@{$hashArgOptLists->{"OptList"}},$options);
247
248
249 # Manually set $self parameters.
250 my $self = {};
251 $self->{'outhandle'} = STDERR;
252 $self->{'idnum'} = -1;
253 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
254 $self->{"info_only"} = 0;
255
256 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
257 # the args, just return the object.
258 foreach my $strArg (@{$args})
259 {
260 if($strArg eq "-gsdlinfo")
261 {
262 $self->{"info_only"} = 1;
263 return bless $self, $class;
264 }
265 }
266
267 # general options available to all classifiers
268 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
269 {
270 #print out the text usage of this classifier.
271 my $classTempClass = bless $self, $class;
272 print STDERR "<BadClassifier c=$classifier_name>\n";
273
274 &gsprintf(STDERR, "\n{BasClas.bad_general_option}\n", $classifier_name);
275 $classTempClass->print_txt_usage(""); # Use default resource bundle
276 die "\n";
277 }
278
279 delete $self->{"info_only"};
280 return bless $self, $class;
281}
282
283sub init {
284 my $self = shift (@_);
285
286 $self->{'supportsmemberof'} = &supports_memberof();
287}
288
289sub set_number {
290 my $self = shift (@_);
291 my ($id) = @_;
292 $self->{'idnum'} = $id;
293}
294
295sub get_number {
296 my $self = shift (@_);
297 return $self->{'idnum'};
298}
299
300sub oid_array_delete
301{
302 my $self = shift (@_);
303 my ($delete_oid,$field) = @_;
304
305 my $outhandle = $self->{'outhandle'};
306
307 my @filtered_list = ();
308 foreach my $existing_oid (@{$self->{$field}}) {
309 if ($existing_oid eq $delete_oid) {
310 print $outhandle " Deleting old $delete_oid for ", ref $self, "\n";
311 }
312 else {
313 push(@filtered_list,$existing_oid);
314 }
315 }
316 $self->{$field} = \@filtered_list;
317}
318
319sub oid_hash_delete
320{
321 my $self = shift (@_);
322 my ($delete_oid,$field) = @_;
323
324 my $outhandle = $self->{'outhandle'};
325
326 print $outhandle " Deleting old $delete_oid for ", ref $self, "\n";
327 delete $self->{$field}->{$delete_oid};
328}
329
330sub classify {
331 my $self = shift (@_);
332 my ($doc_obj) = @_;
333
334 my $outhandle = $self->{'outhandle'};
335 &gsprintf($outhandle, "BaseClassifier::classify {common.must_be_implemented}\n");
336}
337
338sub get_classify_info {
339 my $self = shift (@_);
340
341 my $outhandle = $self->{'outhandle'};
342 &gsprintf($outhandle, "BaseClassifier::get_classify_info {common.must_be_implemented}\n");
343}
344
345sub supports_memberof {
346 my $self = shift(@_);
347
348 return "false";
349}
350
351# previously, if a buttonname wasn't specified, we just use the metadata value,
352# but with a list of metadata, we want to do something a bit nicer so that
353# eg -metadata dc.Title,Title will end up with Title as the buttonname
354
355# current algorithm - use the first element, but strip its namespace
356sub generate_title_from_metadata {
357
358 my $self = shift (@_);
359 my $metadata = shift (@_);
360
361 return "" unless defined $metadata && $metadata =~ /\S/;
362
363 my @metalist = split(/,|;/, $metadata);
364 my $firstmeta = $metalist[0];
365 if ($firstmeta =~ /\./) {
366 $firstmeta =~ s/^\w+\.//;
367 }
368 return $firstmeta;
369}
370
371# ex. can be at front, or it may be a list of metadata, separated by ,/;
372sub strip_ex_from_metadata {
373 my $self = shift (@_);
374 my $metadata = shift (@_);
375
376 return $metadata unless defined $metadata && $metadata =~ /\S/;
377
378 $metadata =~ s/^ex\.//;
379 $metadata =~ s/([,;:\/])ex\./$1/g;
380 return $metadata;
381}
382
383
3841;
Note: See TracBrowser for help on using the repository browser.