source: main/trunk/greenstone2/perllib/classify.pm@ 27303

Last change on this file since 27303 was 27303, checked in by jmt12, 11 years ago

Replacing hardcoded additions to INC and PATH environment variables with conditional ones - this allows us to use the order of values in these variables for precedence, thus allows better support for extensions that override classifiers, plugins etc. ENV and PATH functions already exists in util, but augmentINC() is a new function

  • Property svn:keywords set to Author Date Id Revision
File size: 14.5 KB
Line 
1###########################################################################
2#
3# classify.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# functions to handle classifiers
27
28package classify;
29
30require util;
31use FileUtils;
32require AllList;
33
34use dbutil;
35use gsprintf;
36use strict; no strict 'subs';
37
38
39sub gsprintf
40{
41 return &gsprintf::gsprintf(@_);
42}
43
44
45sub load_classifier_for_info {
46 my ($classifier) = shift @_;
47
48 # find the classifier
49 # - used to have hardcoded list of places to load classifier from. We
50 # should, instead, try loading from all of the perllib places on the
51 # library path, as that improves support for extensions. Special cases
52 # needed for collection specific and custom classifier. [jmt12]
53 my @possible_class_paths;
54 if (defined($ENV{'GSDLCOLLECTION'}))
55 {
56 push(@possible_class_paths, &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'custom', $ENV{'GSDLCOLLECTION'}, 'perllib', 'classify', $classifier . '.pm'));
57 }
58 # (why does GSDLCOLLECTDIR get set to GSDLHOME for classinfo calls?)
59 if ($ENV{'GSDLCOLLECTDIR'} ne $ENV{'GSDLHOME'})
60 {
61 push(@possible_class_paths, &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'classify', $classifier . '.pm'));
62 }
63 foreach my $library_path (@INC)
64 {
65 # only interested in classify paths found in the library paths
66 if ($library_path =~ /classify$/)
67 {
68 push(@possible_class_paths, &FileUtils::filenameConcatenate($library_path, $classifier . '.pm'));
69 }
70 }
71 my $found_class = 0;
72 foreach my $possible_class_path (@possible_class_paths)
73 {
74 if (-e $possible_class_path)
75 {
76 require $possible_class_path;
77 $found_class = 1;
78 last;
79 }
80 }
81 if (!$found_class)
82 {
83 &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classifier) && die "\n";
84 }
85
86 my ($classobj);
87 my $options = "-gsdlinfo";
88 eval ("\$classobj = new \$classifier([],[$options])");
89 die "$@" if $@;
90
91 return $classobj;
92}
93
94sub load_classifiers {
95 my ($classify_list, $build_dir, $outhandle) = @_;
96 my @classify_objects = ();
97 my $classify_number = 1;
98
99 my $colclassdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"perllib/classify");
100 # - ensure colclassdir doesn't already exist in INC before adding, other-
101 # wise we risk clobbering classifier inheritence implied by order of paths
102 # in INC [jmt12]
103 my $inc_paths = join(':',@INC);
104 if ($inc_paths !~ /$colclassdir/)
105 {
106 unshift (@INC, $colclassdir);
107 }
108
109 foreach my $classifyoption (@$classify_list) {
110
111 # get the classifier name
112 my $classname = shift @$classifyoption;
113 next unless defined $classname;
114
115 # find the classifier
116 # - replaced as explained in load_classifier_for_info() [jmt12]
117 my @possible_class_paths;
118 if (defined($ENV{'GSDLCOLLECTION'}))
119 {
120 push(@possible_class_paths, &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'custom', $ENV{'GSDLCOLLECTION'}, 'perllib', 'classify', $classname . '.pm'));
121 }
122 # (why does GSDLCOLLECTDIR get set to GSDLHOME for classinfo calls?)
123 if ($ENV{'GSDLCOLLECTDIR'} ne $ENV{'GSDLHOME'})
124 {
125 push(@possible_class_paths,&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'classify', $classname . '.pm'));
126 }
127 foreach my $library_path (@INC)
128 {
129 # only interested in classify paths found in the library paths
130 if ($library_path =~ /classify$/)
131 {
132 push(@possible_class_paths, &FileUtils::filenameConcatenate($library_path, $classname . '.pm'));
133 }
134 }
135 my $found_class = 0;
136 foreach my $possible_class_path (@possible_class_paths)
137 {
138 if (-e $possible_class_path)
139 {
140 require $possible_class_path;
141 $found_class = 1;
142 last;
143 }
144 }
145 if (!$found_class)
146 {
147 &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classname) && die "\n";
148 }
149
150 # create the classify object
151 my ($classobj);
152
153 my @newoptions;
154
155 # do these first so they can be overriden by user supplied options
156 push @newoptions, "-builddir", "$build_dir" if ($build_dir);
157 push @newoptions, "-outhandle", "$outhandle" if ($outhandle);
158 push @newoptions, "-verbosity", "2";
159
160 # backwards compatability hack: if the classifier options are
161 # in "x=y" format, convert them to parsearg ("-x y") format.
162 my ($opt, $key, $value);
163 foreach $opt (@$classifyoption) {
164 # if ($opt =~ /^(\w+)=(.*)$/) {
165 # push @newoptions, "-$1", $2;
166 # } else {
167 push @newoptions, $opt;
168 #}
169 }
170
171 map { $_ = "\"$_\""; } @newoptions;
172 my $options .= join (",", @newoptions);
173
174
175 eval ("\$classobj = new \$classname([],[$options])");
176 die "$@" if $@;
177
178 $classobj->set_number($classify_number);
179 $classify_number ++;
180
181 # add this object to the list
182 push (@classify_objects, $classobj);
183 }
184
185 my ($classobj);
186 eval ("\$classobj = new AllList()");
187 die "$@" if $@;
188 push (@classify_objects, $classobj);
189
190 return \@classify_objects;
191}
192
193# init_classifiers resets all the classifiers and readys them to process
194# the documents.
195sub init_classifiers {
196 my ($classifiers) = @_;
197
198 foreach my $classobj (@$classifiers) {
199 $classobj->init();
200 }
201}
202
203
204
205# takes a hashref containing the metadata for an infodb entry, and extracts
206# the childrens numbers (from the 'contains' entry).
207# assumes format is ".1;".2;".3
208sub get_children {
209 my ($doc_db_hash) = @_;
210
211 my $children = undef;
212
213 my $contains = $doc_db_hash->{'contains'};
214 if (defined ($contains)) {
215 $contains =~ s/\@$//; #remove trailing @
216 $contains =~ s/^\"\.//; #remove initial ".
217 @$children = split /\;\"\./, $contains;
218 }
219
220 return $children;
221}
222
223
224sub recurse_sections {
225 my ($doc_obj, $children, $parentoid, $parentsection, $database_recs) = @_;
226
227 return if (!defined $children);
228
229 foreach my $child (sort { $a <=> $b} @$children) {
230 $doc_obj->create_named_section("$parentsection.$child");
231 my $doc_db_rec = $database_recs->{"$parentoid.$child"};
232 my $doc_db_hash = db_rec_to_hash($doc_db_rec);
233
234 # get child's children
235 my $newchildren = &get_children($doc_db_hash);
236
237 # add content for current section
238 add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
239
240 # process all the children if there are any
241 if (defined ($newchildren))
242 {
243 recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
244 "$parentsection.$child", $database_recs);
245 }
246 }
247}
248
249
250sub add_section_content {
251 my ($doc_obj, $cursection, $doc_db_hash) = @_;
252
253 foreach my $key (keys %$doc_db_hash) {
254 #don't need to store these metadata
255 next if $key =~ /(thistype|childtype|contains|docnum|doctype|classifytype)/i;
256 # but do want things like hastxt and archivedir
257 my @items = split /@/, $doc_db_hash->{$key};
258 # metadata is all from the info database so should already be in utf8
259 map {$doc_obj->add_utf8_metadata ($cursection, $key, $_); } @items;
260
261 }
262}
263
264
265# gets all the metadata from an infodb entry, and puts it into a hashref
266sub db_rec_to_hash {
267
268 my ($infodb_str_ref) = @_;
269
270 my $hashref = {};
271
272 my @entries = split(/\n/, $infodb_str_ref);
273 foreach my $entry (@entries) {
274 my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
275 $hashref->{$key} .= '@' if defined $hashref->{$key};
276 $hashref->{$key} .= $value;
277
278 }
279
280 return $hashref;
281}
282
283
284sub reconstruct_doc_objs_metadata
285{
286 my $infodb_type = shift(@_);
287 my $infodb_file_path = shift(@_);
288 my $database_recs = shift(@_);
289
290 # dig out top level doc sections
291 my %top_sections = ();
292 my %top_docnums = ();
293 foreach my $key ( keys %$database_recs )
294 {
295 my $md_rec = $database_recs->{$key};
296 my $md_hash = db_rec_to_hash($md_rec);
297
298 if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
299 next if ($key =~ m/\./);
300 $top_sections{$key} = $md_hash;
301 $top_docnums{$key} = $md_hash->{'docnum'};
302 }
303 }
304
305 # for greenstone document objects based on metadata in database file
306 my @all_docs = ();
307 # we need to make sure the documents were processed in the same order as
308 # before, so sort based on their docnums
309 foreach my $oid ( sort { $top_docnums{$a} <=> $top_docnums{$b} } keys %top_sections )
310 {
311 my $doc_db_hash = $top_sections{$oid};
312
313 my $doc_obj = new doc();
314 $doc_obj->set_OID($oid);
315 my $top = $doc_obj->get_top_section();
316 add_section_content ($doc_obj, $top, $doc_db_hash);
317 my $children = &get_children($doc_db_hash);
318 recurse_sections($doc_obj, $children, $oid, $top, $database_recs);
319
320 push(@all_docs,$doc_obj);
321 }
322
323 return \@all_docs;
324}
325
326
327
328
329
330# classify_doc lets each of the classifiers classify a document
331sub classify_doc {
332 my ($classifiers, $doc_obj) = @_;
333
334 foreach my $classobj (@$classifiers) {
335 my $title = $classobj->{'title'};
336
337 $classobj->classify($doc_obj);
338 }
339}
340
341
342our $next_classify_num = 1;
343
344# output_classify_info outputs all the info needed for the classification
345# to the database
346sub output_classify_info
347{
348 my ($classifiers, $infodb_type, $infodb_handle, $remove_empty_classifications, $gli) = @_;
349
350 $gli = 0 unless defined $gli;
351
352 # create a classification containing all the info
353 my $classifyinfo = { 'classifyOID'=> 'browse',
354 'contains' => [] };
355
356 # get each of the classifications
357 foreach my $classifier (@$classifiers)
358 {
359 my $classifier_info = $classifier->get_classify_info($gli);
360 $classifier_info->{'classifyOID'} = "CL$next_classify_num" unless defined($classifier_info->{'classifyOID'});
361 print STDERR "*** outputting information for classifier: $classifier_info->{'classifyOID'}\n";
362
363 push(@{$classifyinfo->{'contains'}}, $classifier_info);
364 $next_classify_num++;
365 }
366
367 &print_classify_info($infodb_type, $infodb_handle, $classifyinfo, "", $remove_empty_classifications);
368}
369
370
371sub print_classify_info
372{
373 my ($infodb_type, $infodb_handle, $classifyinfo, $OID, $remove_empty_classifications) = @_;
374
375 $OID =~ s/^\.+//; # just for good luck
376
377 # book information is printed elsewhere
378 return if (defined ($classifyinfo->{'OID'}));
379
380 # don't want empty classifications
381 return if (&check_contents ($classifyinfo, $remove_empty_classifications) == 0 && $remove_empty_classifications);
382
383 $OID = $classifyinfo->{'classifyOID'} if defined ($classifyinfo->{'classifyOID'});
384
385 my %classify_infodb = ();
386 $classify_infodb{"doctype"} = [ "classify" ];
387 $classify_infodb{"hastxt"} = [ "0" ];
388 $classify_infodb{"childtype"} = [ $classifyinfo->{'childtype'} ]
389 if defined $classifyinfo->{'childtype'};
390 $classify_infodb{"Title"} = [ $classifyinfo->{'Title'} ]
391 if defined $classifyinfo->{'Title'};
392 $classify_infodb{"numleafdocs"} = [ $classifyinfo->{'numleafdocs'} ]
393 if defined $classifyinfo->{'numleafdocs'};
394 $classify_infodb{"thistype"} = [ $classifyinfo->{'thistype'} ]
395 if defined $classifyinfo->{'thistype'};
396 $classify_infodb{"parameters"} = [ $classifyinfo->{'parameters'} ]
397 if defined $classifyinfo->{'parameters'};
398 $classify_infodb{"supportsmemberof"} = [ $classifyinfo->{'supportsmemberof'} ]
399 if defined $classifyinfo->{'supportsmemberof'};
400
401 my $contains_text = "";
402 my $mdoffset_text = "";
403
404 my $next_subOID = 1;
405 my $first = 1;
406 foreach my $tempinfo (@{$classifyinfo->{'contains'}}) {
407 # empty contents were made undefined by clean_contents()
408 next unless defined $tempinfo;
409 if (!defined ($tempinfo->{'classifyOID'}) ||
410 $tempinfo->{'classifyOID'} ne "oai") {
411 $contains_text .= ";" unless $first;
412 }
413 $mdoffset_text .= ";" unless $first;
414 $first = 0;
415
416 if (defined ($tempinfo->{'classifyOID'}))
417 {
418 if ($tempinfo->{'classifyOID'} ne "oai")
419 {
420 $contains_text .= $tempinfo->{'classifyOID'};
421 }
422
423 &print_classify_info ($infodb_type, $infodb_handle, $tempinfo, $tempinfo->{'classifyOID'},
424 $remove_empty_classifications);
425 }
426 elsif (defined ($tempinfo->{'OID'}))
427 {
428 $contains_text .= $tempinfo->{'OID'};
429 $mdoffset_text .= $tempinfo->{'offset'} if (defined ($tempinfo->{'offset'}));
430 }
431 else
432 {
433 # Supress having top-level node in Collage classifier
434 # so no bookshelf icon appears, top-level, along with the
435 # applet
436 if (!defined ($tempinfo->{'Title'}) || $tempinfo->{'Title'} ne "Collage")
437 {
438 $contains_text .= "\".$next_subOID";
439 }
440
441 &print_classify_info ($infodb_type, $infodb_handle, $tempinfo, "$OID.$next_subOID",
442 $remove_empty_classifications);
443 $next_subOID++;
444 }
445 }
446
447 $classify_infodb{"contains"} = [ $contains_text ];
448 $classify_infodb{"mdtype"} = [ $classifyinfo->{'mdtype'} ]
449 if defined $classifyinfo->{'mdtype'};
450 $classify_infodb{"mdoffset"} = [ $mdoffset_text ]
451 if ($mdoffset_text !~ m/^;+$/);
452
453 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, $OID, \%classify_infodb);
454}
455
456
457sub check_contents {
458 my ($classifyinfo,$remove_empty_classifications) = @_;
459 $remove_empty_classifications = 0 unless ($remove_empty_classifications);
460 my $num_leaf_docs = 0;
461 my $sub_num_leaf_docs = 0;
462
463 return $classifyinfo->{'numleafdocs'} if (defined $classifyinfo->{'numleafdocs'});
464
465 foreach my $content (@{$classifyinfo->{'contains'}}) {
466 if (defined $content->{'OID'}) {
467 # found a book
468 $num_leaf_docs ++;
469 } elsif (($sub_num_leaf_docs = &check_contents ($content,$remove_empty_classifications)) > 0) {
470 # there's a book somewhere below
471 $num_leaf_docs += $sub_num_leaf_docs;
472 } else {
473 if ($remove_empty_classifications){
474 # section contains no books so we want to remove
475 # it from its parents contents
476 $content = undef;
477 }
478 }
479 }
480
481 $classifyinfo->{'numleafdocs'} = $num_leaf_docs;
482 return $num_leaf_docs;
483}
484
4851;
Note: See TracBrowser for help on using the repository browser.