source: main/trunk/greenstone2/perllib/classify.pm@ 27329

Last change on this file since 27329 was 27303, checked in by jmt12, 11 years ago

Replacing hardcoded additions to INC and PATH environment variables with conditional ones - this allows us to use the order of values in these variables for precedence, thus allows better support for extensions that override classifiers, plugins etc. ENV and PATH functions already exists in util, but augmentINC() is a new function

  • Property svn:keywords set to Author Date Id Revision
File size: 14.5 KB
RevLine 
[537]1###########################################################################
2#
3# classify.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[214]26# functions to handle classifiers
27
28package classify;
29
30require util;
[27303]31use FileUtils;
[8220]32require AllList;
[15703]33
[15705]34use dbutil;
[5682]35use gsprintf;
[15703]36use strict; no strict 'subs';
[214]37
[11994]38
[5682]39sub gsprintf
40{
41 return &gsprintf::gsprintf(@_);
42}
43
44
[6967]45sub load_classifier_for_info {
46 my ($classifier) = shift @_;
[214]47
[6967]48 # find the classifier
[27303]49 # - used to have hardcoded list of places to load classifier from. We
50 # should, instead, try loading from all of the perllib places on the
51 # library path, as that improves support for extensions. Special cases
52 # needed for collection specific and custom classifier. [jmt12]
53 my @possible_class_paths;
[14239]54 if (defined($ENV{'GSDLCOLLECTION'}))
55 {
[27303]56 push(@possible_class_paths, &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'custom', $ENV{'GSDLCOLLECTION'}, 'perllib', 'classify', $classifier . '.pm'));
[14239]57 }
[27303]58 # (why does GSDLCOLLECTDIR get set to GSDLHOME for classinfo calls?)
59 if ($ENV{'GSDLCOLLECTDIR'} ne $ENV{'GSDLHOME'})
60 {
61 push(@possible_class_paths, &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'classify', $classifier . '.pm'));
62 }
63 foreach my $library_path (@INC)
64 {
65 # only interested in classify paths found in the library paths
66 if ($library_path =~ /classify$/)
67 {
68 push(@possible_class_paths, &FileUtils::filenameConcatenate($library_path, $classifier . '.pm'));
69 }
70 }
71 my $found_class = 0;
72 foreach my $possible_class_path (@possible_class_paths)
73 {
74 if (-e $possible_class_path)
75 {
76 require $possible_class_path;
77 $found_class = 1;
78 last;
79 }
80 }
81 if (!$found_class)
82 {
83 &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classifier) && die "\n";
84 }
[6967]85
86 my ($classobj);
87 my $options = "-gsdlinfo";
[10218]88 eval ("\$classobj = new \$classifier([],[$options])");
[6967]89 die "$@" if $@;
90
91 return $classobj;
92}
93
[811]94sub load_classifiers {
[1839]95 my ($classify_list, $build_dir, $outhandle) = @_;
[811]96 my @classify_objects = ();
[8220]97 my $classify_number = 1;
[13933]98
[27303]99 my $colclassdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"perllib/classify");
100 # - ensure colclassdir doesn't already exist in INC before adding, other-
101 # wise we risk clobbering classifier inheritence implied by order of paths
102 # in INC [jmt12]
103 my $inc_paths = join(':',@INC);
104 if ($inc_paths !~ /$colclassdir/)
105 {
106 unshift (@INC, $colclassdir);
107 }
108
[15703]109 foreach my $classifyoption (@$classify_list) {
[214]110
[811]111 # get the classifier name
112 my $classname = shift @$classifyoption;
113 next unless defined $classname;
[1839]114
[811]115 # find the classifier
[27303]116 # - replaced as explained in load_classifier_for_info() [jmt12]
117 my @possible_class_paths;
118 if (defined($ENV{'GSDLCOLLECTION'}))
119 {
120 push(@possible_class_paths, &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'custom', $ENV{'GSDLCOLLECTION'}, 'perllib', 'classify', $classname . '.pm'));
[14112]121 }
[27303]122 # (why does GSDLCOLLECTDIR get set to GSDLHOME for classinfo calls?)
123 if ($ENV{'GSDLCOLLECTDIR'} ne $ENV{'GSDLHOME'})
124 {
125 push(@possible_class_paths,&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'classify', $classname . '.pm'));
126 }
127 foreach my $library_path (@INC)
128 {
129 # only interested in classify paths found in the library paths
130 if ($library_path =~ /classify$/)
131 {
132 push(@possible_class_paths, &FileUtils::filenameConcatenate($library_path, $classname . '.pm'));
133 }
134 }
135 my $found_class = 0;
136 foreach my $possible_class_path (@possible_class_paths)
137 {
138 if (-e $possible_class_path)
139 {
140 require $possible_class_path;
141 $found_class = 1;
142 last;
143 }
144 }
145 if (!$found_class)
146 {
147 &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classname) && die "\n";
148 }
[214]149
[811]150 # create the classify object
151 my ($classobj);
[1839]152
153 my @newoptions;
[6964]154
155 # do these first so they can be overriden by user supplied options
156 push @newoptions, "-builddir", "$build_dir" if ($build_dir);
157 push @newoptions, "-outhandle", "$outhandle" if ($outhandle);
158 push @newoptions, "-verbosity", "2";
159
[6967]160 # backwards compatability hack: if the classifier options are
161 # in "x=y" format, convert them to parsearg ("-x y") format.
162 my ($opt, $key, $value);
[1839]163 foreach $opt (@$classifyoption) {
[11644]164 # if ($opt =~ /^(\w+)=(.*)$/) {
165 # push @newoptions, "-$1", $2;
166 # } else {
[1839]167 push @newoptions, $opt;
[11644]168 #}
[1839]169 }
170
171 map { $_ = "\"$_\""; } @newoptions;
172 my $options .= join (",", @newoptions);
173
[10218]174
175 eval ("\$classobj = new \$classname([],[$options])");
[811]176 die "$@" if $@;
[1839]177
[8220]178 $classobj->set_number($classify_number);
179 $classify_number ++;
180
[1839]181 # add this object to the list
[811]182 push (@classify_objects, $classobj);
183 }
184
[8220]185 my ($classobj);
186 eval ("\$classobj = new AllList()");
187 die "$@" if $@;
188 push (@classify_objects, $classobj);
189
[811]190 return \@classify_objects;
[214]191}
192
193# init_classifiers resets all the classifiers and readys them to process
[315]194# the documents.
[214]195sub init_classifiers {
196 my ($classifiers) = @_;
197
[15703]198 foreach my $classobj (@$classifiers) {
[214]199 $classobj->init();
200 }
201}
202
[11994]203
204
[15704]205# takes a hashref containing the metadata for an infodb entry, and extracts
[11994]206# the childrens numbers (from the 'contains' entry).
207# assumes format is ".1;".2;".3
208sub get_children {
209 my ($doc_db_hash) = @_;
210
211 my $children = undef;
212
[15703]213 my $contains = $doc_db_hash->{'contains'};
214 if (defined ($contains)) {
215 $contains =~ s/\@$//; #remove trailing @
216 $contains =~ s/^\"\.//; #remove initial ".
217 @$children = split /\;\"\./, $contains;
[11994]218 }
219
220 return $children;
221}
222
223
224sub recurse_sections {
[15704]225 my ($doc_obj, $children, $parentoid, $parentsection, $database_recs) = @_;
[11994]226
227 return if (!defined $children);
228
229 foreach my $child (sort { $a <=> $b} @$children) {
230 $doc_obj->create_named_section("$parentsection.$child");
[15704]231 my $doc_db_rec = $database_recs->{"$parentoid.$child"};
[11994]232 my $doc_db_hash = db_rec_to_hash($doc_db_rec);
233
234 # get child's children
235 my $newchildren = &get_children($doc_db_hash);
236
237 # add content for current section
238 add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
239
240 # process all the children if there are any
241 if (defined ($newchildren))
242 {
243 recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
[15704]244 "$parentsection.$child", $database_recs);
[11994]245 }
246 }
247}
248
249
250sub add_section_content {
251 my ($doc_obj, $cursection, $doc_db_hash) = @_;
252
[15703]253 foreach my $key (keys %$doc_db_hash) {
[11994]254 #don't need to store these metadata
255 next if $key =~ /(thistype|childtype|contains|docnum|doctype|classifytype)/i;
256 # but do want things like hastxt and archivedir
257 my @items = split /@/, $doc_db_hash->{$key};
[21564]258 # metadata is all from the info database so should already be in utf8
[17288]259 map {$doc_obj->add_utf8_metadata ($cursection, $key, $_); } @items;
[11994]260
261 }
262}
263
264
[15704]265# gets all the metadata from an infodb entry, and puts it into a hashref
[11994]266sub db_rec_to_hash {
267
[21564]268 my ($infodb_str_ref) = @_;
[11994]269
270 my $hashref = {};
271
[21564]272 my @entries = split(/\n/, $infodb_str_ref);
[15703]273 foreach my $entry (@entries) {
[11994]274 my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
275 $hashref->{$key} .= '@' if defined $hashref->{$key};
276 $hashref->{$key} .= $value;
277
278 }
279
280 return $hashref;
281}
282
283
284sub reconstruct_doc_objs_metadata
285{
[15725]286 my $infodb_type = shift(@_);
287 my $infodb_file_path = shift(@_);
[20575]288 my $database_recs = shift(@_);
[11994]289
290 # dig out top level doc sections
291 my %top_sections = ();
[13068]292 my %top_docnums = ();
[20575]293 foreach my $key ( keys %$database_recs )
[11994]294 {
[20575]295 my $md_rec = $database_recs->{$key};
[11994]296 my $md_hash = db_rec_to_hash($md_rec);
297
298 if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
299 next if ($key =~ m/\./);
300 $top_sections{$key} = $md_hash;
[13068]301 $top_docnums{$key} = $md_hash->{'docnum'};
[11994]302 }
303 }
304
[15704]305 # for greenstone document objects based on metadata in database file
[11994]306 my @all_docs = ();
[13068]307 # we need to make sure the documents were processed in the same order as
308 # before, so sort based on their docnums
309 foreach my $oid ( sort { $top_docnums{$a} <=> $top_docnums{$b} } keys %top_sections )
[11994]310 {
311 my $doc_db_hash = $top_sections{$oid};
312
313 my $doc_obj = new doc();
314 $doc_obj->set_OID($oid);
315 my $top = $doc_obj->get_top_section();
316 add_section_content ($doc_obj, $top, $doc_db_hash);
317 my $children = &get_children($doc_db_hash);
[20575]318 recurse_sections($doc_obj, $children, $oid, $top, $database_recs);
[11994]319
320 push(@all_docs,$doc_obj);
321 }
322
323 return \@all_docs;
324}
325
326
327
328
329
[214]330# classify_doc lets each of the classifiers classify a document
331sub classify_doc {
[23118]332 my ($classifiers, $doc_obj) = @_;
[19772]333
[15703]334 foreach my $classobj (@$classifiers) {
[8220]335 my $title = $classobj->{'title'};
[19772]336
[23118]337 $classobj->classify($doc_obj);
[214]338 }
339}
340
[15702]341
[18520]342our $next_classify_num = 1;
343
[214]344# output_classify_info outputs all the info needed for the classification
[15702]345# to the database
346sub output_classify_info
347{
[15725]348 my ($classifiers, $infodb_type, $infodb_handle, $remove_empty_classifications, $gli) = @_;
[214]349
[6332]350 $gli = 0 unless defined $gli;
351
[315]352 # create a classification containing all the info
[15702]353 my $classifyinfo = { 'classifyOID'=> 'browse',
354 'contains' => [] };
[315]355
356 # get each of the classifications
[15703]357 foreach my $classifier (@$classifiers)
[15702]358 {
359 my $classifier_info = $classifier->get_classify_info($gli);
360 $classifier_info->{'classifyOID'} = "CL$next_classify_num" unless defined($classifier_info->{'classifyOID'});
361 print STDERR "*** outputting information for classifier: $classifier_info->{'classifyOID'}\n";
[8220]362
[15702]363 push(@{$classifyinfo->{'contains'}}, $classifier_info);
[315]364 $next_classify_num++;
[214]365 }
366
[15725]367 &print_classify_info($infodb_type, $infodb_handle, $classifyinfo, "", $remove_empty_classifications);
[214]368}
369
[831]370
[15702]371sub print_classify_info
372{
[15725]373 my ($infodb_type, $infodb_handle, $classifyinfo, $OID, $remove_empty_classifications) = @_;
[15702]374
[315]375 $OID =~ s/^\.+//; # just for good luck
[214]376
[315]377 # book information is printed elsewhere
378 return if (defined ($classifyinfo->{'OID'}));
379
380 # don't want empty classifications
[8445]381 return if (&check_contents ($classifyinfo, $remove_empty_classifications) == 0 && $remove_empty_classifications);
[315]382
[8361]383 $OID = $classifyinfo->{'classifyOID'} if defined ($classifyinfo->{'classifyOID'});
[15702]384
385 my %classify_infodb = ();
386 $classify_infodb{"doctype"} = [ "classify" ];
387 $classify_infodb{"hastxt"} = [ "0" ];
388 $classify_infodb{"childtype"} = [ $classifyinfo->{'childtype'} ]
[8361]389 if defined $classifyinfo->{'childtype'};
[15702]390 $classify_infodb{"Title"} = [ $classifyinfo->{'Title'} ]
[8361]391 if defined $classifyinfo->{'Title'};
[15702]392 $classify_infodb{"numleafdocs"} = [ $classifyinfo->{'numleafdocs'} ]
[8361]393 if defined $classifyinfo->{'numleafdocs'};
[15702]394 $classify_infodb{"thistype"} = [ $classifyinfo->{'thistype'} ]
[8361]395 if defined $classifyinfo->{'thistype'};
[15702]396 $classify_infodb{"parameters"} = [ $classifyinfo->{'parameters'} ]
[8361]397 if defined $classifyinfo->{'parameters'};
[15702]398 $classify_infodb{"supportsmemberof"} = [ $classifyinfo->{'supportsmemberof'} ]
[8361]399 if defined $classifyinfo->{'supportsmemberof'};
400
[15702]401 my $contains_text = "";
402 my $mdoffset_text = "";
[8361]403
404 my $next_subOID = 1;
405 my $first = 1;
[15702]406 foreach my $tempinfo (@{$classifyinfo->{'contains'}}) {
[8361]407 # empty contents were made undefined by clean_contents()
408 next unless defined $tempinfo;
409 if (!defined ($tempinfo->{'classifyOID'}) ||
410 $tempinfo->{'classifyOID'} ne "oai") {
411 $contains_text .= ";" unless $first;
412 }
413 $mdoffset_text .= ";" unless $first;
414 $first = 0;
[315]415
[16959]416 if (defined ($tempinfo->{'classifyOID'}))
417 {
418 if ($tempinfo->{'classifyOID'} ne "oai")
419 {
[8361]420 $contains_text .= $tempinfo->{'classifyOID'};
[8275]421 }
[12844]422
[15725]423 &print_classify_info ($infodb_type, $infodb_handle, $tempinfo, $tempinfo->{'classifyOID'},
[8361]424 $remove_empty_classifications);
[16959]425 }
426 elsif (defined ($tempinfo->{'OID'}))
427 {
[8361]428 $contains_text .= $tempinfo->{'OID'};
[12844]429 $mdoffset_text .= $tempinfo->{'offset'} if (defined ($tempinfo->{'offset'}));
[16959]430 }
431 else
432 {
433 # Supress having top-level node in Collage classifier
434 # so no bookshelf icon appears, top-level, along with the
435 # applet
436 if (!defined ($tempinfo->{'Title'}) || $tempinfo->{'Title'} ne "Collage")
437 {
438 $contains_text .= "\".$next_subOID";
439 }
[12844]440
[16959]441 &print_classify_info ($infodb_type, $infodb_handle, $tempinfo, "$OID.$next_subOID",
442 $remove_empty_classifications);
443 $next_subOID++;
444 }
[315]445 }
[8361]446
[15702]447 $classify_infodb{"contains"} = [ $contains_text ];
448 $classify_infodb{"mdtype"} = [ $classifyinfo->{'mdtype'} ]
[8361]449 if defined $classifyinfo->{'mdtype'};
[15702]450 $classify_infodb{"mdoffset"} = [ $mdoffset_text ]
451 if ($mdoffset_text !~ m/^;+$/);
[8361]452
[15725]453 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, $OID, \%classify_infodb);
[315]454}
455
[15702]456
[637]457sub check_contents {
[8445]458 my ($classifyinfo,$remove_empty_classifications) = @_;
459 $remove_empty_classifications = 0 unless ($remove_empty_classifications);
[637]460 my $num_leaf_docs = 0;
461 my $sub_num_leaf_docs = 0;
[315]462
[637]463 return $classifyinfo->{'numleafdocs'} if (defined $classifyinfo->{'numleafdocs'});
464
[15703]465 foreach my $content (@{$classifyinfo->{'contains'}}) {
[315]466 if (defined $content->{'OID'}) {
467 # found a book
[637]468 $num_leaf_docs ++;
[9790]469 } elsif (($sub_num_leaf_docs = &check_contents ($content,$remove_empty_classifications)) > 0) {
[315]470 # there's a book somewhere below
[637]471 $num_leaf_docs += $sub_num_leaf_docs;
[315]472 } else {
[8445]473 if ($remove_empty_classifications){
474 # section contains no books so we want to remove
475 # it from its parents contents
476 $content = undef;
477 }
[315]478 }
479 }
[637]480
481 $classifyinfo->{'numleafdocs'} = $num_leaf_docs;
482 return $num_leaf_docs;
[315]483}
484
[214]4851;
Note: See TracBrowser for help on using the repository browser.