source: main/trunk/greenstone2/perllib/classify.pm@ 38935

Last change on this file since 38935 was 38854, checked in by anupama, 3 months ago

Having been able to look at the GS2.88 Collage applet at least trying to run through the appletviewer somewhat (it doesn't really do much than say it's downloading and print out some file information to the terminal), I wanted at least that much to work with the Collage Applet in GS3. Just to get the appletviewer to display a somewhat working GS3 Collage applet (previously it would fail before applet initialisation, with a hard to fathom magic number Java error), I had to create a very skeletal GsdlCollageBrowse.java service, the way Phind has a (proper, non-skeletal) PhindPhraseBrowse service. I'm not sure if this service need or should do more for Collage, as I think the Collage applet rather than the service should take care of all the inner workings. I also needed the perl code to write out the new collage service to the buildconfig.xml file for this to actually appear. So far, the applet params are all hardcoded. I still need to investigate what the Collage applet is trying to do, such as how it works out which files to download and tries to download them, as for GS3 it's not even listing the files to download.

  • Property svn:keywords set to Author Date Id Revision
File size: 14.6 KB
RevLine 
[537]1###########################################################################
2#
3# classify.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[214]26# functions to handle classifiers
27
28package classify;
29
30require util;
[27303]31use FileUtils;
[8220]32require AllList;
[15703]33
[15705]34use dbutil;
[5682]35use gsprintf;
[15703]36use strict; no strict 'subs';
[214]37
[11994]38
[5682]39sub gsprintf
40{
41 return &gsprintf::gsprintf(@_);
42}
43
44
[6967]45sub load_classifier_for_info {
46 my ($classifier) = shift @_;
[214]47
[6967]48 # find the classifier
[27303]49 # - used to have hardcoded list of places to load classifier from. We
50 # should, instead, try loading from all of the perllib places on the
51 # library path, as that improves support for extensions. Special cases
52 # needed for collection specific and custom classifier. [jmt12]
53 my @possible_class_paths;
[14239]54 if (defined($ENV{'GSDLCOLLECTION'}))
55 {
[27303]56 push(@possible_class_paths, &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'custom', $ENV{'GSDLCOLLECTION'}, 'perllib', 'classify', $classifier . '.pm'));
[14239]57 }
[27303]58 # (why does GSDLCOLLECTDIR get set to GSDLHOME for classinfo calls?)
59 if ($ENV{'GSDLCOLLECTDIR'} ne $ENV{'GSDLHOME'})
60 {
61 push(@possible_class_paths, &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'classify', $classifier . '.pm'));
62 }
63 foreach my $library_path (@INC)
64 {
65 # only interested in classify paths found in the library paths
66 if ($library_path =~ /classify$/)
67 {
68 push(@possible_class_paths, &FileUtils::filenameConcatenate($library_path, $classifier . '.pm'));
69 }
70 }
71 my $found_class = 0;
72 foreach my $possible_class_path (@possible_class_paths)
73 {
74 if (-e $possible_class_path)
75 {
76 require $possible_class_path;
77 $found_class = 1;
78 last;
79 }
80 }
81 if (!$found_class)
82 {
83 &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classifier) && die "\n";
84 }
[6967]85
86 my ($classobj);
87 my $options = "-gsdlinfo";
[10218]88 eval ("\$classobj = new \$classifier([],[$options])");
[6967]89 die "$@" if $@;
90
91 return $classobj;
92}
93
[811]94sub load_classifiers {
[1839]95 my ($classify_list, $build_dir, $outhandle) = @_;
[811]96 my @classify_objects = ();
[8220]97 my $classify_number = 1;
[13933]98
[27303]99 # - ensure colclassdir doesn't already exist in INC before adding, other-
100 # wise we risk clobbering classifier inheritence implied by order of paths
101 # in INC [jmt12]
[27375]102 my $colclassdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"perllib/classify");
103 &util::augmentINC($colclassdir);
[27303]104
[15703]105 foreach my $classifyoption (@$classify_list) {
[214]106
[811]107 # get the classifier name
108 my $classname = shift @$classifyoption;
109 next unless defined $classname;
[1839]110
[811]111 # find the classifier
[27303]112 # - replaced as explained in load_classifier_for_info() [jmt12]
113 my @possible_class_paths;
114 if (defined($ENV{'GSDLCOLLECTION'}))
115 {
116 push(@possible_class_paths, &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'custom', $ENV{'GSDLCOLLECTION'}, 'perllib', 'classify', $classname . '.pm'));
[14112]117 }
[27303]118 # (why does GSDLCOLLECTDIR get set to GSDLHOME for classinfo calls?)
119 if ($ENV{'GSDLCOLLECTDIR'} ne $ENV{'GSDLHOME'})
120 {
121 push(@possible_class_paths,&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'classify', $classname . '.pm'));
122 }
123 foreach my $library_path (@INC)
124 {
125 # only interested in classify paths found in the library paths
126 if ($library_path =~ /classify$/)
127 {
128 push(@possible_class_paths, &FileUtils::filenameConcatenate($library_path, $classname . '.pm'));
129 }
130 }
131 my $found_class = 0;
132 foreach my $possible_class_path (@possible_class_paths)
133 {
134 if (-e $possible_class_path)
135 {
136 require $possible_class_path;
137 $found_class = 1;
138 last;
139 }
140 }
141 if (!$found_class)
142 {
143 &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classname) && die "\n";
144 }
[214]145
[811]146 # create the classify object
147 my ($classobj);
[1839]148
149 my @newoptions;
[6964]150
151 # do these first so they can be overriden by user supplied options
[27675]152 if ($build_dir) {
153 (my $build_dir_re = $build_dir) =~ s@\\@\\\\@g; # copy build_dir into build_dir_re and modify build_dir_re
154 push @newoptions, "-builddir", "$build_dir_re";
155 }
[6964]156 push @newoptions, "-outhandle", "$outhandle" if ($outhandle);
157 push @newoptions, "-verbosity", "2";
158
[6967]159 # backwards compatability hack: if the classifier options are
160 # in "x=y" format, convert them to parsearg ("-x y") format.
161 my ($opt, $key, $value);
[1839]162 foreach $opt (@$classifyoption) {
[11644]163 # if ($opt =~ /^(\w+)=(.*)$/) {
164 # push @newoptions, "-$1", $2;
165 # } else {
[1839]166 push @newoptions, $opt;
[11644]167 #}
[1839]168 }
169
[29574]170 eval ("\$classobj = new \$classname([],[\@newoptions])");
[811]171 die "$@" if $@;
[1839]172
[8220]173 $classobj->set_number($classify_number);
174 $classify_number ++;
175
[1839]176 # add this object to the list
[811]177 push (@classify_objects, $classobj);
178 }
179
180 return \@classify_objects;
[214]181}
182
183# init_classifiers resets all the classifiers and readys them to process
[315]184# the documents.
[214]185sub init_classifiers {
186 my ($classifiers) = @_;
187
[15703]188 foreach my $classobj (@$classifiers) {
[214]189 $classobj->init();
190 }
191}
192
[11994]193
194
[15704]195# takes a hashref containing the metadata for an infodb entry, and extracts
[11994]196# the childrens numbers (from the 'contains' entry).
197# assumes format is ".1;".2;".3
198sub get_children {
199 my ($doc_db_hash) = @_;
200
201 my $children = undef;
202
[15703]203 my $contains = $doc_db_hash->{'contains'};
204 if (defined ($contains)) {
205 $contains =~ s/\@$//; #remove trailing @
206 $contains =~ s/^\"\.//; #remove initial ".
207 @$children = split /\;\"\./, $contains;
[11994]208 }
209
210 return $children;
211}
212
213
214sub recurse_sections {
[15704]215 my ($doc_obj, $children, $parentoid, $parentsection, $database_recs) = @_;
[11994]216
217 return if (!defined $children);
218
219 foreach my $child (sort { $a <=> $b} @$children) {
220 $doc_obj->create_named_section("$parentsection.$child");
[15704]221 my $doc_db_rec = $database_recs->{"$parentoid.$child"};
[11994]222 my $doc_db_hash = db_rec_to_hash($doc_db_rec);
223
224 # get child's children
225 my $newchildren = &get_children($doc_db_hash);
226
227 # add content for current section
228 add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
229
230 # process all the children if there are any
231 if (defined ($newchildren))
232 {
233 recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
[15704]234 "$parentsection.$child", $database_recs);
[11994]235 }
236 }
237}
238
239
240sub add_section_content {
241 my ($doc_obj, $cursection, $doc_db_hash) = @_;
242
[15703]243 foreach my $key (keys %$doc_db_hash) {
[11994]244 #don't need to store these metadata
245 next if $key =~ /(thistype|childtype|contains|docnum|doctype|classifytype)/i;
246 # but do want things like hastxt and archivedir
247 my @items = split /@/, $doc_db_hash->{$key};
[21564]248 # metadata is all from the info database so should already be in utf8
[17288]249 map {$doc_obj->add_utf8_metadata ($cursection, $key, $_); } @items;
[11994]250
251 }
252}
253
254
[15704]255# gets all the metadata from an infodb entry, and puts it into a hashref
[11994]256sub db_rec_to_hash {
257
[21564]258 my ($infodb_str_ref) = @_;
[11994]259
260 my $hashref = {};
261
[21564]262 my @entries = split(/\n/, $infodb_str_ref);
[15703]263 foreach my $entry (@entries) {
[11994]264 my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
265 $hashref->{$key} .= '@' if defined $hashref->{$key};
266 $hashref->{$key} .= $value;
267
268 }
269
270 return $hashref;
271}
272
273
274sub reconstruct_doc_objs_metadata
275{
[15725]276 my $infodb_type = shift(@_);
277 my $infodb_file_path = shift(@_);
[20575]278 my $database_recs = shift(@_);
[11994]279
280 # dig out top level doc sections
281 my %top_sections = ();
[13068]282 my %top_docnums = ();
[20575]283 foreach my $key ( keys %$database_recs )
[11994]284 {
[20575]285 my $md_rec = $database_recs->{$key};
[11994]286 my $md_hash = db_rec_to_hash($md_rec);
287
288 if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
289 next if ($key =~ m/\./);
290 $top_sections{$key} = $md_hash;
[13068]291 $top_docnums{$key} = $md_hash->{'docnum'};
[11994]292 }
293 }
294
[15704]295 # for greenstone document objects based on metadata in database file
[11994]296 my @all_docs = ();
[13068]297 # we need to make sure the documents were processed in the same order as
298 # before, so sort based on their docnums
299 foreach my $oid ( sort { $top_docnums{$a} <=> $top_docnums{$b} } keys %top_sections )
[11994]300 {
301 my $doc_db_hash = $top_sections{$oid};
302
303 my $doc_obj = new doc();
304 $doc_obj->set_OID($oid);
305 my $top = $doc_obj->get_top_section();
306 add_section_content ($doc_obj, $top, $doc_db_hash);
307 my $children = &get_children($doc_db_hash);
[20575]308 recurse_sections($doc_obj, $children, $oid, $top, $database_recs);
[11994]309
310 push(@all_docs,$doc_obj);
311 }
312
313 return \@all_docs;
314}
315
316
317
318
319
[214]320# classify_doc lets each of the classifiers classify a document
321sub classify_doc {
[23118]322 my ($classifiers, $doc_obj) = @_;
[19772]323
[15703]324 foreach my $classobj (@$classifiers) {
[8220]325 my $title = $classobj->{'title'};
[19772]326
[23118]327 $classobj->classify($doc_obj);
[214]328 }
329}
330
[15702]331
[18520]332our $next_classify_num = 1;
333
[37128]334sub reset_next_classify_num
335{
336 $next_classify_num = 1;
337}
338
339
340
[214]341# output_classify_info outputs all the info needed for the classification
[15702]342# to the database
343sub output_classify_info
344{
[15725]345 my ($classifiers, $infodb_type, $infodb_handle, $remove_empty_classifications, $gli) = @_;
[214]346
[6332]347 $gli = 0 unless defined $gli;
348
[315]349 # create a classification containing all the info
[15702]350 my $classifyinfo = { 'classifyOID'=> 'browse',
351 'contains' => [] };
[315]352
353 # get each of the classifications
[35647]354 print STDERR "\n"; # force line-break to separate infodb printed info from earlier phase of classifyinfo
[15703]355 foreach my $classifier (@$classifiers)
[15702]356 {
357 my $classifier_info = $classifier->get_classify_info($gli);
[29821]358 if (defined $classifier_info) {
359 $classifier_info->{'classifyOID'} = "CL$next_classify_num" unless defined($classifier_info->{'classifyOID'});
[35647]360 print STDERR "Outputting information for classifier: $classifier_info->{'classifyOID'}\n";
[8220]361
[29821]362 push(@{$classifyinfo->{'contains'}}, $classifier_info);
363 } else {
364 print STDERR "*** error with classifier CL$next_classify_num, not outputing it\n";
365 }
[315]366 $next_classify_num++;
[29821]367
[214]368 }
369
[15725]370 &print_classify_info($infodb_type, $infodb_handle, $classifyinfo, "", $remove_empty_classifications);
[214]371}
372
[831]373
[15702]374sub print_classify_info
375{
[15725]376 my ($infodb_type, $infodb_handle, $classifyinfo, $OID, $remove_empty_classifications) = @_;
[15702]377
[315]378 $OID =~ s/^\.+//; # just for good luck
[214]379
[315]380 # book information is printed elsewhere
381 return if (defined ($classifyinfo->{'OID'}));
382
383 # don't want empty classifications
[8445]384 return if (&check_contents ($classifyinfo, $remove_empty_classifications) == 0 && $remove_empty_classifications);
[315]385
[8361]386 $OID = $classifyinfo->{'classifyOID'} if defined ($classifyinfo->{'classifyOID'});
[15702]387
388 my %classify_infodb = ();
389 $classify_infodb{"doctype"} = [ "classify" ];
390 $classify_infodb{"hastxt"} = [ "0" ];
391 $classify_infodb{"childtype"} = [ $classifyinfo->{'childtype'} ]
[8361]392 if defined $classifyinfo->{'childtype'};
[15702]393 $classify_infodb{"Title"} = [ $classifyinfo->{'Title'} ]
[8361]394 if defined $classifyinfo->{'Title'};
[15702]395 $classify_infodb{"numleafdocs"} = [ $classifyinfo->{'numleafdocs'} ]
[8361]396 if defined $classifyinfo->{'numleafdocs'};
[15702]397 $classify_infodb{"thistype"} = [ $classifyinfo->{'thistype'} ]
[8361]398 if defined $classifyinfo->{'thistype'};
[15702]399 $classify_infodb{"parameters"} = [ $classifyinfo->{'parameters'} ]
[8361]400 if defined $classifyinfo->{'parameters'};
[15702]401 $classify_infodb{"supportsmemberof"} = [ $classifyinfo->{'supportsmemberof'} ]
[8361]402 if defined $classifyinfo->{'supportsmemberof'};
403
[15702]404 my $contains_text = "";
405 my $mdoffset_text = "";
[8361]406
407 my $next_subOID = 1;
408 my $first = 1;
[15702]409 foreach my $tempinfo (@{$classifyinfo->{'contains'}}) {
[8361]410 # empty contents were made undefined by clean_contents()
411 next unless defined $tempinfo;
412 if (!defined ($tempinfo->{'classifyOID'}) ||
413 $tempinfo->{'classifyOID'} ne "oai") {
414 $contains_text .= ";" unless $first;
415 }
416 $mdoffset_text .= ";" unless $first;
417 $first = 0;
[315]418
[16959]419 if (defined ($tempinfo->{'classifyOID'}))
420 {
421 if ($tempinfo->{'classifyOID'} ne "oai")
422 {
[8361]423 $contains_text .= $tempinfo->{'classifyOID'};
[8275]424 }
[12844]425
[15725]426 &print_classify_info ($infodb_type, $infodb_handle, $tempinfo, $tempinfo->{'classifyOID'},
[8361]427 $remove_empty_classifications);
[16959]428 }
429 elsif (defined ($tempinfo->{'OID'}))
430 {
[8361]431 $contains_text .= $tempinfo->{'OID'};
[12844]432 $mdoffset_text .= $tempinfo->{'offset'} if (defined ($tempinfo->{'offset'}));
[16959]433 }
434 else
435 {
[38854]436 # Suppress having top-level node in Collage classifier
[16959]437 # so no bookshelf icon appears, top-level, along with the
438 # applet
439 if (!defined ($tempinfo->{'Title'}) || $tempinfo->{'Title'} ne "Collage")
440 {
441 $contains_text .= "\".$next_subOID";
442 }
[12844]443
[16959]444 &print_classify_info ($infodb_type, $infodb_handle, $tempinfo, "$OID.$next_subOID",
445 $remove_empty_classifications);
446 $next_subOID++;
447 }
[315]448 }
[8361]449
[15702]450 $classify_infodb{"contains"} = [ $contains_text ];
451 $classify_infodb{"mdtype"} = [ $classifyinfo->{'mdtype'} ]
[8361]452 if defined $classifyinfo->{'mdtype'};
[15702]453 $classify_infodb{"mdoffset"} = [ $mdoffset_text ]
454 if ($mdoffset_text !~ m/^;+$/);
[8361]455
[15725]456 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, $OID, \%classify_infodb);
[315]457}
458
[15702]459
[637]460sub check_contents {
[8445]461 my ($classifyinfo,$remove_empty_classifications) = @_;
462 $remove_empty_classifications = 0 unless ($remove_empty_classifications);
[637]463 my $num_leaf_docs = 0;
464 my $sub_num_leaf_docs = 0;
[315]465
[637]466 return $classifyinfo->{'numleafdocs'} if (defined $classifyinfo->{'numleafdocs'});
467
[15703]468 foreach my $content (@{$classifyinfo->{'contains'}}) {
[315]469 if (defined $content->{'OID'}) {
470 # found a book
[637]471 $num_leaf_docs ++;
[9790]472 } elsif (($sub_num_leaf_docs = &check_contents ($content,$remove_empty_classifications)) > 0) {
[315]473 # there's a book somewhere below
[637]474 $num_leaf_docs += $sub_num_leaf_docs;
[315]475 } else {
[8445]476 if ($remove_empty_classifications){
477 # section contains no books so we want to remove
478 # it from its parents contents
479 $content = undef;
480 }
[315]481 }
482 }
[637]483
484 $classifyinfo->{'numleafdocs'} = $num_leaf_docs;
485 return $num_leaf_docs;
[315]486}
487
[214]4881;
Note: See TracBrowser for help on using the repository browser.