source: main/trunk/greenstone2/perllib/classify.pm

Last change on this file was 38854, checked in by anupama, 5 weeks ago

Having been able to look at the GS2.88 Collage applet at least trying to run through the appletviewer somewhat (it doesn't really do much than say it's downloading and print out some file information to the terminal), I wanted at least that much to work with the Collage Applet in GS3. Just to get the appletviewer to display a somewhat working GS3 Collage applet (previously it would fail before applet initialisation, with a hard to fathom magic number Java error), I had to create a very skeletal GsdlCollageBrowse.java service, the way Phind has a (proper, non-skeletal) PhindPhraseBrowse service. I'm not sure if this service need or should do more for Collage, as I think the Collage applet rather than the service should take care of all the inner workings. I also needed the perl code to write out the new collage service to the buildconfig.xml file for this to actually appear. So far, the applet params are all hardcoded. I still need to investigate what the Collage applet is trying to do, such as how it works out which files to download and tries to download them, as for GS3 it's not even listing the files to download.

  • Property svn:keywords set to Author Date Id Revision
File size: 14.6 KB
Line 
1###########################################################################
2#
3# classify.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# functions to handle classifiers
27
28package classify;
29
30require util;
31use FileUtils;
32require AllList;
33
34use dbutil;
35use gsprintf;
36use strict; no strict 'subs';
37
38
39sub gsprintf
40{
41 return &gsprintf::gsprintf(@_);
42}
43
44
45sub load_classifier_for_info {
46 my ($classifier) = shift @_;
47
48 # find the classifier
49 # - used to have hardcoded list of places to load classifier from. We
50 # should, instead, try loading from all of the perllib places on the
51 # library path, as that improves support for extensions. Special cases
52 # needed for collection specific and custom classifier. [jmt12]
53 my @possible_class_paths;
54 if (defined($ENV{'GSDLCOLLECTION'}))
55 {
56 push(@possible_class_paths, &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'custom', $ENV{'GSDLCOLLECTION'}, 'perllib', 'classify', $classifier . '.pm'));
57 }
58 # (why does GSDLCOLLECTDIR get set to GSDLHOME for classinfo calls?)
59 if ($ENV{'GSDLCOLLECTDIR'} ne $ENV{'GSDLHOME'})
60 {
61 push(@possible_class_paths, &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'classify', $classifier . '.pm'));
62 }
63 foreach my $library_path (@INC)
64 {
65 # only interested in classify paths found in the library paths
66 if ($library_path =~ /classify$/)
67 {
68 push(@possible_class_paths, &FileUtils::filenameConcatenate($library_path, $classifier . '.pm'));
69 }
70 }
71 my $found_class = 0;
72 foreach my $possible_class_path (@possible_class_paths)
73 {
74 if (-e $possible_class_path)
75 {
76 require $possible_class_path;
77 $found_class = 1;
78 last;
79 }
80 }
81 if (!$found_class)
82 {
83 &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classifier) && die "\n";
84 }
85
86 my ($classobj);
87 my $options = "-gsdlinfo";
88 eval ("\$classobj = new \$classifier([],[$options])");
89 die "$@" if $@;
90
91 return $classobj;
92}
93
94sub load_classifiers {
95 my ($classify_list, $build_dir, $outhandle) = @_;
96 my @classify_objects = ();
97 my $classify_number = 1;
98
99 # - ensure colclassdir doesn't already exist in INC before adding, other-
100 # wise we risk clobbering classifier inheritence implied by order of paths
101 # in INC [jmt12]
102 my $colclassdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"perllib/classify");
103 &util::augmentINC($colclassdir);
104
105 foreach my $classifyoption (@$classify_list) {
106
107 # get the classifier name
108 my $classname = shift @$classifyoption;
109 next unless defined $classname;
110
111 # find the classifier
112 # - replaced as explained in load_classifier_for_info() [jmt12]
113 my @possible_class_paths;
114 if (defined($ENV{'GSDLCOLLECTION'}))
115 {
116 push(@possible_class_paths, &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'custom', $ENV{'GSDLCOLLECTION'}, 'perllib', 'classify', $classname . '.pm'));
117 }
118 # (why does GSDLCOLLECTDIR get set to GSDLHOME for classinfo calls?)
119 if ($ENV{'GSDLCOLLECTDIR'} ne $ENV{'GSDLHOME'})
120 {
121 push(@possible_class_paths,&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'classify', $classname . '.pm'));
122 }
123 foreach my $library_path (@INC)
124 {
125 # only interested in classify paths found in the library paths
126 if ($library_path =~ /classify$/)
127 {
128 push(@possible_class_paths, &FileUtils::filenameConcatenate($library_path, $classname . '.pm'));
129 }
130 }
131 my $found_class = 0;
132 foreach my $possible_class_path (@possible_class_paths)
133 {
134 if (-e $possible_class_path)
135 {
136 require $possible_class_path;
137 $found_class = 1;
138 last;
139 }
140 }
141 if (!$found_class)
142 {
143 &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classname) && die "\n";
144 }
145
146 # create the classify object
147 my ($classobj);
148
149 my @newoptions;
150
151 # do these first so they can be overriden by user supplied options
152 if ($build_dir) {
153 (my $build_dir_re = $build_dir) =~ s@\\@\\\\@g; # copy build_dir into build_dir_re and modify build_dir_re
154 push @newoptions, "-builddir", "$build_dir_re";
155 }
156 push @newoptions, "-outhandle", "$outhandle" if ($outhandle);
157 push @newoptions, "-verbosity", "2";
158
159 # backwards compatability hack: if the classifier options are
160 # in "x=y" format, convert them to parsearg ("-x y") format.
161 my ($opt, $key, $value);
162 foreach $opt (@$classifyoption) {
163 # if ($opt =~ /^(\w+)=(.*)$/) {
164 # push @newoptions, "-$1", $2;
165 # } else {
166 push @newoptions, $opt;
167 #}
168 }
169
170 eval ("\$classobj = new \$classname([],[\@newoptions])");
171 die "$@" if $@;
172
173 $classobj->set_number($classify_number);
174 $classify_number ++;
175
176 # add this object to the list
177 push (@classify_objects, $classobj);
178 }
179
180 return \@classify_objects;
181}
182
183# init_classifiers resets all the classifiers and readys them to process
184# the documents.
185sub init_classifiers {
186 my ($classifiers) = @_;
187
188 foreach my $classobj (@$classifiers) {
189 $classobj->init();
190 }
191}
192
193
194
195# takes a hashref containing the metadata for an infodb entry, and extracts
196# the childrens numbers (from the 'contains' entry).
197# assumes format is ".1;".2;".3
198sub get_children {
199 my ($doc_db_hash) = @_;
200
201 my $children = undef;
202
203 my $contains = $doc_db_hash->{'contains'};
204 if (defined ($contains)) {
205 $contains =~ s/\@$//; #remove trailing @
206 $contains =~ s/^\"\.//; #remove initial ".
207 @$children = split /\;\"\./, $contains;
208 }
209
210 return $children;
211}
212
213
214sub recurse_sections {
215 my ($doc_obj, $children, $parentoid, $parentsection, $database_recs) = @_;
216
217 return if (!defined $children);
218
219 foreach my $child (sort { $a <=> $b} @$children) {
220 $doc_obj->create_named_section("$parentsection.$child");
221 my $doc_db_rec = $database_recs->{"$parentoid.$child"};
222 my $doc_db_hash = db_rec_to_hash($doc_db_rec);
223
224 # get child's children
225 my $newchildren = &get_children($doc_db_hash);
226
227 # add content for current section
228 add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
229
230 # process all the children if there are any
231 if (defined ($newchildren))
232 {
233 recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
234 "$parentsection.$child", $database_recs);
235 }
236 }
237}
238
239
240sub add_section_content {
241 my ($doc_obj, $cursection, $doc_db_hash) = @_;
242
243 foreach my $key (keys %$doc_db_hash) {
244 #don't need to store these metadata
245 next if $key =~ /(thistype|childtype|contains|docnum|doctype|classifytype)/i;
246 # but do want things like hastxt and archivedir
247 my @items = split /@/, $doc_db_hash->{$key};
248 # metadata is all from the info database so should already be in utf8
249 map {$doc_obj->add_utf8_metadata ($cursection, $key, $_); } @items;
250
251 }
252}
253
254
255# gets all the metadata from an infodb entry, and puts it into a hashref
256sub db_rec_to_hash {
257
258 my ($infodb_str_ref) = @_;
259
260 my $hashref = {};
261
262 my @entries = split(/\n/, $infodb_str_ref);
263 foreach my $entry (@entries) {
264 my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
265 $hashref->{$key} .= '@' if defined $hashref->{$key};
266 $hashref->{$key} .= $value;
267
268 }
269
270 return $hashref;
271}
272
273
274sub reconstruct_doc_objs_metadata
275{
276 my $infodb_type = shift(@_);
277 my $infodb_file_path = shift(@_);
278 my $database_recs = shift(@_);
279
280 # dig out top level doc sections
281 my %top_sections = ();
282 my %top_docnums = ();
283 foreach my $key ( keys %$database_recs )
284 {
285 my $md_rec = $database_recs->{$key};
286 my $md_hash = db_rec_to_hash($md_rec);
287
288 if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
289 next if ($key =~ m/\./);
290 $top_sections{$key} = $md_hash;
291 $top_docnums{$key} = $md_hash->{'docnum'};
292 }
293 }
294
295 # for greenstone document objects based on metadata in database file
296 my @all_docs = ();
297 # we need to make sure the documents were processed in the same order as
298 # before, so sort based on their docnums
299 foreach my $oid ( sort { $top_docnums{$a} <=> $top_docnums{$b} } keys %top_sections )
300 {
301 my $doc_db_hash = $top_sections{$oid};
302
303 my $doc_obj = new doc();
304 $doc_obj->set_OID($oid);
305 my $top = $doc_obj->get_top_section();
306 add_section_content ($doc_obj, $top, $doc_db_hash);
307 my $children = &get_children($doc_db_hash);
308 recurse_sections($doc_obj, $children, $oid, $top, $database_recs);
309
310 push(@all_docs,$doc_obj);
311 }
312
313 return \@all_docs;
314}
315
316
317
318
319
320# classify_doc lets each of the classifiers classify a document
321sub classify_doc {
322 my ($classifiers, $doc_obj) = @_;
323
324 foreach my $classobj (@$classifiers) {
325 my $title = $classobj->{'title'};
326
327 $classobj->classify($doc_obj);
328 }
329}
330
331
332our $next_classify_num = 1;
333
334sub reset_next_classify_num
335{
336 $next_classify_num = 1;
337}
338
339
340
341# output_classify_info outputs all the info needed for the classification
342# to the database
343sub output_classify_info
344{
345 my ($classifiers, $infodb_type, $infodb_handle, $remove_empty_classifications, $gli) = @_;
346
347 $gli = 0 unless defined $gli;
348
349 # create a classification containing all the info
350 my $classifyinfo = { 'classifyOID'=> 'browse',
351 'contains' => [] };
352
353 # get each of the classifications
354 print STDERR "\n"; # force line-break to separate infodb printed info from earlier phase of classifyinfo
355 foreach my $classifier (@$classifiers)
356 {
357 my $classifier_info = $classifier->get_classify_info($gli);
358 if (defined $classifier_info) {
359 $classifier_info->{'classifyOID'} = "CL$next_classify_num" unless defined($classifier_info->{'classifyOID'});
360 print STDERR "Outputting information for classifier: $classifier_info->{'classifyOID'}\n";
361
362 push(@{$classifyinfo->{'contains'}}, $classifier_info);
363 } else {
364 print STDERR "*** error with classifier CL$next_classify_num, not outputing it\n";
365 }
366 $next_classify_num++;
367
368 }
369
370 &print_classify_info($infodb_type, $infodb_handle, $classifyinfo, "", $remove_empty_classifications);
371}
372
373
374sub print_classify_info
375{
376 my ($infodb_type, $infodb_handle, $classifyinfo, $OID, $remove_empty_classifications) = @_;
377
378 $OID =~ s/^\.+//; # just for good luck
379
380 # book information is printed elsewhere
381 return if (defined ($classifyinfo->{'OID'}));
382
383 # don't want empty classifications
384 return if (&check_contents ($classifyinfo, $remove_empty_classifications) == 0 && $remove_empty_classifications);
385
386 $OID = $classifyinfo->{'classifyOID'} if defined ($classifyinfo->{'classifyOID'});
387
388 my %classify_infodb = ();
389 $classify_infodb{"doctype"} = [ "classify" ];
390 $classify_infodb{"hastxt"} = [ "0" ];
391 $classify_infodb{"childtype"} = [ $classifyinfo->{'childtype'} ]
392 if defined $classifyinfo->{'childtype'};
393 $classify_infodb{"Title"} = [ $classifyinfo->{'Title'} ]
394 if defined $classifyinfo->{'Title'};
395 $classify_infodb{"numleafdocs"} = [ $classifyinfo->{'numleafdocs'} ]
396 if defined $classifyinfo->{'numleafdocs'};
397 $classify_infodb{"thistype"} = [ $classifyinfo->{'thistype'} ]
398 if defined $classifyinfo->{'thistype'};
399 $classify_infodb{"parameters"} = [ $classifyinfo->{'parameters'} ]
400 if defined $classifyinfo->{'parameters'};
401 $classify_infodb{"supportsmemberof"} = [ $classifyinfo->{'supportsmemberof'} ]
402 if defined $classifyinfo->{'supportsmemberof'};
403
404 my $contains_text = "";
405 my $mdoffset_text = "";
406
407 my $next_subOID = 1;
408 my $first = 1;
409 foreach my $tempinfo (@{$classifyinfo->{'contains'}}) {
410 # empty contents were made undefined by clean_contents()
411 next unless defined $tempinfo;
412 if (!defined ($tempinfo->{'classifyOID'}) ||
413 $tempinfo->{'classifyOID'} ne "oai") {
414 $contains_text .= ";" unless $first;
415 }
416 $mdoffset_text .= ";" unless $first;
417 $first = 0;
418
419 if (defined ($tempinfo->{'classifyOID'}))
420 {
421 if ($tempinfo->{'classifyOID'} ne "oai")
422 {
423 $contains_text .= $tempinfo->{'classifyOID'};
424 }
425
426 &print_classify_info ($infodb_type, $infodb_handle, $tempinfo, $tempinfo->{'classifyOID'},
427 $remove_empty_classifications);
428 }
429 elsif (defined ($tempinfo->{'OID'}))
430 {
431 $contains_text .= $tempinfo->{'OID'};
432 $mdoffset_text .= $tempinfo->{'offset'} if (defined ($tempinfo->{'offset'}));
433 }
434 else
435 {
436 # Suppress having top-level node in Collage classifier
437 # so no bookshelf icon appears, top-level, along with the
438 # applet
439 if (!defined ($tempinfo->{'Title'}) || $tempinfo->{'Title'} ne "Collage")
440 {
441 $contains_text .= "\".$next_subOID";
442 }
443
444 &print_classify_info ($infodb_type, $infodb_handle, $tempinfo, "$OID.$next_subOID",
445 $remove_empty_classifications);
446 $next_subOID++;
447 }
448 }
449
450 $classify_infodb{"contains"} = [ $contains_text ];
451 $classify_infodb{"mdtype"} = [ $classifyinfo->{'mdtype'} ]
452 if defined $classifyinfo->{'mdtype'};
453 $classify_infodb{"mdoffset"} = [ $mdoffset_text ]
454 if ($mdoffset_text !~ m/^;+$/);
455
456 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, $OID, \%classify_infodb);
457}
458
459
460sub check_contents {
461 my ($classifyinfo,$remove_empty_classifications) = @_;
462 $remove_empty_classifications = 0 unless ($remove_empty_classifications);
463 my $num_leaf_docs = 0;
464 my $sub_num_leaf_docs = 0;
465
466 return $classifyinfo->{'numleafdocs'} if (defined $classifyinfo->{'numleafdocs'});
467
468 foreach my $content (@{$classifyinfo->{'contains'}}) {
469 if (defined $content->{'OID'}) {
470 # found a book
471 $num_leaf_docs ++;
472 } elsif (($sub_num_leaf_docs = &check_contents ($content,$remove_empty_classifications)) > 0) {
473 # there's a book somewhere below
474 $num_leaf_docs += $sub_num_leaf_docs;
475 } else {
476 if ($remove_empty_classifications){
477 # section contains no books so we want to remove
478 # it from its parents contents
479 $content = undef;
480 }
481 }
482 }
483
484 $classifyinfo->{'numleafdocs'} = $num_leaf_docs;
485 return $num_leaf_docs;
486}
487
4881;
Note: See TracBrowser for help on using the repository browser.