source: trunk/gsdl/perllib/classify.pm@ 13418

Last change on this file since 13418 was 13068, checked in by kjdon, 18 years ago

when unbuilding and rebuilding the gdbm database for incremental building, we need to make sure the old documents are processed in the same order as they were originally added, otherwise the docnums don't match up with the index or text

  • Property svn:keywords set to Author Date Id Revision
File size: 14.3 KB
Line 
1###########################################################################
2#
3# classify.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# functions to handle classifiers
27
28package classify;
29
30require util;
31require AllList;
32use gsprintf;
33
34#use GDBM_File;
35use unbuildutil;
36
37
38sub gsprintf
39{
40 return &gsprintf::gsprintf(@_);
41}
42
43
44$next_classify_num = 1;
45$oid_to_clids = {};
46
47sub load_classifier_for_info {
48 my ($classifier) = shift @_;
49
50 # find the classifier
51 my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
52 "perllib/classify",
53 "${classifier}.pm");
54 my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'},
55 "perllib/classify",
56 "${classifier}.pm");
57
58 if (-e $colclassname) { require $colclassname; }
59 elsif (-e $mainclassname) { require $mainclassname; }
60 else {
61 &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classifier) && die "\n";
62 }
63 my ($classobj);
64 my $options = "-gsdlinfo";
65 eval ("\$classobj = new \$classifier([],[$options])");
66 die "$@" if $@;
67
68 return $classobj;
69}
70
71sub load_classifiers {
72 my ($classify_list, $build_dir, $outhandle) = @_;
73 my @classify_objects = ();
74 my $classify_number = 1;
75
76 foreach $classifyoption (@$classify_list) {
77
78 # get the classifier name
79 my $classname = shift @$classifyoption;
80 next unless defined $classname;
81
82 # find the classifier
83 my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"perllib/classify",
84 "${classname}.pm");
85 my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'},"perllib/classify",
86 "${classname}.pm");
87
88 if (-e $colclassname) { require $colclassname; }
89 elsif (-e $mainclassname) { require $mainclassname; }
90 else { &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classname) && die "\n";
91 # die "ERROR - couldn't find classifier \"$classname\"\n";
92 }
93
94 # create the classify object
95 my ($classobj);
96
97 my @newoptions;
98
99 # do these first so they can be overriden by user supplied options
100 push @newoptions, "-builddir", "$build_dir" if ($build_dir);
101 push @newoptions, "-outhandle", "$outhandle" if ($outhandle);
102 push @newoptions, "-verbosity", "2";
103
104 # backwards compatability hack: if the classifier options are
105 # in "x=y" format, convert them to parsearg ("-x y") format.
106 my ($opt, $key, $value);
107 foreach $opt (@$classifyoption) {
108 # if ($opt =~ /^(\w+)=(.*)$/) {
109 # push @newoptions, "-$1", $2;
110 # } else {
111 push @newoptions, $opt;
112 #}
113 }
114
115 map { $_ = "\"$_\""; } @newoptions;
116 my $options .= join (",", @newoptions);
117
118
119 eval ("\$classobj = new \$classname([],[$options])");
120 die "$@" if $@;
121
122 $classobj->set_number($classify_number);
123 $classify_number ++;
124
125 # add this object to the list
126 push (@classify_objects, $classobj);
127 }
128
129 my ($classobj);
130 eval ("\$classobj = new AllList()");
131 die "$@" if $@;
132 push (@classify_objects, $classobj);
133
134 return \@classify_objects;
135}
136
137# init_classifiers resets all the classifiers and readys them to process
138# the documents.
139sub init_classifiers {
140 my ($classifiers) = @_;
141
142 foreach $classobj (@$classifiers) {
143 $classobj->init();
144 }
145}
146
147
148
149# takes a hashref containing the metadata for a gdbmfile entry, and extracts
150# the childrens numbers (from the 'contains' entry).
151# assumes format is ".1;".2;".3
152sub get_children {
153 my ($doc_db_hash) = @_;
154
155 my $children = undef;
156
157 $childs = $doc_db_hash->{'contains'};
158 if (defined ($childs)) {
159 $childs =~ s/\@$//; #remove trailing @
160 $childs =~ s/^\"\.//; #remove initial ".
161 @$children = split /\;\"\./, $childs;
162
163 }
164
165 return $children;
166}
167
168
169sub recurse_sections {
170 my ($doc_obj, $children, $parentoid, $parentsection, $gdbm_recs) = @_;
171
172 return if (!defined $children);
173
174 foreach my $child (sort { $a <=> $b} @$children) {
175 $doc_obj->create_named_section("$parentsection.$child");
176 my $doc_db_rec = $gdbm_recs->{"$parentoid.$child"};
177 my $doc_db_hash = db_rec_to_hash($doc_db_rec);
178
179 # get child's children
180 my $newchildren = &get_children($doc_db_hash);
181
182 # add content for current section
183 add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
184
185 # process all the children if there are any
186 if (defined ($newchildren))
187 {
188 recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
189 "$parentsection.$child", $gdbm_recs);
190 }
191 }
192}
193
194
195sub add_section_content {
196 my ($doc_obj, $cursection, $doc_db_hash) = @_;
197
198 foreach $key (keys %$doc_db_hash) {
199 #don't need to store these metadata
200 next if $key =~ /(thistype|childtype|contains|docnum|doctype|classifytype)/i;
201 # but do want things like hastxt and archivedir
202 my @items = split /@/, $doc_db_hash->{$key};
203 map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
204
205 }
206}
207
208
209# gets all the metadata from a gdbm file entry, and puts it into a hashref
210sub db_rec_to_hash {
211
212 my ($gdb_str_ref) = @_;
213
214 my $hashref = {};
215
216 my @entries = split(/\n/, $gdb_str_ref);
217 foreach $entry (@entries) {
218 my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
219 $hashref->{$key} .= '@' if defined $hashref->{$key};
220 $hashref->{$key} .= $value;
221
222 }
223
224 return $hashref;
225}
226
227
228sub reconstruct_doc_objs_metadata
229{
230 my ($fulldbname) = @_;
231
232# tie %gdbm_recs, 'GDBM_File', $fulldbname, &GDBM_WRCREAT, 0640;
233
234 my %gdbm_recs;
235 &unbuildutil::read_gdbm($fulldbname,\%gdbm_recs);
236
237
238 # dig out top level doc sections
239 my %top_sections = ();
240 my %top_docnums = ();
241 foreach my $key ( keys %gdbm_recs )
242 {
243 my $md_rec = $gdbm_recs{$key};
244 my $md_hash = db_rec_to_hash($md_rec);
245
246 if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
247 next if ($key =~ m/\./);
248 $top_sections{$key} = $md_hash;
249 $top_docnums{$key} = $md_hash->{'docnum'};
250 }
251 }
252
253 # for greenstone document objects based on metadata in gdbm file
254 my @all_docs = ();
255 # we need to make sure the documents were processed in the same order as
256 # before, so sort based on their docnums
257 foreach my $oid ( sort { $top_docnums{$a} <=> $top_docnums{$b} } keys %top_sections )
258 {
259 my $doc_db_hash = $top_sections{$oid};
260
261 my $doc_obj = new doc();
262 $doc_obj->set_OID($oid);
263 my $top = $doc_obj->get_top_section();
264 add_section_content ($doc_obj, $top, $doc_db_hash);
265 my $children = &get_children($doc_db_hash);
266 recurse_sections($doc_obj, $children, $oid, $top, \%gdbm_recs);
267
268 push(@all_docs,$doc_obj);
269 }
270
271# untie %gdbm_recs;
272
273 return \@all_docs;
274}
275
276
277
278
279
280# classify_doc lets each of the classifiers classify a document
281sub classify_doc {
282 my ($classifiers, $doc_obj) = @_;
283
284 foreach $classobj (@$classifiers) {
285 my $title = $classobj->{'title'};
286 $classobj->classify($doc_obj);
287 }
288}
289
290# output_classify_info outputs all the info needed for the classification
291# to the gdbm
292sub output_classify_info {
293 my ($classifiers, $handle, $remove_empty_classifications, $gli) = @_;
294# $handle = "main::STDOUT";
295
296 $gli = 0 unless defined $gli;
297
298 # create a classification containing all the info
299 my $classifyinfo = {'classifyOID'=>'browse',
300 'contains'=>[]};
301
302 # get each of the classifications
303 foreach $classobj (@$classifiers) {
304 my $tempinfo = $classobj->get_classify_info($gli);
305 my $classID = $tempinfo->{'classifyOID'};
306
307 $tempinfo->{'classifyOID'} = "CL$next_classify_num" unless defined($tempinfo->{'classifyOID'});
308 $next_classify_num++;
309
310 print STDERR "*** outputting information for classifier: $tempinfo->{'classifyOID'}\n";
311
312 push (@{$classifyinfo->{'contains'}}, $tempinfo);
313 }
314
315 &print_classify_info ($handle, $classifyinfo, "", $remove_empty_classifications);
316}
317
318sub print_classify_info {
319 my ($handle, $classifyinfo, $OID, $remove_empty_classifications) = @_;
320
321 $OID =~ s/^\.+//; # just for good luck
322
323 # book information is printed elsewhere
324 return if (defined ($classifyinfo->{'OID'}));
325
326 # don't want empty classifications
327 return if (&check_contents ($classifyinfo, $remove_empty_classifications) == 0 && $remove_empty_classifications);
328
329 $OID = $classifyinfo->{'classifyOID'} if defined ($classifyinfo->{'classifyOID'});
330
331 my $outputtext = "[$OID]\n";
332 $outputtext .= "<doctype>classify\n";
333 $outputtext .= "<hastxt>0\n";
334 $outputtext .= "<childtype>$classifyinfo->{'childtype'}\n"
335 if defined $classifyinfo->{'childtype'};
336 $outputtext .= "<Title>$classifyinfo->{'Title'}\n"
337 if defined $classifyinfo->{'Title'};
338 $outputtext .= "<numleafdocs>$classifyinfo->{'numleafdocs'}\n"
339 if defined $classifyinfo->{'numleafdocs'};
340 $outputtext .= "<thistype>$classifyinfo->{'thistype'}\n"
341 if defined $classifyinfo->{'thistype'};
342 $outputtext .= "<parameters>$classifyinfo->{'parameters'}\n"
343 if defined $classifyinfo->{'parameters'};
344 $outputtext .= "<supportsmemberof>$classifyinfo->{'supportsmemberof'}\n"
345 if defined $classifyinfo->{'supportsmemberof'};
346
347 my $contains_text = "<contains>";
348 my $mdoffset_text = "<mdoffset>";
349
350 my $next_subOID = 1;
351 my $first = 1;
352 foreach $tempinfo (@{$classifyinfo->{'contains'}}) {
353 # empty contents were made undefined by clean_contents()
354 next unless defined $tempinfo;
355
356 if (!defined ($tempinfo->{'classifyOID'}) ||
357 $tempinfo->{'classifyOID'} ne "oai") {
358 $contains_text .= ";" unless $first;
359 }
360 $mdoffset_text .= ";" unless $first;
361 $first = 0;
362
363 if (defined ($tempinfo->{'classifyOID'})) {
364 if ($tempinfo->{'classifyOID'} ne "oai") {
365 $contains_text .= $tempinfo->{'classifyOID'};
366 }
367
368 # Extra code for incremental building.
369 # We need to store a listing of the classifiers each DOI is in
370 my $clids = [];
371 #rint STDERR "==1. Recording reverse lookup for " . $tempinfo->{'classifyOID'} . "==\n";
372 if(defined($oid_to_clids->{$tempinfo->{'classifyOID'}})) {
373 #rint STDERR "Found existing array!\n";
374 $clids = $oid_to_clids->{$tempinfo->{'classifyOID'}};
375 }
376 #rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
377 push(@{$clids}, $OID);
378 $oid_to_clids->{$tempinfo->{'classifyOID'}} = $clids;
379 #rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
380
381 &print_classify_info ($handle, $tempinfo, $tempinfo->{'classifyOID'},
382 $remove_empty_classifications);
383 } elsif (defined ($tempinfo->{'OID'})) {
384 $contains_text .= $tempinfo->{'OID'};
385 $mdoffset_text .= $tempinfo->{'offset'} if (defined ($tempinfo->{'offset'}));
386
387
388 # note: we don't want to print the contents of the books
389 # Extra code for incremental building.
390 # We need to store a listing of the classifiers each DOI is in
391 my $clids = [];
392 #rint STDERR "==2. Recording reverse lookup for " . $tempinfo->{'OID'} . "==\n";
393 if(defined($oid_to_clids->{$tempinfo->{'OID'}})) {
394 #rint STDERR "Found existing array!\n";
395 $clids = $oid_to_clids->{$tempinfo->{'OID'}};
396 }
397 #rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
398 push(@{$clids}, $OID);
399 $oid_to_clids->{$tempinfo->{'OID'}} = $clids;
400 #rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
401
402
403 } else {
404
405 # Supress having top-level node in Collage classifier
406 # so no bookshelf icon appears, top-level, along with the
407 # applet
408
409 if (!defined ($tempinfo->{'Title'}) || $tempinfo->{'Title'} ne "Collage") {
410 $contains_text .= "\".$next_subOID";
411 }
412
413 # Extra code for incremental building.
414 # We need to store a listing of the classifiers each DOI is in
415 my $clids = [];
416 #rint STDERR "==3. Recording reverse lookup for $OID.$next_subOID==\n";
417 if(defined($oid_to_clids->{$OID . "." . $next_subOID})) {
418 #rint STDERR "Found existing array!\n";
419 $clids = $oid_to_clids->{$OID . "." . $next_subOID};
420 }
421 #rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
422 push(@{$clids}, $OID);
423 $oid_to_clids->{$OID . "." . $next_subOID} = $clids;
424 #rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
425
426 &print_classify_info ($handle, $tempinfo, "$OID.$next_subOID",
427 $remove_empty_classifications);
428 $next_subOID++;
429 }
430 }
431
432 $outputtext .= "$contains_text\n";
433 $outputtext .= "<mdtype>$classifyinfo->{'mdtype'}\n"
434 if defined $classifyinfo->{'mdtype'};
435 $outputtext .= "$mdoffset_text\n"
436 if ($mdoffset_text !~ m/^<mdoffset>;+$/);
437
438 $outputtext .= '-' x 70 . "\n";
439
440 print $handle $outputtext;
441
442}
443
444sub check_contents {
445 my ($classifyinfo,$remove_empty_classifications) = @_;
446 $remove_empty_classifications = 0 unless ($remove_empty_classifications);
447 my $num_leaf_docs = 0;
448 my $sub_num_leaf_docs = 0;
449
450 return $classifyinfo->{'numleafdocs'} if (defined $classifyinfo->{'numleafdocs'});
451
452 foreach $content (@{$classifyinfo->{'contains'}}) {
453 if (defined $content->{'OID'}) {
454 # found a book
455 $num_leaf_docs ++;
456 } elsif (($sub_num_leaf_docs = &check_contents ($content,$remove_empty_classifications)) > 0) {
457 # there's a book somewhere below
458 $num_leaf_docs += $sub_num_leaf_docs;
459 } else {
460 if ($remove_empty_classifications){
461 # section contains no books so we want to remove
462 # it from its parents contents
463 $content = undef;
464 }
465 }
466 }
467
468 $classifyinfo->{'numleafdocs'} = $num_leaf_docs;
469 return $num_leaf_docs;
470}
471
4721;
Note: See TracBrowser for help on using the repository browser.