source: trunk/gsdl/perllib/classify.pm@ 12883

Last change on this file since 12883 was 12844, checked in by mdewsnip, 18 years ago

Incremental building and dynamic GDBM updating code, many thanks to John Rowe and John Thompson at DL Consulting Ltd.

  • Property svn:keywords set to Author Date Id Revision
File size: 14.1 KB
Line 
1###########################################################################
2#
3# classify.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# functions to handle classifiers
27
28package classify;
29
30require util;
31require AllList;
32use gsprintf;
33
34#use GDBM_File;
35use unbuildutil;
36
37
38sub gsprintf
39{
40 return &gsprintf::gsprintf(@_);
41}
42
43
44$next_classify_num = 1;
45$oid_to_clids = {};
46
47sub load_classifier_for_info {
48 my ($classifier) = shift @_;
49
50 # find the classifier
51 my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
52 "perllib/classify",
53 "${classifier}.pm");
54 my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'},
55 "perllib/classify",
56 "${classifier}.pm");
57
58 if (-e $colclassname) { require $colclassname; }
59 elsif (-e $mainclassname) { require $mainclassname; }
60 else {
61 &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classifier) && die "\n";
62 }
63 my ($classobj);
64 my $options = "-gsdlinfo";
65 eval ("\$classobj = new \$classifier([],[$options])");
66 die "$@" if $@;
67
68 return $classobj;
69}
70
71sub load_classifiers {
72 my ($classify_list, $build_dir, $outhandle) = @_;
73 my @classify_objects = ();
74 my $classify_number = 1;
75
76 foreach $classifyoption (@$classify_list) {
77
78 # get the classifier name
79 my $classname = shift @$classifyoption;
80 next unless defined $classname;
81
82 # find the classifier
83 my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"perllib/classify",
84 "${classname}.pm");
85 my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'},"perllib/classify",
86 "${classname}.pm");
87
88 if (-e $colclassname) { require $colclassname; }
89 elsif (-e $mainclassname) { require $mainclassname; }
90 else { &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classname) && die "\n";
91 # die "ERROR - couldn't find classifier \"$classname\"\n";
92 }
93
94 # create the classify object
95 my ($classobj);
96
97 my @newoptions;
98
99 # do these first so they can be overriden by user supplied options
100 push @newoptions, "-builddir", "$build_dir" if ($build_dir);
101 push @newoptions, "-outhandle", "$outhandle" if ($outhandle);
102 push @newoptions, "-verbosity", "2";
103
104 # backwards compatability hack: if the classifier options are
105 # in "x=y" format, convert them to parsearg ("-x y") format.
106 my ($opt, $key, $value);
107 foreach $opt (@$classifyoption) {
108 # if ($opt =~ /^(\w+)=(.*)$/) {
109 # push @newoptions, "-$1", $2;
110 # } else {
111 push @newoptions, $opt;
112 #}
113 }
114
115 map { $_ = "\"$_\""; } @newoptions;
116 my $options .= join (",", @newoptions);
117
118
119 eval ("\$classobj = new \$classname([],[$options])");
120 die "$@" if $@;
121
122 $classobj->set_number($classify_number);
123 $classify_number ++;
124
125 # add this object to the list
126 push (@classify_objects, $classobj);
127 }
128
129 my ($classobj);
130 eval ("\$classobj = new AllList()");
131 die "$@" if $@;
132 push (@classify_objects, $classobj);
133
134 return \@classify_objects;
135}
136
137# init_classifiers resets all the classifiers and readys them to process
138# the documents.
139sub init_classifiers {
140 my ($classifiers) = @_;
141
142 foreach $classobj (@$classifiers) {
143 $classobj->init();
144 }
145}
146
147
148
149# takes a hashref containing the metadata for a gdbmfile entry, and extracts
150# the childrens numbers (from the 'contains' entry).
151# assumes format is ".1;".2;".3
152sub get_children {
153 my ($doc_db_hash) = @_;
154
155 my $children = undef;
156
157 $childs = $doc_db_hash->{'contains'};
158 if (defined ($childs)) {
159 $childs =~ s/\@$//; #remove trailing @
160 $childs =~ s/^\"\.//; #remove initial ".
161 @$children = split /\;\"\./, $childs;
162
163 }
164
165 return $children;
166}
167
168
169sub recurse_sections {
170 my ($doc_obj, $children, $parentoid, $parentsection, $gdbm_recs) = @_;
171
172 return if (!defined $children);
173
174 foreach my $child (sort { $a <=> $b} @$children) {
175 $doc_obj->create_named_section("$parentsection.$child");
176 my $doc_db_rec = $gdbm_recs->{"$parentoid.$child"};
177 my $doc_db_hash = db_rec_to_hash($doc_db_rec);
178
179 # get child's children
180 my $newchildren = &get_children($doc_db_hash);
181
182 # add content for current section
183 add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
184
185 # process all the children if there are any
186 if (defined ($newchildren))
187 {
188 recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
189 "$parentsection.$child", $gdbm_recs);
190 }
191 }
192}
193
194
195sub add_section_content {
196 my ($doc_obj, $cursection, $doc_db_hash) = @_;
197
198 foreach $key (keys %$doc_db_hash) {
199 #don't need to store these metadata
200 next if $key =~ /(thistype|childtype|contains|docnum|doctype|classifytype)/i;
201 # but do want things like hastxt and archivedir
202 my @items = split /@/, $doc_db_hash->{$key};
203 map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
204
205 }
206}
207
208
209# gets all the metadata from a gdbm file entry, and puts it into a hashref
210sub db_rec_to_hash {
211
212 my ($gdb_str_ref) = @_;
213
214 my $hashref = {};
215
216 my @entries = split(/\n/, $gdb_str_ref);
217 foreach $entry (@entries) {
218 my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
219 $hashref->{$key} .= '@' if defined $hashref->{$key};
220 $hashref->{$key} .= $value;
221
222 }
223
224 return $hashref;
225}
226
227
228sub reconstruct_doc_objs_metadata
229{
230 my ($fulldbname) = @_;
231
232# tie %gdbm_recs, 'GDBM_File', $fulldbname, &GDBM_WRCREAT, 0640;
233
234 my %gdbm_recs;
235 &unbuildutil::read_gdbm($fulldbname,\%gdbm_recs);
236
237
238 # dig out top level doc sections
239 my %top_sections = ();
240 foreach my $key ( keys %gdbm_recs )
241 {
242 my $md_rec = $gdbm_recs{$key};
243 my $md_hash = db_rec_to_hash($md_rec);
244
245 if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
246 next if ($key =~ m/\./);
247 $top_sections{$key} = $md_hash;
248 }
249 }
250
251 # for greenstone document objects based on metadata in gdbm file
252 my @all_docs = ();
253 foreach my $oid ( keys %top_sections )
254 {
255 my $doc_db_hash = $top_sections{$oid};
256
257 my $doc_obj = new doc();
258 $doc_obj->set_OID($oid);
259
260 my $top = $doc_obj->get_top_section();
261 add_section_content ($doc_obj, $top, $doc_db_hash);
262 my $children = &get_children($doc_db_hash);
263 recurse_sections($doc_obj, $children, $oid, $top, \%gdbm_recs);
264
265 push(@all_docs,$doc_obj);
266 }
267
268# untie %gdbm_recs;
269
270 return \@all_docs;
271}
272
273
274
275
276
277# classify_doc lets each of the classifiers classify a document
278sub classify_doc {
279 my ($classifiers, $doc_obj) = @_;
280
281 foreach $classobj (@$classifiers) {
282 my $title = $classobj->{'title'};
283 $classobj->classify($doc_obj);
284 }
285}
286
287# output_classify_info outputs all the info needed for the classification
288# to the gdbm
289sub output_classify_info {
290 my ($classifiers, $handle, $remove_empty_classifications, $gli) = @_;
291# $handle = "main::STDOUT";
292
293 $gli = 0 unless defined $gli;
294
295 # create a classification containing all the info
296 my $classifyinfo = {'classifyOID'=>'browse',
297 'contains'=>[]};
298
299 # get each of the classifications
300 foreach $classobj (@$classifiers) {
301 my $tempinfo = $classobj->get_classify_info($gli);
302 my $classID = $tempinfo->{'classifyOID'};
303
304 $tempinfo->{'classifyOID'} = "CL$next_classify_num" unless defined($tempinfo->{'classifyOID'});
305 $next_classify_num++;
306
307 print STDERR "*** outputting information for classifier: $tempinfo->{'classifyOID'}\n";
308
309 push (@{$classifyinfo->{'contains'}}, $tempinfo);
310 }
311
312 &print_classify_info ($handle, $classifyinfo, "", $remove_empty_classifications);
313}
314
315sub print_classify_info {
316 my ($handle, $classifyinfo, $OID, $remove_empty_classifications) = @_;
317
318 $OID =~ s/^\.+//; # just for good luck
319
320 # book information is printed elsewhere
321 return if (defined ($classifyinfo->{'OID'}));
322
323 # don't want empty classifications
324 return if (&check_contents ($classifyinfo, $remove_empty_classifications) == 0 && $remove_empty_classifications);
325
326 $OID = $classifyinfo->{'classifyOID'} if defined ($classifyinfo->{'classifyOID'});
327
328 my $outputtext = "[$OID]\n";
329 $outputtext .= "<doctype>classify\n";
330 $outputtext .= "<hastxt>0\n";
331 $outputtext .= "<childtype>$classifyinfo->{'childtype'}\n"
332 if defined $classifyinfo->{'childtype'};
333 $outputtext .= "<Title>$classifyinfo->{'Title'}\n"
334 if defined $classifyinfo->{'Title'};
335 $outputtext .= "<numleafdocs>$classifyinfo->{'numleafdocs'}\n"
336 if defined $classifyinfo->{'numleafdocs'};
337 $outputtext .= "<thistype>$classifyinfo->{'thistype'}\n"
338 if defined $classifyinfo->{'thistype'};
339 $outputtext .= "<parameters>$classifyinfo->{'parameters'}\n"
340 if defined $classifyinfo->{'parameters'};
341 $outputtext .= "<supportsmemberof>$classifyinfo->{'supportsmemberof'}\n"
342 if defined $classifyinfo->{'supportsmemberof'};
343
344 my $contains_text = "<contains>";
345 my $mdoffset_text = "<mdoffset>";
346
347 my $next_subOID = 1;
348 my $first = 1;
349 foreach $tempinfo (@{$classifyinfo->{'contains'}}) {
350 # empty contents were made undefined by clean_contents()
351 next unless defined $tempinfo;
352
353 if (!defined ($tempinfo->{'classifyOID'}) ||
354 $tempinfo->{'classifyOID'} ne "oai") {
355 $contains_text .= ";" unless $first;
356 }
357 $mdoffset_text .= ";" unless $first;
358 $first = 0;
359
360 if (defined ($tempinfo->{'classifyOID'})) {
361 if ($tempinfo->{'classifyOID'} ne "oai") {
362 $contains_text .= $tempinfo->{'classifyOID'};
363 }
364
365 # Extra code for incremental building.
366 # We need to store a listing of the classifiers each DOI is in
367 my $clids = [];
368 #rint STDERR "==1. Recording reverse lookup for " . $tempinfo->{'classifyOID'} . "==\n";
369 if(defined($oid_to_clids->{$tempinfo->{'classifyOID'}})) {
370 #rint STDERR "Found existing array!\n";
371 $clids = $oid_to_clids->{$tempinfo->{'classifyOID'}};
372 }
373 #rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
374 push(@{$clids}, $OID);
375 $oid_to_clids->{$tempinfo->{'classifyOID'}} = $clids;
376 #rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
377
378 &print_classify_info ($handle, $tempinfo, $tempinfo->{'classifyOID'},
379 $remove_empty_classifications);
380 } elsif (defined ($tempinfo->{'OID'})) {
381 $contains_text .= $tempinfo->{'OID'};
382 $mdoffset_text .= $tempinfo->{'offset'} if (defined ($tempinfo->{'offset'}));
383
384
385 # note: we don't want to print the contents of the books
386 # Extra code for incremental building.
387 # We need to store a listing of the classifiers each DOI is in
388 my $clids = [];
389 #rint STDERR "==2. Recording reverse lookup for " . $tempinfo->{'OID'} . "==\n";
390 if(defined($oid_to_clids->{$tempinfo->{'OID'}})) {
391 #rint STDERR "Found existing array!\n";
392 $clids = $oid_to_clids->{$tempinfo->{'OID'}};
393 }
394 #rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
395 push(@{$clids}, $OID);
396 $oid_to_clids->{$tempinfo->{'OID'}} = $clids;
397 #rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
398
399
400 } else {
401
402 # Supress having top-level node in Collage classifier
403 # so no bookshelf icon appears, top-level, along with the
404 # applet
405
406 if (!defined ($tempinfo->{'Title'}) || $tempinfo->{'Title'} ne "Collage") {
407 $contains_text .= "\".$next_subOID";
408 }
409
410 # Extra code for incremental building.
411 # We need to store a listing of the classifiers each DOI is in
412 my $clids = [];
413 #rint STDERR "==3. Recording reverse lookup for $OID.$next_subOID==\n";
414 if(defined($oid_to_clids->{$OID . "." . $next_subOID})) {
415 #rint STDERR "Found existing array!\n";
416 $clids = $oid_to_clids->{$OID . "." . $next_subOID};
417 }
418 #rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
419 push(@{$clids}, $OID);
420 $oid_to_clids->{$OID . "." . $next_subOID} = $clids;
421 #rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
422
423 &print_classify_info ($handle, $tempinfo, "$OID.$next_subOID",
424 $remove_empty_classifications);
425 $next_subOID++;
426 }
427 }
428
429 $outputtext .= "$contains_text\n";
430 $outputtext .= "<mdtype>$classifyinfo->{'mdtype'}\n"
431 if defined $classifyinfo->{'mdtype'};
432 $outputtext .= "$mdoffset_text\n"
433 if ($mdoffset_text !~ m/^<mdoffset>;+$/);
434
435 $outputtext .= '-' x 70 . "\n";
436
437 print $handle $outputtext;
438
439}
440
441sub check_contents {
442 my ($classifyinfo,$remove_empty_classifications) = @_;
443 $remove_empty_classifications = 0 unless ($remove_empty_classifications);
444 my $num_leaf_docs = 0;
445 my $sub_num_leaf_docs = 0;
446
447 return $classifyinfo->{'numleafdocs'} if (defined $classifyinfo->{'numleafdocs'});
448
449 foreach $content (@{$classifyinfo->{'contains'}}) {
450 if (defined $content->{'OID'}) {
451 # found a book
452 $num_leaf_docs ++;
453 } elsif (($sub_num_leaf_docs = &check_contents ($content,$remove_empty_classifications)) > 0) {
454 # there's a book somewhere below
455 $num_leaf_docs += $sub_num_leaf_docs;
456 } else {
457 if ($remove_empty_classifications){
458 # section contains no books so we want to remove
459 # it from its parents contents
460 $content = undef;
461 }
462 }
463 }
464
465 $classifyinfo->{'numleafdocs'} = $num_leaf_docs;
466 return $num_leaf_docs;
467}
468
4691;
Note: See TracBrowser for help on using the repository browser.