source: trunk/gsdl/perllib/classify.pm@ 12270

Last change on this file since 12270 was 11994, checked in by davidb, 18 years ago

Improved support for incremental addition: instead of having to run the
classifier pass of buildcol.pl from scratch (i.e. read in all documents
from the archives folder) so correct browse structures are formed -- a
simple to implement strategy, but not very efficient -- the first layer
of a classifier structure is now reconstructed from the GDBM file. Then
the new files in the archives directory are added, and then finally the
completed browser structure is formed.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.7 KB
Line 
1###########################################################################
2#
3# classify.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# functions to handle classifiers
27
28package classify;
29
30require util;
31require AllList;
32use gsprintf;
33
34use GDBM_File;
35
36
37sub gsprintf
38{
39 return &gsprintf::gsprintf(@_);
40}
41
42
43$next_classify_num = 1;
44sub load_classifier_for_info {
45 my ($classifier) = shift @_;
46
47 # find the classifier
48 my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
49 "perllib/classify",
50 "${classifier}.pm");
51 my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'},
52 "perllib/classify",
53 "${classifier}.pm");
54
55 if (-e $colclassname) { require $colclassname; }
56 elsif (-e $mainclassname) { require $mainclassname; }
57 else {
58 &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classifier) && die "\n";
59 }
60 my ($classobj);
61 my $options = "-gsdlinfo";
62 eval ("\$classobj = new \$classifier([],[$options])");
63 die "$@" if $@;
64
65 return $classobj;
66}
67
68sub load_classifiers {
69 my ($classify_list, $build_dir, $outhandle) = @_;
70 my @classify_objects = ();
71 my $classify_number = 1;
72
73 foreach $classifyoption (@$classify_list) {
74
75 # get the classifier name
76 my $classname = shift @$classifyoption;
77 next unless defined $classname;
78
79 # find the classifier
80 my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"perllib/classify",
81 "${classname}.pm");
82 my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'},"perllib/classify",
83 "${classname}.pm");
84
85 if (-e $colclassname) { require $colclassname; }
86 elsif (-e $mainclassname) { require $mainclassname; }
87 else { &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classname) && die "\n";
88 # die "ERROR - couldn't find classifier \"$classname\"\n";
89 }
90
91 # create the classify object
92 my ($classobj);
93
94 my @newoptions;
95
96 # do these first so they can be overriden by user supplied options
97 push @newoptions, "-builddir", "$build_dir" if ($build_dir);
98 push @newoptions, "-outhandle", "$outhandle" if ($outhandle);
99 push @newoptions, "-verbosity", "2";
100
101 # backwards compatability hack: if the classifier options are
102 # in "x=y" format, convert them to parsearg ("-x y") format.
103 my ($opt, $key, $value);
104 foreach $opt (@$classifyoption) {
105 # if ($opt =~ /^(\w+)=(.*)$/) {
106 # push @newoptions, "-$1", $2;
107 # } else {
108 push @newoptions, $opt;
109 #}
110 }
111
112 map { $_ = "\"$_\""; } @newoptions;
113 my $options .= join (",", @newoptions);
114
115
116 eval ("\$classobj = new \$classname([],[$options])");
117 die "$@" if $@;
118
119 $classobj->set_number($classify_number);
120 $classify_number ++;
121
122 # add this object to the list
123 push (@classify_objects, $classobj);
124 }
125
126 my ($classobj);
127 eval ("\$classobj = new AllList()");
128 die "$@" if $@;
129 push (@classify_objects, $classobj);
130
131 return \@classify_objects;
132}
133
134# init_classifiers resets all the classifiers and readys them to process
135# the documents.
136sub init_classifiers {
137 my ($classifiers) = @_;
138
139 foreach $classobj (@$classifiers) {
140 $classobj->init();
141 }
142}
143
144
145
146# takes a hashref containing the metadata for a gdbmfile entry, and extracts
147# the childrens numbers (from the 'contains' entry).
148# assumes format is ".1;".2;".3
149sub get_children {
150 my ($doc_db_hash) = @_;
151
152 my $children = undef;
153
154 $childs = $doc_db_hash->{'contains'};
155 if (defined ($childs)) {
156 $childs =~ s/\@$//; #remove trailing @
157 $childs =~ s/^\"\.//; #remove initial ".
158 @$children = split /\;\"\./, $childs;
159
160 }
161
162 return $children;
163}
164
165
166sub recurse_sections {
167 my ($doc_obj, $children, $parentoid, $parentsection, $gdbm_recs) = @_;
168
169 return if (!defined $children);
170
171 foreach my $child (sort { $a <=> $b} @$children) {
172 $doc_obj->create_named_section("$parentsection.$child");
173 my $doc_db_rec = $gdbm_recs->{"$parentoid.$child"};
174 my $doc_db_hash = db_rec_to_hash($doc_db_rec);
175
176 # get child's children
177 my $newchildren = &get_children($doc_db_hash);
178
179 # add content for current section
180 add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
181
182 # process all the children if there are any
183 if (defined ($newchildren))
184 {
185 recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
186 "$parentsection.$child", $gdbm_recs);
187 }
188 }
189}
190
191
192sub add_section_content {
193 my ($doc_obj, $cursection, $doc_db_hash) = @_;
194
195 foreach $key (keys %$doc_db_hash) {
196 #don't need to store these metadata
197 next if $key =~ /(thistype|childtype|contains|docnum|doctype|classifytype)/i;
198 # but do want things like hastxt and archivedir
199 my @items = split /@/, $doc_db_hash->{$key};
200 map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
201
202 }
203}
204
205
206# gets all the metadata from a gdbm file entry, and puts it into a hashref
207sub db_rec_to_hash {
208
209 my ($gdb_str_ref) = @_;
210
211 my $hashref = {};
212
213 my @entries = split(/\n/, $gdb_str_ref);
214 foreach $entry (@entries) {
215 my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
216 $hashref->{$key} .= '@' if defined $hashref->{$key};
217 $hashref->{$key} .= $value;
218
219 }
220
221 return $hashref;
222}
223
224
225sub reconstruct_doc_objs_metadata
226{
227 my ($fulldbname) = @_;
228
229 tie %gdbm_recs, 'GDBM_File', $fulldbname, &GDBM_WRCREAT, 0640;
230
231 # dig out top level doc sections
232 my %top_sections = ();
233 foreach my $key ( keys %gdbm_recs )
234 {
235 my $md_rec = $gdbm_recs{$key};
236 my $md_hash = db_rec_to_hash($md_rec);
237
238 if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
239 next if ($key =~ m/\./);
240 $top_sections{$key} = $md_hash;
241 }
242 }
243
244 # for greenstone document objects based on metadata in gdbm file
245 my @all_docs = ();
246 foreach my $oid ( keys %top_sections )
247 {
248 my $doc_db_hash = $top_sections{$oid};
249
250 my $doc_obj = new doc();
251 $doc_obj->set_OID($oid);
252
253 my $top = $doc_obj->get_top_section();
254 add_section_content ($doc_obj, $top, $doc_db_hash);
255 my $children = &get_children($doc_db_hash);
256 recurse_sections($doc_obj, $children, $oid, $top, \%gdbm_recs);
257
258 push(@all_docs,$doc_obj);
259 }
260
261 untie %gdbm_recs;
262
263 return \@all_docs;
264}
265
266
267
268
269
270# classify_doc lets each of the classifiers classify a document
271sub classify_doc {
272 my ($classifiers, $doc_obj) = @_;
273
274 foreach $classobj (@$classifiers) {
275 my $title = $classobj->{'title'};
276 $classobj->classify($doc_obj);
277 }
278}
279
280# output_classify_info outputs all the info needed for the classification
281# to the gdbm
282sub output_classify_info {
283 my ($classifiers, $handle, $remove_empty_classifications, $gli) = @_;
284# $handle = "main::STDOUT";
285
286 $gli = 0 unless defined $gli;
287
288 # create a classification containing all the info
289 my $classifyinfo = {'classifyOID'=>'browse',
290 'contains'=>[]};
291
292 # get each of the classifications
293 foreach $classobj (@$classifiers) {
294 my $tempinfo = $classobj->get_classify_info($gli);
295 my $classID = $tempinfo->{'classifyOID'};
296
297 $tempinfo->{'classifyOID'} = "CL$next_classify_num" unless defined($tempinfo->{'classifyOID'});
298 $next_classify_num++;
299 push (@{$classifyinfo->{'contains'}}, $tempinfo);
300 }
301
302 &print_classify_info ($handle, $classifyinfo, "", $remove_empty_classifications);
303}
304
305sub print_classify_info {
306 my ($handle, $classifyinfo, $OID, $remove_empty_classifications) = @_;
307
308 $OID =~ s/^\.+//; # just for good luck
309
310 # book information is printed elsewhere
311 return if (defined ($classifyinfo->{'OID'}));
312
313 # don't want empty classifications
314 return if (&check_contents ($classifyinfo, $remove_empty_classifications) == 0 && $remove_empty_classifications);
315
316 $OID = $classifyinfo->{'classifyOID'} if defined ($classifyinfo->{'classifyOID'});
317
318 my $outputtext = "[$OID]\n";
319 $outputtext .= "<doctype>classify\n";
320 $outputtext .= "<hastxt>0\n";
321 $outputtext .= "<childtype>$classifyinfo->{'childtype'}\n"
322 if defined $classifyinfo->{'childtype'};
323 $outputtext .= "<Title>$classifyinfo->{'Title'}\n"
324 if defined $classifyinfo->{'Title'};
325 $outputtext .= "<numleafdocs>$classifyinfo->{'numleafdocs'}\n"
326 if defined $classifyinfo->{'numleafdocs'};
327 $outputtext .= "<thistype>$classifyinfo->{'thistype'}\n"
328 if defined $classifyinfo->{'thistype'};
329 $outputtext .= "<parameters>$classifyinfo->{'parameters'}\n"
330 if defined $classifyinfo->{'parameters'};
331 $outputtext .= "<supportsmemberof>$classifyinfo->{'supportsmemberof'}\n"
332 if defined $classifyinfo->{'supportsmemberof'};
333
334 my $contains_text = "<contains>";
335 my $mdoffset_text = "<mdoffset>";
336
337 my $next_subOID = 1;
338 my $first = 1;
339 foreach $tempinfo (@{$classifyinfo->{'contains'}}) {
340 # empty contents were made undefined by clean_contents()
341 next unless defined $tempinfo;
342
343 if (!defined ($tempinfo->{'classifyOID'}) ||
344 $tempinfo->{'classifyOID'} ne "oai") {
345 $contains_text .= ";" unless $first;
346 }
347 $mdoffset_text .= ";" unless $first;
348 $first = 0;
349
350 if (defined ($tempinfo->{'classifyOID'})) {
351 if ($tempinfo->{'classifyOID'} ne "oai") {
352 $contains_text .= $tempinfo->{'classifyOID'};
353 }
354 &print_classify_info ($handle, $tempinfo, $tempinfo->{'classifyOID'},
355 $remove_empty_classifications);
356 } elsif (defined ($tempinfo->{'OID'})) {
357 $contains_text .= $tempinfo->{'OID'};
358 $mdoffset_text .= $tempinfo->{'offset'}
359 if (defined ($tempinfo->{'offset'}))
360 # note: we don't want to print the contents of the books
361 } else {
362
363 # Supress having top-level node in Collage classifier
364 # so no bookshelf icon appears, top-level, along with the
365 # applet
366
367 if (!defined ($tempinfo->{'Title'}) || $tempinfo->{'Title'} ne "Collage") {
368 $contains_text .= "\".$next_subOID";
369 }
370
371 &print_classify_info ($handle, $tempinfo, "$OID.$next_subOID",
372 $remove_empty_classifications);
373 $next_subOID++;
374 }
375 }
376
377 $outputtext .= "$contains_text\n";
378 $outputtext .= "<mdtype>$classifyinfo->{'mdtype'}\n"
379 if defined $classifyinfo->{'mdtype'};
380 $outputtext .= "$mdoffset_text\n"
381 if ($mdoffset_text !~ m/^<mdoffset>;+$/);
382
383 $outputtext .= '-' x 70 . "\n";
384
385 print $handle $outputtext;
386
387}
388
389sub check_contents {
390 my ($classifyinfo,$remove_empty_classifications) = @_;
391 $remove_empty_classifications = 0 unless ($remove_empty_classifications);
392 my $num_leaf_docs = 0;
393 my $sub_num_leaf_docs = 0;
394
395 return $classifyinfo->{'numleafdocs'} if (defined $classifyinfo->{'numleafdocs'});
396
397 foreach $content (@{$classifyinfo->{'contains'}}) {
398 if (defined $content->{'OID'}) {
399 # found a book
400 $num_leaf_docs ++;
401 } elsif (($sub_num_leaf_docs = &check_contents ($content,$remove_empty_classifications)) > 0) {
402 # there's a book somewhere below
403 $num_leaf_docs += $sub_num_leaf_docs;
404 } else {
405 if ($remove_empty_classifications){
406 # section contains no books so we want to remove
407 # it from its parents contents
408 $content = undef;
409 }
410 }
411 }
412
413 $classifyinfo->{'numleafdocs'} = $num_leaf_docs;
414 return $num_leaf_docs;
415}
416
4171;
Note: See TracBrowser for help on using the repository browser.