Context Navigation

source: trunk/gsdl/perllib/classify.pm@ 14112

Last change on this file since 14112 was 14112, checked in by sjboddie, 17 years ago
More modifications to support additional collection-level customisations to be put in gsdl/collect/COLLECTION/custom/COLLECTION. basebuilder.pm, classify.pm, colcfg.pm, and plugin.pm were modified to allow collection-specific plugins, classifiers, builders, and buildprocs to be located in the new locations. These changes should not have any effect on existing collections.
Property svn:keywords set to `Author Date Id Revision`
File size: 14.9 KB

Line
1	###########################################################################
2	#
3	# classify.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# functions to handle classifiers
27
28	package classify;
29
30	require util;
31	require AllList;
32	use gsprintf;
33
34	#use GDBM_File;
35	use unbuildutil;
36
37
38	sub gsprintf
39	{
40	return &gsprintf::gsprintf(@_);
41	}
42
43
44	$next_classify_num = 1;
45	$oid_to_clids = {};
46
47	sub load_classifier_for_info {
48	my ($classifier) = shift @_;
49
50	# find the classifier
51	my $customclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'},
52	"perllib", "classify", "${classifier}.pm");
53	my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "perllib", "classify", "${classifier}.pm");
54	my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'}, "perllib", "classify", "${classifier}.pm");
55
56	if (-e $customclassname) { require $customclassname; }
57	elsif (-e $colclassname) { require $colclassname; }
58	elsif (-e $mainclassname) { require $mainclassname; }
59	else {
60	&gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classifier) && die "\n";
61	}
62	my ($classobj);
63	my $options = "-gsdlinfo";
64	eval ("\$classobj = new \$classifier([],[$options])");
65	die "$@" if $@;
66
67	return $classobj;
68	}
69
70	sub load_classifiers {
71	my ($classify_list, $build_dir, $outhandle) = @_;
72	my @classify_objects = ();
73	my $classify_number = 1;
74
75	my $colclassdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"perllib/classify");
76	unshift (@INC, $colclassdir);
77
78	foreach $classifyoption (@$classify_list) {
79
80	# get the classifier name
81	my $classname = shift @$classifyoption;
82	next unless defined $classname;
83
84	# find the classifier
85	my $customclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'},
86	"perllib", "classify", "${classname}.pm");
87	my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "perllib", "classify", "${classname}.pm");
88	my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'}, "perllib", "classify", "${classname}.pm");
89
90	if (-e $customclassname) { require $customclassname; }
91	elsif (-e $colclassname) { require $colclassname; }
92	elsif (-e $mainclassname) { require $mainclassname; }
93	else { &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classname) && die "\n";
94	# die "ERROR - couldn't find classifier \"$classname\"\n";
95	}
96
97	# create the classify object
98	my ($classobj);
99
100	my @newoptions;
101
102	# do these first so they can be overriden by user supplied options
103	push @newoptions, "-builddir", "$build_dir" if ($build_dir);
104	push @newoptions, "-outhandle", "$outhandle" if ($outhandle);
105	push @newoptions, "-verbosity", "2";
106
107	# backwards compatability hack: if the classifier options are
108	# in "x=y" format, convert them to parsearg ("-x y") format.
109	my ($opt, $key, $value);
110	foreach $opt (@$classifyoption) {
111	# if ($opt =~ /^(\w+)=(.*)$/) {
112	# push @newoptions, "-$1", $2;
113	# } else {
114	push @newoptions, $opt;
115	#}
116	}
117
118	map { $_ = "\"$_\""; } @newoptions;
119	my $options .= join (",", @newoptions);
120
121
122	eval ("\$classobj = new \$classname([],[$options])");
123	die "$@" if $@;
124
125	$classobj->set_number($classify_number);
126	$classify_number ++;
127
128	# add this object to the list
129	push (@classify_objects, $classobj);
130	}
131
132	my ($classobj);
133	eval ("\$classobj = new AllList()");
134	die "$@" if $@;
135	push (@classify_objects, $classobj);
136
137	return \@classify_objects;
138	}
139
140	# init_classifiers resets all the classifiers and readys them to process
141	# the documents.
142	sub init_classifiers {
143	my ($classifiers) = @_;
144
145	foreach $classobj (@$classifiers) {
146	$classobj->init();
147	}
148	}
149
150
151
152	# takes a hashref containing the metadata for a gdbmfile entry, and extracts
153	# the childrens numbers (from the 'contains' entry).
154	# assumes format is ".1;".2;".3
155	sub get_children {
156	my ($doc_db_hash) = @_;
157
158	my $children = undef;
159
160	$childs = $doc_db_hash->{'contains'};
161	if (defined ($childs)) {
162	$childs =~ s/\@$//; #remove trailing @
163	$childs =~ s/^\"\.//; #remove initial ".
164	@$children = split /\;\"\./, $childs;
165
166	}
167
168	return $children;
169	}
170
171
172	sub recurse_sections {
173	my ($doc_obj, $children, $parentoid, $parentsection, $gdbm_recs) = @_;
174
175	return if (!defined $children);
176
177	foreach my $child (sort { $a <=> $b} @$children) {
178	$doc_obj->create_named_section("$parentsection.$child");
179	my $doc_db_rec = $gdbm_recs->{"$parentoid.$child"};
180	my $doc_db_hash = db_rec_to_hash($doc_db_rec);
181
182	# get child's children
183	my $newchildren = &get_children($doc_db_hash);
184
185	# add content for current section
186	add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
187
188	# process all the children if there are any
189	if (defined ($newchildren))
190	{
191	recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
192	"$parentsection.$child", $gdbm_recs);
193	}
194	}
195	}
196
197
198	sub add_section_content {
199	my ($doc_obj, $cursection, $doc_db_hash) = @_;
200
201	foreach $key (keys %$doc_db_hash) {
202	#don't need to store these metadata
203	next if $key =~ /(thistype\|childtype\|contains\|docnum\|doctype\|classifytype)/i;
204	# but do want things like hastxt and archivedir
205	my @items = split /@/, $doc_db_hash->{$key};
206	map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
207
208	}
209	}
210
211
212	# gets all the metadata from a gdbm file entry, and puts it into a hashref
213	sub db_rec_to_hash {
214
215	my ($gdb_str_ref) = @_;
216
217	my $hashref = {};
218
219	my @entries = split(/\n/, $gdb_str_ref);
220	foreach $entry (@entries) {
221	my($key, $value) = ($entry =~ /^<([^>])>(.?)$/ );
222	$hashref->{$key} .= '@' if defined $hashref->{$key};
223	$hashref->{$key} .= $value;
224
225	}
226
227	return $hashref;
228	}
229
230
231	sub reconstruct_doc_objs_metadata
232	{
233	my ($fulldbname) = @_;
234
235	# tie %gdbm_recs, 'GDBM_File', $fulldbname, &GDBM_WRCREAT, 0640;
236
237	my %gdbm_recs;
238	&unbuildutil::read_gdbm($fulldbname,\%gdbm_recs);
239
240
241	# dig out top level doc sections
242	my %top_sections = ();
243	my %top_docnums = ();
244	foreach my $key ( keys %gdbm_recs )
245	{
246	my $md_rec = $gdbm_recs{$key};
247	my $md_hash = db_rec_to_hash($md_rec);
248
249	if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
250	next if ($key =~ m/\./);
251	$top_sections{$key} = $md_hash;
252	$top_docnums{$key} = $md_hash->{'docnum'};
253	}
254	}
255
256	# for greenstone document objects based on metadata in gdbm file
257	my @all_docs = ();
258	# we need to make sure the documents were processed in the same order as
259	# before, so sort based on their docnums
260	foreach my $oid ( sort { $top_docnums{$a} <=> $top_docnums{$b} } keys %top_sections )
261	{
262	my $doc_db_hash = $top_sections{$oid};
263
264	my $doc_obj = new doc();
265	$doc_obj->set_OID($oid);
266	my $top = $doc_obj->get_top_section();
267	add_section_content ($doc_obj, $top, $doc_db_hash);
268	my $children = &get_children($doc_db_hash);
269	recurse_sections($doc_obj, $children, $oid, $top, \%gdbm_recs);
270
271	push(@all_docs,$doc_obj);
272	}
273
274	# untie %gdbm_recs;
275
276	return \@all_docs;
277	}
278
279
280
281
282
283	# classify_doc lets each of the classifiers classify a document
284	sub classify_doc {
285	my ($classifiers, $doc_obj) = @_;
286
287	foreach $classobj (@$classifiers) {
288	my $title = $classobj->{'title'};
289	$classobj->classify($doc_obj);
290	}
291	}
292
293	# output_classify_info outputs all the info needed for the classification
294	# to the gdbm
295	sub output_classify_info {
296	my ($classifiers, $handle, $remove_empty_classifications, $gli) = @_;
297	# $handle = "main::STDOUT";
298
299	$gli = 0 unless defined $gli;
300
301	# create a classification containing all the info
302	my $classifyinfo = {'classifyOID'=>'browse',
303	'contains'=>[]};
304
305	# get each of the classifications
306	foreach $classobj (@$classifiers) {
307	my $tempinfo = $classobj->get_classify_info($gli);
308	my $classID = $tempinfo->{'classifyOID'};
309
310	$tempinfo->{'classifyOID'} = "CL$next_classify_num" unless defined($tempinfo->{'classifyOID'});
311	$next_classify_num++;
312
313	print STDERR "*** outputting information for classifier: $tempinfo->{'classifyOID'}\n";
314
315	push (@{$classifyinfo->{'contains'}}, $tempinfo);
316	}
317
318	&print_classify_info ($handle, $classifyinfo, "", $remove_empty_classifications);
319	}
320
321	sub print_classify_info {
322	my ($handle, $classifyinfo, $OID, $remove_empty_classifications) = @_;
323
324	$OID =~ s/^\.+//; # just for good luck
325
326	# book information is printed elsewhere
327	return if (defined ($classifyinfo->{'OID'}));
328
329	# don't want empty classifications
330	return if (&check_contents ($classifyinfo, $remove_empty_classifications) == 0 && $remove_empty_classifications);
331
332	$OID = $classifyinfo->{'classifyOID'} if defined ($classifyinfo->{'classifyOID'});
333
334	my $outputtext = "[$OID]\n";
335	$outputtext .= "<doctype>classify\n";
336	$outputtext .= "<hastxt>0\n";
337	$outputtext .= "<childtype>$classifyinfo->{'childtype'}\n"
338	if defined $classifyinfo->{'childtype'};
339	$outputtext .= "<Title>$classifyinfo->{'Title'}\n"
340	if defined $classifyinfo->{'Title'};
341	$outputtext .= "<numleafdocs>$classifyinfo->{'numleafdocs'}\n"
342	if defined $classifyinfo->{'numleafdocs'};
343	$outputtext .= "<thistype>$classifyinfo->{'thistype'}\n"
344	if defined $classifyinfo->{'thistype'};
345	$outputtext .= "<parameters>$classifyinfo->{'parameters'}\n"
346	if defined $classifyinfo->{'parameters'};
347	$outputtext .= "<supportsmemberof>$classifyinfo->{'supportsmemberof'}\n"
348	if defined $classifyinfo->{'supportsmemberof'};
349
350	my $contains_text = "<contains>";
351	my $mdoffset_text = "<mdoffset>";
352
353	my $next_subOID = 1;
354	my $first = 1;
355	foreach $tempinfo (@{$classifyinfo->{'contains'}}) {
356	# empty contents were made undefined by clean_contents()
357	next unless defined $tempinfo;
358
359	if (!defined ($tempinfo->{'classifyOID'}) \|\|
360	$tempinfo->{'classifyOID'} ne "oai") {
361	$contains_text .= ";" unless $first;
362	}
363	$mdoffset_text .= ";" unless $first;
364	$first = 0;
365
366	if (defined ($tempinfo->{'classifyOID'})) {
367	if ($tempinfo->{'classifyOID'} ne "oai") {
368	$contains_text .= $tempinfo->{'classifyOID'};
369	}
370
371	# Extra code for incremental building.
372	# We need to store a listing of the classifiers each DOI is in
373	my $clids = [];
374	#rint STDERR "==1. Recording reverse lookup for " . $tempinfo->{'classifyOID'} . "==\n";
375	if(defined($oid_to_clids->{$tempinfo->{'classifyOID'}})) {
376	#rint STDERR "Found existing array!\n";
377	$clids = $oid_to_clids->{$tempinfo->{'classifyOID'}};
378	}
379	#rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
380	push(@{$clids}, $OID);
381	$oid_to_clids->{$tempinfo->{'classifyOID'}} = $clids;
382	#rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
383
384	&print_classify_info ($handle, $tempinfo, $tempinfo->{'classifyOID'},
385	$remove_empty_classifications);
386	} elsif (defined ($tempinfo->{'OID'})) {
387	$contains_text .= $tempinfo->{'OID'};
388	$mdoffset_text .= $tempinfo->{'offset'} if (defined ($tempinfo->{'offset'}));
389
390
391	# note: we don't want to print the contents of the books
392	# Extra code for incremental building.
393	# We need to store a listing of the classifiers each DOI is in
394	my $clids = [];
395	#rint STDERR "==2. Recording reverse lookup for " . $tempinfo->{'OID'} . "==\n";
396	if(defined($oid_to_clids->{$tempinfo->{'OID'}})) {
397	#rint STDERR "Found existing array!\n";
398	$clids = $oid_to_clids->{$tempinfo->{'OID'}};
399	}
400	#rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
401	push(@{$clids}, $OID);
402	$oid_to_clids->{$tempinfo->{'OID'}} = $clids;
403	#rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
404
405
406	} else {
407
408	# Supress having top-level node in Collage classifier
409	# so no bookshelf icon appears, top-level, along with the
410	# applet
411
412	if (!defined ($tempinfo->{'Title'}) \|\| $tempinfo->{'Title'} ne "Collage") {
413	$contains_text .= "\".$next_subOID";
414	}
415
416	# Extra code for incremental building.
417	# We need to store a listing of the classifiers each DOI is in
418	my $clids = [];
419	#rint STDERR "==3. Recording reverse lookup for $OID.$next_subOID==\n";
420	if(defined($oid_to_clids->{$OID . "." . $next_subOID})) {
421	#rint STDERR "Found existing array!\n";
422	$clids = $oid_to_clids->{$OID . "." . $next_subOID};
423	}
424	#rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
425	push(@{$clids}, $OID);
426	$oid_to_clids->{$OID . "." . $next_subOID} = $clids;
427	#rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
428
429	&print_classify_info ($handle, $tempinfo, "$OID.$next_subOID",
430	$remove_empty_classifications);
431	$next_subOID++;
432	}
433	}
434
435	$outputtext .= "$contains_text\n";
436	$outputtext .= "<mdtype>$classifyinfo->{'mdtype'}\n"
437	if defined $classifyinfo->{'mdtype'};
438	$outputtext .= "$mdoffset_text\n"
439	if ($mdoffset_text !~ m/^<mdoffset>;+$/);
440
441	$outputtext .= '-' x 70 . "\n";
442
443	print $handle $outputtext;
444
445	}
446
447	sub check_contents {
448	my ($classifyinfo,$remove_empty_classifications) = @_;
449	$remove_empty_classifications = 0 unless ($remove_empty_classifications);
450	my $num_leaf_docs = 0;
451	my $sub_num_leaf_docs = 0;
452
453	return $classifyinfo->{'numleafdocs'} if (defined $classifyinfo->{'numleafdocs'});
454
455	foreach $content (@{$classifyinfo->{'contains'}}) {
456	if (defined $content->{'OID'}) {
457	# found a book
458	$num_leaf_docs ++;
459	} elsif (($sub_num_leaf_docs = &check_contents ($content,$remove_empty_classifications)) > 0) {
460	# there's a book somewhere below
461	$num_leaf_docs += $sub_num_leaf_docs;
462	} else {
463	if ($remove_empty_classifications){
464	# section contains no books so we want to remove
465	# it from its parents contents
466	$content = undef;
467	}
468	}
469	}
470
471	$classifyinfo->{'numleafdocs'} = $num_leaf_docs;
472	return $num_leaf_docs;
473	}
474
475	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: