Context Navigation

source: main/tags/2.74/gsdl/perllib/classify.pm@ 28436

Last change on this file since 28436 was 14270, checked in by oranfry, 17 years ago
merged selected changes to the gsdl trunk since r14217 into the 2.74 branch
Property svn:keywords set to `Author Date Id Revision`
File size: 15.0 KB

Line
1	###########################################################################
2	#
3	# classify.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# functions to handle classifiers
27
28	package classify;
29
30	require util;
31	require AllList;
32	use gsprintf;
33
34	#use GDBM_File;
35	use unbuildutil;
36
37
38	sub gsprintf
39	{
40	return &gsprintf::gsprintf(@_);
41	}
42
43
44	$next_classify_num = 1;
45	$oid_to_clids = {};
46
47	sub load_classifier_for_info {
48	my ($classifier) = shift @_;
49
50	# find the classifier
51	my $customclassname;
52	if (defined($ENV{'GSDLCOLLECTION'}))
53	{
54	$customclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'},
55	"perllib", "classify", "${classifier}.pm");
56	}
57	my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "perllib", "classify", "${classifier}.pm");
58	my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'}, "perllib", "classify", "${classifier}.pm");
59
60	if (defined($customclassname) && -e $customclassname) { require $customclassname; }
61	elsif (-e $colclassname) { require $colclassname; }
62	elsif (-e $mainclassname) { require $mainclassname; }
63	else {
64	&gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classifier) && die "\n";
65	}
66	my ($classobj);
67	my $options = "-gsdlinfo";
68	eval ("\$classobj = new \$classifier([],[$options])");
69	die "$@" if $@;
70
71	return $classobj;
72	}
73
74	sub load_classifiers {
75	my ($classify_list, $build_dir, $outhandle) = @_;
76	my @classify_objects = ();
77	my $classify_number = 1;
78
79	my $colclassdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"perllib/classify");
80	unshift (@INC, $colclassdir);
81
82	foreach $classifyoption (@$classify_list) {
83
84	# get the classifier name
85	my $classname = shift @$classifyoption;
86	next unless defined $classname;
87
88	# find the classifier
89	my $customclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'},
90	"perllib", "classify", "${classname}.pm");
91	my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "perllib", "classify", "${classname}.pm");
92	my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'}, "perllib", "classify", "${classname}.pm");
93
94	if (-e $customclassname) { require $customclassname; }
95	elsif (-e $colclassname) { require $colclassname; }
96	elsif (-e $mainclassname) { require $mainclassname; }
97	else { &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classname) && die "\n";
98	# die "ERROR - couldn't find classifier \"$classname\"\n";
99	}
100
101	# create the classify object
102	my ($classobj);
103
104	my @newoptions;
105
106	# do these first so they can be overriden by user supplied options
107	push @newoptions, "-builddir", "$build_dir" if ($build_dir);
108	push @newoptions, "-outhandle", "$outhandle" if ($outhandle);
109	push @newoptions, "-verbosity", "2";
110
111	# backwards compatability hack: if the classifier options are
112	# in "x=y" format, convert them to parsearg ("-x y") format.
113	my ($opt, $key, $value);
114	foreach $opt (@$classifyoption) {
115	# if ($opt =~ /^(\w+)=(.*)$/) {
116	# push @newoptions, "-$1", $2;
117	# } else {
118	push @newoptions, $opt;
119	#}
120	}
121
122	map { $_ = "\"$_\""; } @newoptions;
123	my $options .= join (",", @newoptions);
124
125
126	eval ("\$classobj = new \$classname([],[$options])");
127	die "$@" if $@;
128
129	$classobj->set_number($classify_number);
130	$classify_number ++;
131
132	# add this object to the list
133	push (@classify_objects, $classobj);
134	}
135
136	my ($classobj);
137	eval ("\$classobj = new AllList()");
138	die "$@" if $@;
139	push (@classify_objects, $classobj);
140
141	return \@classify_objects;
142	}
143
144	# init_classifiers resets all the classifiers and readys them to process
145	# the documents.
146	sub init_classifiers {
147	my ($classifiers) = @_;
148
149	foreach $classobj (@$classifiers) {
150	$classobj->init();
151	}
152	}
153
154
155
156	# takes a hashref containing the metadata for a gdbmfile entry, and extracts
157	# the childrens numbers (from the 'contains' entry).
158	# assumes format is ".1;".2;".3
159	sub get_children {
160	my ($doc_db_hash) = @_;
161
162	my $children = undef;
163
164	$childs = $doc_db_hash->{'contains'};
165	if (defined ($childs)) {
166	$childs =~ s/\@$//; #remove trailing @
167	$childs =~ s/^\"\.//; #remove initial ".
168	@$children = split /\;\"\./, $childs;
169
170	}
171
172	return $children;
173	}
174
175
176	sub recurse_sections {
177	my ($doc_obj, $children, $parentoid, $parentsection, $gdbm_recs) = @_;
178
179	return if (!defined $children);
180
181	foreach my $child (sort { $a <=> $b} @$children) {
182	$doc_obj->create_named_section("$parentsection.$child");
183	my $doc_db_rec = $gdbm_recs->{"$parentoid.$child"};
184	my $doc_db_hash = db_rec_to_hash($doc_db_rec);
185
186	# get child's children
187	my $newchildren = &get_children($doc_db_hash);
188
189	# add content for current section
190	add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
191
192	# process all the children if there are any
193	if (defined ($newchildren))
194	{
195	recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
196	"$parentsection.$child", $gdbm_recs);
197	}
198	}
199	}
200
201
202	sub add_section_content {
203	my ($doc_obj, $cursection, $doc_db_hash) = @_;
204
205	foreach $key (keys %$doc_db_hash) {
206	#don't need to store these metadata
207	next if $key =~ /(thistype\|childtype\|contains\|docnum\|doctype\|classifytype)/i;
208	# but do want things like hastxt and archivedir
209	my @items = split /@/, $doc_db_hash->{$key};
210	map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
211
212	}
213	}
214
215
216	# gets all the metadata from a gdbm file entry, and puts it into a hashref
217	sub db_rec_to_hash {
218
219	my ($gdb_str_ref) = @_;
220
221	my $hashref = {};
222
223	my @entries = split(/\n/, $gdb_str_ref);
224	foreach $entry (@entries) {
225	my($key, $value) = ($entry =~ /^<([^>])>(.?)$/ );
226	$hashref->{$key} .= '@' if defined $hashref->{$key};
227	$hashref->{$key} .= $value;
228
229	}
230
231	return $hashref;
232	}
233
234
235	sub reconstruct_doc_objs_metadata
236	{
237	my ($fulldbname) = @_;
238
239	# tie %gdbm_recs, 'GDBM_File', $fulldbname, &GDBM_WRCREAT, 0640;
240
241	my %gdbm_recs;
242	&unbuildutil::read_gdbm($fulldbname,\%gdbm_recs);
243
244
245	# dig out top level doc sections
246	my %top_sections = ();
247	my %top_docnums = ();
248	foreach my $key ( keys %gdbm_recs )
249	{
250	my $md_rec = $gdbm_recs{$key};
251	my $md_hash = db_rec_to_hash($md_rec);
252
253	if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
254	next if ($key =~ m/\./);
255	$top_sections{$key} = $md_hash;
256	$top_docnums{$key} = $md_hash->{'docnum'};
257	}
258	}
259
260	# for greenstone document objects based on metadata in gdbm file
261	my @all_docs = ();
262	# we need to make sure the documents were processed in the same order as
263	# before, so sort based on their docnums
264	foreach my $oid ( sort { $top_docnums{$a} <=> $top_docnums{$b} } keys %top_sections )
265	{
266	my $doc_db_hash = $top_sections{$oid};
267
268	my $doc_obj = new doc();
269	$doc_obj->set_OID($oid);
270	my $top = $doc_obj->get_top_section();
271	add_section_content ($doc_obj, $top, $doc_db_hash);
272	my $children = &get_children($doc_db_hash);
273	recurse_sections($doc_obj, $children, $oid, $top, \%gdbm_recs);
274
275	push(@all_docs,$doc_obj);
276	}
277
278	# untie %gdbm_recs;
279
280	return \@all_docs;
281	}
282
283
284
285
286
287	# classify_doc lets each of the classifiers classify a document
288	sub classify_doc {
289	my ($classifiers, $doc_obj) = @_;
290
291	foreach $classobj (@$classifiers) {
292	my $title = $classobj->{'title'};
293	$classobj->classify($doc_obj);
294	}
295	}
296
297	# output_classify_info outputs all the info needed for the classification
298	# to the gdbm
299	sub output_classify_info {
300	my ($classifiers, $handle, $remove_empty_classifications, $gli) = @_;
301	# $handle = "main::STDOUT";
302
303	$gli = 0 unless defined $gli;
304
305	# create a classification containing all the info
306	my $classifyinfo = {'classifyOID'=>'browse',
307	'contains'=>[]};
308
309	# get each of the classifications
310	foreach $classobj (@$classifiers) {
311	my $tempinfo = $classobj->get_classify_info($gli);
312	my $classID = $tempinfo->{'classifyOID'};
313
314	$tempinfo->{'classifyOID'} = "CL$next_classify_num" unless defined($tempinfo->{'classifyOID'});
315	$next_classify_num++;
316
317	print STDERR "*** outputting information for classifier: $tempinfo->{'classifyOID'}\n";
318
319	push (@{$classifyinfo->{'contains'}}, $tempinfo);
320	}
321
322	&print_classify_info ($handle, $classifyinfo, "", $remove_empty_classifications);
323	}
324
325	sub print_classify_info {
326	my ($handle, $classifyinfo, $OID, $remove_empty_classifications) = @_;
327
328	$OID =~ s/^\.+//; # just for good luck
329
330	# book information is printed elsewhere
331	return if (defined ($classifyinfo->{'OID'}));
332
333	# don't want empty classifications
334	return if (&check_contents ($classifyinfo, $remove_empty_classifications) == 0 && $remove_empty_classifications);
335
336	$OID = $classifyinfo->{'classifyOID'} if defined ($classifyinfo->{'classifyOID'});
337
338	my $outputtext = "[$OID]\n";
339	$outputtext .= "<doctype>classify\n";
340	$outputtext .= "<hastxt>0\n";
341	$outputtext .= "<childtype>$classifyinfo->{'childtype'}\n"
342	if defined $classifyinfo->{'childtype'};
343	$outputtext .= "<Title>$classifyinfo->{'Title'}\n"
344	if defined $classifyinfo->{'Title'};
345	$outputtext .= "<numleafdocs>$classifyinfo->{'numleafdocs'}\n"
346	if defined $classifyinfo->{'numleafdocs'};
347	$outputtext .= "<thistype>$classifyinfo->{'thistype'}\n"
348	if defined $classifyinfo->{'thistype'};
349	$outputtext .= "<parameters>$classifyinfo->{'parameters'}\n"
350	if defined $classifyinfo->{'parameters'};
351	$outputtext .= "<supportsmemberof>$classifyinfo->{'supportsmemberof'}\n"
352	if defined $classifyinfo->{'supportsmemberof'};
353
354	my $contains_text = "<contains>";
355	my $mdoffset_text = "<mdoffset>";
356
357	my $next_subOID = 1;
358	my $first = 1;
359	foreach $tempinfo (@{$classifyinfo->{'contains'}}) {
360	# empty contents were made undefined by clean_contents()
361	next unless defined $tempinfo;
362
363	if (!defined ($tempinfo->{'classifyOID'}) \|\|
364	$tempinfo->{'classifyOID'} ne "oai") {
365	$contains_text .= ";" unless $first;
366	}
367	$mdoffset_text .= ";" unless $first;
368	$first = 0;
369
370	if (defined ($tempinfo->{'classifyOID'})) {
371	if ($tempinfo->{'classifyOID'} ne "oai") {
372	$contains_text .= $tempinfo->{'classifyOID'};
373	}
374
375	# Extra code for incremental building.
376	# We need to store a listing of the classifiers each DOI is in
377	my $clids = [];
378	#rint STDERR "==1. Recording reverse lookup for " . $tempinfo->{'classifyOID'} . "==\n";
379	if(defined($oid_to_clids->{$tempinfo->{'classifyOID'}})) {
380	#rint STDERR "Found existing array!\n";
381	$clids = $oid_to_clids->{$tempinfo->{'classifyOID'}};
382	}
383	#rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
384	push(@{$clids}, $OID);
385	$oid_to_clids->{$tempinfo->{'classifyOID'}} = $clids;
386	#rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
387
388	&print_classify_info ($handle, $tempinfo, $tempinfo->{'classifyOID'},
389	$remove_empty_classifications);
390	} elsif (defined ($tempinfo->{'OID'})) {
391	$contains_text .= $tempinfo->{'OID'};
392	$mdoffset_text .= $tempinfo->{'offset'} if (defined ($tempinfo->{'offset'}));
393
394
395	# note: we don't want to print the contents of the books
396	# Extra code for incremental building.
397	# We need to store a listing of the classifiers each DOI is in
398	my $clids = [];
399	#rint STDERR "==2. Recording reverse lookup for " . $tempinfo->{'OID'} . "==\n";
400	if(defined($oid_to_clids->{$tempinfo->{'OID'}})) {
401	#rint STDERR "Found existing array!\n";
402	$clids = $oid_to_clids->{$tempinfo->{'OID'}};
403	}
404	#rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
405	push(@{$clids}, $OID);
406	$oid_to_clids->{$tempinfo->{'OID'}} = $clids;
407	#rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
408
409
410	} else {
411
412	# Supress having top-level node in Collage classifier
413	# so no bookshelf icon appears, top-level, along with the
414	# applet
415
416	if (!defined ($tempinfo->{'Title'}) \|\| $tempinfo->{'Title'} ne "Collage") {
417	$contains_text .= "\".$next_subOID";
418	}
419
420	# Extra code for incremental building.
421	# We need to store a listing of the classifiers each DOI is in
422	my $clids = [];
423	#rint STDERR "==3. Recording reverse lookup for $OID.$next_subOID==\n";
424	if(defined($oid_to_clids->{$OID . "." . $next_subOID})) {
425	#rint STDERR "Found existing array!\n";
426	$clids = $oid_to_clids->{$OID . "." . $next_subOID};
427	}
428	#rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
429	push(@{$clids}, $OID);
430	$oid_to_clids->{$OID . "." . $next_subOID} = $clids;
431	#rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
432
433	&print_classify_info ($handle, $tempinfo, "$OID.$next_subOID",
434	$remove_empty_classifications);
435	$next_subOID++;
436	}
437	}
438
439	$outputtext .= "$contains_text\n";
440	$outputtext .= "<mdtype>$classifyinfo->{'mdtype'}\n"
441	if defined $classifyinfo->{'mdtype'};
442	$outputtext .= "$mdoffset_text\n"
443	if ($mdoffset_text !~ m/^<mdoffset>;+$/);
444
445	$outputtext .= '-' x 70 . "\n";
446
447	print $handle $outputtext;
448
449	}
450
451	sub check_contents {
452	my ($classifyinfo,$remove_empty_classifications) = @_;
453	$remove_empty_classifications = 0 unless ($remove_empty_classifications);
454	my $num_leaf_docs = 0;
455	my $sub_num_leaf_docs = 0;
456
457	return $classifyinfo->{'numleafdocs'} if (defined $classifyinfo->{'numleafdocs'});
458
459	foreach $content (@{$classifyinfo->{'contains'}}) {
460	if (defined $content->{'OID'}) {
461	# found a book
462	$num_leaf_docs ++;
463	} elsif (($sub_num_leaf_docs = &check_contents ($content,$remove_empty_classifications)) > 0) {
464	# there's a book somewhere below
465	$num_leaf_docs += $sub_num_leaf_docs;
466	} else {
467	if ($remove_empty_classifications){
468	# section contains no books so we want to remove
469	# it from its parents contents
470	$content = undef;
471	}
472	}
473	}
474
475	$classifyinfo->{'numleafdocs'} = $num_leaf_docs;
476	return $num_leaf_docs;
477	}
478
479	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: