Context Navigation

source: trunk/gsdl/perllib/classify.pm@ 13418

Last change on this file since 13418 was 13068, checked in by kjdon, 18 years ago
when unbuilding and rebuilding the gdbm database for incremental building, we need to make sure the old documents are processed in the same order as they were originally added, otherwise the docnums don't match up with the index or text
Property svn:keywords set to `Author Date Id Revision`
File size: 14.3 KB

Line
1	###########################################################################
2	#
3	# classify.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# functions to handle classifiers
27
28	package classify;
29
30	require util;
31	require AllList;
32	use gsprintf;
33
34	#use GDBM_File;
35	use unbuildutil;
36
37
38	sub gsprintf
39	{
40	return &gsprintf::gsprintf(@_);
41	}
42
43
44	$next_classify_num = 1;
45	$oid_to_clids = {};
46
47	sub load_classifier_for_info {
48	my ($classifier) = shift @_;
49
50	# find the classifier
51	my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
52	"perllib/classify",
53	"${classifier}.pm");
54	my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'},
55	"perllib/classify",
56	"${classifier}.pm");
57
58	if (-e $colclassname) { require $colclassname; }
59	elsif (-e $mainclassname) { require $mainclassname; }
60	else {
61	&gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classifier) && die "\n";
62	}
63	my ($classobj);
64	my $options = "-gsdlinfo";
65	eval ("\$classobj = new \$classifier([],[$options])");
66	die "$@" if $@;
67
68	return $classobj;
69	}
70
71	sub load_classifiers {
72	my ($classify_list, $build_dir, $outhandle) = @_;
73	my @classify_objects = ();
74	my $classify_number = 1;
75
76	foreach $classifyoption (@$classify_list) {
77
78	# get the classifier name
79	my $classname = shift @$classifyoption;
80	next unless defined $classname;
81
82	# find the classifier
83	my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"perllib/classify",
84	"${classname}.pm");
85	my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'},"perllib/classify",
86	"${classname}.pm");
87
88	if (-e $colclassname) { require $colclassname; }
89	elsif (-e $mainclassname) { require $mainclassname; }
90	else { &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classname) && die "\n";
91	# die "ERROR - couldn't find classifier \"$classname\"\n";
92	}
93
94	# create the classify object
95	my ($classobj);
96
97	my @newoptions;
98
99	# do these first so they can be overriden by user supplied options
100	push @newoptions, "-builddir", "$build_dir" if ($build_dir);
101	push @newoptions, "-outhandle", "$outhandle" if ($outhandle);
102	push @newoptions, "-verbosity", "2";
103
104	# backwards compatability hack: if the classifier options are
105	# in "x=y" format, convert them to parsearg ("-x y") format.
106	my ($opt, $key, $value);
107	foreach $opt (@$classifyoption) {
108	# if ($opt =~ /^(\w+)=(.*)$/) {
109	# push @newoptions, "-$1", $2;
110	# } else {
111	push @newoptions, $opt;
112	#}
113	}
114
115	map { $_ = "\"$_\""; } @newoptions;
116	my $options .= join (",", @newoptions);
117
118
119	eval ("\$classobj = new \$classname([],[$options])");
120	die "$@" if $@;
121
122	$classobj->set_number($classify_number);
123	$classify_number ++;
124
125	# add this object to the list
126	push (@classify_objects, $classobj);
127	}
128
129	my ($classobj);
130	eval ("\$classobj = new AllList()");
131	die "$@" if $@;
132	push (@classify_objects, $classobj);
133
134	return \@classify_objects;
135	}
136
137	# init_classifiers resets all the classifiers and readys them to process
138	# the documents.
139	sub init_classifiers {
140	my ($classifiers) = @_;
141
142	foreach $classobj (@$classifiers) {
143	$classobj->init();
144	}
145	}
146
147
148
149	# takes a hashref containing the metadata for a gdbmfile entry, and extracts
150	# the childrens numbers (from the 'contains' entry).
151	# assumes format is ".1;".2;".3
152	sub get_children {
153	my ($doc_db_hash) = @_;
154
155	my $children = undef;
156
157	$childs = $doc_db_hash->{'contains'};
158	if (defined ($childs)) {
159	$childs =~ s/\@$//; #remove trailing @
160	$childs =~ s/^\"\.//; #remove initial ".
161	@$children = split /\;\"\./, $childs;
162
163	}
164
165	return $children;
166	}
167
168
169	sub recurse_sections {
170	my ($doc_obj, $children, $parentoid, $parentsection, $gdbm_recs) = @_;
171
172	return if (!defined $children);
173
174	foreach my $child (sort { $a <=> $b} @$children) {
175	$doc_obj->create_named_section("$parentsection.$child");
176	my $doc_db_rec = $gdbm_recs->{"$parentoid.$child"};
177	my $doc_db_hash = db_rec_to_hash($doc_db_rec);
178
179	# get child's children
180	my $newchildren = &get_children($doc_db_hash);
181
182	# add content for current section
183	add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
184
185	# process all the children if there are any
186	if (defined ($newchildren))
187	{
188	recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
189	"$parentsection.$child", $gdbm_recs);
190	}
191	}
192	}
193
194
195	sub add_section_content {
196	my ($doc_obj, $cursection, $doc_db_hash) = @_;
197
198	foreach $key (keys %$doc_db_hash) {
199	#don't need to store these metadata
200	next if $key =~ /(thistype\|childtype\|contains\|docnum\|doctype\|classifytype)/i;
201	# but do want things like hastxt and archivedir
202	my @items = split /@/, $doc_db_hash->{$key};
203	map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
204
205	}
206	}
207
208
209	# gets all the metadata from a gdbm file entry, and puts it into a hashref
210	sub db_rec_to_hash {
211
212	my ($gdb_str_ref) = @_;
213
214	my $hashref = {};
215
216	my @entries = split(/\n/, $gdb_str_ref);
217	foreach $entry (@entries) {
218	my($key, $value) = ($entry =~ /^<([^>])>(.?)$/ );
219	$hashref->{$key} .= '@' if defined $hashref->{$key};
220	$hashref->{$key} .= $value;
221
222	}
223
224	return $hashref;
225	}
226
227
228	sub reconstruct_doc_objs_metadata
229	{
230	my ($fulldbname) = @_;
231
232	# tie %gdbm_recs, 'GDBM_File', $fulldbname, &GDBM_WRCREAT, 0640;
233
234	my %gdbm_recs;
235	&unbuildutil::read_gdbm($fulldbname,\%gdbm_recs);
236
237
238	# dig out top level doc sections
239	my %top_sections = ();
240	my %top_docnums = ();
241	foreach my $key ( keys %gdbm_recs )
242	{
243	my $md_rec = $gdbm_recs{$key};
244	my $md_hash = db_rec_to_hash($md_rec);
245
246	if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
247	next if ($key =~ m/\./);
248	$top_sections{$key} = $md_hash;
249	$top_docnums{$key} = $md_hash->{'docnum'};
250	}
251	}
252
253	# for greenstone document objects based on metadata in gdbm file
254	my @all_docs = ();
255	# we need to make sure the documents were processed in the same order as
256	# before, so sort based on their docnums
257	foreach my $oid ( sort { $top_docnums{$a} <=> $top_docnums{$b} } keys %top_sections )
258	{
259	my $doc_db_hash = $top_sections{$oid};
260
261	my $doc_obj = new doc();
262	$doc_obj->set_OID($oid);
263	my $top = $doc_obj->get_top_section();
264	add_section_content ($doc_obj, $top, $doc_db_hash);
265	my $children = &get_children($doc_db_hash);
266	recurse_sections($doc_obj, $children, $oid, $top, \%gdbm_recs);
267
268	push(@all_docs,$doc_obj);
269	}
270
271	# untie %gdbm_recs;
272
273	return \@all_docs;
274	}
275
276
277
278
279
280	# classify_doc lets each of the classifiers classify a document
281	sub classify_doc {
282	my ($classifiers, $doc_obj) = @_;
283
284	foreach $classobj (@$classifiers) {
285	my $title = $classobj->{'title'};
286	$classobj->classify($doc_obj);
287	}
288	}
289
290	# output_classify_info outputs all the info needed for the classification
291	# to the gdbm
292	sub output_classify_info {
293	my ($classifiers, $handle, $remove_empty_classifications, $gli) = @_;
294	# $handle = "main::STDOUT";
295
296	$gli = 0 unless defined $gli;
297
298	# create a classification containing all the info
299	my $classifyinfo = {'classifyOID'=>'browse',
300	'contains'=>[]};
301
302	# get each of the classifications
303	foreach $classobj (@$classifiers) {
304	my $tempinfo = $classobj->get_classify_info($gli);
305	my $classID = $tempinfo->{'classifyOID'};
306
307	$tempinfo->{'classifyOID'} = "CL$next_classify_num" unless defined($tempinfo->{'classifyOID'});
308	$next_classify_num++;
309
310	print STDERR "*** outputting information for classifier: $tempinfo->{'classifyOID'}\n";
311
312	push (@{$classifyinfo->{'contains'}}, $tempinfo);
313	}
314
315	&print_classify_info ($handle, $classifyinfo, "", $remove_empty_classifications);
316	}
317
318	sub print_classify_info {
319	my ($handle, $classifyinfo, $OID, $remove_empty_classifications) = @_;
320
321	$OID =~ s/^\.+//; # just for good luck
322
323	# book information is printed elsewhere
324	return if (defined ($classifyinfo->{'OID'}));
325
326	# don't want empty classifications
327	return if (&check_contents ($classifyinfo, $remove_empty_classifications) == 0 && $remove_empty_classifications);
328
329	$OID = $classifyinfo->{'classifyOID'} if defined ($classifyinfo->{'classifyOID'});
330
331	my $outputtext = "[$OID]\n";
332	$outputtext .= "<doctype>classify\n";
333	$outputtext .= "<hastxt>0\n";
334	$outputtext .= "<childtype>$classifyinfo->{'childtype'}\n"
335	if defined $classifyinfo->{'childtype'};
336	$outputtext .= "<Title>$classifyinfo->{'Title'}\n"
337	if defined $classifyinfo->{'Title'};
338	$outputtext .= "<numleafdocs>$classifyinfo->{'numleafdocs'}\n"
339	if defined $classifyinfo->{'numleafdocs'};
340	$outputtext .= "<thistype>$classifyinfo->{'thistype'}\n"
341	if defined $classifyinfo->{'thistype'};
342	$outputtext .= "<parameters>$classifyinfo->{'parameters'}\n"
343	if defined $classifyinfo->{'parameters'};
344	$outputtext .= "<supportsmemberof>$classifyinfo->{'supportsmemberof'}\n"
345	if defined $classifyinfo->{'supportsmemberof'};
346
347	my $contains_text = "<contains>";
348	my $mdoffset_text = "<mdoffset>";
349
350	my $next_subOID = 1;
351	my $first = 1;
352	foreach $tempinfo (@{$classifyinfo->{'contains'}}) {
353	# empty contents were made undefined by clean_contents()
354	next unless defined $tempinfo;
355
356	if (!defined ($tempinfo->{'classifyOID'}) \|\|
357	$tempinfo->{'classifyOID'} ne "oai") {
358	$contains_text .= ";" unless $first;
359	}
360	$mdoffset_text .= ";" unless $first;
361	$first = 0;
362
363	if (defined ($tempinfo->{'classifyOID'})) {
364	if ($tempinfo->{'classifyOID'} ne "oai") {
365	$contains_text .= $tempinfo->{'classifyOID'};
366	}
367
368	# Extra code for incremental building.
369	# We need to store a listing of the classifiers each DOI is in
370	my $clids = [];
371	#rint STDERR "==1. Recording reverse lookup for " . $tempinfo->{'classifyOID'} . "==\n";
372	if(defined($oid_to_clids->{$tempinfo->{'classifyOID'}})) {
373	#rint STDERR "Found existing array!\n";
374	$clids = $oid_to_clids->{$tempinfo->{'classifyOID'}};
375	}
376	#rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
377	push(@{$clids}, $OID);
378	$oid_to_clids->{$tempinfo->{'classifyOID'}} = $clids;
379	#rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
380
381	&print_classify_info ($handle, $tempinfo, $tempinfo->{'classifyOID'},
382	$remove_empty_classifications);
383	} elsif (defined ($tempinfo->{'OID'})) {
384	$contains_text .= $tempinfo->{'OID'};
385	$mdoffset_text .= $tempinfo->{'offset'} if (defined ($tempinfo->{'offset'}));
386
387
388	# note: we don't want to print the contents of the books
389	# Extra code for incremental building.
390	# We need to store a listing of the classifiers each DOI is in
391	my $clids = [];
392	#rint STDERR "==2. Recording reverse lookup for " . $tempinfo->{'OID'} . "==\n";
393	if(defined($oid_to_clids->{$tempinfo->{'OID'}})) {
394	#rint STDERR "Found existing array!\n";
395	$clids = $oid_to_clids->{$tempinfo->{'OID'}};
396	}
397	#rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
398	push(@{$clids}, $OID);
399	$oid_to_clids->{$tempinfo->{'OID'}} = $clids;
400	#rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
401
402
403	} else {
404
405	# Supress having top-level node in Collage classifier
406	# so no bookshelf icon appears, top-level, along with the
407	# applet
408
409	if (!defined ($tempinfo->{'Title'}) \|\| $tempinfo->{'Title'} ne "Collage") {
410	$contains_text .= "\".$next_subOID";
411	}
412
413	# Extra code for incremental building.
414	# We need to store a listing of the classifiers each DOI is in
415	my $clids = [];
416	#rint STDERR "==3. Recording reverse lookup for $OID.$next_subOID==\n";
417	if(defined($oid_to_clids->{$OID . "." . $next_subOID})) {
418	#rint STDERR "Found existing array!\n";
419	$clids = $oid_to_clids->{$OID . "." . $next_subOID};
420	}
421	#rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
422	push(@{$clids}, $OID);
423	$oid_to_clids->{$OID . "." . $next_subOID} = $clids;
424	#rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
425
426	&print_classify_info ($handle, $tempinfo, "$OID.$next_subOID",
427	$remove_empty_classifications);
428	$next_subOID++;
429	}
430	}
431
432	$outputtext .= "$contains_text\n";
433	$outputtext .= "<mdtype>$classifyinfo->{'mdtype'}\n"
434	if defined $classifyinfo->{'mdtype'};
435	$outputtext .= "$mdoffset_text\n"
436	if ($mdoffset_text !~ m/^<mdoffset>;+$/);
437
438	$outputtext .= '-' x 70 . "\n";
439
440	print $handle $outputtext;
441
442	}
443
444	sub check_contents {
445	my ($classifyinfo,$remove_empty_classifications) = @_;
446	$remove_empty_classifications = 0 unless ($remove_empty_classifications);
447	my $num_leaf_docs = 0;
448	my $sub_num_leaf_docs = 0;
449
450	return $classifyinfo->{'numleafdocs'} if (defined $classifyinfo->{'numleafdocs'});
451
452	foreach $content (@{$classifyinfo->{'contains'}}) {
453	if (defined $content->{'OID'}) {
454	# found a book
455	$num_leaf_docs ++;
456	} elsif (($sub_num_leaf_docs = &check_contents ($content,$remove_empty_classifications)) > 0) {
457	# there's a book somewhere below
458	$num_leaf_docs += $sub_num_leaf_docs;
459	} else {
460	if ($remove_empty_classifications){
461	# section contains no books so we want to remove
462	# it from its parents contents
463	$content = undef;
464	}
465	}
466	}
467
468	$classifyinfo->{'numleafdocs'} = $num_leaf_docs;
469	return $num_leaf_docs;
470	}
471
472	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: