Context Navigation

source: gsdl/trunk/perllib/classify.pm@ 20575

Last change on this file since 20575 was 20575, checked in by davidb, 15 years ago
Opening of txt2db moved to earlier in the buildcol process. This was done to avoid a huge memory spike that occurred with incremental building. Previously we recoconstructed all the documents from the GDBM database. Then the code added, edited, removed documents as required (i.e. the incremental bit), then it wrote it all out to GDBM. The problem was that the reconstructed phase could grow quite large -- an example PagedImage collection of 100000 documents took 2.4 GB when read in. When it got to the stage of opening a pipe to the datbase with open('\|txt2db'), the fork() call that occurs inside this function requires the system to (briefly) have two 2.4 GB processes, before quickly replacing the child process with the much smalled 'txt2db' process. It is at the point of the duplication of the two processes that can cause a computer to run out of memory. In the PagedImage example, the machine had 2 GB of main memory and 2 GB of swap. Therefore there was no way it could sustain two 2.4 GB processes.\n Long explanation. The good news is shifting the open() to be before the documents are reconstructed solves the problem.
Property svn:keywords set to `Author Date Id Revision`
File size: 13.0 KB

Line
1	###########################################################################
2	#
3	# classify.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# functions to handle classifiers
27
28	package classify;
29
30	require util;
31	require AllList;
32
33	use dbutil;
34	use gsprintf;
35	use strict; no strict 'subs';
36
37
38	sub gsprintf
39	{
40	return &gsprintf::gsprintf(@_);
41	}
42
43
44	sub load_classifier_for_info {
45	my ($classifier) = shift @_;
46
47	# find the classifier
48	my $customclassname;
49	if (defined($ENV{'GSDLCOLLECTION'}))
50	{
51	$customclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'},
52	"perllib", "classify", "${classifier}.pm");
53	}
54	my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "perllib", "classify", "${classifier}.pm");
55	my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'}, "perllib", "classify", "${classifier}.pm");
56
57	if (defined($customclassname) && -e $customclassname) { require $customclassname; }
58	elsif (-e $colclassname) { require $colclassname; }
59	elsif (-e $mainclassname) { require $mainclassname; }
60	else {
61	&gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classifier) && die "\n";
62	}
63	my ($classobj);
64	my $options = "-gsdlinfo";
65	eval ("\$classobj = new \$classifier([],[$options])");
66	die "$@" if $@;
67
68	return $classobj;
69	}
70
71	sub load_classifiers {
72	my ($classify_list, $build_dir, $outhandle) = @_;
73	my @classify_objects = ();
74	my $classify_number = 1;
75
76	my $colclassdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"perllib/classify");
77	unshift (@INC, $colclassdir);
78
79	foreach my $classifyoption (@$classify_list) {
80
81	# get the classifier name
82	my $classname = shift @$classifyoption;
83	next unless defined $classname;
84
85	# find the classifier
86	my $customclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'},
87	"perllib", "classify", "${classname}.pm");
88	my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "perllib", "classify", "${classname}.pm");
89	my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'}, "perllib", "classify", "${classname}.pm");
90
91	if (-e $customclassname) { require $customclassname; }
92	elsif (-e $colclassname) { require $colclassname; }
93	elsif (-e $mainclassname) { require $mainclassname; }
94	else { &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classname) && die "\n";
95	# die "ERROR - couldn't find classifier \"$classname\"\n";
96	}
97
98	# create the classify object
99	my ($classobj);
100
101	my @newoptions;
102
103	# do these first so they can be overriden by user supplied options
104	push @newoptions, "-builddir", "$build_dir" if ($build_dir);
105	push @newoptions, "-outhandle", "$outhandle" if ($outhandle);
106	push @newoptions, "-verbosity", "2";
107
108	# backwards compatability hack: if the classifier options are
109	# in "x=y" format, convert them to parsearg ("-x y") format.
110	my ($opt, $key, $value);
111	foreach $opt (@$classifyoption) {
112	# if ($opt =~ /^(\w+)=(.*)$/) {
113	# push @newoptions, "-$1", $2;
114	# } else {
115	push @newoptions, $opt;
116	#}
117	}
118
119	map { $_ = "\"$_\""; } @newoptions;
120	my $options .= join (",", @newoptions);
121
122
123	eval ("\$classobj = new \$classname([],[$options])");
124	die "$@" if $@;
125
126	$classobj->set_number($classify_number);
127	$classify_number ++;
128
129	# add this object to the list
130	push (@classify_objects, $classobj);
131	}
132
133	my ($classobj);
134	eval ("\$classobj = new AllList()");
135	die "$@" if $@;
136	push (@classify_objects, $classobj);
137
138	return \@classify_objects;
139	}
140
141	# init_classifiers resets all the classifiers and readys them to process
142	# the documents.
143	sub init_classifiers {
144	my ($classifiers) = @_;
145
146	foreach my $classobj (@$classifiers) {
147	$classobj->init();
148	}
149	}
150
151
152
153	# takes a hashref containing the metadata for an infodb entry, and extracts
154	# the childrens numbers (from the 'contains' entry).
155	# assumes format is ".1;".2;".3
156	sub get_children {
157	my ($doc_db_hash) = @_;
158
159	my $children = undef;
160
161	my $contains = $doc_db_hash->{'contains'};
162	if (defined ($contains)) {
163	$contains =~ s/\@$//; #remove trailing @
164	$contains =~ s/^\"\.//; #remove initial ".
165	@$children = split /\;\"\./, $contains;
166	}
167
168	return $children;
169	}
170
171
172	sub recurse_sections {
173	my ($doc_obj, $children, $parentoid, $parentsection, $database_recs) = @_;
174
175	return if (!defined $children);
176
177	foreach my $child (sort { $a <=> $b} @$children) {
178	$doc_obj->create_named_section("$parentsection.$child");
179	my $doc_db_rec = $database_recs->{"$parentoid.$child"};
180	my $doc_db_hash = db_rec_to_hash($doc_db_rec);
181
182	# get child's children
183	my $newchildren = &get_children($doc_db_hash);
184
185	# add content for current section
186	add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
187
188	# process all the children if there are any
189	if (defined ($newchildren))
190	{
191	recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
192	"$parentsection.$child", $database_recs);
193	}
194	}
195	}
196
197
198	sub add_section_content {
199	my ($doc_obj, $cursection, $doc_db_hash) = @_;
200
201	foreach my $key (keys %$doc_db_hash) {
202	#don't need to store these metadata
203	next if $key =~ /(thistype\|childtype\|contains\|docnum\|doctype\|classifytype)/i;
204	# but do want things like hastxt and archivedir
205	my @items = split /@/, $doc_db_hash->{$key};
206	# metadata is all from gdbm so should already be in utf8
207	map {$doc_obj->add_utf8_metadata ($cursection, $key, $_); } @items;
208
209	}
210	}
211
212
213	# gets all the metadata from an infodb entry, and puts it into a hashref
214	sub db_rec_to_hash {
215
216	my ($gdb_str_ref) = @_;
217
218	my $hashref = {};
219
220	my @entries = split(/\n/, $gdb_str_ref);
221	foreach my $entry (@entries) {
222	my($key, $value) = ($entry =~ /^<([^>])>(.?)$/ );
223	$hashref->{$key} .= '@' if defined $hashref->{$key};
224	$hashref->{$key} .= $value;
225
226	}
227
228	return $hashref;
229	}
230
231
232	sub reconstruct_doc_objs_metadata
233	{
234	my $infodb_type = shift(@_);
235	my $infodb_file_path = shift(@_);
236	my $database_recs = shift(@_);
237
238	# dig out top level doc sections
239	my %top_sections = ();
240	my %top_docnums = ();
241	foreach my $key ( keys %$database_recs )
242	{
243	my $md_rec = $database_recs->{$key};
244	my $md_hash = db_rec_to_hash($md_rec);
245
246	if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
247	next if ($key =~ m/\./);
248	$top_sections{$key} = $md_hash;
249	$top_docnums{$key} = $md_hash->{'docnum'};
250	}
251	}
252
253	# for greenstone document objects based on metadata in database file
254	my @all_docs = ();
255	# we need to make sure the documents were processed in the same order as
256	# before, so sort based on their docnums
257	foreach my $oid ( sort { $top_docnums{$a} <=> $top_docnums{$b} } keys %top_sections )
258	{
259	my $doc_db_hash = $top_sections{$oid};
260
261	my $doc_obj = new doc();
262	$doc_obj->set_OID($oid);
263	my $top = $doc_obj->get_top_section();
264	add_section_content ($doc_obj, $top, $doc_db_hash);
265	my $children = &get_children($doc_db_hash);
266	recurse_sections($doc_obj, $children, $oid, $top, $database_recs);
267
268	push(@all_docs,$doc_obj);
269	}
270
271	return \@all_docs;
272	}
273
274
275
276
277
278	# classify_doc lets each of the classifiers classify a document
279	sub classify_doc {
280	my ($classifiers, $doc_obj, $edit_mode) = @_;
281
282	foreach my $classobj (@$classifiers) {
283	my $title = $classobj->{'title'};
284
285	$classobj->classify($doc_obj,$edit_mode);
286	}
287	}
288
289
290	our $next_classify_num = 1;
291
292	# output_classify_info outputs all the info needed for the classification
293	# to the database
294	sub output_classify_info
295	{
296	my ($classifiers, $infodb_type, $infodb_handle, $remove_empty_classifications, $gli) = @_;
297
298	$gli = 0 unless defined $gli;
299
300	# create a classification containing all the info
301	my $classifyinfo = { 'classifyOID'=> 'browse',
302	'contains' => [] };
303
304	# get each of the classifications
305	foreach my $classifier (@$classifiers)
306	{
307	my $classifier_info = $classifier->get_classify_info($gli);
308	$classifier_info->{'classifyOID'} = "CL$next_classify_num" unless defined($classifier_info->{'classifyOID'});
309	print STDERR "*** outputting information for classifier: $classifier_info->{'classifyOID'}\n";
310
311	push(@{$classifyinfo->{'contains'}}, $classifier_info);
312	$next_classify_num++;
313	}
314
315	&print_classify_info($infodb_type, $infodb_handle, $classifyinfo, "", $remove_empty_classifications);
316	}
317
318
319	sub print_classify_info
320	{
321	my ($infodb_type, $infodb_handle, $classifyinfo, $OID, $remove_empty_classifications) = @_;
322
323	$OID =~ s/^\.+//; # just for good luck
324
325	# book information is printed elsewhere
326	return if (defined ($classifyinfo->{'OID'}));
327
328	# don't want empty classifications
329	return if (&check_contents ($classifyinfo, $remove_empty_classifications) == 0 && $remove_empty_classifications);
330
331	$OID = $classifyinfo->{'classifyOID'} if defined ($classifyinfo->{'classifyOID'});
332
333	my %classify_infodb = ();
334	$classify_infodb{"doctype"} = [ "classify" ];
335	$classify_infodb{"hastxt"} = [ "0" ];
336	$classify_infodb{"childtype"} = [ $classifyinfo->{'childtype'} ]
337	if defined $classifyinfo->{'childtype'};
338	$classify_infodb{"Title"} = [ $classifyinfo->{'Title'} ]
339	if defined $classifyinfo->{'Title'};
340	$classify_infodb{"numleafdocs"} = [ $classifyinfo->{'numleafdocs'} ]
341	if defined $classifyinfo->{'numleafdocs'};
342	$classify_infodb{"thistype"} = [ $classifyinfo->{'thistype'} ]
343	if defined $classifyinfo->{'thistype'};
344	$classify_infodb{"parameters"} = [ $classifyinfo->{'parameters'} ]
345	if defined $classifyinfo->{'parameters'};
346	$classify_infodb{"supportsmemberof"} = [ $classifyinfo->{'supportsmemberof'} ]
347	if defined $classifyinfo->{'supportsmemberof'};
348
349	my $contains_text = "";
350	my $mdoffset_text = "";
351
352	my $next_subOID = 1;
353	my $first = 1;
354	foreach my $tempinfo (@{$classifyinfo->{'contains'}}) {
355	# empty contents were made undefined by clean_contents()
356	next unless defined $tempinfo;
357
358	if (!defined ($tempinfo->{'classifyOID'}) \|\|
359	$tempinfo->{'classifyOID'} ne "oai") {
360	$contains_text .= ";" unless $first;
361	}
362	$mdoffset_text .= ";" unless $first;
363	$first = 0;
364
365	if (defined ($tempinfo->{'classifyOID'}))
366	{
367	if ($tempinfo->{'classifyOID'} ne "oai")
368	{
369	$contains_text .= $tempinfo->{'classifyOID'};
370	}
371
372	&print_classify_info ($infodb_type, $infodb_handle, $tempinfo, $tempinfo->{'classifyOID'},
373	$remove_empty_classifications);
374	}
375	elsif (defined ($tempinfo->{'OID'}))
376	{
377	$contains_text .= $tempinfo->{'OID'};
378	$mdoffset_text .= $tempinfo->{'offset'} if (defined ($tempinfo->{'offset'}));
379	}
380	else
381	{
382	# Supress having top-level node in Collage classifier
383	# so no bookshelf icon appears, top-level, along with the
384	# applet
385	if (!defined ($tempinfo->{'Title'}) \|\| $tempinfo->{'Title'} ne "Collage")
386	{
387	$contains_text .= "\".$next_subOID";
388	}
389
390	&print_classify_info ($infodb_type, $infodb_handle, $tempinfo, "$OID.$next_subOID",
391	$remove_empty_classifications);
392	$next_subOID++;
393	}
394	}
395
396	$classify_infodb{"contains"} = [ $contains_text ];
397	$classify_infodb{"mdtype"} = [ $classifyinfo->{'mdtype'} ]
398	if defined $classifyinfo->{'mdtype'};
399	$classify_infodb{"mdoffset"} = [ $mdoffset_text ]
400	if ($mdoffset_text !~ m/^;+$/);
401
402	&dbutil::write_infodb_entry($infodb_type, $infodb_handle, $OID, \%classify_infodb);
403	}
404
405
406	sub check_contents {
407	my ($classifyinfo,$remove_empty_classifications) = @_;
408	$remove_empty_classifications = 0 unless ($remove_empty_classifications);
409	my $num_leaf_docs = 0;
410	my $sub_num_leaf_docs = 0;
411
412	return $classifyinfo->{'numleafdocs'} if (defined $classifyinfo->{'numleafdocs'});
413
414	foreach my $content (@{$classifyinfo->{'contains'}}) {
415	if (defined $content->{'OID'}) {
416	# found a book
417	$num_leaf_docs ++;
418	} elsif (($sub_num_leaf_docs = &check_contents ($content,$remove_empty_classifications)) > 0) {
419	# there's a book somewhere below
420	$num_leaf_docs += $sub_num_leaf_docs;
421	} else {
422	if ($remove_empty_classifications){
423	# section contains no books so we want to remove
424	# it from its parents contents
425	$content = undef;
426	}
427	}
428	}
429
430	$classifyinfo->{'numleafdocs'} = $num_leaf_docs;
431	return $num_leaf_docs;
432	}
433
434	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: