source: gsdl/trunk/perllib/classify.pm@ 20575

Last change on this file since 20575 was 20575, checked in by davidb, 15 years ago

Opening of txt2db moved to earlier in the buildcol process. This was done to avoid a huge memory spike that occurred with incremental building. Previously we recoconstructed all the documents from the GDBM database. Then the code added, edited, removed documents as required (i.e. the incremental bit), then it wrote it all out to GDBM. The problem was that the reconstructed phase could grow quite large -- an example PagedImage collection of 100000 documents took 2.4 GB when read in. When it got to the stage of opening a pipe to the datbase with open('|txt2db'), the fork() call that occurs inside this function requires the system to (briefly) have *two* 2.4 GB processes, before quickly replacing the child process with the much smalled 'txt2db' process. It is at the point of the duplication of the two processes that can cause a computer to run out of memory. In the PagedImage example, the machine had 2 GB of main memory and 2 GB of swap. Therefore there was no way it could sustain two 2.4 GB processes.\n Long explanation. The good news is shifting the open() to be before the documents are reconstructed solves the problem.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.0 KB
Line 
1###########################################################################
2#
3# classify.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# functions to handle classifiers
27
28package classify;
29
30require util;
31require AllList;
32
33use dbutil;
34use gsprintf;
35use strict; no strict 'subs';
36
37
38sub gsprintf
39{
40 return &gsprintf::gsprintf(@_);
41}
42
43
44sub load_classifier_for_info {
45 my ($classifier) = shift @_;
46
47 # find the classifier
48 my $customclassname;
49 if (defined($ENV{'GSDLCOLLECTION'}))
50 {
51 $customclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'},
52 "perllib", "classify", "${classifier}.pm");
53 }
54 my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "perllib", "classify", "${classifier}.pm");
55 my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'}, "perllib", "classify", "${classifier}.pm");
56
57 if (defined($customclassname) && -e $customclassname) { require $customclassname; }
58 elsif (-e $colclassname) { require $colclassname; }
59 elsif (-e $mainclassname) { require $mainclassname; }
60 else {
61 &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classifier) && die "\n";
62 }
63 my ($classobj);
64 my $options = "-gsdlinfo";
65 eval ("\$classobj = new \$classifier([],[$options])");
66 die "$@" if $@;
67
68 return $classobj;
69}
70
71sub load_classifiers {
72 my ($classify_list, $build_dir, $outhandle) = @_;
73 my @classify_objects = ();
74 my $classify_number = 1;
75
76 my $colclassdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"perllib/classify");
77 unshift (@INC, $colclassdir);
78
79 foreach my $classifyoption (@$classify_list) {
80
81 # get the classifier name
82 my $classname = shift @$classifyoption;
83 next unless defined $classname;
84
85 # find the classifier
86 my $customclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'},
87 "perllib", "classify", "${classname}.pm");
88 my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "perllib", "classify", "${classname}.pm");
89 my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'}, "perllib", "classify", "${classname}.pm");
90
91 if (-e $customclassname) { require $customclassname; }
92 elsif (-e $colclassname) { require $colclassname; }
93 elsif (-e $mainclassname) { require $mainclassname; }
94 else { &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classname) && die "\n";
95 # die "ERROR - couldn't find classifier \"$classname\"\n";
96 }
97
98 # create the classify object
99 my ($classobj);
100
101 my @newoptions;
102
103 # do these first so they can be overriden by user supplied options
104 push @newoptions, "-builddir", "$build_dir" if ($build_dir);
105 push @newoptions, "-outhandle", "$outhandle" if ($outhandle);
106 push @newoptions, "-verbosity", "2";
107
108 # backwards compatability hack: if the classifier options are
109 # in "x=y" format, convert them to parsearg ("-x y") format.
110 my ($opt, $key, $value);
111 foreach $opt (@$classifyoption) {
112 # if ($opt =~ /^(\w+)=(.*)$/) {
113 # push @newoptions, "-$1", $2;
114 # } else {
115 push @newoptions, $opt;
116 #}
117 }
118
119 map { $_ = "\"$_\""; } @newoptions;
120 my $options .= join (",", @newoptions);
121
122
123 eval ("\$classobj = new \$classname([],[$options])");
124 die "$@" if $@;
125
126 $classobj->set_number($classify_number);
127 $classify_number ++;
128
129 # add this object to the list
130 push (@classify_objects, $classobj);
131 }
132
133 my ($classobj);
134 eval ("\$classobj = new AllList()");
135 die "$@" if $@;
136 push (@classify_objects, $classobj);
137
138 return \@classify_objects;
139}
140
141# init_classifiers resets all the classifiers and readys them to process
142# the documents.
143sub init_classifiers {
144 my ($classifiers) = @_;
145
146 foreach my $classobj (@$classifiers) {
147 $classobj->init();
148 }
149}
150
151
152
153# takes a hashref containing the metadata for an infodb entry, and extracts
154# the childrens numbers (from the 'contains' entry).
155# assumes format is ".1;".2;".3
156sub get_children {
157 my ($doc_db_hash) = @_;
158
159 my $children = undef;
160
161 my $contains = $doc_db_hash->{'contains'};
162 if (defined ($contains)) {
163 $contains =~ s/\@$//; #remove trailing @
164 $contains =~ s/^\"\.//; #remove initial ".
165 @$children = split /\;\"\./, $contains;
166 }
167
168 return $children;
169}
170
171
172sub recurse_sections {
173 my ($doc_obj, $children, $parentoid, $parentsection, $database_recs) = @_;
174
175 return if (!defined $children);
176
177 foreach my $child (sort { $a <=> $b} @$children) {
178 $doc_obj->create_named_section("$parentsection.$child");
179 my $doc_db_rec = $database_recs->{"$parentoid.$child"};
180 my $doc_db_hash = db_rec_to_hash($doc_db_rec);
181
182 # get child's children
183 my $newchildren = &get_children($doc_db_hash);
184
185 # add content for current section
186 add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
187
188 # process all the children if there are any
189 if (defined ($newchildren))
190 {
191 recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
192 "$parentsection.$child", $database_recs);
193 }
194 }
195}
196
197
198sub add_section_content {
199 my ($doc_obj, $cursection, $doc_db_hash) = @_;
200
201 foreach my $key (keys %$doc_db_hash) {
202 #don't need to store these metadata
203 next if $key =~ /(thistype|childtype|contains|docnum|doctype|classifytype)/i;
204 # but do want things like hastxt and archivedir
205 my @items = split /@/, $doc_db_hash->{$key};
206 # metadata is all from gdbm so should already be in utf8
207 map {$doc_obj->add_utf8_metadata ($cursection, $key, $_); } @items;
208
209 }
210}
211
212
213# gets all the metadata from an infodb entry, and puts it into a hashref
214sub db_rec_to_hash {
215
216 my ($gdb_str_ref) = @_;
217
218 my $hashref = {};
219
220 my @entries = split(/\n/, $gdb_str_ref);
221 foreach my $entry (@entries) {
222 my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
223 $hashref->{$key} .= '@' if defined $hashref->{$key};
224 $hashref->{$key} .= $value;
225
226 }
227
228 return $hashref;
229}
230
231
232sub reconstruct_doc_objs_metadata
233{
234 my $infodb_type = shift(@_);
235 my $infodb_file_path = shift(@_);
236 my $database_recs = shift(@_);
237
238 # dig out top level doc sections
239 my %top_sections = ();
240 my %top_docnums = ();
241 foreach my $key ( keys %$database_recs )
242 {
243 my $md_rec = $database_recs->{$key};
244 my $md_hash = db_rec_to_hash($md_rec);
245
246 if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
247 next if ($key =~ m/\./);
248 $top_sections{$key} = $md_hash;
249 $top_docnums{$key} = $md_hash->{'docnum'};
250 }
251 }
252
253 # for greenstone document objects based on metadata in database file
254 my @all_docs = ();
255 # we need to make sure the documents were processed in the same order as
256 # before, so sort based on their docnums
257 foreach my $oid ( sort { $top_docnums{$a} <=> $top_docnums{$b} } keys %top_sections )
258 {
259 my $doc_db_hash = $top_sections{$oid};
260
261 my $doc_obj = new doc();
262 $doc_obj->set_OID($oid);
263 my $top = $doc_obj->get_top_section();
264 add_section_content ($doc_obj, $top, $doc_db_hash);
265 my $children = &get_children($doc_db_hash);
266 recurse_sections($doc_obj, $children, $oid, $top, $database_recs);
267
268 push(@all_docs,$doc_obj);
269 }
270
271 return \@all_docs;
272}
273
274
275
276
277
278# classify_doc lets each of the classifiers classify a document
279sub classify_doc {
280 my ($classifiers, $doc_obj, $edit_mode) = @_;
281
282 foreach my $classobj (@$classifiers) {
283 my $title = $classobj->{'title'};
284
285 $classobj->classify($doc_obj,$edit_mode);
286 }
287}
288
289
290our $next_classify_num = 1;
291
292# output_classify_info outputs all the info needed for the classification
293# to the database
294sub output_classify_info
295{
296 my ($classifiers, $infodb_type, $infodb_handle, $remove_empty_classifications, $gli) = @_;
297
298 $gli = 0 unless defined $gli;
299
300 # create a classification containing all the info
301 my $classifyinfo = { 'classifyOID'=> 'browse',
302 'contains' => [] };
303
304 # get each of the classifications
305 foreach my $classifier (@$classifiers)
306 {
307 my $classifier_info = $classifier->get_classify_info($gli);
308 $classifier_info->{'classifyOID'} = "CL$next_classify_num" unless defined($classifier_info->{'classifyOID'});
309 print STDERR "*** outputting information for classifier: $classifier_info->{'classifyOID'}\n";
310
311 push(@{$classifyinfo->{'contains'}}, $classifier_info);
312 $next_classify_num++;
313 }
314
315 &print_classify_info($infodb_type, $infodb_handle, $classifyinfo, "", $remove_empty_classifications);
316}
317
318
319sub print_classify_info
320{
321 my ($infodb_type, $infodb_handle, $classifyinfo, $OID, $remove_empty_classifications) = @_;
322
323 $OID =~ s/^\.+//; # just for good luck
324
325 # book information is printed elsewhere
326 return if (defined ($classifyinfo->{'OID'}));
327
328 # don't want empty classifications
329 return if (&check_contents ($classifyinfo, $remove_empty_classifications) == 0 && $remove_empty_classifications);
330
331 $OID = $classifyinfo->{'classifyOID'} if defined ($classifyinfo->{'classifyOID'});
332
333 my %classify_infodb = ();
334 $classify_infodb{"doctype"} = [ "classify" ];
335 $classify_infodb{"hastxt"} = [ "0" ];
336 $classify_infodb{"childtype"} = [ $classifyinfo->{'childtype'} ]
337 if defined $classifyinfo->{'childtype'};
338 $classify_infodb{"Title"} = [ $classifyinfo->{'Title'} ]
339 if defined $classifyinfo->{'Title'};
340 $classify_infodb{"numleafdocs"} = [ $classifyinfo->{'numleafdocs'} ]
341 if defined $classifyinfo->{'numleafdocs'};
342 $classify_infodb{"thistype"} = [ $classifyinfo->{'thistype'} ]
343 if defined $classifyinfo->{'thistype'};
344 $classify_infodb{"parameters"} = [ $classifyinfo->{'parameters'} ]
345 if defined $classifyinfo->{'parameters'};
346 $classify_infodb{"supportsmemberof"} = [ $classifyinfo->{'supportsmemberof'} ]
347 if defined $classifyinfo->{'supportsmemberof'};
348
349 my $contains_text = "";
350 my $mdoffset_text = "";
351
352 my $next_subOID = 1;
353 my $first = 1;
354 foreach my $tempinfo (@{$classifyinfo->{'contains'}}) {
355 # empty contents were made undefined by clean_contents()
356 next unless defined $tempinfo;
357
358 if (!defined ($tempinfo->{'classifyOID'}) ||
359 $tempinfo->{'classifyOID'} ne "oai") {
360 $contains_text .= ";" unless $first;
361 }
362 $mdoffset_text .= ";" unless $first;
363 $first = 0;
364
365 if (defined ($tempinfo->{'classifyOID'}))
366 {
367 if ($tempinfo->{'classifyOID'} ne "oai")
368 {
369 $contains_text .= $tempinfo->{'classifyOID'};
370 }
371
372 &print_classify_info ($infodb_type, $infodb_handle, $tempinfo, $tempinfo->{'classifyOID'},
373 $remove_empty_classifications);
374 }
375 elsif (defined ($tempinfo->{'OID'}))
376 {
377 $contains_text .= $tempinfo->{'OID'};
378 $mdoffset_text .= $tempinfo->{'offset'} if (defined ($tempinfo->{'offset'}));
379 }
380 else
381 {
382 # Supress having top-level node in Collage classifier
383 # so no bookshelf icon appears, top-level, along with the
384 # applet
385 if (!defined ($tempinfo->{'Title'}) || $tempinfo->{'Title'} ne "Collage")
386 {
387 $contains_text .= "\".$next_subOID";
388 }
389
390 &print_classify_info ($infodb_type, $infodb_handle, $tempinfo, "$OID.$next_subOID",
391 $remove_empty_classifications);
392 $next_subOID++;
393 }
394 }
395
396 $classify_infodb{"contains"} = [ $contains_text ];
397 $classify_infodb{"mdtype"} = [ $classifyinfo->{'mdtype'} ]
398 if defined $classifyinfo->{'mdtype'};
399 $classify_infodb{"mdoffset"} = [ $mdoffset_text ]
400 if ($mdoffset_text !~ m/^;+$/);
401
402 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, $OID, \%classify_infodb);
403}
404
405
406sub check_contents {
407 my ($classifyinfo,$remove_empty_classifications) = @_;
408 $remove_empty_classifications = 0 unless ($remove_empty_classifications);
409 my $num_leaf_docs = 0;
410 my $sub_num_leaf_docs = 0;
411
412 return $classifyinfo->{'numleafdocs'} if (defined $classifyinfo->{'numleafdocs'});
413
414 foreach my $content (@{$classifyinfo->{'contains'}}) {
415 if (defined $content->{'OID'}) {
416 # found a book
417 $num_leaf_docs ++;
418 } elsif (($sub_num_leaf_docs = &check_contents ($content,$remove_empty_classifications)) > 0) {
419 # there's a book somewhere below
420 $num_leaf_docs += $sub_num_leaf_docs;
421 } else {
422 if ($remove_empty_classifications){
423 # section contains no books so we want to remove
424 # it from its parents contents
425 $content = undef;
426 }
427 }
428 }
429
430 $classifyinfo->{'numleafdocs'} = $num_leaf_docs;
431 return $num_leaf_docs;
432}
433
4341;
Note: See TracBrowser for help on using the repository browser.