source: trunk/gsdl/perllib/basebuilder.pm@ 12834

Last change on this file since 12834 was 12340, checked in by kjdon, 18 years ago

maxnumeric is set using set_maxnumeric (by buildcol.pl) rather than the builder looking directly in the collect.cfg file

  • Property svn:keywords set to Author Date Id Revision
File size: 17.9 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mgpp
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47our $maxdocsize = 12000;
48
49sub new {
50 my ($class, $collection, $source_dir, $build_dir, $verbosity,
51 $maxdocs, $debug, $keepold, $remove_empty_classifications,
52 $outhandle, $no_text, $failhandle, $gli) = @_;
53
54 $outhandle = STDERR unless defined $outhandle;
55 $no_text = 0 unless defined $no_text;
56 $failhandle = STDERR unless defined $failhandle;
57
58 # create a builder object
59 my $self = bless {'collection'=>$collection,
60 'source_dir'=>$source_dir,
61 'build_dir'=>$build_dir,
62 'verbosity'=>$verbosity,
63 'maxdocs'=>$maxdocs,
64 'debug'=>$debug,
65 'keepold'=>$keepold,
66 'remove_empty_classifications'=>$remove_empty_classifications,
67 'outhandle'=>$outhandle,
68 'no_text'=>$no_text,
69 'failhandle'=>$failhandle,
70 'notbuilt'=>{}, # indexes not built
71 'gli'=>$gli
72 }, $class;
73
74 $self->{'gli'} = 0 unless defined $self->{'gli'};
75
76 # read in the collection configuration file
77 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
78 if (!-e $colcfgname) {
79 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
80 }
81 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
82
83 # get the list of plugins for this collection
84 my $plugins = [];
85 if (defined $self->{'collect_cfg'}->{'plugin'}) {
86 $plugins = $self->{'collect_cfg'}->{'plugin'};
87 }
88
89 # load all the plugins
90
91 #build up the extra global options for the plugins
92 my @global_opts = ();
93 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
94 push @global_opts, "-separate_cjk";
95 }
96 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts, $keepold);
97
98 if (scalar(@{$self->{'pluginfo'}}) == 0) {
99 print $outhandle "No plugins were loaded.\n";
100 die "\n";
101 }
102
103 # get the list of classifiers for this collection
104 my $classifiers = [];
105 if (defined $self->{'collect_cfg'}->{'classify'}) {
106 $classifiers = $self->{'collect_cfg'}->{'classify'};
107 }
108
109 # load all the classifiers
110 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
111
112 # load up any dontgdbm fields
113 $self->{'dontgdbm'} = {};
114 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
115 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
116 $self->{'dontgdbm'}->{$dg} = 1;
117 }
118 }
119
120 $self->{'maxnumeric'} = 4;
121 return $self;
122}
123
124# stuff has been moved here from new, so we can use subclass methods
125sub init {
126 my $self = shift(@_);
127
128 $self->generate_index_list();
129
130 # sort out subcollection indexes
131 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
132 my $indexes = $self->{'collect_cfg'}->{'indexes'};
133 $self->{'collect_cfg'}->{'indexes'} = [];
134 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
135 foreach my $index (@$indexes) {
136 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
137 }
138 }
139 }
140
141 # sort out language subindexes
142 if (defined $self->{'collect_cfg'}->{'languages'}) {
143 my $indexes = $self->{'collect_cfg'}->{'indexes'};
144 $self->{'collect_cfg'}->{'indexes'} = [];
145 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
146 foreach my $index (@$indexes) {
147 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
148 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
149 }
150 else { # add in an empty subcollection field
151 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
152 }
153 }
154 }
155 }
156
157 if (defined($self->{'collect_cfg'}->{'indexes'})) {
158 # make sure that the same index isn't specified more than once
159 my %tmphash = ();
160 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
161 $self->{'collect_cfg'}->{'indexes'} = [];
162 foreach my $i (@tmparray) {
163 if (!defined ($tmphash{$i})) {
164 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
165 $tmphash{$i} = 1;
166 }
167 }
168 } else {
169 $self->{'collect_cfg'}->{'indexes'} = [];
170 }
171
172 # load up the document processor for building
173 # if a buildproc class has been created for this collection, use it
174 # otherwise, use the mg buildproc
175 my ($buildprocdir, $buildproctype);
176 my $collection = $self->{'collection'};
177 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
178 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
179 $buildproctype = "${collection}buildproc";
180 } else {
181 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
182 $buildproctype = $self->default_buildproc();
183 }
184 require "$buildprocdir/$buildproctype.pm";
185
186 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
187 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
188 die "$@" if $@;
189
190 if (!$self->{'debug'} && !$self->{'keepold'}) {
191 # remove any old builds
192 &util::rm_r($self->{'build_dir'});
193 &util::mk_all_dir($self->{'build_dir'});
194
195 # make the text directory
196 my $textdir = "$self->{'build_dir'}/text";
197 &util::mk_all_dir($textdir);
198 }
199
200}
201
202sub deinit {
203 my $self = shift (@_);
204
205 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
206}
207
208sub set_sections_index_document_metadata {
209 my $self = shift (@_);
210 my ($index) = @_;
211
212 $self->{'buildproc'}->set_sections_index_document_metadata($index);
213}
214
215sub set_maxnumeric {
216 my $self = shift (@_);
217 my ($maxnumeric) = @_;
218
219 $self->{'maxnumeric'} = $maxnumeric;
220}
221sub set_strip_html {
222 my $self = shift (@_);
223 my ($strip) = @_;
224
225 $self->{'strip_html'} = $strip;
226 $self->{'buildproc'}->set_strip_html($strip);
227}
228
229sub compress_text {
230 my $self = shift (@_);
231 my ($textindex) = @_;
232
233 print STDERR "compress_text() should be implemented in subclass!!";
234 return;
235}
236
237
238sub build_indexes {
239 my $self = shift (@_);
240 my ($indexname) = @_;
241 my $outhandle = $self->{'outhandle'};
242
243 my $indexes = [];
244 if (defined $indexname && $indexname =~ /\w/) {
245 push @$indexes, $indexname;
246 } else {
247 $indexes = $self->{'collect_cfg'}->{'indexes'};
248 }
249
250 # create the mapping between the index descriptions
251 # and their directory names (includes subcolls and langs)
252 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
253
254 # build each of the indexes
255 foreach my $index (@$indexes) {
256 if ($self->want_built($index)) {
257 print $outhandle "\n*** building index $index in subdirectory " .
258 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
259 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
260 $self->build_index($index);
261 } else {
262 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
263 }
264 }
265
266 $self->build_indexes_extra();
267
268}
269
270sub build_indexes_extra {
271 my $self = shift(@_);
272
273}
274
275sub build_index {
276 my $self = shift (@_);
277 my ($index) = @_;
278
279 print STDERR "build_index should be implemented in subclass\n";
280 return;
281}
282
283
284
285sub make_infodatabase {
286 my $self = shift (@_);
287 my $outhandle = $self->{'outhandle'};
288
289 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
290 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
291 &util::mk_all_dir ($textdir);
292 &util::mk_all_dir ($assocdir);
293
294 # get db name
295 my $dbext = ".bdb";
296 $dbext = ".ldb" if &util::is_little_endian();
297 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
298 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
299
300 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
301 my $exe = &util::get_os_exe ();
302 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
303
304 print $outhandle "\n*** creating the info database and processing associated files\n"
305 if ($self->{'verbosity'} >= 1);
306 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
307
308 # init all the classifiers
309 &classify::init_classifiers ($self->{'classifiers'});
310
311
312 my $reconstructed_docs = undef;
313 if ($self->{'keepold'}) {
314 # reconstruct doc_obj metadata from gdbm for all docs
315 $reconstructed_docs = &classify::reconstruct_doc_objs_metadata($fulldbname);
316 }
317
318 # set up the document processor
319 my ($handle);
320 if ($self->{'debug'}) {
321 $handle = STDOUT;
322 } else {
323 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
324 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
325 die "builder::make_infodatabase - couldn't run $txt2db_exe\n";
326 }
327 $handle = basebuilder::PIPEOUT;
328 }
329
330 $self->{'buildproc'}->set_output_handle ($handle);
331 $self->{'buildproc'}->set_mode ('infodb');
332 $self->{'buildproc'}->set_assocdir ($assocdir);
333 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
334 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
335 $self->{'buildproc'}->set_indexing_text (0);
336 $self->{'buildproc'}->set_store_text(1);
337
338 # make_infodatabase needs full reset even for incremental build
339 # as incremental works by reconstructing all docs from GDBM and
340 # then adding in the new ones
341 $self->{'buildproc'}->zero_reset();
342
343 if ($self->{'keepold'}) {
344 # create flat classify structure, ready for new docs to be added
345 foreach my $doc_obj ( @$reconstructed_docs ) {
346 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
347 $self->{'buildproc'}->process($doc_obj,undef);
348 }
349 }
350
351
352 # this has changed to only output collection meta if its
353 # not in the config file
354 $self->output_collection_meta($handle);
355 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
356 "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
357
358 # output classification information
359 &classify::output_classify_info ($self->{'classifiers'}, $handle,
360 $self->{'remove_empty_classifications'},
361 $self->{'gli'});
362
363
364 #output doclist
365 my @doclist = $self->{'buildproc'}->get_doc_list();
366 my $docs = join (";",@doclist);
367 print $handle "[browselist]\n";
368 print $handle "<hastxt>0\n";
369 print $handle "<childtype>VList\n";
370 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
371 print $handle "<thistype>Invisible\n";
372 print $handle "<contains>$docs";
373 print $handle "\n" . ('-' x 70) . "\n";
374
375 close ($handle) if !$self->{'debug'};
376
377 print STDERR "</Stage>\n" if $self->{'gli'};
378}
379
380sub make_auxiliary_files {
381 my $self = shift (@_);
382 my ($index);
383 my $build_cfg = {};
384 # subclasses may have already defined stuff in here
385 if (defined $self->{'build_cfg'}) {
386 $build_cfg = $self->{'build_cfg'};
387 }
388
389 my $outhandle = $self->{'outhandle'};
390
391 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
392 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
393
394 # get the text directory
395 &util::mk_all_dir ($self->{'build_dir'});
396
397 # store the build date
398 $build_cfg->{'builddate'} = time;
399 $build_cfg->{'buildtype'} = $self->{'buildtype'};
400 $build_cfg->{'indexstem'} = $self->{'collection'};
401 # store the number of documents and number of bytes
402 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
403 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
404
405
406 # store the mapping between the index names and the directory names
407 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
408 my @indexmap = ();
409 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
410 if (not defined ($self->{'notbuilt'}->{$index})) {
411 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
412 }
413 }
414 $build_cfg->{'indexmap'} = \@indexmap;
415
416 my @subcollectionmap = ();
417 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
418 push (@subcollectionmap, "$subcollection\-\>" .
419 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
420 }
421 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
422
423 my @languagemap = ();
424 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
425 push (@languagemap, "$language\-\>" .
426 $self->{'index_mapping'}->{'languagemap'}->{$language});
427 }
428 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
429
430 my @notbuilt = ();
431 foreach my $nb (keys %{$self->{'notbuilt'}}) {
432 push (@notbuilt, $nb);
433 }
434 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
435
436 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
437
438 $self->build_cfg_extra($build_cfg);
439
440 $self->write_cfg_file($build_cfg);
441 print STDERR "</Stage>\n" if $self->{'gli'};
442}
443
444sub collect_specific {
445 my $self = shift (@_);
446}
447
448sub want_built {
449 my $self = shift (@_);
450 my ($index) = @_;
451
452 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
453 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
454 if ($index =~ /^$checkstr$/) {
455 $self->{'notbuilt'}->{$index} = 1;
456 return 0;
457 }
458 }
459 }
460
461 return 1;
462}
463
464sub create_index_mapping {
465 my $self = shift (@_);
466 my ($indexes) = @_;
467
468 print STDERR "create_index_mapping should be implemented in subclass\n";
469 my %mapping = ();
470 return \%mapping;
471}
472
473# returns a processed version of a field.
474# if the field has only one component the processed
475# version will contain the first character and next consonant
476# of that componant - otherwise it will contain the first
477# character of the first two components
478# only uses letdig (\w) characters now
479sub process_field {
480 my $self = shift (@_);
481 my ($field) = @_;
482
483 return "" unless (defined ($field) && $field =~ /\S/);
484
485 my ($a, $b);
486 my @components = split /,/, $field;
487 if (scalar @components >= 2) {
488 # pick the first letdig from the first two field names
489 ($a) = $components[0] =~ /^[^\w]*(\w)/;
490 ($b) = $components[1] =~ /^[^\w]*(\w)/;
491 } else {
492 # pick the first two letdig chars
493 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
494 }
495 # there may not have been any letdigs...
496 $a = 'a' unless defined $a;
497 $b = '0' unless defined $b;
498
499 return "$a$b";
500
501}
502
503sub get_next_version {
504 my $self = shift (@_);
505 my ($nameref) = @_;
506 my $num=0;
507 if ($$nameref =~ /(\d\d)$/) {
508 $num = $1; $num ++;
509 $$nameref =~ s/\d\d$/$num/;
510 } elsif ($$nameref =~ /(\d)$/) {
511 $num = $1;
512 if ($num == 9) {$$nameref =~ s/\d$/10/;}
513 else {$num ++; $$nameref =~ s/\d$/$num/;}
514 } else {
515 $$nameref =~ s/.$/0/;
516 }
517}
518
519# implement this in subclass if want to add extra stuff to build.cfg
520sub build_cfg_extra {
521 my $self = shift(@_);
522 my ($build_cfg) = @_;
523
524}
525
526sub write_cfg_file {
527 my $self = shift(@_);
528 my ($build_cfg) = @_;
529
530 # write out the build information
531 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
532 '^(builddate|buildtype|numdocs|numbytes|numwords|numsections|maxnumeric|indexstem)$',
533 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
534
535}
536
537# default is to output an empty [collection] entry
538sub output_collection_meta {
539 my $self = shift(@_);
540 my ($handle) = @_;
541
542 print $handle "[collection]\n". ('-' x 70) . "\n";;
543
544}
545
546sub print_stats {
547 my $self = shift (@_);
548
549 my $outhandle = $self->{'outhandle'};
550 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
551 my $index = $self->{'buildproc'}->get_index();
552 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
553 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
554
555 if ($indexing_text) {
556 print $outhandle "Stats (Creating index $index)\n";
557 } else {
558 print $outhandle "Stats (Compressing text from $index)\n";
559 }
560 print $outhandle "Total bytes in collection: $num_bytes\n";
561 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
562
563 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
564
565 if ($self->{'keepold'}) {
566 if ($num_processed_bytes == 0) {
567 if ($indexing_text) {
568 print $outhandle "No additional text was added to $index\n";
569 } elsif (!$self->{'no_text'}) {
570 print $outhandle "No additional text was compressed\n";
571 }
572 }
573 }
574 else {
575 print $outhandle "***************\n";
576 if ($indexing_text) {
577 print $outhandle "WARNING: There is very little or no text to process for $index\n";
578 } elsif (!$self->{'no_text'}) {
579 print $outhandle "WARNING: There is very little or no text to compress\n";
580 }
581 print $outhandle " Was this your intention?\n";
582 print $outhandle "***************\n";
583 }
584
585 }
586
587}
588
589
5901;
591
Note: See TracBrowser for help on using the repository browser.