source: trunk/gsdl/perllib/basebuilder.pm@ 12891

Last change on this file since 12891 was 12844, checked in by mdewsnip, 18 years ago

Incremental building and dynamic GDBM updating code, many thanks to John Rowe and John Thompson at DL Consulting Ltd.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.1 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mgpp
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47our $maxdocsize = 12000;
48
49sub new {
50 my ($class, $collection, $source_dir, $build_dir, $verbosity,
51 $maxdocs, $debug, $keepold, $remove_empty_classifications,
52 $outhandle, $no_text, $failhandle, $gli) = @_;
53
54 $outhandle = STDERR unless defined $outhandle;
55 $no_text = 0 unless defined $no_text;
56 $failhandle = STDERR unless defined $failhandle;
57
58 # create a builder object
59 my $self = bless {'collection'=>$collection,
60 'source_dir'=>$source_dir,
61 'build_dir'=>$build_dir,
62 'verbosity'=>$verbosity,
63 'maxdocs'=>$maxdocs,
64 'debug'=>$debug,
65 'keepold'=>$keepold,
66 'remove_empty_classifications'=>$remove_empty_classifications,
67 'outhandle'=>$outhandle,
68 'no_text'=>$no_text,
69 'failhandle'=>$failhandle,
70 'notbuilt'=>{}, # indexes not built
71 'gli'=>$gli
72 }, $class;
73
74 $self->{'gli'} = 0 unless defined $self->{'gli'};
75
76 # read in the collection configuration file
77 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
78 if (!-e $colcfgname) {
79 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
80 }
81 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
82
83 # get the list of plugins for this collection
84 my $plugins = [];
85 if (defined $self->{'collect_cfg'}->{'plugin'}) {
86 $plugins = $self->{'collect_cfg'}->{'plugin'};
87 }
88
89 # load all the plugins
90
91 #build up the extra global options for the plugins
92 my @global_opts = ();
93 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
94 push @global_opts, "-separate_cjk";
95 }
96 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts, $keepold);
97
98 if (scalar(@{$self->{'pluginfo'}}) == 0) {
99 print $outhandle "No plugins were loaded.\n";
100 die "\n";
101 }
102
103 # get the list of classifiers for this collection
104 my $classifiers = [];
105 if (defined $self->{'collect_cfg'}->{'classify'}) {
106 $classifiers = $self->{'collect_cfg'}->{'classify'};
107 }
108
109 # load all the classifiers
110 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
111
112 # load up any dontgdbm fields
113 $self->{'dontgdbm'} = {};
114 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
115 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
116 $self->{'dontgdbm'}->{$dg} = 1;
117 }
118 }
119
120 $self->{'maxnumeric'} = 4;
121 return $self;
122}
123
124# stuff has been moved here from new, so we can use subclass methods
125sub init {
126 my $self = shift(@_);
127
128 $self->generate_index_list();
129
130 # sort out subcollection indexes
131 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
132 my $indexes = $self->{'collect_cfg'}->{'indexes'};
133 $self->{'collect_cfg'}->{'indexes'} = [];
134 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
135 foreach my $index (@$indexes) {
136 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
137 }
138 }
139 }
140
141 # sort out language subindexes
142 if (defined $self->{'collect_cfg'}->{'languages'}) {
143 my $indexes = $self->{'collect_cfg'}->{'indexes'};
144 $self->{'collect_cfg'}->{'indexes'} = [];
145 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
146 foreach my $index (@$indexes) {
147 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
148 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
149 }
150 else { # add in an empty subcollection field
151 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
152 }
153 }
154 }
155 }
156
157 if (defined($self->{'collect_cfg'}->{'indexes'})) {
158 # make sure that the same index isn't specified more than once
159 my %tmphash = ();
160 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
161 $self->{'collect_cfg'}->{'indexes'} = [];
162 foreach my $i (@tmparray) {
163 if (!defined ($tmphash{$i})) {
164 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
165 $tmphash{$i} = 1;
166 }
167 }
168 } else {
169 $self->{'collect_cfg'}->{'indexes'} = [];
170 }
171
172 # load up the document processor for building
173 # if a buildproc class has been created for this collection, use it
174 # otherwise, use the mg buildproc
175 my ($buildprocdir, $buildproctype);
176 my $collection = $self->{'collection'};
177 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
178 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
179 $buildproctype = "${collection}buildproc";
180 } else {
181 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
182 $buildproctype = $self->default_buildproc();
183 }
184 require "$buildprocdir/$buildproctype.pm";
185
186 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
187 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
188 die "$@" if $@;
189
190 if (!$self->{'debug'} && !$self->{'keepold'}) {
191 # remove any old builds
192 &util::rm_r($self->{'build_dir'});
193 &util::mk_all_dir($self->{'build_dir'});
194
195 # make the text directory
196 my $textdir = "$self->{'build_dir'}/text";
197 &util::mk_all_dir($textdir);
198 }
199
200}
201
202sub deinit {
203 my $self = shift (@_);
204
205 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
206}
207
208sub set_sections_index_document_metadata {
209 my $self = shift (@_);
210 my ($index) = @_;
211
212 $self->{'buildproc'}->set_sections_index_document_metadata($index);
213}
214
215sub set_maxnumeric {
216 my $self = shift (@_);
217 my ($maxnumeric) = @_;
218
219 $self->{'maxnumeric'} = $maxnumeric;
220}
221sub set_strip_html {
222 my $self = shift (@_);
223 my ($strip) = @_;
224
225 $self->{'strip_html'} = $strip;
226 $self->{'buildproc'}->set_strip_html($strip);
227}
228
229sub compress_text {
230 my $self = shift (@_);
231 my ($textindex) = @_;
232
233 print STDERR "compress_text() should be implemented in subclass!!";
234 return;
235}
236
237
238sub build_indexes {
239 my $self = shift (@_);
240 my ($indexname) = @_;
241 my $outhandle = $self->{'outhandle'};
242
243 my $indexes = [];
244 if (defined $indexname && $indexname =~ /\w/) {
245 push @$indexes, $indexname;
246 } else {
247 $indexes = $self->{'collect_cfg'}->{'indexes'};
248 }
249
250 # create the mapping between the index descriptions
251 # and their directory names (includes subcolls and langs)
252 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
253
254 # build each of the indexes
255 foreach my $index (@$indexes) {
256 if ($self->want_built($index)) {
257 print $outhandle "\n*** building index $index in subdirectory " .
258 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
259 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
260 $self->build_index($index);
261 } else {
262 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
263 }
264 }
265
266 $self->build_indexes_extra();
267
268}
269
270sub build_indexes_extra {
271 my $self = shift(@_);
272
273}
274
275sub build_index {
276 my $self = shift (@_);
277 my ($index) = @_;
278
279 print STDERR "build_index should be implemented in subclass\n";
280 return;
281}
282
283
284
285sub make_infodatabase {
286 my $self = shift (@_);
287 my $outhandle = $self->{'outhandle'};
288
289 print STDERR "BuildDir: $self->{'build_dir'}\n";
290
291 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
292 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
293 &util::mk_all_dir ($textdir);
294 &util::mk_all_dir ($assocdir);
295
296 # get db name
297 my $dbext = ".bdb";
298 $dbext = ".ldb" if &util::is_little_endian();
299 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
300 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
301
302 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
303 my $exe = &util::get_os_exe ();
304 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
305
306 print $outhandle "\n*** creating the info database and processing associated files\n"
307 if ($self->{'verbosity'} >= 1);
308 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
309
310 # init all the classifiers
311 &classify::init_classifiers ($self->{'classifiers'});
312
313
314 my $reconstructed_docs = undef;
315 if ($self->{'keepold'}) {
316 # reconstruct doc_obj metadata from gdbm for all docs
317 $reconstructed_docs = &classify::reconstruct_doc_objs_metadata($fulldbname);
318 }
319
320 # set up the document processor
321 my ($handle);
322 if ($self->{'debug'}) {
323 $handle = STDOUT;
324 } else {
325 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
326 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
327 die "builder::make_infodatabase - couldn't run $txt2db_exe\n";
328 }
329 $handle = basebuilder::PIPEOUT;
330 }
331
332 $self->{'buildproc'}->set_output_handle ($handle);
333 $self->{'buildproc'}->set_mode ('infodb');
334 $self->{'buildproc'}->set_assocdir ($assocdir);
335 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
336 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
337 $self->{'buildproc'}->set_indexing_text (0);
338 $self->{'buildproc'}->set_store_text(1);
339
340 # make_infodatabase needs full reset even for incremental build
341 # as incremental works by reconstructing all docs from GDBM and
342 # then adding in the new ones
343 $self->{'buildproc'}->zero_reset();
344
345 if ($self->{'keepold'}) {
346 # create flat classify structure, ready for new docs to be added
347 foreach my $doc_obj ( @$reconstructed_docs ) {
348 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
349 $self->{'buildproc'}->process($doc_obj,undef);
350 }
351 }
352
353
354 # this has changed to only output collection meta if its
355 # not in the config file
356 $self->output_collection_meta($handle);
357 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
358 "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
359
360 # output classification information
361 &classify::output_classify_info ($self->{'classifiers'}, $handle,
362 $self->{'remove_empty_classifications'},
363 $self->{'gli'});
364
365 # Output classifier reverse lookup, used in incremental deletion
366 #&classify::print_reverse_lookup($handle);
367
368 #output doclist
369 my @doclist = $self->{'buildproc'}->get_doc_list();
370 my $docs = join (";",@doclist);
371 print $handle "[browselist]\n";
372 print $handle "<hastxt>0\n";
373 print $handle "<childtype>VList\n";
374 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
375 print $handle "<thistype>Invisible\n";
376 print $handle "<contains>$docs";
377 print $handle "\n" . ('-' x 70) . "\n";
378
379 close ($handle) if !$self->{'debug'};
380
381 print STDERR "</Stage>\n" if $self->{'gli'};
382}
383
384sub make_auxiliary_files {
385 my $self = shift (@_);
386 my ($index);
387 my $build_cfg = {};
388 # subclasses may have already defined stuff in here
389 if (defined $self->{'build_cfg'}) {
390 $build_cfg = $self->{'build_cfg'};
391 }
392
393 my $outhandle = $self->{'outhandle'};
394
395 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
396 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
397
398 # get the text directory
399 &util::mk_all_dir ($self->{'build_dir'});
400
401 # store the build date
402 $build_cfg->{'builddate'} = time;
403 $build_cfg->{'buildtype'} = $self->{'buildtype'};
404 $build_cfg->{'indexstem'} = $self->{'collection'};
405 # store the number of documents and number of bytes
406 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
407 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
408 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
409
410 # store the mapping between the index names and the directory names
411 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
412 my @indexmap = ();
413 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
414 if (not defined ($self->{'notbuilt'}->{$index})) {
415 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
416 }
417 }
418 $build_cfg->{'indexmap'} = \@indexmap;
419
420 my @subcollectionmap = ();
421 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
422 push (@subcollectionmap, "$subcollection\-\>" .
423 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
424 }
425 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
426
427 my @languagemap = ();
428 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
429 push (@languagemap, "$language\-\>" .
430 $self->{'index_mapping'}->{'languagemap'}->{$language});
431 }
432 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
433
434 my @notbuilt = ();
435 foreach my $nb (keys %{$self->{'notbuilt'}}) {
436 push (@notbuilt, $nb);
437 }
438 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
439
440 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
441
442 $self->build_cfg_extra($build_cfg);
443
444 $self->write_cfg_file($build_cfg);
445 print STDERR "</Stage>\n" if $self->{'gli'};
446}
447
448sub collect_specific {
449 my $self = shift (@_);
450}
451
452sub want_built {
453 my $self = shift (@_);
454 my ($index) = @_;
455
456 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
457 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
458 if ($index =~ /^$checkstr$/) {
459 $self->{'notbuilt'}->{$index} = 1;
460 return 0;
461 }
462 }
463 }
464
465 return 1;
466}
467
468sub create_index_mapping {
469 my $self = shift (@_);
470 my ($indexes) = @_;
471
472 print STDERR "create_index_mapping should be implemented in subclass\n";
473 my %mapping = ();
474 return \%mapping;
475}
476
477# returns a processed version of a field.
478# if the field has only one component the processed
479# version will contain the first character and next consonant
480# of that componant - otherwise it will contain the first
481# character of the first two components
482# only uses letdig (\w) characters now
483sub process_field {
484 my $self = shift (@_);
485 my ($field) = @_;
486
487 return "" unless (defined ($field) && $field =~ /\S/);
488
489 my ($a, $b);
490 my @components = split /,/, $field;
491 if (scalar @components >= 2) {
492 # pick the first letdig from the first two field names
493 ($a) = $components[0] =~ /^[^\w]*(\w)/;
494 ($b) = $components[1] =~ /^[^\w]*(\w)/;
495 } else {
496 # pick the first two letdig chars
497 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
498 }
499 # there may not have been any letdigs...
500 $a = 'a' unless defined $a;
501 $b = '0' unless defined $b;
502
503 return "$a$b";
504
505}
506
507sub get_next_version {
508 my $self = shift (@_);
509 my ($nameref) = @_;
510 my $num=0;
511 if ($$nameref =~ /(\d\d)$/) {
512 $num = $1; $num ++;
513 $$nameref =~ s/\d\d$/$num/;
514 } elsif ($$nameref =~ /(\d)$/) {
515 $num = $1;
516 if ($num == 9) {$$nameref =~ s/\d$/10/;}
517 else {$num ++; $$nameref =~ s/\d$/$num/;}
518 } else {
519 $$nameref =~ s/.$/0/;
520 }
521}
522
523# implement this in subclass if want to add extra stuff to build.cfg
524sub build_cfg_extra {
525 my $self = shift(@_);
526 my ($build_cfg) = @_;
527
528}
529
530sub write_cfg_file {
531 my $self = shift(@_);
532 my ($build_cfg) = @_;
533
534 # write out the build information
535 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
536 '^(builddate|buildtype|numdocs|numbytes|numwords|numsections|maxnumeric|indexstem)$',
537 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
538
539}
540
541# default is to output an empty [collection] entry
542sub output_collection_meta {
543 my $self = shift(@_);
544 my ($handle) = @_;
545
546 print $handle "[collection]\n". ('-' x 70) . "\n";;
547
548}
549
550sub print_stats {
551 my $self = shift (@_);
552
553 my $outhandle = $self->{'outhandle'};
554 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
555 my $index = $self->{'buildproc'}->get_index();
556 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
557 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
558
559 if ($indexing_text) {
560 print $outhandle "Stats (Creating index $index)\n";
561 } else {
562 print $outhandle "Stats (Compressing text from $index)\n";
563 }
564 print $outhandle "Total bytes in collection: $num_bytes\n";
565 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
566
567 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
568
569 if ($self->{'keepold'}) {
570 if ($num_processed_bytes == 0) {
571 if ($indexing_text) {
572 print $outhandle "No additional text was added to $index\n";
573 } elsif (!$self->{'no_text'}) {
574 print $outhandle "No additional text was compressed\n";
575 }
576 }
577 }
578 else {
579 print $outhandle "***************\n";
580 if ($indexing_text) {
581 print $outhandle "WARNING: There is very little or no text to process for $index\n";
582 } elsif (!$self->{'no_text'}) {
583 print $outhandle "WARNING: There is very little or no text to compress\n";
584 }
585 print $outhandle " Was this your intention?\n";
586 print $outhandle "***************\n";
587 }
588
589 }
590
591}
592
593
5941;
595
Note: See TracBrowser for help on using the repository browser.