source: trunk/gsdl/perllib/basebuilder.pm@ 13933

Last change on this file since 13933 was 12972, checked in by kjdon, 18 years ago

now has incremental and incremental_dlc argument to new()

  • Property svn:keywords set to Author Date Id Revision
File size: 18.0 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mgpp
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47our $maxdocsize = 12000;
48
49sub new {
50 my ($class, $collection, $source_dir, $build_dir, $verbosity,
51 $maxdocs, $debug, $keepold, $incremental, $incremental_dlc,
52 $remove_empty_classifications,
53 $outhandle, $no_text, $failhandle, $gli) = @_;
54
55 $outhandle = STDERR unless defined $outhandle;
56 $no_text = 0 unless defined $no_text;
57 $failhandle = STDERR unless defined $failhandle;
58
59 # create a builder object
60 my $self = bless {'collection'=>$collection,
61 'source_dir'=>$source_dir,
62 'build_dir'=>$build_dir,
63 'verbosity'=>$verbosity,
64 'maxdocs'=>$maxdocs,
65 'debug'=>$debug,
66 'keepold'=>$keepold,
67 'incremental'=>$incremental,
68 'incremental_dlc' => $incremental_dlc,
69 'remove_empty_classifications'=>$remove_empty_classifications,
70 'outhandle'=>$outhandle,
71 'no_text'=>$no_text,
72 'failhandle'=>$failhandle,
73 'notbuilt'=>{}, # indexes not built
74 'gli'=>$gli
75 }, $class;
76
77 $self->{'gli'} = 0 unless defined $self->{'gli'};
78
79 # read in the collection configuration file
80 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
81 if (!-e $colcfgname) {
82 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
83 }
84 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
85
86 # get the list of plugins for this collection
87 my $plugins = [];
88 if (defined $self->{'collect_cfg'}->{'plugin'}) {
89 $plugins = $self->{'collect_cfg'}->{'plugin'};
90 }
91
92 # load all the plugins
93
94 #build up the extra global options for the plugins
95 my @global_opts = ();
96 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
97 push @global_opts, "-separate_cjk";
98 }
99 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts, $keepold);
100
101 if (scalar(@{$self->{'pluginfo'}}) == 0) {
102 print $outhandle "No plugins were loaded.\n";
103 die "\n";
104 }
105
106 # get the list of classifiers for this collection
107 my $classifiers = [];
108 if (defined $self->{'collect_cfg'}->{'classify'}) {
109 $classifiers = $self->{'collect_cfg'}->{'classify'};
110 }
111
112 # load all the classifiers
113 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
114
115 # load up any dontgdbm fields
116 $self->{'dontgdbm'} = {};
117 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
118 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
119 $self->{'dontgdbm'}->{$dg} = 1;
120 }
121 }
122
123 $self->{'maxnumeric'} = 4;
124 return $self;
125}
126
127# stuff has been moved here from new, so we can use subclass methods
128sub init {
129 my $self = shift(@_);
130
131 $self->generate_index_list();
132 $self->generate_index_options();
133
134 # sort out subcollection indexes
135 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
136 my $indexes = $self->{'collect_cfg'}->{'indexes'};
137 $self->{'collect_cfg'}->{'indexes'} = [];
138 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
139 foreach my $index (@$indexes) {
140 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
141 }
142 }
143 }
144
145 # sort out language subindexes
146 if (defined $self->{'collect_cfg'}->{'languages'}) {
147 my $indexes = $self->{'collect_cfg'}->{'indexes'};
148 $self->{'collect_cfg'}->{'indexes'} = [];
149 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
150 foreach my $index (@$indexes) {
151 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
152 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
153 }
154 else { # add in an empty subcollection field
155 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
156 }
157 }
158 }
159 }
160
161 if (defined($self->{'collect_cfg'}->{'indexes'})) {
162 # make sure that the same index isn't specified more than once
163 my %tmphash = ();
164 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
165 $self->{'collect_cfg'}->{'indexes'} = [];
166 foreach my $i (@tmparray) {
167 if (!defined ($tmphash{$i})) {
168 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
169 $tmphash{$i} = 1;
170 }
171 }
172 } else {
173 $self->{'collect_cfg'}->{'indexes'} = [];
174 }
175
176 # load up the document processor for building
177 # if a buildproc class has been created for this collection, use it
178 # otherwise, use the mg buildproc
179 my ($buildprocdir, $buildproctype);
180 my $collection = $self->{'collection'};
181 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
182 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
183 $buildproctype = "${collection}buildproc";
184 } else {
185 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
186 $buildproctype = $self->default_buildproc();
187 }
188 require "$buildprocdir/$buildproctype.pm";
189
190 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
191 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
192 die "$@" if $@;
193
194 if (!$self->{'debug'} && !$self->{'keepold'}) {
195 # remove any old builds
196 &util::rm_r($self->{'build_dir'});
197 &util::mk_all_dir($self->{'build_dir'});
198
199 # make the text directory
200 my $textdir = "$self->{'build_dir'}/text";
201 &util::mk_all_dir($textdir);
202 }
203
204}
205
206sub deinit {
207 my $self = shift (@_);
208
209 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
210}
211
212sub set_sections_index_document_metadata {
213 my $self = shift (@_);
214 my ($index) = @_;
215
216 $self->{'buildproc'}->set_sections_index_document_metadata($index);
217}
218
219sub set_maxnumeric {
220 my $self = shift (@_);
221 my ($maxnumeric) = @_;
222
223 $self->{'maxnumeric'} = $maxnumeric;
224}
225sub set_strip_html {
226 my $self = shift (@_);
227 my ($strip) = @_;
228
229 $self->{'strip_html'} = $strip;
230 $self->{'buildproc'}->set_strip_html($strip);
231}
232
233sub compress_text {
234 my $self = shift (@_);
235 my ($textindex) = @_;
236
237 print STDERR "compress_text() should be implemented in subclass!!";
238 return;
239}
240
241
242sub build_indexes {
243 my $self = shift (@_);
244 my ($indexname) = @_;
245 my $outhandle = $self->{'outhandle'};
246
247 my $indexes = [];
248 if (defined $indexname && $indexname =~ /\w/) {
249 push @$indexes, $indexname;
250 } else {
251 $indexes = $self->{'collect_cfg'}->{'indexes'};
252 }
253
254 # create the mapping between the index descriptions
255 # and their directory names (includes subcolls and langs)
256 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
257
258 # build each of the indexes
259 foreach my $index (@$indexes) {
260 if ($self->want_built($index)) {
261 print $outhandle "\n*** building index $index in subdirectory " .
262 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
263 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
264 $self->build_index($index);
265 } else {
266 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
267 }
268 }
269
270 $self->build_indexes_extra();
271
272}
273
274sub build_indexes_extra {
275 my $self = shift(@_);
276
277}
278
279sub build_index {
280 my $self = shift (@_);
281 my ($index) = @_;
282
283 print STDERR "build_index should be implemented in subclass\n";
284 return;
285}
286
287
288
289sub make_infodatabase {
290 my $self = shift (@_);
291 my $outhandle = $self->{'outhandle'};
292
293 print STDERR "BuildDir: $self->{'build_dir'}\n";
294
295 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
296 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
297 &util::mk_all_dir ($textdir);
298 &util::mk_all_dir ($assocdir);
299
300 # get db name
301 my $dbext = ".bdb";
302 $dbext = ".ldb" if &util::is_little_endian();
303 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
304 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
305
306 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
307 my $exe = &util::get_os_exe ();
308 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
309
310 print $outhandle "\n*** creating the info database and processing associated files\n"
311 if ($self->{'verbosity'} >= 1);
312 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
313
314 # init all the classifiers
315 &classify::init_classifiers ($self->{'classifiers'});
316
317
318 my $reconstructed_docs = undef;
319 if ($self->{'keepold'}) {
320 # reconstruct doc_obj metadata from gdbm for all docs
321 $reconstructed_docs = &classify::reconstruct_doc_objs_metadata($fulldbname);
322 }
323
324 # set up the document processor
325 my ($handle);
326 if ($self->{'debug'}) {
327 $handle = STDOUT;
328 } else {
329 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
330 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
331 die "builder::make_infodatabase - couldn't run $txt2db_exe\n";
332 }
333 $handle = basebuilder::PIPEOUT;
334 }
335
336 $self->{'buildproc'}->set_output_handle ($handle);
337 $self->{'buildproc'}->set_mode ('infodb');
338 $self->{'buildproc'}->set_assocdir ($assocdir);
339 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
340 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
341 $self->{'buildproc'}->set_indexing_text (0);
342 $self->{'buildproc'}->set_store_text(1);
343
344 # make_infodatabase needs full reset even for incremental build
345 # as incremental works by reconstructing all docs from GDBM and
346 # then adding in the new ones
347 $self->{'buildproc'}->zero_reset();
348
349 if ($self->{'keepold'}) {
350 # create flat classify structure, ready for new docs to be added
351 foreach my $doc_obj ( @$reconstructed_docs ) {
352 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
353 $self->{'buildproc'}->process($doc_obj,undef);
354 }
355 }
356
357
358 # this has changed to only output collection meta if its
359 # not in the config file
360 $self->output_collection_meta($handle);
361 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
362 "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
363
364 # output classification information
365 &classify::output_classify_info ($self->{'classifiers'}, $handle,
366 $self->{'remove_empty_classifications'},
367 $self->{'gli'});
368
369 # Output classifier reverse lookup, used in incremental deletion
370 #&classify::print_reverse_lookup($handle);
371
372 #output doclist
373 my @doclist = $self->{'buildproc'}->get_doc_list();
374 my $docs = join (";",@doclist);
375 print $handle "[browselist]\n";
376 print $handle "<hastxt>0\n";
377 print $handle "<childtype>VList\n";
378 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
379 print $handle "<thistype>Invisible\n";
380 print $handle "<contains>$docs";
381 print $handle "\n" . ('-' x 70) . "\n";
382
383 close ($handle) if !$self->{'debug'};
384
385 print STDERR "</Stage>\n" if $self->{'gli'};
386}
387
388sub make_auxiliary_files {
389 my $self = shift (@_);
390 my ($index);
391 my $build_cfg = {};
392 # subclasses may have already defined stuff in here
393 if (defined $self->{'build_cfg'}) {
394 $build_cfg = $self->{'build_cfg'};
395 }
396
397 my $outhandle = $self->{'outhandle'};
398
399 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
400 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
401
402 # get the text directory
403 &util::mk_all_dir ($self->{'build_dir'});
404
405 # store the build date
406 $build_cfg->{'builddate'} = time;
407 $build_cfg->{'buildtype'} = $self->{'buildtype'};
408 $build_cfg->{'indexstem'} = $self->{'collection'};
409 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
410
411 # store the number of documents and number of bytes
412 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
413 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
414 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
415
416 # store the mapping between the index names and the directory names
417 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
418 my @indexmap = ();
419 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
420 if (not defined ($self->{'notbuilt'}->{$index})) {
421 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
422 }
423 }
424 $build_cfg->{'indexmap'} = \@indexmap;
425
426 my @subcollectionmap = ();
427 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
428 push (@subcollectionmap, "$subcollection\-\>" .
429 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
430 }
431 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
432
433 my @languagemap = ();
434 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
435 push (@languagemap, "$language\-\>" .
436 $self->{'index_mapping'}->{'languagemap'}->{$language});
437 }
438 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
439
440 my @notbuilt = ();
441 foreach my $nb (keys %{$self->{'notbuilt'}}) {
442 push (@notbuilt, $nb);
443 }
444 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
445
446 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
447
448 $self->build_cfg_extra($build_cfg);
449
450 &colcfg::write_build_cfg("$self->{'build_dir'}/build.cfg", $build_cfg);
451
452 print STDERR "</Stage>\n" if $self->{'gli'};
453}
454
455sub collect_specific {
456 my $self = shift (@_);
457}
458
459sub want_built {
460 my $self = shift (@_);
461 my ($index) = @_;
462
463 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
464 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
465 if ($index =~ /^$checkstr$/) {
466 $self->{'notbuilt'}->{$index} = 1;
467 return 0;
468 }
469 }
470 }
471
472 return 1;
473}
474
475sub create_index_mapping {
476 my $self = shift (@_);
477 my ($indexes) = @_;
478
479 print STDERR "create_index_mapping should be implemented in subclass\n";
480 my %mapping = ();
481 return \%mapping;
482}
483
484# returns a processed version of a field.
485# if the field has only one component the processed
486# version will contain the first character and next consonant
487# of that componant - otherwise it will contain the first
488# character of the first two components
489# only uses letdig (\w) characters now
490sub process_field {
491 my $self = shift (@_);
492 my ($field) = @_;
493
494 return "" unless (defined ($field) && $field =~ /\S/);
495
496 my ($a, $b);
497 my @components = split /,/, $field;
498 if (scalar @components >= 2) {
499 # pick the first letdig from the first two field names
500 ($a) = $components[0] =~ /^[^\w]*(\w)/;
501 ($b) = $components[1] =~ /^[^\w]*(\w)/;
502 } else {
503 # pick the first two letdig chars
504 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
505 }
506 # there may not have been any letdigs...
507 $a = 'a' unless defined $a;
508 $b = '0' unless defined $b;
509
510 return "$a$b";
511
512}
513
514sub get_next_version {
515 my $self = shift (@_);
516 my ($nameref) = @_;
517 my $num=0;
518 if ($$nameref =~ /(\d\d)$/) {
519 $num = $1; $num ++;
520 $$nameref =~ s/\d\d$/$num/;
521 } elsif ($$nameref =~ /(\d)$/) {
522 $num = $1;
523 if ($num == 9) {$$nameref =~ s/\d$/10/;}
524 else {$num ++; $$nameref =~ s/\d$/$num/;}
525 } else {
526 $$nameref =~ s/.$/0/;
527 }
528}
529
530# implement this in subclass if want to add extra stuff to build.cfg
531sub build_cfg_extra {
532 my $self = shift(@_);
533 my ($build_cfg) = @_;
534
535}
536
537# default is to output an empty [collection] entry
538sub output_collection_meta {
539 my $self = shift(@_);
540 my ($handle) = @_;
541
542 print $handle "[collection]\n". ('-' x 70) . "\n";;
543
544}
545
546sub print_stats {
547 my $self = shift (@_);
548
549 my $outhandle = $self->{'outhandle'};
550 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
551 my $index = $self->{'buildproc'}->get_index();
552 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
553 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
554
555 if ($indexing_text) {
556 print $outhandle "Stats (Creating index $index)\n";
557 } else {
558 print $outhandle "Stats (Compressing text from $index)\n";
559 }
560 print $outhandle "Total bytes in collection: $num_bytes\n";
561 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
562
563 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
564
565 if ($self->{'keepold'}) {
566 if ($num_processed_bytes == 0) {
567 if ($indexing_text) {
568 print $outhandle "No additional text was added to $index\n";
569 } elsif (!$self->{'no_text'}) {
570 print $outhandle "No additional text was compressed\n";
571 }
572 }
573 }
574 else {
575 print $outhandle "***************\n";
576 if ($indexing_text) {
577 print $outhandle "WARNING: There is very little or no text to process for $index\n";
578 } elsif (!$self->{'no_text'}) {
579 print $outhandle "WARNING: There is very little or no text to compress\n";
580 }
581 print $outhandle " Was this your intention?\n";
582 print $outhandle "***************\n";
583 }
584
585 }
586
587}
588
589
5901;
591
Note: See TracBrowser for help on using the repository browser.