source: gsdl/trunk/perllib/basebuilder.pm@ 14374

Last change on this file since 14374 was 14374, checked in by mdewsnip, 15 years ago

Fix to problem where the "indexmap" line would be lost from the build.cfg file when running buildcol.pl with "-mode infodb".

  • Property svn:keywords set to Author Date Id Revision
File size: 19.7 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use plugin;
35use util;
36use FileHandle;
37
38BEGIN {
39 # set autoflush on for STDERR and STDOUT so that mgpp
40 # doesn't get out of sync with plugins
41 STDOUT->autoflush(1);
42 STDERR->autoflush(1);
43}
44
45END {
46 STDOUT->autoflush(0);
47 STDERR->autoflush(0);
48}
49
50our $maxdocsize = 12000;
51
52# used to signify "gs2"(default) or "gs3"
53my $gs_mode = "gs2";
54
55sub new {
56 my ($class, $collection, $source_dir, $build_dir, $verbosity,
57 $maxdocs, $debug, $keepold, $incremental, $incremental_dlc,
58 $remove_empty_classifications,
59 $outhandle, $no_text, $failhandle, $gli, $disable_OAI) = @_;
60
61 $outhandle = *STDERR unless defined $outhandle;
62 $no_text = 0 unless defined $no_text;
63 $failhandle = *STDERR unless defined $failhandle;
64
65 # create a builder object
66 my $self = bless {'collection'=>$collection,
67 'source_dir'=>$source_dir,
68 'build_dir'=>$build_dir,
69 'verbosity'=>$verbosity,
70 'maxdocs'=>$maxdocs,
71 'debug'=>$debug,
72 'keepold'=>$keepold,
73 'incremental'=>$incremental,
74 'incremental_dlc' => $incremental_dlc,
75 'remove_empty_classifications'=>$remove_empty_classifications,
76 'outhandle'=>$outhandle,
77 'no_text'=>$no_text,
78 'failhandle'=>$failhandle,
79 'notbuilt'=>{}, # indexes not built
80 'gli'=>$gli,
81 'disable_OAI'=>$disable_OAI
82 }, $class;
83
84 $self->{'gli'} = 0 unless defined $self->{'gli'};
85
86 # disable_OIA applies to greenstone 3 only and is only passed to &colcfg::write_build_cfg_xml (then cfgread4gs3::write_cfg_file) when writing the buildConfig.xml
87 $self->{'disable_OAI'} = 0 unless defined $self->{'disable_OAI'};
88
89 # Read in the collection configuration file.
90 my ($colcfgname);
91 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
92 if ($gs_mode eq "gs2") {
93 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
94 } elsif ($gs_mode eq "gs3") {
95 $self->{'collect_cfg'} = &colcfg::read_collection_cfg_xml ($colcfgname);
96 }
97
98 # get the list of plugins for this collection
99 my $plugins = [];
100 if (defined $self->{'collect_cfg'}->{'plugin'}) {
101 $plugins = $self->{'collect_cfg'}->{'plugin'};
102 }
103
104 # load all the plugins
105
106 #build up the extra global options for the plugins
107 my @global_opts = ();
108 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
109 push @global_opts, "-separate_cjk";
110 }
111 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts, $keepold);
112
113 if (scalar(@{$self->{'pluginfo'}}) == 0) {
114 print $outhandle "No plugins were loaded.\n";
115 die "\n";
116 }
117
118 # get the list of classifiers for this collection
119 my $classifiers = [];
120 if (defined $self->{'collect_cfg'}->{'classify'}) {
121 $classifiers = $self->{'collect_cfg'}->{'classify'};
122 }
123
124 # load all the classifiers
125 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
126
127 # load up any dontgdbm fields
128 $self->{'dontgdbm'} = {};
129 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
130 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
131 $self->{'dontgdbm'}->{$dg} = 1;
132 }
133 }
134
135 $self->{'maxnumeric'} = 4;
136 return $self;
137}
138
139# stuff has been moved here from new, so we can use subclass methods
140sub init {
141 my $self = shift(@_);
142
143 $self->generate_index_list();
144 $self->generate_index_options();
145
146 # sort out subcollection indexes
147 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
148 my $indexes = $self->{'collect_cfg'}->{'indexes'};
149 $self->{'collect_cfg'}->{'indexes'} = [];
150 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
151 foreach my $index (@$indexes) {
152 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
153 }
154 }
155 }
156
157 # sort out language subindexes
158 if (defined $self->{'collect_cfg'}->{'languages'}) {
159 my $indexes = $self->{'collect_cfg'}->{'indexes'};
160 $self->{'collect_cfg'}->{'indexes'} = [];
161 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
162 foreach my $index (@$indexes) {
163 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
164 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
165 }
166 else { # add in an empty subcollection field
167 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
168 }
169 }
170 }
171 }
172
173 if (defined($self->{'collect_cfg'}->{'indexes'})) {
174 # make sure that the same index isn't specified more than once
175 my %tmphash = ();
176 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
177 $self->{'collect_cfg'}->{'indexes'} = [];
178 foreach my $i (@tmparray) {
179 if (!defined ($tmphash{$i})) {
180 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
181 $tmphash{$i} = 1;
182 }
183 }
184 } else {
185 $self->{'collect_cfg'}->{'indexes'} = [];
186 }
187
188 # load up the document processor for building
189 # if a buildproc class has been created for this collection, use it
190 # otherwise, use the mg buildproc
191 my ($buildprocdir, $buildproctype);
192 my $collection = $self->{'collection'};
193 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
194 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
195 $buildproctype = "custombuildproc";
196 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
197 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
198 $buildproctype = "custombuildproc";
199 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
200 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
201 $buildproctype = "${collection}buildproc";
202 } else {
203 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
204 $buildproctype = $self->default_buildproc();
205 }
206 require "$buildprocdir/$buildproctype.pm";
207
208 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
209 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
210 die "$@" if $@;
211
212 if (!$self->{'debug'} && !$self->{'keepold'}) {
213 # remove any old builds
214 &util::rm_r($self->{'build_dir'});
215 &util::mk_all_dir($self->{'build_dir'});
216
217 # make the text directory
218 my $textdir = "$self->{'build_dir'}/text";
219 &util::mk_all_dir($textdir);
220 }
221
222}
223
224sub deinit {
225 my $self = shift (@_);
226
227 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
228}
229
230sub set_sections_index_document_metadata {
231 my $self = shift (@_);
232 my ($index) = @_;
233
234 $self->{'buildproc'}->set_sections_index_document_metadata($index);
235}
236
237sub set_maxnumeric {
238 my $self = shift (@_);
239 my ($maxnumeric) = @_;
240
241 $self->{'maxnumeric'} = $maxnumeric;
242}
243sub set_strip_html {
244 my $self = shift (@_);
245 my ($strip) = @_;
246
247 $self->{'strip_html'} = $strip;
248 $self->{'buildproc'}->set_strip_html($strip);
249}
250
251sub compress_text {
252 my $self = shift (@_);
253 my ($textindex) = @_;
254
255 print STDERR "compress_text() should be implemented in subclass!!";
256 return;
257}
258
259
260sub build_indexes {
261 my $self = shift (@_);
262 my ($indexname) = @_;
263 my $outhandle = $self->{'outhandle'};
264
265 my $indexes = [];
266 if (defined $indexname && $indexname =~ /\w/) {
267 push @$indexes, $indexname;
268 } else {
269 $indexes = $self->{'collect_cfg'}->{'indexes'};
270 }
271
272 # create the mapping between the index descriptions
273 # and their directory names (includes subcolls and langs)
274 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
275
276 # build each of the indexes
277 foreach my $index (@$indexes) {
278 if ($self->want_built($index)) {
279 print $outhandle "\n*** building index $index in subdirectory " .
280 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
281 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
282 $self->build_index($index);
283 } else {
284 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
285 }
286 }
287
288 $self->build_indexes_extra();
289
290}
291
292sub build_indexes_extra {
293 my $self = shift(@_);
294
295}
296
297sub build_index {
298 my $self = shift (@_);
299 my ($index) = @_;
300
301 print STDERR "build_index should be implemented in subclass\n";
302 return;
303}
304
305
306
307sub make_infodatabase {
308 my $self = shift (@_);
309 my $outhandle = $self->{'outhandle'};
310
311 print STDERR "BuildDir: $self->{'build_dir'}\n";
312
313 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
314 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
315 &util::mk_all_dir ($textdir);
316 &util::mk_all_dir ($assocdir);
317
318 # get db name
319 my $dbext = ".bdb";
320 $dbext = ".ldb" if &util::is_little_endian();
321 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
322 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
323
324 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
325 my $exe = &util::get_os_exe ();
326 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
327
328 print $outhandle "\n*** creating the info database and processing associated files\n"
329 if ($self->{'verbosity'} >= 1);
330 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
331
332 # init all the classifiers
333 &classify::init_classifiers ($self->{'classifiers'});
334
335
336 my $reconstructed_docs = undef;
337 if ($self->{'keepold'}) {
338 # reconstruct doc_obj metadata from gdbm for all docs
339 $reconstructed_docs = &classify::reconstruct_doc_objs_metadata($fulldbname);
340 }
341
342 # set up the document processor
343 my ($handle);
344 if ($self->{'debug'}) {
345 $handle = *STDOUT;
346 } else {
347 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
348 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
349 die "builder::make_infodatabase - couldn't run $txt2db_exe\n";
350 }
351 $handle = *PIPEOUT;
352 }
353
354 $self->{'buildproc'}->set_output_handle ($handle);
355 $self->{'buildproc'}->set_mode ('infodb');
356 $self->{'buildproc'}->set_assocdir ($assocdir);
357 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
358 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
359 $self->{'buildproc'}->set_indexing_text (0);
360 $self->{'buildproc'}->set_store_text(1);
361
362 # make_infodatabase needs full reset even for incremental build
363 # as incremental works by reconstructing all docs from GDBM and
364 # then adding in the new ones
365 $self->{'buildproc'}->zero_reset();
366
367 if ($self->{'keepold'}) {
368 # create flat classify structure, ready for new docs to be added
369 foreach my $doc_obj ( @$reconstructed_docs ) {
370 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
371 $self->{'buildproc'}->process($doc_obj,undef);
372 }
373 }
374
375
376 # this has changed to only output collection meta if its
377 # not in the config file
378 $self->output_collection_meta($handle);
379 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
380 "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
381
382 # output classification information
383 &classify::output_classify_info ($self->{'classifiers'}, $handle,
384 $self->{'remove_empty_classifications'},
385 $self->{'gli'});
386
387 # Output classifier reverse lookup, used in incremental deletion
388 #&classify::print_reverse_lookup($handle);
389
390 #output doclist
391 my @doclist = $self->{'buildproc'}->get_doc_list();
392 my $docs = join (";",@doclist);
393 print $handle "[browselist]\n";
394 print $handle "<hastxt>0\n";
395 print $handle "<childtype>VList\n";
396 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
397 print $handle "<thistype>Invisible\n";
398 print $handle "<contains>$docs";
399 print $handle "\n" . ('-' x 70) . "\n";
400
401 close ($handle) if !$self->{'debug'};
402
403 print STDERR "</Stage>\n" if $self->{'gli'};
404}
405
406sub make_auxiliary_files {
407 my $self = shift (@_);
408 my ($index);
409 my $build_cfg = {};
410 # subclasses may have already defined stuff in here
411 if (defined $self->{'build_cfg'}) {
412 $build_cfg = $self->{'build_cfg'};
413 }
414
415 my $outhandle = $self->{'outhandle'};
416
417 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
418 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
419
420 # get the text directory
421 &util::mk_all_dir ($self->{'build_dir'});
422
423 # store the build date
424 $build_cfg->{'builddate'} = time;
425 $build_cfg->{'buildtype'} = $self->{'buildtype'};
426 $build_cfg->{'indexstem'} = $self->{'collection'};
427 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
428
429 # store the number of documents and number of bytes
430 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
431 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
432 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
433
434 # store the mapping between the index names and the directory names
435 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
436 my @indexmap = ();
437 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
438 if (not defined ($self->{'notbuilt'}->{$index})) {
439 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
440 }
441 }
442 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
443
444 my @subcollectionmap = ();
445 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
446 push (@subcollectionmap, "$subcollection\-\>" .
447 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
448 }
449 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
450
451 my @languagemap = ();
452 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
453 push (@languagemap, "$language\-\>" .
454 $self->{'index_mapping'}->{'languagemap'}->{$language});
455 }
456 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
457
458 my @notbuilt = ();
459 foreach my $nb (keys %{$self->{'notbuilt'}}) {
460 push (@notbuilt, $nb);
461 }
462 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
463
464 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
465
466 $self->build_cfg_extra($build_cfg);
467
468 if ($gs_mode eq "gs2") {
469 &colcfg::write_build_cfg("$self->{'build_dir'}/build.cfg", $build_cfg);
470 }
471 if ($gs_mode eq "gs3") {
472 &colcfg::write_build_cfg_xml("$self->{'build_dir'}/buildConfig.xml", $build_cfg, $self->{'collect_cfg'}, $self->{'disable_OAI'});
473 }
474
475 print STDERR "</Stage>\n" if $self->{'gli'};
476}
477
478sub collect_specific {
479 my $self = shift (@_);
480}
481
482sub want_built {
483 my $self = shift (@_);
484 my ($index) = @_;
485
486 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
487 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
488 if ($index =~ /^$checkstr$/) {
489 $self->{'notbuilt'}->{$index} = 1;
490 return 0;
491 }
492 }
493 }
494
495 return 1;
496}
497
498sub create_index_mapping {
499 my $self = shift (@_);
500 my ($indexes) = @_;
501
502 print STDERR "create_index_mapping should be implemented in subclass\n";
503 my %mapping = ();
504 return \%mapping;
505}
506
507# returns a processed version of a field.
508# if the field has only one component the processed
509# version will contain the first character and next consonant
510# of that componant - otherwise it will contain the first
511# character of the first two components
512# only uses letdig (\w) characters now
513sub process_field {
514 my $self = shift (@_);
515 my ($field) = @_;
516
517 return "" unless (defined ($field) && $field =~ /\S/);
518
519 my ($a, $b);
520 my @components = split /,/, $field;
521 if (scalar @components >= 2) {
522 # pick the first letdig from the first two field names
523 ($a) = $components[0] =~ /^[^\w]*(\w)/;
524 ($b) = $components[1] =~ /^[^\w]*(\w)/;
525 } else {
526 # pick the first two letdig chars
527 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
528 }
529 # there may not have been any letdigs...
530 $a = 'a' unless defined $a;
531 $b = '0' unless defined $b;
532
533 return "$a$b";
534
535}
536
537sub get_next_version {
538 my $self = shift (@_);
539 my ($nameref) = @_;
540 my $num=0;
541 if ($$nameref =~ /(\d\d)$/) {
542 $num = $1; $num ++;
543 $$nameref =~ s/\d\d$/$num/;
544 } elsif ($$nameref =~ /(\d)$/) {
545 $num = $1;
546 if ($num == 9) {$$nameref =~ s/\d$/10/;}
547 else {$num ++; $$nameref =~ s/\d$/$num/;}
548 } else {
549 $$nameref =~ s/.$/0/;
550 }
551}
552
553# implement this in subclass if want to add extra stuff to build.cfg
554sub build_cfg_extra {
555 my $self = shift(@_);
556 my ($build_cfg) = @_;
557
558}
559
560# default is to output an empty [collection] entry
561sub output_collection_meta {
562 my $self = shift(@_);
563 my ($handle) = @_;
564
565 print $handle "[collection]\n". ('-' x 70) . "\n";;
566
567}
568
569sub print_stats {
570 my $self = shift (@_);
571
572 my $outhandle = $self->{'outhandle'};
573 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
574 my $index = $self->{'buildproc'}->get_index();
575 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
576 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
577
578 if ($indexing_text) {
579 print $outhandle "Stats (Creating index $index)\n";
580 } else {
581 print $outhandle "Stats (Compressing text from $index)\n";
582 }
583 print $outhandle "Total bytes in collection: $num_bytes\n";
584 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
585
586 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
587
588 if ($self->{'keepold'}) {
589 if ($num_processed_bytes == 0) {
590 if ($indexing_text) {
591 print $outhandle "No additional text was added to $index\n";
592 } elsif (!$self->{'no_text'}) {
593 print $outhandle "No additional text was compressed\n";
594 }
595 }
596 }
597 else {
598 print $outhandle "***************\n";
599 if ($indexing_text) {
600 print $outhandle "WARNING: There is very little or no text to process for $index\n";
601 } elsif (!$self->{'no_text'}) {
602 print $outhandle "WARNING: There is very little or no text to compress\n";
603 }
604 print $outhandle " Was this your intention?\n";
605 print $outhandle "***************\n";
606 }
607
608 }
609
610}
611
612
6131;
614
Note: See TracBrowser for help on using the repository browser.