source: trunk/gsdl/perllib/basebuilder.pm@ 14022

Last change on this file since 14022 was 14022, checked in by xiao, 17 years ago
  1. Changes made to look for collectionConfig.xml in gs3 mode and collect.cfg in gs2 mode, rather than presumably only for the file collect.cfg. 2. Changes made to use cfgread4gs3.pm in gs3 mode and cfgread.pm in gs2 mode.
  • Property svn:keywords set to Author Date Id Revision
File size: 19.1 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mgpp
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47our $maxdocsize = 12000;
48
49# used to signify "gs2"(default) or "gs3"
50my $gs_mode = "gs2";
51
52sub new {
53 my ($class, $collection, $source_dir, $build_dir, $verbosity,
54 $maxdocs, $debug, $keepold, $incremental, $incremental_dlc,
55 $remove_empty_classifications,
56 $outhandle, $no_text, $failhandle, $gli) = @_;
57
58 $outhandle = STDERR unless defined $outhandle;
59 $no_text = 0 unless defined $no_text;
60 $failhandle = STDERR unless defined $failhandle;
61
62 # create a builder object
63 my $self = bless {'collection'=>$collection,
64 'source_dir'=>$source_dir,
65 'build_dir'=>$build_dir,
66 'verbosity'=>$verbosity,
67 'maxdocs'=>$maxdocs,
68 'debug'=>$debug,
69 'keepold'=>$keepold,
70 'incremental'=>$incremental,
71 'incremental_dlc' => $incremental_dlc,
72 'remove_empty_classifications'=>$remove_empty_classifications,
73 'outhandle'=>$outhandle,
74 'no_text'=>$no_text,
75 'failhandle'=>$failhandle,
76 'notbuilt'=>{}, # indexes not built
77 'gli'=>$gli
78 }, $class;
79
80 $self->{'gli'} = 0 unless defined $self->{'gli'};
81
82 # read in the collection configuration file
83 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
84 if (-e $colcfgname) {
85 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
86 $gs_mode = "gs2";
87 }
88 else {
89 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collectionConfig.xml";
90 if (!-e $colcfgname) {
91 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
92 }
93 else {
94 $self->{'collect_cfg'} = &colcfg::read_collection_cfg_xml ($colcfgname);
95 $gs_mode = "gs3";
96 }
97 }
98
99 # get the list of plugins for this collection
100 my $plugins = [];
101 if (defined $self->{'collect_cfg'}->{'plugin'}) {
102 $plugins = $self->{'collect_cfg'}->{'plugin'};
103 }
104
105 # load all the plugins
106
107 #build up the extra global options for the plugins
108 my @global_opts = ();
109 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
110 push @global_opts, "-separate_cjk";
111 }
112 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts, $keepold);
113
114 if (scalar(@{$self->{'pluginfo'}}) == 0) {
115 print $outhandle "No plugins were loaded.\n";
116 die "\n";
117 }
118
119 # get the list of classifiers for this collection
120 my $classifiers = [];
121 if (defined $self->{'collect_cfg'}->{'classify'}) {
122 $classifiers = $self->{'collect_cfg'}->{'classify'};
123 }
124
125 # load all the classifiers
126 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
127
128 # load up any dontgdbm fields
129 $self->{'dontgdbm'} = {};
130 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
131 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
132 $self->{'dontgdbm'}->{$dg} = 1;
133 }
134 }
135
136 $self->{'maxnumeric'} = 4;
137 return $self;
138}
139
140# stuff has been moved here from new, so we can use subclass methods
141sub init {
142 my $self = shift(@_);
143
144 $self->generate_index_list();
145 $self->generate_index_options();
146
147 # sort out subcollection indexes
148 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
149 my $indexes = $self->{'collect_cfg'}->{'indexes'};
150 $self->{'collect_cfg'}->{'indexes'} = [];
151 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
152 foreach my $index (@$indexes) {
153 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
154 }
155 }
156 }
157
158 # sort out language subindexes
159 if (defined $self->{'collect_cfg'}->{'languages'}) {
160 my $indexes = $self->{'collect_cfg'}->{'indexes'};
161 $self->{'collect_cfg'}->{'indexes'} = [];
162 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
163 foreach my $index (@$indexes) {
164 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
165 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
166 }
167 else { # add in an empty subcollection field
168 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
169 }
170 }
171 }
172 }
173
174 if (defined($self->{'collect_cfg'}->{'indexes'})) {
175 # make sure that the same index isn't specified more than once
176 my %tmphash = ();
177 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
178 $self->{'collect_cfg'}->{'indexes'} = [];
179 foreach my $i (@tmparray) {
180 if (!defined ($tmphash{$i})) {
181 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
182 $tmphash{$i} = 1;
183 }
184 }
185 } else {
186 $self->{'collect_cfg'}->{'indexes'} = [];
187 }
188
189 # load up the document processor for building
190 # if a buildproc class has been created for this collection, use it
191 # otherwise, use the mg buildproc
192 my ($buildprocdir, $buildproctype);
193 my $collection = $self->{'collection'};
194 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
195 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
196 $buildproctype = "${collection}buildproc";
197 } else {
198 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
199 $buildproctype = $self->default_buildproc();
200 }
201 require "$buildprocdir/$buildproctype.pm";
202
203 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
204 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
205 die "$@" if $@;
206
207 if (!$self->{'debug'} && !$self->{'keepold'}) {
208 # remove any old builds
209 &util::rm_r($self->{'build_dir'});
210 &util::mk_all_dir($self->{'build_dir'});
211
212 # make the text directory
213 my $textdir = "$self->{'build_dir'}/text";
214 &util::mk_all_dir($textdir);
215 }
216
217}
218
219sub deinit {
220 my $self = shift (@_);
221
222 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
223}
224
225sub set_sections_index_document_metadata {
226 my $self = shift (@_);
227 my ($index) = @_;
228
229 $self->{'buildproc'}->set_sections_index_document_metadata($index);
230}
231
232sub set_maxnumeric {
233 my $self = shift (@_);
234 my ($maxnumeric) = @_;
235
236 $self->{'maxnumeric'} = $maxnumeric;
237}
238sub set_strip_html {
239 my $self = shift (@_);
240 my ($strip) = @_;
241
242 $self->{'strip_html'} = $strip;
243 $self->{'buildproc'}->set_strip_html($strip);
244}
245
246sub compress_text {
247 my $self = shift (@_);
248 my ($textindex) = @_;
249
250 print STDERR "compress_text() should be implemented in subclass!!";
251 return;
252}
253
254
255sub build_indexes {
256 my $self = shift (@_);
257 my ($indexname) = @_;
258 my $outhandle = $self->{'outhandle'};
259
260 my $indexes = [];
261 if (defined $indexname && $indexname =~ /\w/) {
262 push @$indexes, $indexname;
263 } else {
264 $indexes = $self->{'collect_cfg'}->{'indexes'};
265 }
266
267 # create the mapping between the index descriptions
268 # and their directory names (includes subcolls and langs)
269 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
270
271 # build each of the indexes
272 foreach my $index (@$indexes) {
273 if ($self->want_built($index)) {
274 print $outhandle "\n*** building index $index in subdirectory " .
275 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
276 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
277 $self->build_index($index);
278 } else {
279 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
280 }
281 }
282
283 $self->build_indexes_extra();
284
285}
286
287sub build_indexes_extra {
288 my $self = shift(@_);
289
290}
291
292sub build_index {
293 my $self = shift (@_);
294 my ($index) = @_;
295
296 print STDERR "build_index should be implemented in subclass\n";
297 return;
298}
299
300
301
302sub make_infodatabase {
303 my $self = shift (@_);
304 my $outhandle = $self->{'outhandle'};
305
306 print STDERR "BuildDir: $self->{'build_dir'}\n";
307
308 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
309 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
310 &util::mk_all_dir ($textdir);
311 &util::mk_all_dir ($assocdir);
312
313 # get db name
314 my $dbext = ".bdb";
315 $dbext = ".ldb" if &util::is_little_endian();
316 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
317 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
318
319 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
320 my $exe = &util::get_os_exe ();
321 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
322
323 print $outhandle "\n*** creating the info database and processing associated files\n"
324 if ($self->{'verbosity'} >= 1);
325 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
326
327 # init all the classifiers
328 &classify::init_classifiers ($self->{'classifiers'});
329
330
331 my $reconstructed_docs = undef;
332 if ($self->{'keepold'}) {
333 # reconstruct doc_obj metadata from gdbm for all docs
334 $reconstructed_docs = &classify::reconstruct_doc_objs_metadata($fulldbname);
335 }
336
337 # set up the document processor
338 my ($handle);
339 if ($self->{'debug'}) {
340 $handle = STDOUT;
341 } else {
342 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
343 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
344 die "builder::make_infodatabase - couldn't run $txt2db_exe\n";
345 }
346 $handle = basebuilder::PIPEOUT;
347 }
348
349 $self->{'buildproc'}->set_output_handle ($handle);
350 $self->{'buildproc'}->set_mode ('infodb');
351 $self->{'buildproc'}->set_assocdir ($assocdir);
352 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
353 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
354 $self->{'buildproc'}->set_indexing_text (0);
355 $self->{'buildproc'}->set_store_text(1);
356
357 # make_infodatabase needs full reset even for incremental build
358 # as incremental works by reconstructing all docs from GDBM and
359 # then adding in the new ones
360 $self->{'buildproc'}->zero_reset();
361
362 if ($self->{'keepold'}) {
363 # create flat classify structure, ready for new docs to be added
364 foreach my $doc_obj ( @$reconstructed_docs ) {
365 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
366 $self->{'buildproc'}->process($doc_obj,undef);
367 }
368 }
369
370
371 # this has changed to only output collection meta if its
372 # not in the config file
373 $self->output_collection_meta($handle);
374 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
375 "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
376
377 # output classification information
378 &classify::output_classify_info ($self->{'classifiers'}, $handle,
379 $self->{'remove_empty_classifications'},
380 $self->{'gli'});
381
382 # Output classifier reverse lookup, used in incremental deletion
383 #&classify::print_reverse_lookup($handle);
384
385 #output doclist
386 my @doclist = $self->{'buildproc'}->get_doc_list();
387 my $docs = join (";",@doclist);
388 print $handle "[browselist]\n";
389 print $handle "<hastxt>0\n";
390 print $handle "<childtype>VList\n";
391 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
392 print $handle "<thistype>Invisible\n";
393 print $handle "<contains>$docs";
394 print $handle "\n" . ('-' x 70) . "\n";
395
396 close ($handle) if !$self->{'debug'};
397
398 print STDERR "</Stage>\n" if $self->{'gli'};
399}
400
401sub make_auxiliary_files {
402 my $self = shift (@_);
403 my ($index);
404 my $build_cfg = {};
405 # subclasses may have already defined stuff in here
406 if (defined $self->{'build_cfg'}) {
407 $build_cfg = $self->{'build_cfg'};
408 }
409
410 my $outhandle = $self->{'outhandle'};
411
412 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
413 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
414
415 # get the text directory
416 &util::mk_all_dir ($self->{'build_dir'});
417
418 # store the build date
419 $build_cfg->{'builddate'} = time;
420 $build_cfg->{'buildtype'} = $self->{'buildtype'};
421 $build_cfg->{'indexstem'} = $self->{'collection'};
422 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
423
424 # store the number of documents and number of bytes
425 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
426 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
427 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
428
429 # store the mapping between the index names and the directory names
430 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
431 my @indexmap = ();
432 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
433 if (not defined ($self->{'notbuilt'}->{$index})) {
434 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
435 }
436 }
437 $build_cfg->{'indexmap'} = \@indexmap;
438
439 my @subcollectionmap = ();
440 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
441 push (@subcollectionmap, "$subcollection\-\>" .
442 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
443 }
444 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
445
446 my @languagemap = ();
447 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
448 push (@languagemap, "$language\-\>" .
449 $self->{'index_mapping'}->{'languagemap'}->{$language});
450 }
451 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
452
453 my @notbuilt = ();
454 foreach my $nb (keys %{$self->{'notbuilt'}}) {
455 push (@notbuilt, $nb);
456 }
457 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
458
459 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
460
461 $self->build_cfg_extra($build_cfg);
462
463 if ($gs_mode eq "gs2") {
464 &colcfg::write_build_cfg("$self->{'build_dir'}/build.cfg", $build_cfg);
465 }
466 if ($gs_mode eq "gs3") {
467 &colcfg::write_build_cfg_xml("$self->{'build_dir'}/buildConfig.xml", $build_cfg, $self->{'collect_cfg'});
468 }
469
470 print STDERR "</Stage>\n" if $self->{'gli'};
471}
472
473sub collect_specific {
474 my $self = shift (@_);
475}
476
477sub want_built {
478 my $self = shift (@_);
479 my ($index) = @_;
480
481 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
482 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
483 if ($index =~ /^$checkstr$/) {
484 $self->{'notbuilt'}->{$index} = 1;
485 return 0;
486 }
487 }
488 }
489
490 return 1;
491}
492
493sub create_index_mapping {
494 my $self = shift (@_);
495 my ($indexes) = @_;
496
497 print STDERR "create_index_mapping should be implemented in subclass\n";
498 my %mapping = ();
499 return \%mapping;
500}
501
502# returns a processed version of a field.
503# if the field has only one component the processed
504# version will contain the first character and next consonant
505# of that componant - otherwise it will contain the first
506# character of the first two components
507# only uses letdig (\w) characters now
508sub process_field {
509 my $self = shift (@_);
510 my ($field) = @_;
511
512 return "" unless (defined ($field) && $field =~ /\S/);
513
514 my ($a, $b);
515 my @components = split /,/, $field;
516 if (scalar @components >= 2) {
517 # pick the first letdig from the first two field names
518 ($a) = $components[0] =~ /^[^\w]*(\w)/;
519 ($b) = $components[1] =~ /^[^\w]*(\w)/;
520 } else {
521 # pick the first two letdig chars
522 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
523 }
524 # there may not have been any letdigs...
525 $a = 'a' unless defined $a;
526 $b = '0' unless defined $b;
527
528 return "$a$b";
529
530}
531
532sub get_next_version {
533 my $self = shift (@_);
534 my ($nameref) = @_;
535 my $num=0;
536 if ($$nameref =~ /(\d\d)$/) {
537 $num = $1; $num ++;
538 $$nameref =~ s/\d\d$/$num/;
539 } elsif ($$nameref =~ /(\d)$/) {
540 $num = $1;
541 if ($num == 9) {$$nameref =~ s/\d$/10/;}
542 else {$num ++; $$nameref =~ s/\d$/$num/;}
543 } else {
544 $$nameref =~ s/.$/0/;
545 }
546}
547
548# implement this in subclass if want to add extra stuff to build.cfg
549sub build_cfg_extra {
550 my $self = shift(@_);
551 my ($build_cfg) = @_;
552
553}
554
555# default is to output an empty [collection] entry
556sub output_collection_meta {
557 my $self = shift(@_);
558 my ($handle) = @_;
559
560 print $handle "[collection]\n". ('-' x 70) . "\n";;
561
562}
563
564sub print_stats {
565 my $self = shift (@_);
566
567 my $outhandle = $self->{'outhandle'};
568 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
569 my $index = $self->{'buildproc'}->get_index();
570 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
571 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
572
573 if ($indexing_text) {
574 print $outhandle "Stats (Creating index $index)\n";
575 } else {
576 print $outhandle "Stats (Compressing text from $index)\n";
577 }
578 print $outhandle "Total bytes in collection: $num_bytes\n";
579 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
580
581 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
582
583 if ($self->{'keepold'}) {
584 if ($num_processed_bytes == 0) {
585 if ($indexing_text) {
586 print $outhandle "No additional text was added to $index\n";
587 } elsif (!$self->{'no_text'}) {
588 print $outhandle "No additional text was compressed\n";
589 }
590 }
591 }
592 else {
593 print $outhandle "***************\n";
594 if ($indexing_text) {
595 print $outhandle "WARNING: There is very little or no text to process for $index\n";
596 } elsif (!$self->{'no_text'}) {
597 print $outhandle "WARNING: There is very little or no text to compress\n";
598 }
599 print $outhandle " Was this your intention?\n";
600 print $outhandle "***************\n";
601 }
602
603 }
604
605}
606
607
6081;
609
Note: See TracBrowser for help on using the repository browser.