source: trunk/gsdl/perllib/basebuilder.pm@ 14112

Last change on this file since 14112 was 14112, checked in by sjboddie, 17 years ago

More modifications to support additional collection-level customisations
to be put in gsdl/collect/COLLECTION/custom/COLLECTION. basebuilder.pm,
classify.pm, colcfg.pm, and plugin.pm were modified to allow
collection-specific plugins, classifiers, builders, and buildprocs to
be located in the new locations. These changes should not have any effect
on existing collections.

  • Property svn:keywords set to Author Date Id Revision
File size: 19.3 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use plugin;
35use util;
36use FileHandle;
37
38BEGIN {
39 # set autoflush on for STDERR and STDOUT so that mgpp
40 # doesn't get out of sync with plugins
41 STDOUT->autoflush(1);
42 STDERR->autoflush(1);
43}
44
45END {
46 STDOUT->autoflush(0);
47 STDERR->autoflush(0);
48}
49
50our $maxdocsize = 12000;
51
52# used to signify "gs2"(default) or "gs3"
53my $gs_mode = "gs2";
54
55sub new {
56 my ($class, $collection, $source_dir, $build_dir, $verbosity,
57 $maxdocs, $debug, $keepold, $incremental, $incremental_dlc,
58 $remove_empty_classifications,
59 $outhandle, $no_text, $failhandle, $gli) = @_;
60
61 $outhandle = *STDERR unless defined $outhandle;
62 $no_text = 0 unless defined $no_text;
63 $failhandle = *STDERR unless defined $failhandle;
64
65 # create a builder object
66 my $self = bless {'collection'=>$collection,
67 'source_dir'=>$source_dir,
68 'build_dir'=>$build_dir,
69 'verbosity'=>$verbosity,
70 'maxdocs'=>$maxdocs,
71 'debug'=>$debug,
72 'keepold'=>$keepold,
73 'incremental'=>$incremental,
74 'incremental_dlc' => $incremental_dlc,
75 'remove_empty_classifications'=>$remove_empty_classifications,
76 'outhandle'=>$outhandle,
77 'no_text'=>$no_text,
78 'failhandle'=>$failhandle,
79 'notbuilt'=>{}, # indexes not built
80 'gli'=>$gli
81 }, $class;
82
83 $self->{'gli'} = 0 unless defined $self->{'gli'};
84
85 # Read in the collection configuration file.
86 my ($colcfgname);
87 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
88 if ($gs_mode eq "gs2") {
89 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
90 } elsif ($gs_mode eq "gs3") {
91 $self->{'collect_cfg'} = &colcfg::read_collection_cfg_xml ($colcfgname);
92 }
93
94 # get the list of plugins for this collection
95 my $plugins = [];
96 if (defined $self->{'collect_cfg'}->{'plugin'}) {
97 $plugins = $self->{'collect_cfg'}->{'plugin'};
98 }
99
100 # load all the plugins
101
102 #build up the extra global options for the plugins
103 my @global_opts = ();
104 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
105 push @global_opts, "-separate_cjk";
106 }
107 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts, $keepold);
108
109 if (scalar(@{$self->{'pluginfo'}}) == 0) {
110 print $outhandle "No plugins were loaded.\n";
111 die "\n";
112 }
113
114 # get the list of classifiers for this collection
115 my $classifiers = [];
116 if (defined $self->{'collect_cfg'}->{'classify'}) {
117 $classifiers = $self->{'collect_cfg'}->{'classify'};
118 }
119
120 # load all the classifiers
121 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
122
123 # load up any dontgdbm fields
124 $self->{'dontgdbm'} = {};
125 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
126 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
127 $self->{'dontgdbm'}->{$dg} = 1;
128 }
129 }
130
131 $self->{'maxnumeric'} = 4;
132 return $self;
133}
134
135# stuff has been moved here from new, so we can use subclass methods
136sub init {
137 my $self = shift(@_);
138
139 $self->generate_index_list();
140 $self->generate_index_options();
141
142 # sort out subcollection indexes
143 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
144 my $indexes = $self->{'collect_cfg'}->{'indexes'};
145 $self->{'collect_cfg'}->{'indexes'} = [];
146 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
147 foreach my $index (@$indexes) {
148 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
149 }
150 }
151 }
152
153 # sort out language subindexes
154 if (defined $self->{'collect_cfg'}->{'languages'}) {
155 my $indexes = $self->{'collect_cfg'}->{'indexes'};
156 $self->{'collect_cfg'}->{'indexes'} = [];
157 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
158 foreach my $index (@$indexes) {
159 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
160 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
161 }
162 else { # add in an empty subcollection field
163 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
164 }
165 }
166 }
167 }
168
169 if (defined($self->{'collect_cfg'}->{'indexes'})) {
170 # make sure that the same index isn't specified more than once
171 my %tmphash = ();
172 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
173 $self->{'collect_cfg'}->{'indexes'} = [];
174 foreach my $i (@tmparray) {
175 if (!defined ($tmphash{$i})) {
176 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
177 $tmphash{$i} = 1;
178 }
179 }
180 } else {
181 $self->{'collect_cfg'}->{'indexes'} = [];
182 }
183
184 # load up the document processor for building
185 # if a buildproc class has been created for this collection, use it
186 # otherwise, use the mg buildproc
187 my ($buildprocdir, $buildproctype);
188 my $collection = $self->{'collection'};
189 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
190 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
191 $buildproctype = "custombuildproc";
192 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
193 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
194 $buildproctype = "custombuildproc";
195 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
196 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
197 $buildproctype = "${collection}buildproc";
198 } else {
199 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
200 $buildproctype = $self->default_buildproc();
201 }
202 require "$buildprocdir/$buildproctype.pm";
203
204 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
205 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
206 die "$@" if $@;
207
208 if (!$self->{'debug'} && !$self->{'keepold'}) {
209 # remove any old builds
210 &util::rm_r($self->{'build_dir'});
211 &util::mk_all_dir($self->{'build_dir'});
212
213 # make the text directory
214 my $textdir = "$self->{'build_dir'}/text";
215 &util::mk_all_dir($textdir);
216 }
217
218}
219
220sub deinit {
221 my $self = shift (@_);
222
223 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
224}
225
226sub set_sections_index_document_metadata {
227 my $self = shift (@_);
228 my ($index) = @_;
229
230 $self->{'buildproc'}->set_sections_index_document_metadata($index);
231}
232
233sub set_maxnumeric {
234 my $self = shift (@_);
235 my ($maxnumeric) = @_;
236
237 $self->{'maxnumeric'} = $maxnumeric;
238}
239sub set_strip_html {
240 my $self = shift (@_);
241 my ($strip) = @_;
242
243 $self->{'strip_html'} = $strip;
244 $self->{'buildproc'}->set_strip_html($strip);
245}
246
247sub compress_text {
248 my $self = shift (@_);
249 my ($textindex) = @_;
250
251 print STDERR "compress_text() should be implemented in subclass!!";
252 return;
253}
254
255
256sub build_indexes {
257 my $self = shift (@_);
258 my ($indexname) = @_;
259 my $outhandle = $self->{'outhandle'};
260
261 my $indexes = [];
262 if (defined $indexname && $indexname =~ /\w/) {
263 push @$indexes, $indexname;
264 } else {
265 $indexes = $self->{'collect_cfg'}->{'indexes'};
266 }
267
268 # create the mapping between the index descriptions
269 # and their directory names (includes subcolls and langs)
270 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
271
272 # build each of the indexes
273 foreach my $index (@$indexes) {
274 if ($self->want_built($index)) {
275 print $outhandle "\n*** building index $index in subdirectory " .
276 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
277 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
278 $self->build_index($index);
279 } else {
280 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
281 }
282 }
283
284 $self->build_indexes_extra();
285
286}
287
288sub build_indexes_extra {
289 my $self = shift(@_);
290
291}
292
293sub build_index {
294 my $self = shift (@_);
295 my ($index) = @_;
296
297 print STDERR "build_index should be implemented in subclass\n";
298 return;
299}
300
301
302
303sub make_infodatabase {
304 my $self = shift (@_);
305 my $outhandle = $self->{'outhandle'};
306
307 print STDERR "BuildDir: $self->{'build_dir'}\n";
308
309 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
310 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
311 &util::mk_all_dir ($textdir);
312 &util::mk_all_dir ($assocdir);
313
314 # get db name
315 my $dbext = ".bdb";
316 $dbext = ".ldb" if &util::is_little_endian();
317 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
318 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
319
320 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
321 my $exe = &util::get_os_exe ();
322 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
323
324 print $outhandle "\n*** creating the info database and processing associated files\n"
325 if ($self->{'verbosity'} >= 1);
326 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
327
328 # init all the classifiers
329 &classify::init_classifiers ($self->{'classifiers'});
330
331
332 my $reconstructed_docs = undef;
333 if ($self->{'keepold'}) {
334 # reconstruct doc_obj metadata from gdbm for all docs
335 $reconstructed_docs = &classify::reconstruct_doc_objs_metadata($fulldbname);
336 }
337
338 # set up the document processor
339 my ($handle);
340 if ($self->{'debug'}) {
341 $handle = *STDOUT;
342 } else {
343 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
344 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
345 die "builder::make_infodatabase - couldn't run $txt2db_exe\n";
346 }
347 $handle = *PIPEOUT;
348 }
349
350 $self->{'buildproc'}->set_output_handle ($handle);
351 $self->{'buildproc'}->set_mode ('infodb');
352 $self->{'buildproc'}->set_assocdir ($assocdir);
353 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
354 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
355 $self->{'buildproc'}->set_indexing_text (0);
356 $self->{'buildproc'}->set_store_text(1);
357
358 # make_infodatabase needs full reset even for incremental build
359 # as incremental works by reconstructing all docs from GDBM and
360 # then adding in the new ones
361 $self->{'buildproc'}->zero_reset();
362
363 if ($self->{'keepold'}) {
364 # create flat classify structure, ready for new docs to be added
365 foreach my $doc_obj ( @$reconstructed_docs ) {
366 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
367 $self->{'buildproc'}->process($doc_obj,undef);
368 }
369 }
370
371
372 # this has changed to only output collection meta if its
373 # not in the config file
374 $self->output_collection_meta($handle);
375 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
376 "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
377
378 # output classification information
379 &classify::output_classify_info ($self->{'classifiers'}, $handle,
380 $self->{'remove_empty_classifications'},
381 $self->{'gli'});
382
383 # Output classifier reverse lookup, used in incremental deletion
384 #&classify::print_reverse_lookup($handle);
385
386 #output doclist
387 my @doclist = $self->{'buildproc'}->get_doc_list();
388 my $docs = join (";",@doclist);
389 print $handle "[browselist]\n";
390 print $handle "<hastxt>0\n";
391 print $handle "<childtype>VList\n";
392 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
393 print $handle "<thistype>Invisible\n";
394 print $handle "<contains>$docs";
395 print $handle "\n" . ('-' x 70) . "\n";
396
397 close ($handle) if !$self->{'debug'};
398
399 print STDERR "</Stage>\n" if $self->{'gli'};
400}
401
402sub make_auxiliary_files {
403 my $self = shift (@_);
404 my ($index);
405 my $build_cfg = {};
406 # subclasses may have already defined stuff in here
407 if (defined $self->{'build_cfg'}) {
408 $build_cfg = $self->{'build_cfg'};
409 }
410
411 my $outhandle = $self->{'outhandle'};
412
413 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
414 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
415
416 # get the text directory
417 &util::mk_all_dir ($self->{'build_dir'});
418
419 # store the build date
420 $build_cfg->{'builddate'} = time;
421 $build_cfg->{'buildtype'} = $self->{'buildtype'};
422 $build_cfg->{'indexstem'} = $self->{'collection'};
423 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
424
425 # store the number of documents and number of bytes
426 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
427 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
428 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
429
430 # store the mapping between the index names and the directory names
431 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
432 my @indexmap = ();
433 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
434 if (not defined ($self->{'notbuilt'}->{$index})) {
435 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
436 }
437 }
438 $build_cfg->{'indexmap'} = \@indexmap;
439
440 my @subcollectionmap = ();
441 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
442 push (@subcollectionmap, "$subcollection\-\>" .
443 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
444 }
445 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
446
447 my @languagemap = ();
448 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
449 push (@languagemap, "$language\-\>" .
450 $self->{'index_mapping'}->{'languagemap'}->{$language});
451 }
452 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
453
454 my @notbuilt = ();
455 foreach my $nb (keys %{$self->{'notbuilt'}}) {
456 push (@notbuilt, $nb);
457 }
458 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
459
460 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
461
462 $self->build_cfg_extra($build_cfg);
463
464 if ($gs_mode eq "gs2") {
465 &colcfg::write_build_cfg("$self->{'build_dir'}/build.cfg", $build_cfg);
466 }
467 if ($gs_mode eq "gs3") {
468 &colcfg::write_build_cfg_xml("$self->{'build_dir'}/buildConfig.xml", $build_cfg, $self->{'collect_cfg'});
469 }
470
471 print STDERR "</Stage>\n" if $self->{'gli'};
472}
473
474sub collect_specific {
475 my $self = shift (@_);
476}
477
478sub want_built {
479 my $self = shift (@_);
480 my ($index) = @_;
481
482 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
483 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
484 if ($index =~ /^$checkstr$/) {
485 $self->{'notbuilt'}->{$index} = 1;
486 return 0;
487 }
488 }
489 }
490
491 return 1;
492}
493
494sub create_index_mapping {
495 my $self = shift (@_);
496 my ($indexes) = @_;
497
498 print STDERR "create_index_mapping should be implemented in subclass\n";
499 my %mapping = ();
500 return \%mapping;
501}
502
503# returns a processed version of a field.
504# if the field has only one component the processed
505# version will contain the first character and next consonant
506# of that componant - otherwise it will contain the first
507# character of the first two components
508# only uses letdig (\w) characters now
509sub process_field {
510 my $self = shift (@_);
511 my ($field) = @_;
512
513 return "" unless (defined ($field) && $field =~ /\S/);
514
515 my ($a, $b);
516 my @components = split /,/, $field;
517 if (scalar @components >= 2) {
518 # pick the first letdig from the first two field names
519 ($a) = $components[0] =~ /^[^\w]*(\w)/;
520 ($b) = $components[1] =~ /^[^\w]*(\w)/;
521 } else {
522 # pick the first two letdig chars
523 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
524 }
525 # there may not have been any letdigs...
526 $a = 'a' unless defined $a;
527 $b = '0' unless defined $b;
528
529 return "$a$b";
530
531}
532
533sub get_next_version {
534 my $self = shift (@_);
535 my ($nameref) = @_;
536 my $num=0;
537 if ($$nameref =~ /(\d\d)$/) {
538 $num = $1; $num ++;
539 $$nameref =~ s/\d\d$/$num/;
540 } elsif ($$nameref =~ /(\d)$/) {
541 $num = $1;
542 if ($num == 9) {$$nameref =~ s/\d$/10/;}
543 else {$num ++; $$nameref =~ s/\d$/$num/;}
544 } else {
545 $$nameref =~ s/.$/0/;
546 }
547}
548
549# implement this in subclass if want to add extra stuff to build.cfg
550sub build_cfg_extra {
551 my $self = shift(@_);
552 my ($build_cfg) = @_;
553
554}
555
556# default is to output an empty [collection] entry
557sub output_collection_meta {
558 my $self = shift(@_);
559 my ($handle) = @_;
560
561 print $handle "[collection]\n". ('-' x 70) . "\n";;
562
563}
564
565sub print_stats {
566 my $self = shift (@_);
567
568 my $outhandle = $self->{'outhandle'};
569 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
570 my $index = $self->{'buildproc'}->get_index();
571 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
572 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
573
574 if ($indexing_text) {
575 print $outhandle "Stats (Creating index $index)\n";
576 } else {
577 print $outhandle "Stats (Compressing text from $index)\n";
578 }
579 print $outhandle "Total bytes in collection: $num_bytes\n";
580 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
581
582 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
583
584 if ($self->{'keepold'}) {
585 if ($num_processed_bytes == 0) {
586 if ($indexing_text) {
587 print $outhandle "No additional text was added to $index\n";
588 } elsif (!$self->{'no_text'}) {
589 print $outhandle "No additional text was compressed\n";
590 }
591 }
592 }
593 else {
594 print $outhandle "***************\n";
595 if ($indexing_text) {
596 print $outhandle "WARNING: There is very little or no text to process for $index\n";
597 } elsif (!$self->{'no_text'}) {
598 print $outhandle "WARNING: There is very little or no text to compress\n";
599 }
600 print $outhandle " Was this your intention?\n";
601 print $outhandle "***************\n";
602 }
603
604 }
605
606}
607
608
6091;
610
Note: See TracBrowser for help on using the repository browser.