source: main/trunk/greenstone2/perllib/basebuilder.pm@ 27192

Last change on this file since 27192 was 27192, checked in by davidb, 11 years ago

Extra test added to avoid putting 'undef' into an array of values. Problem originally showed up with 'indexoptions'

  • Property svn:keywords set to Author Date Id Revision
File size: 25.9 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57 my ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
59 $remove_empty_classifications,
60 $outhandle, $no_text, $failhandle, $gli) = @_;
61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
67 my $self = bless {'site'=>$site, # will be undef for Greenstone 2
68 'collection'=>$collection,
69 'source_dir'=>$source_dir,
70 'build_dir'=>$build_dir,
71 'verbosity'=>$verbosity,
72 'maxdocs'=>$maxdocs,
73 'debug'=>$debug,
74 'keepold'=>$keepold,
75 'incremental'=>$incremental,
76 'incremental_mode'=>$incremental_mode,
77 'remove_empty_classifications'=>$remove_empty_classifications,
78 'outhandle'=>$outhandle,
79 'no_text'=>$no_text,
80 'failhandle'=>$failhandle,
81 'notbuilt'=>{}, # indexes not built
82 'gli'=>$gli
83 }, $class;
84
85 $self->{'gli'} = 0 unless defined $self->{'gli'};
86
87 # Read in the collection configuration file.
88 if ((defined $site) && ($site ne "")) { # GS3
89 $gs_mode = "gs3";
90 }
91
92 my $colcfgname = &colcfg::get_collect_cfg_name($outhandle, $gs_mode);
93 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
94
95 if ($gs_mode eq "gs3") {
96 # read it in again to save the original form for later writing out
97 # of buildConfig.xml
98 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
99 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
100 }
101
102 # get the database type for this collection from the collect.cfg file (may be undefined)
103 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
104
105
106 # load up any dontdb fields
107 $self->{'dontdb'} = {};
108 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
109 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
110 $self->{'dontdb'}->{$dg} = 1;
111 }
112 }
113
114 $self->{'maxnumeric'} = 4;
115 return $self;
116}
117
118# stuff has been moved here from new, so we can use subclass methods
119sub init {
120 my $self = shift(@_);
121
122 my $outhandle = $self->{'outhandle'};
123 my $failhandle = $self->{'failhandle'};
124
125 $self->generate_index_list();
126 my $indexes = $self->{'collect_cfg'}->{'indexes'};
127 if (defined $indexes) {
128 # sort out subcollection indexes
129 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
130 $self->{'collect_cfg'}->{'indexes'} = [];
131 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
132 foreach my $index (@$indexes) {
133 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
134 }
135 }
136 }
137
138 # sort out language subindexes
139 if (defined $self->{'collect_cfg'}->{'languages'}) {
140 $indexes = $self->{'collect_cfg'}->{'indexes'};
141 $self->{'collect_cfg'}->{'indexes'} = [];
142 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
143 foreach my $index (@$indexes) {
144 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
145 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
146 }
147 else { # add in an empty subcollection field
148 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
149 }
150 }
151 }
152 }
153 }
154
155 if (defined($self->{'collect_cfg'}->{'indexes'})) {
156 # make sure that the same index isn't specified more than once
157 my %tmphash = ();
158 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
159 $self->{'collect_cfg'}->{'indexes'} = [];
160 foreach my $i (@tmparray) {
161 if (!defined ($tmphash{$i})) {
162 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
163 $tmphash{$i} = 1;
164 }
165 }
166 } else {
167 $self->{'collect_cfg'}->{'indexes'} = [];
168 }
169
170 # check incremental against whether builder can cope or not.
171 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
172 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
173 $self->{'keepold'} = 0;
174 $self->{'incremental'} = 0;
175 $self->{'incremental_mode'} = "none";
176
177 }
178
179 # gs_version for plugins
180 my $gs_version = "2";
181 if ($gs_mode eq "gs3") {
182 $gs_version = "3";
183 }
184 # get the list of plugins for this collection
185 my $plugins = [];
186 if (defined $self->{'collect_cfg'}->{'plugin'}) {
187 $plugins = $self->{'collect_cfg'}->{'plugin'};
188 }
189
190 # load all the plugins
191
192 #build up the extra global options for the plugins
193 my @global_opts = ();
194 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
195 push @global_opts, "-separate_cjk";
196 }
197 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'}, $gs_version);
198
199 if (scalar(@{$self->{'pluginfo'}}) == 0) {
200 print $outhandle "No plugins were loaded.\n";
201 die "\n";
202 }
203
204 # get the list of classifiers for this collection
205 my $classifiers = [];
206 if (defined $self->{'collect_cfg'}->{'classify'}) {
207 $classifiers = $self->{'collect_cfg'}->{'classify'};
208 }
209
210 # load all the classifiers
211 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
212
213 # load up the document processor for building
214 # if a buildproc class has been created for this collection, use it
215 # otherwise, use the default buildproc for the builder we are initialising
216 my $buildprocdir = undef;
217 my $buildproctype;
218
219 my $collection = $self->{'collection'};
220 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
221 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
222 $buildproctype = "custombuildproc";
223 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
224 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
225 $buildproctype = "custombuildproc";
226 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
227 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
228 $buildproctype = "${collection}buildproc";
229 } else {
230 $buildproctype = $self->default_buildproc();
231 }
232 if (defined $buildprocdir) {
233 require "$buildprocdir/$buildproctype.pm";
234 }
235 else {
236 require "$buildproctype.pm";
237 }
238
239 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
240 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
241 die "$@" if $@;
242
243 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
244 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
245
246 $self->generate_index_options();
247
248 if (!$self->{'debug'} && !$self->{'keepold'}) {
249 # remove any old builds
250 &util::rm_r($self->{'build_dir'});
251 &util::mk_all_dir($self->{'build_dir'});
252
253 # make the text directory
254 my $textdir = "$self->{'build_dir'}/text";
255 &util::mk_all_dir($textdir);
256 }
257
258 if ($self->{'incremental'}) {
259 # some classes may need to do some additional initialisation
260 $self->init_for_incremental_build();
261 }
262
263}
264
265sub is_incremental_capable
266{
267 # By default we return 'no' as the answer
268 # Safer to assume non-incremental to start with, and then override in
269 # inherited classes that are.
270
271 return 0;
272}
273
274# implement this in subclass if want to do additional initialisation for an
275# incremental build
276sub init_for_incremental_build {
277 my $self = shift (@_);
278}
279
280sub deinit {
281 my $self = shift (@_);
282
283 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
284}
285
286sub generate_index_options {
287 my $self = shift (@_);
288
289 my $separate_cjk = 0;
290
291 my $indexoptions = $self->{'collect_cfg'}->{'indexoptions'};
292 if (defined($indexoptions)) {
293
294 foreach my $option (@$indexoptions) {
295 if ($option =~ /separate_cjk/) {
296 $separate_cjk = 1;
297 }
298 }
299 }
300 # set this for building
301 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
302 # record it for build.cfg
303 $self->{'separate_cjk'} = $separate_cjk;
304}
305
306sub set_sections_index_document_metadata {
307 my $self = shift (@_);
308 my ($index) = @_;
309
310 $self->{'buildproc'}->set_sections_index_document_metadata($index);
311}
312
313sub set_maxnumeric {
314 my $self = shift (@_);
315 my ($maxnumeric) = @_;
316
317 $self->{'maxnumeric'} = $maxnumeric;
318}
319sub set_strip_html {
320 my $self = shift (@_);
321 my ($strip) = @_;
322
323 $self->{'strip_html'} = $strip;
324 $self->{'buildproc'}->set_strip_html($strip);
325}
326
327sub set_store_metadata_coverage {
328 my $self = shift (@_);
329 my ($store_metadata_coverage) = @_;
330
331 $self->{'buildproc'}->set_store_metadata_coverage($store_metadata_coverage);
332}
333
334sub compress_text {
335 my $self = shift (@_);
336 my ($textindex) = @_;
337
338 print STDERR "compress_text() should be implemented in subclass!!";
339 return;
340}
341
342
343sub build_indexes {
344 my $self = shift (@_);
345 my ($indexname) = @_;
346 my $outhandle = $self->{'outhandle'};
347
348 $self->pre_build_indexes();
349
350 my $indexes = [];
351 if (defined $indexname && $indexname =~ /\w/) {
352 push @$indexes, $indexname;
353 } else {
354 $indexes = $self->{'collect_cfg'}->{'indexes'};
355 }
356
357 # create the mapping between the index descriptions
358 # and their directory names (includes subcolls and langs)
359 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
360
361 # build each of the indexes
362 foreach my $index (@$indexes) {
363 if ($self->want_built($index)) {
364 print $outhandle "\n*** building index $index in subdirectory " .
365 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
366 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
367 $self->build_index($index);
368 } else {
369 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
370 }
371 }
372
373 $self->post_build_indexes();
374
375}
376
377# implement this in subclass if want to do extra stuff at before building
378# all the indexes
379sub pre_build_indexes {
380 my $self = shift(@_);
381 my ($indexname) = @_; # optional parameter
382}
383
384# implement this in subclass if want to do extra stuff at the end of building
385# all the indexes
386sub post_build_indexes {
387 my $self = shift(@_);
388}
389
390sub build_index {
391 my $self = shift (@_);
392 my ($index) = @_;
393
394 print STDERR "build_index should be implemented in subclass\n";
395 return;
396}
397
398
399
400sub make_infodatabase {
401 my $self = shift (@_);
402 my $outhandle = $self->{'outhandle'};
403
404 print STDERR "BuildDir: $self->{'build_dir'}\n";
405
406 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
407 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
408 &util::mk_all_dir ($textdir);
409 &util::mk_all_dir ($assocdir);
410
411 # Get info database file path
412 my $infodb_type = $self->{'infodbtype'};
413 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
414
415 print $outhandle "\n*** creating the info database and processing associated files\n"
416 if ($self->{'verbosity'} >= 1);
417 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
418
419 # init all the classifiers
420 &classify::init_classifiers ($self->{'classifiers'});
421
422 my $reconstructed_docs = undef;
423 my $database_recs = undef;
424
425 if ($self->{'incremental'}) {
426 $database_recs = {};
427
428 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
429 }
430
431
432 # Important (for memory usage reasons) that we obtain the filehandle
433 # here for writing out to the database, rather than after
434 # $reconstructed_docs has been set up (assuming -incremental is on)
435 #
436 # This is because when we open a pipe to txt2db [using open()]
437 # this triggers a fork() followed by exec(). $reconstructed_docs
438 # can get very large, and so if we did the open() after this, it means
439 # the fork creates a clone of the *large* process image which (admittedly)
440 # is then quickly replaced in the execve() with the much smaller image for
441 # 'txt2db'. The trouble is, in that seismic second caused by
442 # the fork(), the system really does need to have all that memory available
443 # even though it isn't ultimately used. The result is an out of memory
444 # error.
445
446 my ($infodb_handle);
447 if ($self->{'debug'}) {
448 $infodb_handle = *STDOUT;
449 }
450 else {
451 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
452 if (!defined($infodb_handle))
453 {
454 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
455 die "builder::make_infodatabase - couldn't open infodb write handle\n";
456 }
457 }
458
459 if ($self->{'incremental'}) {
460 # reconstruct doc_obj metadata from database for all docs
461 $reconstructed_docs
462 = &classify::reconstruct_doc_objs_metadata($infodb_type,
463 $infodb_file_path,
464 $database_recs);
465 }
466
467 # set up the document processor
468
469 $self->{'buildproc'}->set_output_handle ($infodb_handle);
470 $self->{'buildproc'}->set_mode ('infodb');
471 $self->{'buildproc'}->set_assocdir ($assocdir);
472 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
473 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
474 $self->{'buildproc'}->set_indexing_text (0);
475 $self->{'buildproc'}->set_store_text(1);
476
477 # make_infodatabase needs full reset even for incremental build
478 # as incremental works by reconstructing all docs from the database and
479 # then adding in the new ones
480 $self->{'buildproc'}->zero_reset();
481
482 $self->{'buildproc'}->{'mdprefix_fields'} = {};
483
484 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
485 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
486
487 if ($self->{'incremental'}) {
488 # create flat classify structure, ready for new docs to be added
489 foreach my $doc_obj ( @$reconstructed_docs ) {
490 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
491 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
492 $self->{'buildproc'}->process($doc_obj,undef);
493 }
494 }
495 }
496 # this has changed to only output collection meta if its
497 # not in the config file
498 $self->output_collection_meta($infodb_handle);
499
500 # output classification information
501 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
502 $self->{'remove_empty_classifications'},
503 $self->{'gli'});
504
505 # Output classifier reverse lookup, used in incremental deletion
506 ####&classify::print_reverse_lookup($infodb_handle);
507
508 # output doclist
509 my @doc_list = $self->{'buildproc'}->get_doc_list();
510 my $browselist_infodb = { 'hastxt' => [ "0" ],
511 'childtype' => [ "VList" ],
512 'numleafdocs' => [ scalar(@doc_list) ],
513 'thistype' => [ "Invisible" ],
514 'contains' => [ join(";", @doc_list) ] };
515 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
516
517 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
518
519 if ($infodb_type eq "gdbm-txtgz") {
520 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
521 if (-e $gdb_infodb_file_path) {
522 &util::rm($gdb_infodb_file_path);
523 }
524 }
525 print STDERR "</Stage>\n" if $self->{'gli'};
526}
527
528sub make_auxiliary_files {
529 my $self = shift (@_);
530 my ($index);
531 my $build_cfg = {};
532 # subclasses may have already defined stuff in here
533 if (defined $self->{'build_cfg'}) {
534 $build_cfg = $self->{'build_cfg'};
535 }
536
537 my $outhandle = $self->{'outhandle'};
538
539 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
540 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
541
542 # get the text directory
543 &util::mk_all_dir ($self->{'build_dir'});
544
545 # store the build date
546 $build_cfg->{'builddate'} = time;
547 $build_cfg->{'buildtype'} = $self->{'buildtype'};
548 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
549 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
550 if ($self->{'separate_cjk'}) {
551 $build_cfg->{'separate_cjk'} = "true";
552 }
553
554 # store the number of documents and number of bytes
555 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
556 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
557 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
558
559 # store the mapping between the index names and the directory names
560 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
561 my @indexmap = ();
562 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
563 if (not defined ($self->{'notbuilt'}->{$index})) {
564 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
565 }
566 }
567
568 # store the number of indexes built to later determine whether search serviceracks get written out to buildConfig.xml
569 $build_cfg->{'num_indexes'} = scalar (@indexmap);
570
571 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
572
573 my @subcollectionmap = ();
574 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
575 push (@subcollectionmap, "$subcollection\-\>" .
576 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
577 }
578 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
579
580 my @languagemap = ();
581 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
582 push (@languagemap, "$language\-\>" .
583 $self->{'index_mapping'}->{'languagemap'}->{$language});
584 }
585 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
586
587 my @notbuilt = ();
588 foreach my $nb (keys %{$self->{'notbuilt'}}) {
589 push (@notbuilt, $nb);
590 }
591 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
592
593 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
594
595 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
596
597 # write out the earliestDatestamp information needed for OAI
598 my $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
599 if(!-d $archivedir) {
600 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
601 }
602 my $earliestDatestampFile = &util::filename_cat ($archivedir, "earliestDatestamp");
603 my $earliestDatestamp = 0;
604 if (open(FIN,"<$earliestDatestampFile")) {
605 {
606 # slurp in file as a single line
607 local $/ = undef;
608 $earliestDatestamp = <FIN>;
609 #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
610 }
611 close(FIN);
612 }
613 else {
614 print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
615 print $outhandle "Setting value to 0.\n";
616 }
617 $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
618
619 $self->build_cfg_extra($build_cfg);
620
621 if ($gs_mode eq "gs2") {
622 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
623 }
624 if ($gs_mode eq "gs3") {
625
626 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
627 }
628
629 print STDERR "</Stage>\n" if $self->{'gli'};
630}
631
632# implement this in subclass if want to add extra stuff to build.cfg
633sub build_cfg_extra {
634 my $self = shift(@_);
635 my ($build_cfg) = @_;
636
637}
638
639
640sub collect_specific {
641 my $self = shift (@_);
642}
643
644sub want_built {
645 my $self = shift (@_);
646 my ($index) = @_;
647
648 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
649 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
650 if ($index =~ /^$checkstr$/) {
651 $self->{'notbuilt'}->{$index} = 1;
652 return 0;
653 }
654 }
655 }
656
657 return 1;
658}
659
660sub create_index_mapping {
661 my $self = shift (@_);
662 my ($indexes) = @_;
663
664 print STDERR "create_index_mapping should be implemented in subclass\n";
665 my %mapping = ();
666 return \%mapping;
667}
668
669# returns a processed version of a field.
670# if the field has only one component the processed
671# version will contain the first character and next consonant
672# of that componant - otherwise it will contain the first
673# character of the first two components
674# only uses letdig (\w) characters now
675sub process_field {
676 my $self = shift (@_);
677 my ($field) = @_;
678
679 return "" unless (defined ($field) && $field =~ /\S/);
680
681 my ($a, $b);
682 my @components = split /,/, $field;
683 if (scalar @components >= 2) {
684 # pick the first letdig from the first two field names
685 ($a) = $components[0] =~ /^[^\w]*(\w)/;
686 ($b) = $components[1] =~ /^[^\w]*(\w)/;
687 } else {
688 # pick the first two letdig chars
689 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
690 }
691 # there may not have been any letdigs...
692 $a = 'a' unless defined $a;
693 $b = '0' unless defined $b;
694
695 my $newfield = "$a$b";
696 if ($newfield =~ /^\d\d$/) {
697 # digits only - Greenstone runtime doesn't like this.
698 $newfield = "a$a";
699 }
700 return $newfield;
701
702}
703
704sub get_next_version {
705 my $self = shift (@_);
706 my ($nameref) = @_;
707 my $num=0;
708 if ($$nameref =~ /(\d\d)$/) {
709 $num = $1; $num ++;
710 $$nameref =~ s/\d\d$/$num/;
711 } elsif ($$nameref =~ /(\d)$/) {
712 $num = $1;
713 if ($num == 9) {$$nameref =~ s/\d$/10/;}
714 else {$num ++; $$nameref =~ s/\d$/$num/;}
715 } else {
716 $$nameref =~ s/.$/0/;
717 }
718}
719
720
721
722sub get_collection_meta_sets
723{
724 my $self = shift(@_);
725 my $collection_infodb = shift(@_);
726
727 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
728 foreach my $prefix (keys %$mdprefix_fields)
729 {
730 push(@{$collection_infodb->{"metadataset"}}, $prefix);
731
732 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
733 {
734 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
735
736 my $val = $mdprefix_fields->{$prefix}->{$field};
737 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
738 }
739 }
740}
741
742
743# default is to output the metadata sets (prefixes) used in collection
744sub output_collection_meta
745{
746 my $self = shift(@_);
747 my $infodb_handle = shift(@_);
748
749 my %collection_infodb = ();
750 $self->get_collection_meta_sets(\%collection_infodb);
751 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
752}
753
754# sometimes we need to read in an existing build.cfg - for example,
755# if doing each stage of building separately, or when doing incremental
756# building
757sub read_build_cfg {
758 my $self = shift(@_);
759
760 my $buildconfigfilename;
761
762 if ($gs_mode eq "gs2") {
763 $buildconfigfilename = "build.cfg";
764 } else {
765 $buildconfigfilename = "buildConfig.xml";
766 }
767
768 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
769
770 if (!-e $buildconfigfile) {
771 # try the index dir - but do we know where it is?? try here
772 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
773 if (!-e $buildconfigfile) {
774 #we cant find a config file - just ignore the field list
775 return undef;
776 }
777 }
778 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
779
780}
781
782sub print_stats {
783 my $self = shift (@_);
784
785 my $outhandle = $self->{'outhandle'};
786 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
787 my $index = $self->{'buildproc'}->get_index();
788 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
789 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
790
791 if ($indexing_text) {
792 print $outhandle "Stats (Creating index $index)\n";
793 } else {
794 print $outhandle "Stats (Compressing text from $index)\n";
795 }
796 print $outhandle "Total bytes in collection: $num_bytes\n";
797 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
798
799 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
800
801 if ($self->{'incremental'}) {
802 if ($num_processed_bytes == 0) {
803 if ($indexing_text) {
804 print $outhandle "No additional text was added to $index\n";
805 } elsif (!$self->{'no_text'}) {
806 print $outhandle "No additional text was compressed\n";
807 }
808 }
809 }
810 else {
811 print $outhandle "***************\n";
812 if ($indexing_text) {
813 print $outhandle "WARNING: There is very little or no text to process for $index\n";
814 } elsif (!$self->{'no_text'}) {
815 print $outhandle "WARNING: There is very little or no text to compress\n";
816 }
817 print $outhandle " Was this your intention?\n";
818 print $outhandle "***************\n";
819 }
820
821 }
822
823}
824
825
8261;
827
Note: See TracBrowser for help on using the repository browser.