source: main/trunk/greenstone2/perllib/basebuilder.pm@ 27192

Last change on this file since 27192 was 27192, checked in by davidb, 8 years ago

Extra test added to avoid putting 'undef' into an array of values. Problem originally showed up with 'indexoptions'

  • Property svn:keywords set to Author Date Id Revision
File size: 25.9 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57 my ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
59 $remove_empty_classifications,
60 $outhandle, $no_text, $failhandle, $gli) = @_;
61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
67 my $self = bless {'site'=>$site, # will be undef for Greenstone 2
68 'collection'=>$collection,
69 'source_dir'=>$source_dir,
70 'build_dir'=>$build_dir,
71 'verbosity'=>$verbosity,
72 'maxdocs'=>$maxdocs,
73 'debug'=>$debug,
74 'keepold'=>$keepold,
75 'incremental'=>$incremental,
76 'incremental_mode'=>$incremental_mode,
77 'remove_empty_classifications'=>$remove_empty_classifications,
78 'outhandle'=>$outhandle,
79 'no_text'=>$no_text,
80 'failhandle'=>$failhandle,
81 'notbuilt'=>{}, # indexes not built
82 'gli'=>$gli
83 }, $class;
84
85 $self->{'gli'} = 0 unless defined $self->{'gli'};
86
87 # Read in the collection configuration file.
88 if ((defined $site) && ($site ne "")) { # GS3
89 $gs_mode = "gs3";
90 }
91
92 my $colcfgname = &colcfg::get_collect_cfg_name($outhandle, $gs_mode);
93 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
94
95 if ($gs_mode eq "gs3") {
96 # read it in again to save the original form for later writing out
97 # of buildConfig.xml
98 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
99 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
100 }
101
102 # get the database type for this collection from the collect.cfg file (may be undefined)
103 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
104
105
106 # load up any dontdb fields
107 $self->{'dontdb'} = {};
108 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
109 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
110 $self->{'dontdb'}->{$dg} = 1;
111 }
112 }
113
114 $self->{'maxnumeric'} = 4;
115 return $self;
116}
117
118# stuff has been moved here from new, so we can use subclass methods
119sub init {
120 my $self = shift(@_);
121
122 my $outhandle = $self->{'outhandle'};
123 my $failhandle = $self->{'failhandle'};
124
125 $self->generate_index_list();
126 my $indexes = $self->{'collect_cfg'}->{'indexes'};
127 if (defined $indexes) {
128 # sort out subcollection indexes
129 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
130 $self->{'collect_cfg'}->{'indexes'} = [];
131 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
132 foreach my $index (@$indexes) {
133 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
134 }
135 }
136 }
137
138 # sort out language subindexes
139 if (defined $self->{'collect_cfg'}->{'languages'}) {
140 $indexes = $self->{'collect_cfg'}->{'indexes'};
141 $self->{'collect_cfg'}->{'indexes'} = [];
142 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
143 foreach my $index (@$indexes) {
144 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
145 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
146 }
147 else { # add in an empty subcollection field
148 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
149 }
150 }
151 }
152 }
153 }
154
155 if (defined($self->{'collect_cfg'}->{'indexes'})) {
156 # make sure that the same index isn't specified more than once
157 my %tmphash = ();
158 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
159 $self->{'collect_cfg'}->{'indexes'} = [];
160 foreach my $i (@tmparray) {
161 if (!defined ($tmphash{$i})) {
162 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
163 $tmphash{$i} = 1;
164 }
165 }
166 } else {
167 $self->{'collect_cfg'}->{'indexes'} = [];
168 }
169
170 # check incremental against whether builder can cope or not.
171 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
172 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
173 $self->{'keepold'} = 0;
174 $self->{'incremental'} = 0;
175 $self->{'incremental_mode'} = "none";
176
177 }
178
179 # gs_version for plugins
180 my $gs_version = "2";
181 if ($gs_mode eq "gs3") {
182 $gs_version = "3";
183 }
184 # get the list of plugins for this collection
185 my $plugins = [];
186 if (defined $self->{'collect_cfg'}->{'plugin'}) {
187 $plugins = $self->{'collect_cfg'}->{'plugin'};
188 }
189
190 # load all the plugins
191
192 #build up the extra global options for the plugins
193 my @global_opts = ();
194 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
195 push @global_opts, "-separate_cjk";
196 }
197 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'}, $gs_version);
198
199 if (scalar(@{$self->{'pluginfo'}}) == 0) {
200 print $outhandle "No plugins were loaded.\n";
201 die "\n";
202 }
203
204 # get the list of classifiers for this collection
205 my $classifiers = [];
206 if (defined $self->{'collect_cfg'}->{'classify'}) {
207 $classifiers = $self->{'collect_cfg'}->{'classify'};
208 }
209
210 # load all the classifiers
211 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
212
213 # load up the document processor for building
214 # if a buildproc class has been created for this collection, use it
215 # otherwise, use the default buildproc for the builder we are initialising
216 my $buildprocdir = undef;
217 my $buildproctype;
218
219 my $collection = $self->{'collection'};
220 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
221 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
222 $buildproctype = "custombuildproc";
223 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
224 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
225 $buildproctype = "custombuildproc";
226 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
227 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
228 $buildproctype = "${collection}buildproc";
229 } else {
230 $buildproctype = $self->default_buildproc();
231 }
232 if (defined $buildprocdir) {
233 require "$buildprocdir/$buildproctype.pm";
234 }
235 else {
236 require "$buildproctype.pm";
237 }
238
239 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
240 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
241 die "$@" if $@;
242
243 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
244 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
245
246 $self->generate_index_options();
247
248 if (!$self->{'debug'} && !$self->{'keepold'}) {
249 # remove any old builds
250 &util::rm_r($self->{'build_dir'});
251 &util::mk_all_dir($self->{'build_dir'});
252
253 # make the text directory
254 my $textdir = "$self->{'build_dir'}/text";
255 &util::mk_all_dir($textdir);
256 }
257
258 if ($self->{'incremental'}) {
259 # some classes may need to do some additional initialisation
260 $self->init_for_incremental_build();
261 }
262
263}
264
265sub is_incremental_capable
266{
267 # By default we return 'no' as the answer
268 # Safer to assume non-incremental to start with, and then override in
269 # inherited classes that are.
270
271 return 0;
272}
273
274# implement this in subclass if want to do additional initialisation for an
275# incremental build
276sub init_for_incremental_build {
277 my $self = shift (@_);
278}
279
280sub deinit {
281 my $self = shift (@_);
282
283 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
284}
285
286sub generate_index_options {
287 my $self = shift (@_);
288
289 my $separate_cjk = 0;
290
291 my $indexoptions = $self->{'collect_cfg'}->{'indexoptions'};
292 if (defined($indexoptions)) {
293
294 foreach my $option (@$indexoptions) {
295 if ($option =~ /separate_cjk/) {
296 $separate_cjk = 1;
297 }
298 }
299 }
300 # set this for building
301 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
302 # record it for build.cfg
303 $self->{'separate_cjk'} = $separate_cjk;
304}
305
306sub set_sections_index_document_metadata {
307 my $self = shift (@_);
308 my ($index) = @_;
309
310 $self->{'buildproc'}->set_sections_index_document_metadata($index);
311}
312
313sub set_maxnumeric {
314 my $self = shift (@_);
315 my ($maxnumeric) = @_;
316
317 $self->{'maxnumeric'} = $maxnumeric;
318}
319sub set_strip_html {
320 my $self = shift (@_);
321 my ($strip) = @_;
322
323 $self->{'strip_html'} = $strip;
324 $self->{'buildproc'}->set_strip_html($strip);
325}
326
327sub set_store_metadata_coverage {
328 my $self = shift (@_);
329 my ($store_metadata_coverage) = @_;
330
331 $self->{'buildproc'}->set_store_metadata_coverage($store_metadata_coverage);
332}
333
334sub compress_text {
335 my $self = shift (@_);
336 my ($textindex) = @_;
337
338 print STDERR "compress_text() should be implemented in subclass!!";
339 return;
340}
341
342
343sub build_indexes {
344 my $self = shift (@_);
345 my ($indexname) = @_;
346 my $outhandle = $self->{'outhandle'};
347
348 $self->pre_build_indexes();
349
350 my $indexes = [];
351 if (defined $indexname && $indexname =~ /\w/) {
352 push @$indexes, $indexname;
353 } else {
354 $indexes = $self->{'collect_cfg'}->{'indexes'};
355 }
356
357 # create the mapping between the index descriptions
358 # and their directory names (includes subcolls and langs)
359 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
360
361 # build each of the indexes
362 foreach my $index (@$indexes) {
363 if ($self->want_built($index)) {
364 print $outhandle "\n*** building index $index in subdirectory " .
365 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
366 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
367 $self->build_index($index);
368 } else {
369 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
370 }
371 }
372
373 $self->post_build_indexes();
374
375}
376
377# implement this in subclass if want to do extra stuff at before building
378# all the indexes
379sub pre_build_indexes {
380 my $self = shift(@_);
381 my ($indexname) = @_; # optional parameter
382}
383
384# implement this in subclass if want to do extra stuff at the end of building
385# all the indexes
386sub post_build_indexes {
387 my $self = shift(@_);
388}
389
390sub build_index {
391 my $self = shift (@_);
392 my ($index) = @_;
393
394 print STDERR "build_index should be implemented in subclass\n";
395 return;
396}
397
398
399
400sub make_infodatabase {
401 my $self = shift (@_);
402 my $outhandle = $self->{'outhandle'};
403
404 print STDERR "BuildDir: $self->{'build_dir'}\n";
405
406 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
407 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
408 &util::mk_all_dir ($textdir);
409 &util::mk_all_dir ($assocdir);
410
411 # Get info database file path
412 my $infodb_type = $self->{'infodbtype'};
413 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
414
415 print $outhandle "\n*** creating the info database and processing associated files\n"
416 if ($self->{'verbosity'} >= 1);
417 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
418
419 # init all the classifiers
420 &classify::init_classifiers ($self->{'classifiers'});
421
422 my $reconstructed_docs = undef;
423 my $database_recs = undef;
424
425 if ($self->{'incremental'}) {
426 $database_recs = {};
427
428 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
429 }
430
431
432 # Important (for memory usage reasons) that we obtain the filehandle
433 # here for writing out to the database, rather than after
434 # $reconstructed_docs has been set up (assuming -incremental is on)
435 #
436 # This is because when we open a pipe to txt2db [using open()]
437 # this triggers a fork() followed by exec(). $reconstructed_docs
438 # can get very large, and so if we did the open() after this, it means
439 # the fork creates a clone of the *large* process image which (admittedly)
440 # is then quickly replaced in the execve() with the much smaller image for
441 # 'txt2db'. The trouble is, in that seismic second caused by
442 # the fork(), the system really does need to have all that memory available
443 # even though it isn't ultimately used. The result is an out of memory
444 # error.
445
446 my ($infodb_handle);
447 if ($self->{'debug'}) {
448 $infodb_handle = *STDOUT;
449 }
450 else {
451 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
452 if (!defined($infodb_handle))
453 {
454 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
455 die "builder::make_infodatabase - couldn't open infodb write handle\n";
456 }
457 }
458
459 if ($self->{'incremental'}) {
460 # reconstruct doc_obj metadata from database for all docs
461 $reconstructed_docs
462 = &classify::reconstruct_doc_objs_metadata($infodb_type,
463 $infodb_file_path,
464 $database_recs);
465 }
466
467 # set up the document processor
468
469 $self->{'buildproc'}->set_output_handle ($infodb_handle);
470 $self->{'buildproc'}->set_mode ('infodb');
471 $self->{'buildproc'}->set_assocdir ($assocdir);
472 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
473 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
474 $self->{'buildproc'}->set_indexing_text (0);
475 $self->{'buildproc'}->set_store_text(1);
476
477 # make_infodatabase needs full reset even for incremental build
478 # as incremental works by reconstructing all docs from the database and
479 # then adding in the new ones
480 $self->{'buildproc'}->zero_reset();
481
482 $self->{'buildproc'}->{'mdprefix_fields'} = {};
483
484 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
485 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
486
487 if ($self->{'incremental'}) {
488 # create flat classify structure, ready for new docs to be added
489 foreach my $doc_obj ( @$reconstructed_docs ) {
490 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
491 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
492 $self->{'buildproc'}->process($doc_obj,undef);
493 }
494 }
495 }
496 # this has changed to only output collection meta if its
497 # not in the config file
498 $self->output_collection_meta($infodb_handle);
499
500 # output classification information
501 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
502 $self->{'remove_empty_classifications'},
503 $self->{'gli'});
504
505 # Output classifier reverse lookup, used in incremental deletion
506 ####&classify::print_reverse_lookup($infodb_handle);
507
508 # output doclist
509 my @doc_list = $self->{'buildproc'}->get_doc_list();
510 my $browselist_infodb = { 'hastxt' => [ "0" ],
511 'childtype' => [ "VList" ],
512 'numleafdocs' => [ scalar(@doc_list) ],
513 'thistype' => [ "Invisible" ],
514 'contains' => [ join(";", @doc_list) ] };
515 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
516
517 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
518
519 if ($infodb_type eq "gdbm-txtgz") {
520 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
521 if (-e $gdb_infodb_file_path) {
522 &util::rm($gdb_infodb_file_path);
523 }
524 }
525 print STDERR "</Stage>\n" if $self->{'gli'};
526}
527
528sub make_auxiliary_files {
529 my $self = shift (@_);
530 my ($index);
531 my $build_cfg = {};
532 # subclasses may have already defined stuff in here
533 if (defined $self->{'build_cfg'}) {
534 $build_cfg = $self->{'build_cfg'};
535 }
536
537 my $outhandle = $self->{'outhandle'};
538
539 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
540 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
541
542 # get the text directory
543 &util::mk_all_dir ($self->{'build_dir'});
544
545 # store the build date
546 $build_cfg->{'builddate'} = time;
547 $build_cfg->{'buildtype'} = $self->{'buildtype'};
548 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
549 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
550 if ($self->{'separate_cjk'}) {
551 $build_cfg->{'separate_cjk'} = "true";
552 }
553
554 # store the number of documents and number of bytes
555 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
556 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
557 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
558
559 # store the mapping between the index names and the directory names
560 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
561 my @indexmap = ();
562 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
563 if (not defined ($self->{'notbuilt'}->{$index})) {
564 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
565 }
566 }
567
568 # store the number of indexes built to later determine whether search serviceracks get written out to buildConfig.xml
569 $build_cfg->{'num_indexes'} = scalar (@indexmap);
570
571 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
572
573 my @subcollectionmap = ();
574 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
575 push (@subcollectionmap, "$subcollection\-\>" .
576 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
577 }
578 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
579
580 my @languagemap = ();
581 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
582 push (@languagemap, "$language\-\>" .
583 $self->{'index_mapping'}->{'languagemap'}->{$language});
584 }
585 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
586
587 my @notbuilt = ();
588 foreach my $nb (keys %{$self->{'notbuilt'}}) {
589 push (@notbuilt, $nb);
590 }
591 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
592
593 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
594
595 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
596
597 # write out the earliestDatestamp information needed for OAI
598 my $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
599 if(!-d $archivedir) {
600 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
601 }
602 my $earliestDatestampFile = &util::filename_cat ($archivedir, "earliestDatestamp");
603 my $earliestDatestamp = 0;
604 if (open(FIN,"<$earliestDatestampFile")) {
605 {
606 # slurp in file as a single line
607 local $/ = undef;
608 $earliestDatestamp = <FIN>;
609 #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
610 }
611 close(FIN);
612 }
613 else {
614 print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
615 print $outhandle "Setting value to 0.\n";
616 }
617 $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
618
619 $self->build_cfg_extra($build_cfg);
620
621 if ($gs_mode eq "gs2") {
622 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
623 }
624 if ($gs_mode eq "gs3") {
625
626 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
627 }
628
629 print STDERR "</Stage>\n" if $self->{'gli'};
630}
631
632# implement this in subclass if want to add extra stuff to build.cfg
633sub build_cfg_extra {
634 my $self = shift(@_);
635 my ($build_cfg) = @_;
636
637}
638
639
640sub collect_specific {
641 my $self = shift (@_);
642}
643
644sub want_built {
645 my $self = shift (@_);
646 my ($index) = @_;
647
648 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
649 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
650 if ($index =~ /^$checkstr$/) {
651 $self->{'notbuilt'}->{$index} = 1;
652 return 0;
653 }
654 }
655 }
656
657 return 1;
658}
659
660sub create_index_mapping {
661 my $self = shift (@_);
662 my ($indexes) = @_;
663
664 print STDERR "create_index_mapping should be implemented in subclass\n";
665 my %mapping = ();
666 return \%mapping;
667}
668
669# returns a processed version of a field.
670# if the field has only one component the processed
671# version will contain the first character and next consonant
672# of that componant - otherwise it will contain the first
673# character of the first two components
674# only uses letdig (\w) characters now
675sub process_field {
676 my $self = shift (@_);
677 my ($field) = @_;
678
679 return "" unless (defined ($field) && $field =~ /\S/);
680
681 my ($a, $b);
682 my @components = split /,/, $field;
683 if (scalar @components >= 2) {
684 # pick the first letdig from the first two field names
685 ($a) = $components[0] =~ /^[^\w]*(\w)/;
686 ($b) = $components[1] =~ /^[^\w]*(\w)/;
687 } else {
688 # pick the first two letdig chars
689 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
690 }
691 # there may not have been any letdigs...
692 $a = 'a' unless defined $a;
693 $b = '0' unless defined $b;
694
695 my $newfield = "$a$b";
696 if ($newfield =~ /^\d\d$/) {
697 # digits only - Greenstone runtime doesn't like this.
698 $newfield = "a$a";
699 }
700 return $newfield;
701
702}
703
704sub get_next_version {
705 my $self = shift (@_);
706 my ($nameref) = @_;
707 my $num=0;
708 if ($$nameref =~ /(\d\d)$/) {
709 $num = $1; $num ++;
710 $$nameref =~ s/\d\d$/$num/;
711 } elsif ($$nameref =~ /(\d)$/) {
712 $num = $1;
713 if ($num == 9) {$$nameref =~ s/\d$/10/;}
714 else {$num ++; $$nameref =~ s/\d$/$num/;}
715 } else {
716 $$nameref =~ s/.$/0/;
717 }
718}
719
720
721
722sub get_collection_meta_sets
723{
724 my $self = shift(@_);
725 my $collection_infodb = shift(@_);
726
727 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
728 foreach my $prefix (keys %$mdprefix_fields)
729 {
730 push(@{$collection_infodb->{"metadataset"}}, $prefix);
731
732 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
733 {
734 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
735
736 my $val = $mdprefix_fields->{$prefix}->{$field};
737 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
738 }
739 }
740}
741
742
743# default is to output the metadata sets (prefixes) used in collection
744sub output_collection_meta
745{
746 my $self = shift(@_);
747 my $infodb_handle = shift(@_);
748
749 my %collection_infodb = ();
750 $self->get_collection_meta_sets(\%collection_infodb);
751 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
752}
753
754# sometimes we need to read in an existing build.cfg - for example,
755# if doing each stage of building separately, or when doing incremental
756# building
757sub read_build_cfg {
758 my $self = shift(@_);
759
760 my $buildconfigfilename;
761
762 if ($gs_mode eq "gs2") {
763 $buildconfigfilename = "build.cfg";
764 } else {
765 $buildconfigfilename = "buildConfig.xml";
766 }
767
768 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
769
770 if (!-e $buildconfigfile) {
771 # try the index dir - but do we know where it is?? try here
772 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
773 if (!-e $buildconfigfile) {
774 #we cant find a config file - just ignore the field list
775 return undef;
776 }
777 }
778 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
779
780}
781
782sub print_stats {
783 my $self = shift (@_);
784
785 my $outhandle = $self->{'outhandle'};
786 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
787 my $index = $self->{'buildproc'}->get_index();
788 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
789 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
790
791 if ($indexing_text) {
792 print $outhandle "Stats (Creating index $index)\n";
793 } else {
794 print $outhandle "Stats (Compressing text from $index)\n";
795 }
796 print $outhandle "Total bytes in collection: $num_bytes\n";
797 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
798
799 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
800
801 if ($self->{'incremental'}) {
802 if ($num_processed_bytes == 0) {
803 if ($indexing_text) {
804 print $outhandle "No additional text was added to $index\n";
805 } elsif (!$self->{'no_text'}) {
806 print $outhandle "No additional text was compressed\n";
807 }
808 }
809 }
810 else {
811 print $outhandle "***************\n";
812 if ($indexing_text) {
813 print $outhandle "WARNING: There is very little or no text to process for $index\n";
814 } elsif (!$self->{'no_text'}) {
815 print $outhandle "WARNING: There is very little or no text to compress\n";
816 }
817 print $outhandle " Was this your intention?\n";
818 print $outhandle "***************\n";
819 }
820
821 }
822
823}
824
825
8261;
827
Note: See TracBrowser for help on using the repository browser.