source: main/trunk/greenstone2/perllib/basebuilder.pm@ 26567

Last change on this file since 26567 was 26567, checked in by ak19, 11 years ago

When a GS2 collection contains both collect.cfg and collectionConfig.xml (as advanced beatles does) the old code used to end up reading in the GS3 collectionConfig.xml instead of the GS2 collect.cfg and set the GS_mode to GS3. Now colcfg::get_collect_cfg_name takes the gs_mode (instead of determining this and returning it) and works out the collectcfg file name for the gs_mode. That means that the calling functions now need to work out the gs_mode. They do so by setting the gs_mode to gs3 if the site flag is present in the commandline, if not then it defaults to gs2. So from now on, the site flag must be specified for GS3 collections.

  • Property svn:keywords set to Author Date Id Revision
File size: 25.9 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57 my ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
59 $remove_empty_classifications,
60 $outhandle, $no_text, $failhandle, $gli) = @_;
61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
67 my $self = bless {'site'=>$site, # will be undef for Greenstone 2
68 'collection'=>$collection,
69 'source_dir'=>$source_dir,
70 'build_dir'=>$build_dir,
71 'verbosity'=>$verbosity,
72 'maxdocs'=>$maxdocs,
73 'debug'=>$debug,
74 'keepold'=>$keepold,
75 'incremental'=>$incremental,
76 'incremental_mode'=>$incremental_mode,
77 'remove_empty_classifications'=>$remove_empty_classifications,
78 'outhandle'=>$outhandle,
79 'no_text'=>$no_text,
80 'failhandle'=>$failhandle,
81 'notbuilt'=>{}, # indexes not built
82 'gli'=>$gli
83 }, $class;
84
85 $self->{'gli'} = 0 unless defined $self->{'gli'};
86
87 # Read in the collection configuration file.
88 if ((defined $site) && ($site ne "")) { # GS3
89 $gs_mode = "gs3";
90 }
91
92 my $colcfgname = &colcfg::get_collect_cfg_name($outhandle, $gs_mode);
93 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
94
95 if ($gs_mode eq "gs3") {
96 # read it in again to save the original form for later writing out
97 # of buildConfig.xml
98 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
99 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
100 }
101
102 # get the database type for this collection from the collect.cfg file (may be undefined)
103 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
104
105
106 # load up any dontdb fields
107 $self->{'dontdb'} = {};
108 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
109 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
110 $self->{'dontdb'}->{$dg} = 1;
111 }
112 }
113
114 $self->{'maxnumeric'} = 4;
115 return $self;
116}
117
118# stuff has been moved here from new, so we can use subclass methods
119sub init {
120 my $self = shift(@_);
121
122 my $outhandle = $self->{'outhandle'};
123 my $failhandle = $self->{'failhandle'};
124
125 $self->generate_index_list();
126 my $indexes = $self->{'collect_cfg'}->{'indexes'};
127 if (defined $indexes) {
128 # sort out subcollection indexes
129 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
130 $self->{'collect_cfg'}->{'indexes'} = [];
131 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
132 foreach my $index (@$indexes) {
133 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
134 }
135 }
136 }
137
138 # sort out language subindexes
139 if (defined $self->{'collect_cfg'}->{'languages'}) {
140 $indexes = $self->{'collect_cfg'}->{'indexes'};
141 $self->{'collect_cfg'}->{'indexes'} = [];
142 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
143 foreach my $index (@$indexes) {
144 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
145 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
146 }
147 else { # add in an empty subcollection field
148 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
149 }
150 }
151 }
152 }
153 }
154
155 if (defined($self->{'collect_cfg'}->{'indexes'})) {
156 # make sure that the same index isn't specified more than once
157 my %tmphash = ();
158 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
159 $self->{'collect_cfg'}->{'indexes'} = [];
160 foreach my $i (@tmparray) {
161 if (!defined ($tmphash{$i})) {
162 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
163 $tmphash{$i} = 1;
164 }
165 }
166 } else {
167 $self->{'collect_cfg'}->{'indexes'} = [];
168 }
169
170 # check incremental against whether builder can cope or not.
171 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
172 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
173 $self->{'keepold'} = 0;
174 $self->{'incremental'} = 0;
175 $self->{'incremental_mode'} = "none";
176
177 }
178
179 # gs_version for plugins
180 my $gs_version = "2";
181 if ($gs_mode eq "gs3") {
182 $gs_version = "3";
183 }
184 # get the list of plugins for this collection
185 my $plugins = [];
186 if (defined $self->{'collect_cfg'}->{'plugin'}) {
187 $plugins = $self->{'collect_cfg'}->{'plugin'};
188 }
189
190 # load all the plugins
191
192 #build up the extra global options for the plugins
193 my @global_opts = ();
194 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
195 push @global_opts, "-separate_cjk";
196 }
197 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'}, $gs_version);
198
199 if (scalar(@{$self->{'pluginfo'}}) == 0) {
200 print $outhandle "No plugins were loaded.\n";
201 die "\n";
202 }
203
204 # get the list of classifiers for this collection
205 my $classifiers = [];
206 if (defined $self->{'collect_cfg'}->{'classify'}) {
207 $classifiers = $self->{'collect_cfg'}->{'classify'};
208 }
209
210 # load all the classifiers
211 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
212
213 # load up the document processor for building
214 # if a buildproc class has been created for this collection, use it
215 # otherwise, use the default buildproc for the builder we are initialising
216 my $buildprocdir = undef;
217 my $buildproctype;
218
219 my $collection = $self->{'collection'};
220 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
221 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
222 $buildproctype = "custombuildproc";
223 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
224 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
225 $buildproctype = "custombuildproc";
226 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
227 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
228 $buildproctype = "${collection}buildproc";
229 } else {
230 $buildproctype = $self->default_buildproc();
231 }
232 if (defined $buildprocdir) {
233 require "$buildprocdir/$buildproctype.pm";
234 }
235 else {
236 require "$buildproctype.pm";
237 }
238
239 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
240 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
241 die "$@" if $@;
242
243 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
244 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
245
246 $self->generate_index_options();
247
248 if (!$self->{'debug'} && !$self->{'keepold'}) {
249 # remove any old builds
250 &util::rm_r($self->{'build_dir'});
251 &util::mk_all_dir($self->{'build_dir'});
252
253 # make the text directory
254 my $textdir = "$self->{'build_dir'}/text";
255 &util::mk_all_dir($textdir);
256 }
257
258 if ($self->{'incremental'}) {
259 # some classes may need to do some additional initialisation
260 $self->init_for_incremental_build();
261 }
262
263}
264
265sub is_incremental_capable
266{
267 # By default we return 'no' as the answer
268 # Safer to assume non-incremental to start with, and then override in
269 # inherited classes that are.
270
271 return 0;
272}
273
274# implement this in subclass if want to do additional initialisation for an
275# incremental build
276sub init_for_incremental_build {
277 my $self = shift (@_);
278}
279
280sub deinit {
281 my $self = shift (@_);
282
283 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
284}
285
286sub generate_index_options {
287 my $self = shift (@_);
288
289 my $separate_cjk = 0;
290
291 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
292 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
293 if ($option =~ /separate_cjk/) {
294 $separate_cjk = 1;
295 }
296 }
297 }
298 # set this for building
299 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
300 # record it for build.cfg
301 $self->{'separate_cjk'} = $separate_cjk;
302}
303
304sub set_sections_index_document_metadata {
305 my $self = shift (@_);
306 my ($index) = @_;
307
308 $self->{'buildproc'}->set_sections_index_document_metadata($index);
309}
310
311sub set_maxnumeric {
312 my $self = shift (@_);
313 my ($maxnumeric) = @_;
314
315 $self->{'maxnumeric'} = $maxnumeric;
316}
317sub set_strip_html {
318 my $self = shift (@_);
319 my ($strip) = @_;
320
321 $self->{'strip_html'} = $strip;
322 $self->{'buildproc'}->set_strip_html($strip);
323}
324
325sub set_store_metadata_coverage {
326 my $self = shift (@_);
327 my ($store_metadata_coverage) = @_;
328
329 $self->{'buildproc'}->set_store_metadata_coverage($store_metadata_coverage);
330}
331
332sub compress_text {
333 my $self = shift (@_);
334 my ($textindex) = @_;
335
336 print STDERR "compress_text() should be implemented in subclass!!";
337 return;
338}
339
340
341sub build_indexes {
342 my $self = shift (@_);
343 my ($indexname) = @_;
344 my $outhandle = $self->{'outhandle'};
345
346 $self->pre_build_indexes();
347
348 my $indexes = [];
349 if (defined $indexname && $indexname =~ /\w/) {
350 push @$indexes, $indexname;
351 } else {
352 $indexes = $self->{'collect_cfg'}->{'indexes'};
353 }
354
355 # create the mapping between the index descriptions
356 # and their directory names (includes subcolls and langs)
357 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
358
359 # build each of the indexes
360 foreach my $index (@$indexes) {
361 if ($self->want_built($index)) {
362 print $outhandle "\n*** building index $index in subdirectory " .
363 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
364 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
365 $self->build_index($index);
366 } else {
367 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
368 }
369 }
370
371 $self->post_build_indexes();
372
373}
374
375# implement this in subclass if want to do extra stuff at before building
376# all the indexes
377sub pre_build_indexes {
378 my $self = shift(@_);
379 my ($indexname) = @_; # optional parameter
380}
381
382# implement this in subclass if want to do extra stuff at the end of building
383# all the indexes
384sub post_build_indexes {
385 my $self = shift(@_);
386}
387
388sub build_index {
389 my $self = shift (@_);
390 my ($index) = @_;
391
392 print STDERR "build_index should be implemented in subclass\n";
393 return;
394}
395
396
397
398sub make_infodatabase {
399 my $self = shift (@_);
400 my $outhandle = $self->{'outhandle'};
401
402 print STDERR "BuildDir: $self->{'build_dir'}\n";
403
404 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
405 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
406 &util::mk_all_dir ($textdir);
407 &util::mk_all_dir ($assocdir);
408
409 # Get info database file path
410 my $infodb_type = $self->{'infodbtype'};
411 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
412
413 print $outhandle "\n*** creating the info database and processing associated files\n"
414 if ($self->{'verbosity'} >= 1);
415 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
416
417 # init all the classifiers
418 &classify::init_classifiers ($self->{'classifiers'});
419
420 my $reconstructed_docs = undef;
421 my $database_recs = undef;
422
423 if ($self->{'incremental'}) {
424 $database_recs = {};
425
426 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
427 }
428
429
430 # Important (for memory usage reasons) that we obtain the filehandle
431 # here for writing out to the database, rather than after
432 # $reconstructed_docs has been set up (assuming -incremental is on)
433 #
434 # This is because when we open a pipe to txt2db [using open()]
435 # this triggers a fork() followed by exec(). $reconstructed_docs
436 # can get very large, and so if we did the open() after this, it means
437 # the fork creates a clone of the *large* process image which (admittedly)
438 # is then quickly replaced in the execve() with the much smaller image for
439 # 'txt2db'. The trouble is, in that seismic second caused by
440 # the fork(), the system really does need to have all that memory available
441 # even though it isn't ultimately used. The result is an out of memory
442 # error.
443
444 my ($infodb_handle);
445 if ($self->{'debug'}) {
446 $infodb_handle = *STDOUT;
447 }
448 else {
449 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
450 if (!defined($infodb_handle))
451 {
452 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
453 die "builder::make_infodatabase - couldn't open infodb write handle\n";
454 }
455 }
456
457 if ($self->{'incremental'}) {
458 # reconstruct doc_obj metadata from database for all docs
459 $reconstructed_docs
460 = &classify::reconstruct_doc_objs_metadata($infodb_type,
461 $infodb_file_path,
462 $database_recs);
463 }
464
465 # set up the document processor
466
467 $self->{'buildproc'}->set_output_handle ($infodb_handle);
468 $self->{'buildproc'}->set_mode ('infodb');
469 $self->{'buildproc'}->set_assocdir ($assocdir);
470 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
471 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
472 $self->{'buildproc'}->set_indexing_text (0);
473 $self->{'buildproc'}->set_store_text(1);
474
475 # make_infodatabase needs full reset even for incremental build
476 # as incremental works by reconstructing all docs from the database and
477 # then adding in the new ones
478 $self->{'buildproc'}->zero_reset();
479
480 $self->{'buildproc'}->{'mdprefix_fields'} = {};
481
482 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
483 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
484
485 if ($self->{'incremental'}) {
486 # create flat classify structure, ready for new docs to be added
487 foreach my $doc_obj ( @$reconstructed_docs ) {
488 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
489 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
490 $self->{'buildproc'}->process($doc_obj,undef);
491 }
492 }
493 }
494 # this has changed to only output collection meta if its
495 # not in the config file
496 $self->output_collection_meta($infodb_handle);
497
498 # output classification information
499 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
500 $self->{'remove_empty_classifications'},
501 $self->{'gli'});
502
503 # Output classifier reverse lookup, used in incremental deletion
504 ####&classify::print_reverse_lookup($infodb_handle);
505
506 # output doclist
507 my @doc_list = $self->{'buildproc'}->get_doc_list();
508 my $browselist_infodb = { 'hastxt' => [ "0" ],
509 'childtype' => [ "VList" ],
510 'numleafdocs' => [ scalar(@doc_list) ],
511 'thistype' => [ "Invisible" ],
512 'contains' => [ join(";", @doc_list) ] };
513 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
514
515 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
516
517 if ($infodb_type eq "gdbm-txtgz") {
518 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
519 if (-e $gdb_infodb_file_path) {
520 &util::rm($gdb_infodb_file_path);
521 }
522 }
523 print STDERR "</Stage>\n" if $self->{'gli'};
524}
525
526sub make_auxiliary_files {
527 my $self = shift (@_);
528 my ($index);
529 my $build_cfg = {};
530 # subclasses may have already defined stuff in here
531 if (defined $self->{'build_cfg'}) {
532 $build_cfg = $self->{'build_cfg'};
533 }
534
535 my $outhandle = $self->{'outhandle'};
536
537 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
538 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
539
540 # get the text directory
541 &util::mk_all_dir ($self->{'build_dir'});
542
543 # store the build date
544 $build_cfg->{'builddate'} = time;
545 $build_cfg->{'buildtype'} = $self->{'buildtype'};
546 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
547 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
548 if ($self->{'separate_cjk'}) {
549 $build_cfg->{'separate_cjk'} = "true";
550 }
551
552 # store the number of documents and number of bytes
553 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
554 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
555 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
556
557 # store the mapping between the index names and the directory names
558 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
559 my @indexmap = ();
560 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
561 if (not defined ($self->{'notbuilt'}->{$index})) {
562 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
563 }
564 }
565
566 # store the number of indexes built to later determine whether search serviceracks get written out to buildConfig.xml
567 $build_cfg->{'num_indexes'} = scalar (@indexmap);
568
569 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
570
571 my @subcollectionmap = ();
572 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
573 push (@subcollectionmap, "$subcollection\-\>" .
574 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
575 }
576 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
577
578 my @languagemap = ();
579 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
580 push (@languagemap, "$language\-\>" .
581 $self->{'index_mapping'}->{'languagemap'}->{$language});
582 }
583 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
584
585 my @notbuilt = ();
586 foreach my $nb (keys %{$self->{'notbuilt'}}) {
587 push (@notbuilt, $nb);
588 }
589 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
590
591 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
592
593 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
594
595 # write out the earliestDatestamp information needed for OAI
596 my $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
597 if(!-d $archivedir) {
598 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
599 }
600 my $earliestDatestampFile = &util::filename_cat ($archivedir, "earliestDatestamp");
601 my $earliestDatestamp = 0;
602 if (open(FIN,"<$earliestDatestampFile")) {
603 {
604 # slurp in file as a single line
605 local $/ = undef;
606 $earliestDatestamp = <FIN>;
607 #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
608 }
609 close(FIN);
610 }
611 else {
612 print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
613 print $outhandle "Setting value to 0.\n";
614 }
615 $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
616
617 $self->build_cfg_extra($build_cfg);
618
619 if ($gs_mode eq "gs2") {
620 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
621 }
622 if ($gs_mode eq "gs3") {
623
624 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
625 }
626
627 print STDERR "</Stage>\n" if $self->{'gli'};
628}
629
630# implement this in subclass if want to add extra stuff to build.cfg
631sub build_cfg_extra {
632 my $self = shift(@_);
633 my ($build_cfg) = @_;
634
635}
636
637
638sub collect_specific {
639 my $self = shift (@_);
640}
641
642sub want_built {
643 my $self = shift (@_);
644 my ($index) = @_;
645
646 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
647 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
648 if ($index =~ /^$checkstr$/) {
649 $self->{'notbuilt'}->{$index} = 1;
650 return 0;
651 }
652 }
653 }
654
655 return 1;
656}
657
658sub create_index_mapping {
659 my $self = shift (@_);
660 my ($indexes) = @_;
661
662 print STDERR "create_index_mapping should be implemented in subclass\n";
663 my %mapping = ();
664 return \%mapping;
665}
666
667# returns a processed version of a field.
668# if the field has only one component the processed
669# version will contain the first character and next consonant
670# of that componant - otherwise it will contain the first
671# character of the first two components
672# only uses letdig (\w) characters now
673sub process_field {
674 my $self = shift (@_);
675 my ($field) = @_;
676
677 return "" unless (defined ($field) && $field =~ /\S/);
678
679 my ($a, $b);
680 my @components = split /,/, $field;
681 if (scalar @components >= 2) {
682 # pick the first letdig from the first two field names
683 ($a) = $components[0] =~ /^[^\w]*(\w)/;
684 ($b) = $components[1] =~ /^[^\w]*(\w)/;
685 } else {
686 # pick the first two letdig chars
687 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
688 }
689 # there may not have been any letdigs...
690 $a = 'a' unless defined $a;
691 $b = '0' unless defined $b;
692
693 my $newfield = "$a$b";
694 if ($newfield =~ /^\d\d$/) {
695 # digits only - Greenstone runtime doesn't like this.
696 $newfield = "a$a";
697 }
698 return $newfield;
699
700}
701
702sub get_next_version {
703 my $self = shift (@_);
704 my ($nameref) = @_;
705 my $num=0;
706 if ($$nameref =~ /(\d\d)$/) {
707 $num = $1; $num ++;
708 $$nameref =~ s/\d\d$/$num/;
709 } elsif ($$nameref =~ /(\d)$/) {
710 $num = $1;
711 if ($num == 9) {$$nameref =~ s/\d$/10/;}
712 else {$num ++; $$nameref =~ s/\d$/$num/;}
713 } else {
714 $$nameref =~ s/.$/0/;
715 }
716}
717
718
719
720sub get_collection_meta_sets
721{
722 my $self = shift(@_);
723 my $collection_infodb = shift(@_);
724
725 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
726 foreach my $prefix (keys %$mdprefix_fields)
727 {
728 push(@{$collection_infodb->{"metadataset"}}, $prefix);
729
730 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
731 {
732 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
733
734 my $val = $mdprefix_fields->{$prefix}->{$field};
735 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
736 }
737 }
738}
739
740
741# default is to output the metadata sets (prefixes) used in collection
742sub output_collection_meta
743{
744 my $self = shift(@_);
745 my $infodb_handle = shift(@_);
746
747 my %collection_infodb = ();
748 $self->get_collection_meta_sets(\%collection_infodb);
749 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
750}
751
752# sometimes we need to read in an existing build.cfg - for example,
753# if doing each stage of building separately, or when doing incremental
754# building
755sub read_build_cfg {
756 my $self = shift(@_);
757
758 my $buildconfigfilename;
759
760 if ($gs_mode eq "gs2") {
761 $buildconfigfilename = "build.cfg";
762 } else {
763 $buildconfigfilename = "buildConfig.xml";
764 }
765
766 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
767
768 if (!-e $buildconfigfile) {
769 # try the index dir - but do we know where it is?? try here
770 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
771 if (!-e $buildconfigfile) {
772 #we cant find a config file - just ignore the field list
773 return undef;
774 }
775 }
776 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
777
778}
779
780sub print_stats {
781 my $self = shift (@_);
782
783 my $outhandle = $self->{'outhandle'};
784 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
785 my $index = $self->{'buildproc'}->get_index();
786 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
787 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
788
789 if ($indexing_text) {
790 print $outhandle "Stats (Creating index $index)\n";
791 } else {
792 print $outhandle "Stats (Compressing text from $index)\n";
793 }
794 print $outhandle "Total bytes in collection: $num_bytes\n";
795 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
796
797 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
798
799 if ($self->{'incremental'}) {
800 if ($num_processed_bytes == 0) {
801 if ($indexing_text) {
802 print $outhandle "No additional text was added to $index\n";
803 } elsif (!$self->{'no_text'}) {
804 print $outhandle "No additional text was compressed\n";
805 }
806 }
807 }
808 else {
809 print $outhandle "***************\n";
810 if ($indexing_text) {
811 print $outhandle "WARNING: There is very little or no text to process for $index\n";
812 } elsif (!$self->{'no_text'}) {
813 print $outhandle "WARNING: There is very little or no text to compress\n";
814 }
815 print $outhandle " Was this your intention?\n";
816 print $outhandle "***************\n";
817 }
818
819 }
820
821}
822
823
8241;
825
Note: See TracBrowser for help on using the repository browser.