source: gs2-extensions/parallel-building/trunk/src/perllib/basebuilder.pm@ 25115

Last change on this file since 25115 was 25115, checked in by jmt12, 12 years ago

This is sometimes called without the site argument... detect and match up remaining arguments appropriately

File size: 26.2 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57 my ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
59 $remove_empty_classifications,
60 $outhandle, $no_text, $failhandle, $gli);
61
62 # Somehow this is sometimes called without the site argument... detect and
63 # match up arguments appropriately
64 if (scalar(@_) == 16)
65 {
66 ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
67 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
68 $remove_empty_classifications,
69 $outhandle, $no_text, $failhandle, $gli) = @_;
70 }
71 else
72 {
73 ($class, $collection, $source_dir, $build_dir, $verbosity,
74 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
75 $remove_empty_classifications,
76 $outhandle, $no_text, $failhandle, $gli) = @_;
77 }
78
79 $outhandle = *STDERR unless defined $outhandle;
80 $no_text = 0 unless defined $no_text;
81 $failhandle = *STDERR unless defined $failhandle;
82
83 # create a builder object
84 my $self = bless {'site'=>$site, # will be undef for Greenstone 2
85 'collection'=>$collection,
86 'source_dir'=>$source_dir,
87 'build_dir'=>$build_dir,
88 'verbosity'=>$verbosity,
89 'maxdocs'=>$maxdocs,
90 'debug'=>$debug,
91 'keepold'=>$keepold,
92 'incremental'=>$incremental,
93 'incremental_mode'=>$incremental_mode,
94 'remove_empty_classifications'=>$remove_empty_classifications,
95 'outhandle'=>$outhandle,
96 'no_text'=>$no_text,
97 'failhandle'=>$failhandle,
98 'notbuilt'=>{}, # indexes not built
99 'gli'=>$gli
100 }, $class;
101
102 $self->{'gli'} = 0 unless defined $self->{'gli'};
103
104 # Read in the collection configuration file.
105 my ($colcfgname);
106 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
107 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
108
109 if ($gs_mode eq "gs3") {
110 # read it in again to save the original form for later writing out
111 # of buildConfig.xml
112 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
113 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
114 }
115
116 # get the database type for this collection from the collect.cfg file (may be undefined)
117 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
118
119
120 # load up any dontdb fields
121 $self->{'dontdb'} = {};
122 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
123 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
124 $self->{'dontdb'}->{$dg} = 1;
125 }
126 }
127
128 $self->{'maxnumeric'} = 4;
129 return $self;
130}
131
132# stuff has been moved here from new, so we can use subclass methods
133sub init {
134 my $self = shift(@_);
135
136 my $outhandle = $self->{'outhandle'};
137 my $failhandle = $self->{'failhandle'};
138
139 $self->generate_index_list();
140 my $indexes = $self->{'collect_cfg'}->{'indexes'};
141 if (defined $indexes) {
142 # sort out subcollection indexes
143 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
144 $self->{'collect_cfg'}->{'indexes'} = [];
145 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
146 foreach my $index (@$indexes) {
147 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
148 }
149 }
150 }
151
152 # sort out language subindexes
153 if (defined $self->{'collect_cfg'}->{'languages'}) {
154 $indexes = $self->{'collect_cfg'}->{'indexes'};
155 $self->{'collect_cfg'}->{'indexes'} = [];
156 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
157 foreach my $index (@$indexes) {
158 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
159 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
160 }
161 else { # add in an empty subcollection field
162 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
163 }
164 }
165 }
166 }
167 }
168
169 if (defined($self->{'collect_cfg'}->{'indexes'})) {
170 # make sure that the same index isn't specified more than once
171 my %tmphash = ();
172 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
173 $self->{'collect_cfg'}->{'indexes'} = [];
174 foreach my $i (@tmparray) {
175 if (!defined ($tmphash{$i})) {
176 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
177 $tmphash{$i} = 1;
178 }
179 }
180 } else {
181 $self->{'collect_cfg'}->{'indexes'} = [];
182 }
183
184 # check incremental against whether builder can cope or not.
185 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
186 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
187 $self->{'keepold'} = 0;
188 $self->{'incremental'} = 0;
189 $self->{'incremental_mode'} = "none";
190
191 }
192
193
194 # get the list of plugins for this collection
195 my $plugins = [];
196 if (defined $self->{'collect_cfg'}->{'plugin'}) {
197 $plugins = $self->{'collect_cfg'}->{'plugin'};
198 }
199
200 # load all the plugins
201
202 #build up the extra global options for the plugins
203 my @global_opts = ();
204 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
205 push @global_opts, "-separate_cjk";
206 }
207 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'});
208
209 if (scalar(@{$self->{'pluginfo'}}) == 0) {
210 print $outhandle "No plugins were loaded.\n";
211 die "\n";
212 }
213
214 # get the list of classifiers for this collection
215 my $classifiers = [];
216 if (defined $self->{'collect_cfg'}->{'classify'}) {
217 $classifiers = $self->{'collect_cfg'}->{'classify'};
218 }
219
220 # load all the classifiers
221 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
222
223 # load up the document processor for building
224 # if a buildproc class has been created for this collection, use it
225 # otherwise, use the default buildproc for the builder we are initialising
226 my $buildprocdir = undef;
227 my $buildproctype;
228
229 my $collection = $self->{'collection'};
230 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
231 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
232 $buildproctype = "custombuildproc";
233 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
234 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
235 $buildproctype = "custombuildproc";
236 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
237 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
238 $buildproctype = "${collection}buildproc";
239 } else {
240 $buildproctype = $self->default_buildproc();
241 }
242 if (defined $buildprocdir) {
243 require "$buildprocdir/$buildproctype.pm";
244 }
245 else {
246 require "$buildproctype.pm";
247 }
248
249 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
250 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
251 die "$@" if $@;
252
253 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
254 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
255
256 $self->generate_index_options();
257
258 if (!$self->{'debug'} && !$self->{'keepold'}) {
259 # remove any old builds
260 &util::rm_r($self->{'build_dir'});
261 &util::mk_all_dir($self->{'build_dir'});
262
263 # make the text directory
264 my $textdir = "$self->{'build_dir'}/text";
265 &util::mk_all_dir($textdir);
266 }
267
268 if ($self->{'incremental'}) {
269 # some classes may need to do some additional initialisation
270 $self->init_for_incremental_build();
271 }
272
273}
274
275sub is_incremental_capable
276{
277 # By default we return 'no' as the answer
278 # Safer to assume non-incremental to start with, and then override in
279 # inherited classes that are.
280
281 return 0;
282}
283
284# implement this in subclass if want to do additional initialisation for an
285# incremental build
286sub init_for_incremental_build {
287 my $self = shift (@_);
288}
289
290sub deinit {
291 my $self = shift (@_);
292
293 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
294}
295
296sub generate_index_options {
297 my $self = shift (@_);
298
299 my $separate_cjk = 0;
300
301 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
302 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
303 if ($option =~ /separate_cjk/) {
304 $separate_cjk = 1;
305 }
306 }
307 }
308 # set this for building
309 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
310 # record it for build.cfg
311 $self->{'separate_cjk'} = $separate_cjk;
312}
313
314sub set_sections_index_document_metadata {
315 my $self = shift (@_);
316 my ($index) = @_;
317
318 $self->{'buildproc'}->set_sections_index_document_metadata($index);
319}
320
321sub set_maxnumeric {
322 my $self = shift (@_);
323 my ($maxnumeric) = @_;
324
325 $self->{'maxnumeric'} = $maxnumeric;
326}
327sub set_strip_html {
328 my $self = shift (@_);
329 my ($strip) = @_;
330
331 $self->{'strip_html'} = $strip;
332 $self->{'buildproc'}->set_strip_html($strip);
333}
334
335sub compress_text {
336 my $self = shift (@_);
337 my ($textindex) = @_;
338
339 print STDERR "compress_text() should be implemented in subclass!!";
340 return;
341}
342
343
344sub build_indexes {
345 my $self = shift (@_);
346 my ($indexname,$indexlevel) = @_;
347 my $outhandle = $self->{'outhandle'};
348
349 $self->pre_build_indexes();
350
351 my $indexes = [];
352 if (defined $indexname && $indexname =~ /\w/) {
353 push @$indexes, $indexname;
354 } else {
355 $indexes = $self->{'collect_cfg'}->{'indexes'};
356 }
357
358 # create the mapping between the index descriptions
359 # and their directory names (includes subcolls and langs)
360 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
361
362 # build each of the indexes
363 foreach my $index (@$indexes) {
364 if ($self->want_built($index)) {
365 print $outhandle "\n*** building index $index in subdirectory " .
366 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
367 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
368 $self->build_index($index);
369 } else {
370 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
371 }
372 }
373
374 $self->post_build_indexes();
375
376}
377
378# implement this in subclass if want to do extra stuff at before building
379# all the indexes
380sub pre_build_indexes {
381 my $self = shift(@_);
382 my ($indexname) = @_; # optional parameter
383}
384
385# implement this in subclass if want to do extra stuff at the end of building
386# all the indexes
387sub post_build_indexes {
388 my $self = shift(@_);
389}
390
391sub build_index {
392 my $self = shift (@_);
393 my ($index) = @_;
394
395 print STDERR "build_index should be implemented in subclass\n";
396 return;
397}
398
399
400
401sub make_infodatabase {
402 my $self = shift (@_);
403 my $outhandle = $self->{'outhandle'};
404
405 print STDERR "BuildDir: $self->{'build_dir'}\n";
406
407 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
408 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
409 &util::mk_all_dir ($textdir);
410 &util::mk_all_dir ($assocdir);
411
412 # Get info database file path
413 my $infodb_type = $self->{'infodbtype'};
414 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
415
416 print $outhandle "\n*** creating the info database and processing associated files\n"
417 if ($self->{'verbosity'} >= 1);
418 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
419
420 # init all the classifiers
421 &classify::init_classifiers ($self->{'classifiers'});
422
423 my $reconstructed_docs = undef;
424 my $database_recs = undef;
425
426 if ($self->{'incremental'}) {
427 $database_recs = {};
428
429 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
430 }
431
432
433 # Important (for memory usage reasons) that we obtain the filehandle
434 # here for writing out to the database, rather than after
435 # $reconstructed_docs has been set up (assuming -incremental is on)
436 #
437 # This is because when we open a pipe to txt2db [using open()]
438 # this triggers a fork() followed by exec(). $reconstructed_docs
439 # can get very large, and so if we did the open() after this, it means
440 # the fork creates a clone of the *large* process image which (admittedly)
441 # is then quickly replaced in the execve() with the much smaller image for
442 # 'txt2db'. The trouble is, in that seismic second caused by
443 # the fork(), the system really does need to have all that memory available
444 # even though it isn't ultimately used. The result is an out of memory
445 # error.
446
447 my ($infodb_handle);
448 if ($self->{'debug'}) {
449 $infodb_handle = *STDOUT;
450 }
451 else {
452 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
453 if (!defined($infodb_handle))
454 {
455 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
456 die "builder::make_infodatabase - couldn't open infodb write handle\n";
457 }
458 }
459
460 if ($self->{'incremental'}) {
461 # reconstruct doc_obj metadata from database for all docs
462 $reconstructed_docs
463 = &classify::reconstruct_doc_objs_metadata($infodb_type,
464 $infodb_file_path,
465 $database_recs);
466 }
467
468 # set up the document processor
469
470 $self->{'buildproc'}->set_output_handle ($infodb_handle);
471 $self->{'buildproc'}->set_mode ('infodb');
472 $self->{'buildproc'}->set_assocdir ($assocdir);
473 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
474 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
475 $self->{'buildproc'}->set_indexing_text (0);
476 $self->{'buildproc'}->set_store_text(1);
477 $self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
478
479 # make_infodatabase needs full reset even for incremental build
480 # as incremental works by reconstructing all docs from the database and
481 # then adding in the new ones
482 $self->{'buildproc'}->zero_reset();
483
484 $self->{'buildproc'}->{'mdprefix_fields'} = {};
485
486 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
487 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
488
489 if ($self->{'incremental'}) {
490 # create flat classify structure, ready for new docs to be added
491 foreach my $doc_obj ( @$reconstructed_docs ) {
492 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
493 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
494 $self->{'buildproc'}->process($doc_obj,undef);
495 }
496 }
497 }
498 # this has changed to only output collection meta if its
499 # not in the config file
500 $self->output_collection_meta($infodb_handle);
501
502 # output classification information
503 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
504 $self->{'remove_empty_classifications'},
505 $self->{'gli'});
506
507 # Output classifier reverse lookup, used in incremental deletion
508 ####&classify::print_reverse_lookup($infodb_handle);
509
510 # output doclist
511 my @doc_list = $self->{'buildproc'}->get_doc_list();
512 my $browselist_infodb = { 'hastxt' => [ "0" ],
513 'childtype' => [ "VList" ],
514 'numleafdocs' => [ scalar(@doc_list) ],
515 'thistype' => [ "Invisible" ],
516 'contains' => [ join(";", @doc_list) ] };
517 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
518
519 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
520
521 if ($infodb_type eq "gdbm-txtgz") {
522 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
523 if (-e $gdb_infodb_file_path) {
524 &util::rm($gdb_infodb_file_path);
525 }
526 }
527 print STDERR "</Stage>\n" if $self->{'gli'};
528}
529
530sub make_auxiliary_files {
531 my $self = shift (@_);
532 my ($index);
533 my $build_cfg = {};
534 # subclasses may have already defined stuff in here
535 if (defined $self->{'build_cfg'}) {
536 $build_cfg = $self->{'build_cfg'};
537 }
538
539 my $outhandle = $self->{'outhandle'};
540
541 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
542 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
543
544 # get the text directory
545 &util::mk_all_dir ($self->{'build_dir'});
546
547 # store the build date
548 $build_cfg->{'builddate'} = time;
549 $build_cfg->{'buildtype'} = $self->{'buildtype'};
550 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
551 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
552 if ($self->{'separate_cjk'}) {
553 $build_cfg->{'separate_cjk'} = "true";
554 }
555
556 # store the number of documents and number of bytes
557 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
558 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
559 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
560
561 # store the mapping between the index names and the directory names
562 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
563 my @indexmap = ();
564 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
565 if (not defined ($self->{'notbuilt'}->{$index})) {
566 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
567 }
568 }
569 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
570
571 my @subcollectionmap = ();
572 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
573 push (@subcollectionmap, "$subcollection\-\>" .
574 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
575 }
576 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
577
578 my @languagemap = ();
579 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
580 push (@languagemap, "$language\-\>" .
581 $self->{'index_mapping'}->{'languagemap'}->{$language});
582 }
583 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
584
585 my @notbuilt = ();
586 foreach my $nb (keys %{$self->{'notbuilt'}}) {
587 push (@notbuilt, $nb);
588 }
589 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
590
591 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
592
593 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
594
595 # write out the earliestDatestamp information needed for OAI
596 my $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
597 if(!-d $archivedir) {
598 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
599 }
600 my $earliestDatestampFile = &util::filename_cat ($archivedir, "earliestDatestamp");
601 my $earliestDatestamp = 0;
602 if (open(FIN,"<$earliestDatestampFile")) {
603 {
604 # slurp in file as a single line
605 local $/ = undef;
606 $earliestDatestamp = <FIN>;
607 #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
608 }
609 close(FIN);
610 }
611 else {
612 print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
613 print $outhandle "Setting value to 0.\n";
614 }
615 $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
616
617 $self->build_cfg_extra($build_cfg);
618
619 if ($gs_mode eq "gs2") {
620 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
621 }
622 if ($gs_mode eq "gs3") {
623
624 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
625 }
626
627 print STDERR "</Stage>\n" if $self->{'gli'};
628}
629
630# implement this in subclass if want to add extra stuff to build.cfg
631sub build_cfg_extra {
632 my $self = shift(@_);
633 my ($build_cfg) = @_;
634
635}
636
637
638sub collect_specific {
639 my $self = shift (@_);
640}
641
642sub want_built {
643 my $self = shift (@_);
644 my ($index) = @_;
645
646 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
647 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
648 if ($index =~ /^$checkstr$/) {
649 $self->{'notbuilt'}->{$index} = 1;
650 return 0;
651 }
652 }
653 }
654
655 return 1;
656}
657
658sub create_index_mapping {
659 my $self = shift (@_);
660 my ($indexes) = @_;
661
662 print STDERR "create_index_mapping should be implemented in subclass\n";
663 my %mapping = ();
664 return \%mapping;
665}
666
667# returns a processed version of a field.
668# if the field has only one component the processed
669# version will contain the first character and next consonant
670# of that componant - otherwise it will contain the first
671# character of the first two components
672# only uses letdig (\w) characters now
673sub process_field {
674 my $self = shift (@_);
675 my ($field) = @_;
676
677 return "" unless (defined ($field) && $field =~ /\S/);
678
679 my ($a, $b);
680 my @components = split /,/, $field;
681 if (scalar @components >= 2) {
682 # pick the first letdig from the first two field names
683 ($a) = $components[0] =~ /^[^\w]*(\w)/;
684 ($b) = $components[1] =~ /^[^\w]*(\w)/;
685 } else {
686 # pick the first two letdig chars
687 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
688 }
689 # there may not have been any letdigs...
690 $a = 'a' unless defined $a;
691 $b = '0' unless defined $b;
692
693 my $newfield = "$a$b";
694 if ($newfield =~ /^\d\d$/) {
695 # digits only - Greenstone runtime doesn't like this.
696 $newfield = "a$a";
697 }
698 return $newfield;
699
700}
701
702sub get_next_version {
703 my $self = shift (@_);
704 my ($nameref) = @_;
705 my $num=0;
706 if ($$nameref =~ /(\d\d)$/) {
707 $num = $1; $num ++;
708 $$nameref =~ s/\d\d$/$num/;
709 } elsif ($$nameref =~ /(\d)$/) {
710 $num = $1;
711 if ($num == 9) {$$nameref =~ s/\d$/10/;}
712 else {$num ++; $$nameref =~ s/\d$/$num/;}
713 } else {
714 $$nameref =~ s/.$/0/;
715 }
716}
717
718
719
720sub get_collection_meta_sets
721{
722 my $self = shift(@_);
723 my $collection_infodb = shift(@_);
724
725 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
726 foreach my $prefix (keys %$mdprefix_fields)
727 {
728 push(@{$collection_infodb->{"metadataset"}}, $prefix);
729
730 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
731 {
732 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
733
734 my $val = $mdprefix_fields->{$prefix}->{$field};
735 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
736 }
737 }
738}
739
740
741# default is to output the metadata sets (prefixes) used in collection
742sub output_collection_meta
743{
744 my $self = shift(@_);
745 my $infodb_handle = shift(@_);
746
747 my %collection_infodb = ();
748 $self->get_collection_meta_sets(\%collection_infodb);
749 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
750}
751
752# sometimes we need to read in an existing build.cfg - for example,
753# if doing each stage of building separately, or when doing incremental
754# building
755sub read_build_cfg {
756 my $self = shift(@_);
757
758 my $buildconfigfilename;
759
760 if ($gs_mode eq "gs2") {
761 $buildconfigfilename = "build.cfg";
762 } else {
763 $buildconfigfilename = "buildConfig.xml";
764 }
765
766 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
767
768 if (!-e $buildconfigfile) {
769 # try the index dir - but do we know where it is?? try here
770 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
771 if (!-e $buildconfigfile) {
772 #we cant find a config file - just ignore the field list
773 return undef;
774 }
775 }
776 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
777
778}
779
780sub print_stats {
781 my $self = shift (@_);
782
783 my $outhandle = $self->{'outhandle'};
784 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
785 my $index = $self->{'buildproc'}->get_index();
786 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
787 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
788
789 if ($indexing_text) {
790 print $outhandle "Stats (Creating index $index)\n";
791 } else {
792 print $outhandle "Stats (Compressing text from $index)\n";
793 }
794 print $outhandle "Total bytes in collection: $num_bytes\n";
795 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
796
797 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
798
799 if ($self->{'incremental'}) {
800 if ($num_processed_bytes == 0) {
801 if ($indexing_text) {
802 print $outhandle "No additional text was added to $index\n";
803 } elsif (!$self->{'no_text'}) {
804 print $outhandle "No additional text was compressed\n";
805 }
806 }
807 }
808 else {
809 print $outhandle "***************\n";
810 if ($indexing_text) {
811 print $outhandle "WARNING: There is very little or no text to process for $index\n";
812 } elsif (!$self->{'no_text'}) {
813 print $outhandle "WARNING: There is very little or no text to compress\n";
814 }
815 print $outhandle " Was this your intention?\n";
816 print $outhandle "***************\n";
817 }
818
819 }
820
821}
822
823sub prepare_build_recipe
824{
825 my ($self) = @_;
826 my $outhandle = $self->{'outhandle'};
827 print $outhandle "WARNING: prepare_build_recipe() should be implemented in subclass!!";
828}
829
8301;
831
Note: See TracBrowser for help on using the repository browser.