source: main/trunk/greenstone2/perllib/basebuilder.pm@ 30517

Last change on this file since 30517 was 30517, checked in by ak19, 8 years ago

Fixing incremental-rebuild when the database is gdbm. At this point (see buildcolutils.pm), the code needs to deactivate the collection before calling make_infodatabase(), since otherwise there's a lock on the gdbm database which prevents successful incremental-rebuild and activation.

  • Property svn:keywords set to Author Date Id Revision
File size: 26.2 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37use FileUtils;
38
39
40BEGIN {
41 # set autoflush on for STDERR and STDOUT so that mgpp
42 # doesn't get out of sync with plugins
43 STDOUT->autoflush(1);
44 STDERR->autoflush(1);
45}
46
47END {
48 STDOUT->autoflush(0);
49 STDERR->autoflush(0);
50}
51
52our $maxdocsize = 12000;
53
54# used to signify "gs2"(default) or "gs3"
55our $gs_mode = "gs2";
56
57sub new {
58 my ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
59 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
60 $remove_empty_classifications,
61 $outhandle, $no_text, $failhandle, $gli) = @_;
62
63 $outhandle = *STDERR unless defined $outhandle;
64 $no_text = 0 unless defined $no_text;
65 $failhandle = *STDERR unless defined $failhandle;
66
67 # create a builder object
68 my $self = bless {'site'=>$site, # will be undef for Greenstone 2
69 'collection'=>$collection,
70 'source_dir'=>$source_dir,
71 'build_dir'=>$build_dir,
72 'verbosity'=>$verbosity,
73 'maxdocs'=>$maxdocs,
74 'debug'=>$debug,
75 'keepold'=>$keepold,
76 'incremental'=>$incremental,
77 'incremental_mode'=>$incremental_mode,
78 'remove_empty_classifications'=>$remove_empty_classifications,
79 'outhandle'=>$outhandle,
80 'no_text'=>$no_text,
81 'failhandle'=>$failhandle,
82 'notbuilt'=>{}, # indexes not built
83 'gli'=>$gli
84 }, $class;
85
86 $self->{'gli'} = 0 unless defined $self->{'gli'};
87
88 # Read in the collection configuration file.
89 if ((defined $site) && ($site ne "")) { # GS3
90 $gs_mode = "gs3";
91 }
92
93 my $colcfgname = &colcfg::get_collect_cfg_name($outhandle, $gs_mode);
94 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
95
96 if ($gs_mode eq "gs3") {
97 # read it in again to save the original form for later writing out
98 # of buildConfig.xml
99 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
100 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
101 }
102
103 # get the database type for this collection from the collect.cfg file (may be undefined)
104 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
105
106
107 # load up any dontdb fields
108 $self->{'dontdb'} = {};
109 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
110 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
111 $self->{'dontdb'}->{$dg} = 1;
112 }
113 }
114
115 $self->{'maxnumeric'} = 4;
116 return $self;
117}
118
119# stuff has been moved here from new, so we can use subclass methods
120sub init {
121 my $self = shift(@_);
122
123 my $outhandle = $self->{'outhandle'};
124 my $failhandle = $self->{'failhandle'};
125
126 $self->generate_index_list();
127 my $indexes = $self->{'collect_cfg'}->{'indexes'};
128 if (defined $indexes) {
129 # sort out subcollection indexes
130 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
131 $self->{'collect_cfg'}->{'indexes'} = [];
132 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
133 foreach my $index (@$indexes) {
134 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
135 }
136 }
137 }
138
139 # sort out language subindexes
140 if (defined $self->{'collect_cfg'}->{'languages'}) {
141 $indexes = $self->{'collect_cfg'}->{'indexes'};
142 $self->{'collect_cfg'}->{'indexes'} = [];
143 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
144 foreach my $index (@$indexes) {
145 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
146 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
147 }
148 else { # add in an empty subcollection field
149 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
150 }
151 }
152 }
153 }
154 }
155
156 if (defined($self->{'collect_cfg'}->{'indexes'})) {
157 # make sure that the same index isn't specified more than once
158 my %tmphash = ();
159 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
160 $self->{'collect_cfg'}->{'indexes'} = [];
161 foreach my $i (@tmparray) {
162 if (!defined ($tmphash{$i})) {
163 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
164 $tmphash{$i} = 1;
165 }
166 }
167 } else {
168 $self->{'collect_cfg'}->{'indexes'} = [];
169 }
170
171 # check incremental against whether builder can cope or not.
172 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
173 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
174 $self->{'keepold'} = 0;
175 $self->{'incremental'} = 0;
176 $self->{'incremental_mode'} = "none";
177
178 }
179
180 # gs_version for plugins
181 my $gs_version = "2";
182 if ($gs_mode eq "gs3") {
183 $gs_version = "3";
184 }
185 # get the list of plugins for this collection
186 my $plugins = [];
187 if (defined $self->{'collect_cfg'}->{'plugin'}) {
188 $plugins = $self->{'collect_cfg'}->{'plugin'};
189 }
190
191 # load all the plugins
192
193 #build up the extra global options for the plugins
194 my @global_opts = ();
195 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
196 push @global_opts, "-separate_cjk";
197 }
198 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'}, $gs_version);
199
200 if (scalar(@{$self->{'pluginfo'}}) == 0) {
201 print $outhandle "No plugins were loaded.\n";
202 die "\n";
203 }
204
205 # get the list of classifiers for this collection
206 my $classifiers = [];
207 if (defined $self->{'collect_cfg'}->{'classify'}) {
208 $classifiers = $self->{'collect_cfg'}->{'classify'};
209 }
210
211 # load all the classifiers
212 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
213
214 # load up the document processor for building
215 # if a buildproc class has been created for this collection, use it
216 # otherwise, use the default buildproc for the builder we are initialising
217 my $buildprocdir = undef;
218 my $buildproctype;
219
220 my $collection = $self->{'collection'};
221 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
222 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
223 $buildproctype = "custombuildproc";
224 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
225 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
226 $buildproctype = "custombuildproc";
227 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
228 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
229 $buildproctype = "${collection}buildproc";
230 } else {
231 $buildproctype = $self->default_buildproc();
232 }
233 if (defined $buildprocdir) {
234 require "$buildprocdir/$buildproctype.pm";
235 }
236 else {
237 require "$buildproctype.pm";
238 }
239
240 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
241 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
242 die "$@" if $@;
243
244 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
245 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
246
247 $self->generate_index_options();
248
249 if (!$self->{'debug'} && !$self->{'keepold'}) {
250 # remove any old builds
251 &FileUtils::removeFilesRecursive($self->{'build_dir'});
252 &FileUtils::makeAllDirectories($self->{'build_dir'});
253
254 # make the text directory
255 my $textdir = "$self->{'build_dir'}/text";
256 &FileUtils::makeAllDirectories($textdir);
257 }
258
259 if ($self->{'incremental'}) {
260 # some classes may need to do some additional initialisation
261 $self->init_for_incremental_build();
262 }
263
264}
265
266sub is_incremental_capable
267{
268 # By default we return 'no' as the answer
269 # Safer to assume non-incremental to start with, and then override in
270 # inherited classes that are.
271
272 return 0;
273}
274
275# implement this in subclass if want to do additional initialisation for an
276# incremental build
277sub init_for_incremental_build {
278 my $self = shift (@_);
279}
280
281sub deinit {
282 my $self = shift (@_);
283
284 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
285}
286
287sub generate_index_options {
288 my $self = shift (@_);
289
290 my $separate_cjk = 0;
291
292 my $indexoptions = $self->{'collect_cfg'}->{'indexoptions'};
293 if (defined($indexoptions)) {
294
295 foreach my $option (@$indexoptions) {
296 if ($option =~ /separate_cjk/) {
297 $separate_cjk = 1;
298 }
299 }
300 }
301 # set this for building
302 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
303 # record it for build.cfg
304 $self->{'separate_cjk'} = $separate_cjk;
305}
306
307sub set_sections_index_document_metadata {
308 my $self = shift (@_);
309 my ($index) = @_;
310
311 $self->{'buildproc'}->set_sections_index_document_metadata($index);
312}
313
314sub set_maxnumeric {
315 my $self = shift (@_);
316 my ($maxnumeric) = @_;
317
318 $self->{'maxnumeric'} = $maxnumeric;
319}
320sub set_strip_html {
321 my $self = shift (@_);
322 my ($strip) = @_;
323
324 $self->{'strip_html'} = $strip;
325 $self->{'buildproc'}->set_strip_html($strip);
326}
327
328sub set_store_metadata_coverage {
329 my $self = shift (@_);
330 my ($store_metadata_coverage) = @_;
331
332 $self->{'buildproc'}->set_store_metadata_coverage($store_metadata_coverage);
333}
334
335sub compress_text {
336 my $self = shift (@_);
337 my ($textindex) = @_;
338
339 print STDERR "compress_text() should be implemented in subclass!!";
340 return;
341}
342
343
344sub build_indexes {
345 my $self = shift (@_);
346 my ($indexname) = @_;
347 my $outhandle = $self->{'outhandle'};
348
349 $self->pre_build_indexes();
350
351 my $indexes = [];
352 if (defined $indexname && $indexname =~ /\w/) {
353 push @$indexes, $indexname;
354 } else {
355 $indexes = $self->{'collect_cfg'}->{'indexes'};
356 }
357
358 # create the mapping between the index descriptions
359 # and their directory names (includes subcolls and langs)
360 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
361
362 # build each of the indexes
363 foreach my $index (@$indexes) {
364 if ($self->want_built($index)) {
365 print $outhandle "\n*** building index $index in subdirectory " .
366 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
367 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
368 $self->build_index($index);
369 } else {
370 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
371 }
372 }
373
374 $self->post_build_indexes();
375
376}
377
378# implement this in subclass if want to do extra stuff at before building
379# all the indexes
380sub pre_build_indexes {
381 my $self = shift(@_);
382 my ($indexname) = @_; # optional parameter
383}
384
385# implement this in subclass if want to do extra stuff at the end of building
386# all the indexes
387sub post_build_indexes {
388 my $self = shift(@_);
389}
390
391sub build_index {
392 my $self = shift (@_);
393 my ($index) = @_;
394
395 print STDERR "build_index should be implemented in subclass\n";
396 return;
397}
398
399# By default, builders do support make_infodatabase()
400sub supports_make_infodatabase {
401 return 1;
402}
403
404
405sub make_infodatabase {
406 my $self = shift (@_);
407 my $outhandle = $self->{'outhandle'};
408
409 print STDERR "BuildDir: $self->{'build_dir'}\n";
410
411 my $textdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
412 my $assocdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "assoc");
413 &FileUtils::makeAllDirectories ($textdir);
414 &FileUtils::makeAllDirectories ($assocdir);
415
416 # Get info database file path
417 my $infodb_type = $self->{'infodbtype'};
418 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
419
420 print $outhandle "\n*** creating the info database and processing associated files\n"
421 if ($self->{'verbosity'} >= 1);
422 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
423
424 # init all the classifiers
425 &classify::init_classifiers ($self->{'classifiers'});
426
427 my $reconstructed_docs = undef;
428 my $database_recs = undef;
429
430 if ($self->{'incremental'}) {
431 $database_recs = {};
432
433 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
434 }
435
436
437 # Important (for memory usage reasons) that we obtain the filehandle
438 # here for writing out to the database, rather than after
439 # $reconstructed_docs has been set up (assuming -incremental is on)
440 #
441 # This is because when we open a pipe to txt2db [using open()]
442 # this triggers a fork() followed by exec(). $reconstructed_docs
443 # can get very large, and so if we did the open() after this, it means
444 # the fork creates a clone of the *large* process image which (admittedly)
445 # is then quickly replaced in the execve() with the much smaller image for
446 # 'txt2db'. The trouble is, in that seismic second caused by
447 # the fork(), the system really does need to have all that memory available
448 # even though it isn't ultimately used. The result is an out of memory
449 # error.
450
451 my ($infodb_handle);
452 if ($self->{'debug'}) {
453 $infodb_handle = *STDOUT;
454 }
455 else {
456 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
457 if (!defined($infodb_handle))
458 {
459 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
460 die "builder::make_infodatabase - couldn't open infodb write handle\n";
461 }
462 }
463
464 if ($self->{'incremental'}) {
465 # reconstruct doc_obj metadata from database for all docs
466 $reconstructed_docs
467 = &classify::reconstruct_doc_objs_metadata($infodb_type,
468 $infodb_file_path,
469 $database_recs);
470 }
471
472 # set up the document processor
473
474 $self->{'buildproc'}->set_output_handle ($infodb_handle);
475 $self->{'buildproc'}->set_mode ('infodb');
476 $self->{'buildproc'}->set_assocdir ($assocdir);
477 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
478 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
479 $self->{'buildproc'}->set_indexing_text (0);
480 $self->{'buildproc'}->set_store_text(1);
481
482 # make_infodatabase needs full reset even for incremental build
483 # as incremental works by reconstructing all docs from the database and
484 # then adding in the new ones
485 $self->{'buildproc'}->zero_reset();
486
487 $self->{'buildproc'}->{'mdprefix_fields'} = {};
488
489 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
490 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
491
492 if ($self->{'incremental'}) {
493 # create flat classify structure, ready for new docs to be added
494 foreach my $doc_obj ( @$reconstructed_docs ) {
495 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
496 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
497 $self->{'buildproc'}->process($doc_obj,undef);
498 }
499 }
500 }
501 # this has changed to only output collection meta if its
502 # not in the config file
503 $self->output_collection_meta($infodb_handle);
504
505 # output classification information
506 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
507 $self->{'remove_empty_classifications'},
508 $self->{'gli'});
509
510 # Output classifier reverse lookup, used in incremental deletion
511 ####&classify::print_reverse_lookup($infodb_handle);
512
513 # output doclist
514 my @doc_list = $self->{'buildproc'}->get_doc_list();
515 my $browselist_infodb = { 'hastxt' => [ "0" ],
516 'childtype' => [ "VList" ],
517 'numleafdocs' => [ scalar(@doc_list) ],
518 'thistype' => [ "Invisible" ],
519 'contains' => [ join(";", @doc_list) ] };
520 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
521
522 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
523
524 if ($infodb_type eq "gdbm-txtgz") {
525 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
526 if (-e $gdb_infodb_file_path) {
527 &FileUtils::removeFiles($gdb_infodb_file_path);
528 }
529 }
530 print STDERR "</Stage>\n" if $self->{'gli'};
531}
532
533sub make_auxiliary_files {
534 my $self = shift (@_);
535 my ($index);
536 my $build_cfg = {};
537 # subclasses may have already defined stuff in here
538 if (defined $self->{'build_cfg'}) {
539 $build_cfg = $self->{'build_cfg'};
540 }
541
542 my $outhandle = $self->{'outhandle'};
543
544 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
545 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
546
547 # get the text directory
548 &FileUtils::makeAllDirectories ($self->{'build_dir'});
549
550 # store the build date
551 $build_cfg->{'builddate'} = time;
552 $build_cfg->{'buildtype'} = $self->{'buildtype'};
553 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
554 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
555 if ($self->{'separate_cjk'}) {
556 $build_cfg->{'separate_cjk'} = "true";
557 }
558
559 # store the number of documents and number of bytes
560 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
561 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
562 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
563
564 # store the mapping between the index names and the directory names
565 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
566 my @indexmap = ();
567 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
568 if (not defined ($self->{'notbuilt'}->{$index})) {
569 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
570 }
571 }
572
573 # store the number of indexes built to later determine whether search serviceracks get written out to buildConfig.xml
574 $build_cfg->{'num_indexes'} = scalar (@indexmap);
575
576 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
577
578 my @subcollectionmap = ();
579 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
580 push (@subcollectionmap, "$subcollection\-\>" .
581 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
582 }
583 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
584
585 my @languagemap = ();
586 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
587 push (@languagemap, "$language\-\>" .
588 $self->{'index_mapping'}->{'languagemap'}->{$language});
589 }
590 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
591
592 my @notbuilt = ();
593 foreach my $nb (keys %{$self->{'notbuilt'}}) {
594 push (@notbuilt, $nb);
595 }
596 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
597
598 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
599
600 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
601
602 # write out the earliestDatestamp information needed for OAI
603 my $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "archives");
604 if(!-d $archivedir) {
605 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "export");
606 }
607 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
608 my $earliestDatestamp = 0;
609 if (open(FIN,"<$earliestDatestampFile")) {
610 {
611 # slurp in file as a single line
612 local $/ = undef;
613 $earliestDatestamp = <FIN>;
614 #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
615 }
616 close(FIN);
617 }
618 else {
619 print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
620 print $outhandle "Setting value to 0.\n";
621 }
622 $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
623
624 $self->build_cfg_extra($build_cfg);
625
626 if ($gs_mode eq "gs2") {
627 &colcfg::write_build_cfg(&FileUtils::filenameConcatenate($self->{'build_dir'},"build.cfg"), $build_cfg);
628 }
629 if ($gs_mode eq "gs3") {
630
631 &colcfg::write_build_cfg_xml(&FileUtils::filenameConcatenate($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
632 }
633
634 print STDERR "</Stage>\n" if $self->{'gli'};
635}
636
637# implement this in subclass if want to add extra stuff to build.cfg
638sub build_cfg_extra {
639 my $self = shift(@_);
640 my ($build_cfg) = @_;
641
642}
643
644
645sub collect_specific {
646 my $self = shift (@_);
647}
648
649sub want_built {
650 my $self = shift (@_);
651 my ($index) = @_;
652
653 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
654 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
655 if ($index =~ /^$checkstr$/) {
656 $self->{'notbuilt'}->{$index} = 1;
657 return 0;
658 }
659 }
660 }
661
662 return 1;
663}
664
665sub create_index_mapping {
666 my $self = shift (@_);
667 my ($indexes) = @_;
668
669 print STDERR "create_index_mapping should be implemented in subclass\n";
670 my %mapping = ();
671 return \%mapping;
672}
673
674# returns a processed version of a field.
675# if the field has only one component the processed
676# version will contain the first character and next consonant
677# of that componant - otherwise it will contain the first
678# character of the first two components
679# only uses letdig (\w) characters now
680sub process_field {
681 my $self = shift (@_);
682 my ($field) = @_;
683
684 return "" unless (defined ($field) && $field =~ /\S/);
685
686 my ($a, $b);
687 my @components = split /,/, $field;
688 if (scalar @components >= 2) {
689 # pick the first letdig from the first two field names
690 ($a) = $components[0] =~ /^[^\w]*(\w)/;
691 ($b) = $components[1] =~ /^[^\w]*(\w)/;
692 } else {
693 # pick the first two letdig chars
694 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
695 }
696 # there may not have been any letdigs...
697 $a = 'a' unless defined $a;
698 $b = '0' unless defined $b;
699
700 my $newfield = "$a$b";
701 if ($newfield =~ /^\d\d$/) {
702 # digits only - Greenstone runtime doesn't like this.
703 $newfield = "a$a";
704 }
705 return $newfield;
706
707}
708
709sub get_next_version {
710 my $self = shift (@_);
711 my ($nameref) = @_;
712 my $num=0;
713 if ($$nameref =~ /(\d\d)$/) {
714 $num = $1; $num ++;
715 $$nameref =~ s/\d\d$/$num/;
716 } elsif ($$nameref =~ /(\d)$/) {
717 $num = $1;
718 if ($num == 9) {$$nameref =~ s/\d$/10/;}
719 else {$num ++; $$nameref =~ s/\d$/$num/;}
720 } else {
721 $$nameref =~ s/.$/0/;
722 }
723}
724
725
726
727sub get_collection_meta_sets
728{
729 my $self = shift(@_);
730 my $collection_infodb = shift(@_);
731
732 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
733 foreach my $prefix (keys %$mdprefix_fields)
734 {
735 push(@{$collection_infodb->{"metadataset"}}, $prefix);
736
737 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
738 {
739 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
740
741 my $val = $mdprefix_fields->{$prefix}->{$field};
742 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
743 }
744 }
745}
746
747
748# default is to output the metadata sets (prefixes) used in collection
749sub output_collection_meta
750{
751 my $self = shift(@_);
752 my $infodb_handle = shift(@_);
753
754 my %collection_infodb = ();
755 $self->get_collection_meta_sets(\%collection_infodb);
756 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
757}
758
759# sometimes we need to read in an existing build.cfg - for example,
760# if doing each stage of building separately, or when doing incremental
761# building
762sub read_build_cfg {
763 my $self = shift(@_);
764
765 my $buildconfigfilename;
766
767 if ($gs_mode eq "gs2") {
768 $buildconfigfilename = "build.cfg";
769 } else {
770 $buildconfigfilename = "buildConfig.xml";
771 }
772
773 my $buildconfigfile = &FileUtils::filenameConcatenate($self->{'build_dir'}, $buildconfigfilename);
774
775 if (!-e $buildconfigfile) {
776 # try the index dir - but do we know where it is?? try here
777 $buildconfigfile = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
778 if (!-e $buildconfigfile) {
779 #we cant find a config file - just ignore the field list
780 return undef;
781 }
782 }
783 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
784
785}
786
787sub print_stats {
788 my $self = shift (@_);
789
790 my $outhandle = $self->{'outhandle'};
791 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
792 my $index = $self->{'buildproc'}->get_index();
793 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
794 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
795
796 if ($indexing_text) {
797 print $outhandle "Stats (Creating index $index)\n";
798 } else {
799 print $outhandle "Stats (Compressing text from $index)\n";
800 }
801 print $outhandle "Total bytes in collection: $num_bytes\n";
802 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
803
804 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
805
806 if ($self->{'incremental'}) {
807 if ($num_processed_bytes == 0) {
808 if ($indexing_text) {
809 print $outhandle "No additional text was added to $index\n";
810 } elsif (!$self->{'no_text'}) {
811 print $outhandle "No additional text was compressed\n";
812 }
813 }
814 }
815 else {
816 print $outhandle "***************\n";
817 if ($indexing_text) {
818 print $outhandle "WARNING: There is very little or no text to process for $index\n";
819 } elsif (!$self->{'no_text'}) {
820 print $outhandle "WARNING: There is very little or no text to compress\n";
821 }
822 print $outhandle " Was this your intention?\n";
823 print $outhandle "***************\n";
824 }
825
826 }
827
828}
829
830
8311;
832
Note: See TracBrowser for help on using the repository browser.