source: main/trunk/greenstone2/perllib/basebuilder.pm@ 20999

Last change on this file since 20999 was 20686, checked in by kjdon, 15 years ago

use incremental instead of keepold to determine whether we are incremental or not. keepold can be used for other things

  • Property svn:keywords set to Author Date Id Revision
File size: 24.0 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57 my ($class, $collection, $source_dir, $build_dir, $verbosity,
58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
59 $remove_empty_classifications,
60 $outhandle, $no_text, $failhandle, $gli, $disable_OAI) = @_;
61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
67 my $self = bless {'collection'=>$collection,
68 'source_dir'=>$source_dir,
69 'build_dir'=>$build_dir,
70 'verbosity'=>$verbosity,
71 'maxdocs'=>$maxdocs,
72 'debug'=>$debug,
73 'keepold'=>$keepold,
74 'incremental'=>$incremental,
75 'incremental_mode'=>$incremental_mode,
76 'remove_empty_classifications'=>$remove_empty_classifications,
77 'outhandle'=>$outhandle,
78 'no_text'=>$no_text,
79 'failhandle'=>$failhandle,
80 'notbuilt'=>{}, # indexes not built
81 'gli'=>$gli,
82 'disable_OAI'=>$disable_OAI
83 }, $class;
84
85 $self->{'gli'} = 0 unless defined $self->{'gli'};
86
87 # disable_OAI applies to greenstone 3 only and is only passed to &colcfg::write_build_cfg_xml (then buildConfigxml::write_build_cfg_file) when writing the buildConfig.xml
88 $self->{'disable_OAI'} = 0 unless defined $self->{'disable_OAI'};
89
90 # Read in the collection configuration file.
91 my ($colcfgname);
92 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
93 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
94
95 if ($gs_mode eq "gs3") {
96 # read it in again to save the original form for later writing out
97 # of buildConfig.xml
98 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
99 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
100 }
101
102 # get the database type for this collection from the collect.cfg file (may be undefined)
103 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
104
105
106 # load up any dontdb fields
107 $self->{'dontdb'} = {};
108 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
109 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
110 $self->{'dontdb'}->{$dg} = 1;
111 }
112 }
113
114 $self->{'maxnumeric'} = 4;
115 return $self;
116}
117
118# stuff has been moved here from new, so we can use subclass methods
119sub init {
120 my $self = shift(@_);
121
122 my $outhandle = $self->{'outhandle'};
123 my $failhandle = $self->{'failhandle'};
124
125 $self->generate_index_list();
126 my $indexes = $self->{'collect_cfg'}->{'indexes'};
127 if (defined $indexes) {
128 # sort out subcollection indexes
129 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
130 $self->{'collect_cfg'}->{'indexes'} = [];
131 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
132 foreach my $index (@$indexes) {
133 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
134 }
135 }
136 }
137
138 # sort out language subindexes
139 if (defined $self->{'collect_cfg'}->{'languages'}) {
140 $indexes = $self->{'collect_cfg'}->{'indexes'};
141 $self->{'collect_cfg'}->{'indexes'} = [];
142 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
143 foreach my $index (@$indexes) {
144 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
145 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
146 }
147 else { # add in an empty subcollection field
148 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
149 }
150 }
151 }
152 }
153 }
154
155 if (defined($self->{'collect_cfg'}->{'indexes'})) {
156 # make sure that the same index isn't specified more than once
157 my %tmphash = ();
158 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
159 $self->{'collect_cfg'}->{'indexes'} = [];
160 foreach my $i (@tmparray) {
161 if (!defined ($tmphash{$i})) {
162 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
163 $tmphash{$i} = 1;
164 }
165 }
166 } else {
167 $self->{'collect_cfg'}->{'indexes'} = [];
168 }
169
170 # check incremental against whether builder can cope or not.
171 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
172 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
173 $self->{'keepold'} = 0;
174 $self->{'incremental'} = 0;
175 $self->{'incremental_mode'} = "none";
176
177 }
178
179
180 # get the list of plugins for this collection
181 my $plugins = [];
182 if (defined $self->{'collect_cfg'}->{'plugin'}) {
183 $plugins = $self->{'collect_cfg'}->{'plugin'};
184 }
185
186 # load all the plugins
187
188 #build up the extra global options for the plugins
189 my @global_opts = ();
190 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
191 push @global_opts, "-separate_cjk";
192 }
193 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'});
194
195 if (scalar(@{$self->{'pluginfo'}}) == 0) {
196 print $outhandle "No plugins were loaded.\n";
197 die "\n";
198 }
199
200 # get the list of classifiers for this collection
201 my $classifiers = [];
202 if (defined $self->{'collect_cfg'}->{'classify'}) {
203 $classifiers = $self->{'collect_cfg'}->{'classify'};
204 }
205
206 # load all the classifiers
207 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
208
209 # load up the document processor for building
210 # if a buildproc class has been created for this collection, use it
211 # otherwise, use the default buildproc for the builder we are initialising
212 my ($buildprocdir, $buildproctype);
213 my $collection = $self->{'collection'};
214 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
215 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
216 $buildproctype = "custombuildproc";
217 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
218 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
219 $buildproctype = "custombuildproc";
220 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
221 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
222 $buildproctype = "${collection}buildproc";
223 } else {
224 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
225 $buildproctype = $self->default_buildproc();
226 }
227 require "$buildprocdir/$buildproctype.pm";
228
229 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
230 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
231 die "$@" if $@;
232
233
234 $self->generate_index_options();
235
236 if (!$self->{'debug'} && !$self->{'keepold'}) {
237 # remove any old builds
238 &util::rm_r($self->{'build_dir'});
239 &util::mk_all_dir($self->{'build_dir'});
240
241 # make the text directory
242 my $textdir = "$self->{'build_dir'}/text";
243 &util::mk_all_dir($textdir);
244 }
245
246 if ($self->{'incremental'}) {
247 # some classes may need to do some additional initialisation
248 $self->init_for_incremental_build();
249 }
250
251}
252
253sub is_incremental_capable
254{
255 # By default we return 'no' as the answer
256 # Safer to assume non-incremental to start with, and then override in
257 # inherited classes that are.
258
259 return 0;
260}
261
262# implement this in subclass if want to do additional initialisation for an
263# incremental build
264sub init_for_incremental_build {
265 my $self = shift (@_);
266}
267
268sub deinit {
269 my $self = shift (@_);
270
271 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
272}
273
274sub generate_index_options {
275 my $self = shift (@_);
276
277 my $separate_cjk = 0;
278
279 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
280 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
281 if ($option =~ /separate_cjk/) {
282 $separate_cjk = 1;
283 }
284 }
285 }
286 # set this for building
287 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
288 # record it for build.cfg
289 $self->{'separate_cjk'} = $separate_cjk;
290}
291
292sub set_sections_index_document_metadata {
293 my $self = shift (@_);
294 my ($index) = @_;
295
296 $self->{'buildproc'}->set_sections_index_document_metadata($index);
297}
298
299sub set_maxnumeric {
300 my $self = shift (@_);
301 my ($maxnumeric) = @_;
302
303 $self->{'maxnumeric'} = $maxnumeric;
304}
305sub set_strip_html {
306 my $self = shift (@_);
307 my ($strip) = @_;
308
309 $self->{'strip_html'} = $strip;
310 $self->{'buildproc'}->set_strip_html($strip);
311}
312
313sub compress_text {
314 my $self = shift (@_);
315 my ($textindex) = @_;
316
317 print STDERR "compress_text() should be implemented in subclass!!";
318 return;
319}
320
321
322sub build_indexes {
323 my $self = shift (@_);
324 my ($indexname) = @_;
325 my $outhandle = $self->{'outhandle'};
326
327 my $indexes = [];
328 if (defined $indexname && $indexname =~ /\w/) {
329 push @$indexes, $indexname;
330 } else {
331 $indexes = $self->{'collect_cfg'}->{'indexes'};
332 }
333
334 # create the mapping between the index descriptions
335 # and their directory names (includes subcolls and langs)
336 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
337
338 # build each of the indexes
339 foreach my $index (@$indexes) {
340 if ($self->want_built($index)) {
341 print $outhandle "\n*** building index $index in subdirectory " .
342 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
343 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
344 $self->build_index($index);
345 } else {
346 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
347 }
348 }
349
350 $self->build_indexes_extra();
351
352}
353
354# implement this in subclass if want to do extra stuff at the end of building
355# all the indexes
356sub build_indexes_extra {
357 my $self = shift(@_);
358
359}
360
361sub build_index {
362 my $self = shift (@_);
363 my ($index) = @_;
364
365 print STDERR "build_index should be implemented in subclass\n";
366 return;
367}
368
369
370
371sub make_infodatabase {
372 my $self = shift (@_);
373 my $outhandle = $self->{'outhandle'};
374
375 print STDERR "BuildDir: $self->{'build_dir'}\n";
376
377 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
378 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
379 &util::mk_all_dir ($textdir);
380 &util::mk_all_dir ($assocdir);
381
382 # Get info database file path
383 my $infodb_type = $self->{'infodbtype'};
384 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
385
386 print $outhandle "\n*** creating the info database and processing associated files\n"
387 if ($self->{'verbosity'} >= 1);
388 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
389
390 # init all the classifiers
391 &classify::init_classifiers ($self->{'classifiers'});
392
393 my $reconstructed_docs = undef;
394 my $database_recs = undef;
395
396 if ($self->{'incremental'}) {
397 $database_recs = {};
398
399 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
400 }
401
402
403 # Important (for memory usage reasons) that we obtain the filehandle
404 # here for writing out to the database, rather than after
405 # $reconstructed_docs has been set up (assuming -incremental is on)
406 #
407 # This is because when we open a pipe to txt2db [using open()]
408 # this triggers a fork() followed by exec(). $reconstructed_docs
409 # can get very large, and so if we did the open() after this, it means
410 # the fork creates a clone of the *large* process image which (admittedly)
411 # is then quickly replaced in the execve() with the much smaller image for
412 # 'txt2db'. The trouble is, in that seismic second caused by
413 # the fork(), the system really does need to have all that memory available
414 # even though it isn't ultimately used. The result is an out of memory
415 # error.
416
417 my ($infodb_handle);
418 if ($self->{'debug'}) {
419 $infodb_handle = *STDOUT;
420 }
421 else {
422 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
423 if (!defined($infodb_handle))
424 {
425 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
426 die "builder::make_infodatabase - couldn't open infodb write handle\n";
427 }
428 }
429
430 if ($self->{'incremental'}) {
431 # reconstruct doc_obj metadata from database for all docs
432 $reconstructed_docs
433 = &classify::reconstruct_doc_objs_metadata($infodb_type,
434 $infodb_file_path,
435 $database_recs);
436 }
437
438 # set up the document processor
439
440 $self->{'buildproc'}->set_infodbtype ($infodb_type);
441 $self->{'buildproc'}->set_output_handle ($infodb_handle);
442 $self->{'buildproc'}->set_mode ('infodb');
443 $self->{'buildproc'}->set_assocdir ($assocdir);
444 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
445 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
446 $self->{'buildproc'}->set_indexing_text (0);
447 $self->{'buildproc'}->set_store_text(1);
448 $self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
449
450 # make_infodatabase needs full reset even for incremental build
451 # as incremental works by reconstructing all docs from the database and
452 # then adding in the new ones
453 $self->{'buildproc'}->zero_reset();
454
455 $self->{'buildproc'}->{'mdprefix_fields'} = {};
456
457 if ($self->{'incremental'}) {
458 # create flat classify structure, ready for new docs to be added
459 foreach my $doc_obj ( @$reconstructed_docs ) {
460 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
461 $self->{'buildproc'}->process($doc_obj,undef);
462 }
463 }
464
465
466 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
467 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
468
469 # this has changed to only output collection meta if its
470 # not in the config file
471 $self->output_collection_meta($infodb_handle);
472
473 # output classification information
474 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
475 $self->{'remove_empty_classifications'},
476 $self->{'gli'});
477
478 # Output classifier reverse lookup, used in incremental deletion
479 ####&classify::print_reverse_lookup($infodb_handle);
480
481 # output doclist
482 my @doc_list = $self->{'buildproc'}->get_doc_list();
483 my $browselist_infodb = { 'hastxt' => [ "0" ],
484 'childtype' => [ "VList" ],
485 'numleafdocs' => [ scalar(@doc_list) ],
486 'thistype' => [ "Invisible" ],
487 'contains' => [ join(";", @doc_list) ] };
488 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
489
490 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
491
492 print STDERR "</Stage>\n" if $self->{'gli'};
493}
494
495sub make_auxiliary_files {
496 my $self = shift (@_);
497 my ($index);
498 my $build_cfg = {};
499 # subclasses may have already defined stuff in here
500 if (defined $self->{'build_cfg'}) {
501 $build_cfg = $self->{'build_cfg'};
502 }
503
504 my $outhandle = $self->{'outhandle'};
505
506 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
507 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
508
509 # get the text directory
510 &util::mk_all_dir ($self->{'build_dir'});
511
512 # store the build date
513 $build_cfg->{'builddate'} = time;
514 $build_cfg->{'buildtype'} = $self->{'buildtype'};
515 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
516 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
517 if ($self->{'separate_cjk'}) {
518 $build_cfg->{'separate_cjk'} = "true";
519 }
520
521 # store the number of documents and number of bytes
522 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
523 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
524 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
525
526 # store the mapping between the index names and the directory names
527 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
528 my @indexmap = ();
529 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
530 if (not defined ($self->{'notbuilt'}->{$index})) {
531 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
532 }
533 }
534 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
535
536 my @subcollectionmap = ();
537 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
538 push (@subcollectionmap, "$subcollection\-\>" .
539 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
540 }
541 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
542
543 my @languagemap = ();
544 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
545 push (@languagemap, "$language\-\>" .
546 $self->{'index_mapping'}->{'languagemap'}->{$language});
547 }
548 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
549
550 my @notbuilt = ();
551 foreach my $nb (keys %{$self->{'notbuilt'}}) {
552 push (@notbuilt, $nb);
553 }
554 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
555
556 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
557
558 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
559
560 $self->build_cfg_extra($build_cfg);
561
562 if ($gs_mode eq "gs2") {
563 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
564 }
565 if ($gs_mode eq "gs3") {
566
567 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'}, $self->{'disable_OAI'});
568 }
569
570 print STDERR "</Stage>\n" if $self->{'gli'};
571}
572
573# implement this in subclass if want to add extra stuff to build.cfg
574sub build_cfg_extra {
575 my $self = shift(@_);
576 my ($build_cfg) = @_;
577
578}
579
580
581sub collect_specific {
582 my $self = shift (@_);
583}
584
585sub want_built {
586 my $self = shift (@_);
587 my ($index) = @_;
588
589 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
590 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
591 if ($index =~ /^$checkstr$/) {
592 $self->{'notbuilt'}->{$index} = 1;
593 return 0;
594 }
595 }
596 }
597
598 return 1;
599}
600
601sub create_index_mapping {
602 my $self = shift (@_);
603 my ($indexes) = @_;
604
605 print STDERR "create_index_mapping should be implemented in subclass\n";
606 my %mapping = ();
607 return \%mapping;
608}
609
610# returns a processed version of a field.
611# if the field has only one component the processed
612# version will contain the first character and next consonant
613# of that componant - otherwise it will contain the first
614# character of the first two components
615# only uses letdig (\w) characters now
616sub process_field {
617 my $self = shift (@_);
618 my ($field) = @_;
619
620 return "" unless (defined ($field) && $field =~ /\S/);
621
622 my ($a, $b);
623 my @components = split /,/, $field;
624 if (scalar @components >= 2) {
625 # pick the first letdig from the first two field names
626 ($a) = $components[0] =~ /^[^\w]*(\w)/;
627 ($b) = $components[1] =~ /^[^\w]*(\w)/;
628 } else {
629 # pick the first two letdig chars
630 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
631 }
632 # there may not have been any letdigs...
633 $a = 'a' unless defined $a;
634 $b = '0' unless defined $b;
635
636 return "$a$b";
637
638}
639
640sub get_next_version {
641 my $self = shift (@_);
642 my ($nameref) = @_;
643 my $num=0;
644 if ($$nameref =~ /(\d\d)$/) {
645 $num = $1; $num ++;
646 $$nameref =~ s/\d\d$/$num/;
647 } elsif ($$nameref =~ /(\d)$/) {
648 $num = $1;
649 if ($num == 9) {$$nameref =~ s/\d$/10/;}
650 else {$num ++; $$nameref =~ s/\d$/$num/;}
651 } else {
652 $$nameref =~ s/.$/0/;
653 }
654}
655
656
657
658sub get_collection_meta_sets
659{
660 my $self = shift(@_);
661 my $collection_infodb = shift(@_);
662
663 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
664 foreach my $prefix (keys %$mdprefix_fields)
665 {
666 push(@{$collection_infodb->{"metadataset"}}, $prefix);
667
668 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
669 {
670 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
671
672 my $val = $mdprefix_fields->{$prefix}->{$field};
673 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
674 }
675 }
676}
677
678
679# default is to output the metadata sets (prefixes) used in collection
680sub output_collection_meta
681{
682 my $self = shift(@_);
683 my $infodb_handle = shift(@_);
684
685 my %collection_infodb = ();
686 $self->get_collection_meta_sets(\%collection_infodb);
687 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
688}
689
690# sometimes we need to read in an existing build.cfg - for example,
691# if doing each stage of building separately, or when doing incremental
692# building
693sub read_build_cfg {
694 my $self = shift(@_);
695
696 my $buildconfigfilename;
697
698 if ($gs_mode eq "gs2") {
699 $buildconfigfilename = "build.cfg";
700 } else {
701 $buildconfigfilename = "buildConfig.xml";
702 }
703
704 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
705
706 if (!-e $buildconfigfile) {
707 # try the index dir - but do we know where it is?? try here
708 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
709 if (!-e $buildconfigfile) {
710 #we cant find a config file - just ignore the field list
711 return undef;
712 }
713 }
714 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
715
716}
717
718sub print_stats {
719 my $self = shift (@_);
720
721 my $outhandle = $self->{'outhandle'};
722 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
723 my $index = $self->{'buildproc'}->get_index();
724 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
725 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
726
727 if ($indexing_text) {
728 print $outhandle "Stats (Creating index $index)\n";
729 } else {
730 print $outhandle "Stats (Compressing text from $index)\n";
731 }
732 print $outhandle "Total bytes in collection: $num_bytes\n";
733 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
734
735 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
736
737 if ($self->{'incremental'}) {
738 if ($num_processed_bytes == 0) {
739 if ($indexing_text) {
740 print $outhandle "No additional text was added to $index\n";
741 } elsif (!$self->{'no_text'}) {
742 print $outhandle "No additional text was compressed\n";
743 }
744 }
745 }
746 else {
747 print $outhandle "***************\n";
748 if ($indexing_text) {
749 print $outhandle "WARNING: There is very little or no text to process for $index\n";
750 } elsif (!$self->{'no_text'}) {
751 print $outhandle "WARNING: There is very little or no text to compress\n";
752 }
753 print $outhandle " Was this your intention?\n";
754 print $outhandle "***************\n";
755 }
756
757 }
758
759}
760
761
7621;
763
Note: See TracBrowser for help on using the repository browser.