source: gsdl/trunk/perllib/basebuilder.pm@ 20575

Last change on this file since 20575 was 20575, checked in by davidb, 15 years ago

Opening of txt2db moved to earlier in the buildcol process. This was done to avoid a huge memory spike that occurred with incremental building. Previously we recoconstructed all the documents from the GDBM database. Then the code added, edited, removed documents as required (i.e. the incremental bit), then it wrote it all out to GDBM. The problem was that the reconstructed phase could grow quite large -- an example PagedImage collection of 100000 documents took 2.4 GB when read in. When it got to the stage of opening a pipe to the datbase with open('|txt2db'), the fork() call that occurs inside this function requires the system to (briefly) have *two* 2.4 GB processes, before quickly replacing the child process with the much smalled 'txt2db' process. It is at the point of the duplication of the two processes that can cause a computer to run out of memory. In the PagedImage example, the machine had 2 GB of main memory and 2 GB of swap. Therefore there was no way it could sustain two 2.4 GB processes.\n Long explanation. The good news is shifting the open() to be before the documents are reconstructed solves the problem.

  • Property svn:keywords set to Author Date Id Revision
File size: 23.2 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57 my ($class, $collection, $source_dir, $build_dir, $verbosity,
58 $maxdocs, $debug, $keepold, $incremental,
59 $remove_empty_classifications,
60 $outhandle, $no_text, $failhandle, $gli, $disable_OAI) = @_;
61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
67 my $self = bless {'collection'=>$collection,
68 'source_dir'=>$source_dir,
69 'build_dir'=>$build_dir,
70 'verbosity'=>$verbosity,
71 'maxdocs'=>$maxdocs,
72 'debug'=>$debug,
73 'keepold'=>$keepold,
74 'incremental'=>$incremental,
75 'remove_empty_classifications'=>$remove_empty_classifications,
76 'outhandle'=>$outhandle,
77 'no_text'=>$no_text,
78 'failhandle'=>$failhandle,
79 'notbuilt'=>{}, # indexes not built
80 'gli'=>$gli,
81 'disable_OAI'=>$disable_OAI
82 }, $class;
83
84 $self->{'gli'} = 0 unless defined $self->{'gli'};
85
86 # disable_OAI applies to greenstone 3 only and is only passed to &colcfg::write_build_cfg_xml (then buildConfigxml::write_build_cfg_file) when writing the buildConfig.xml
87 $self->{'disable_OAI'} = 0 unless defined $self->{'disable_OAI'};
88
89 # Read in the collection configuration file.
90 my ($colcfgname);
91 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
92 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
93
94 if ($gs_mode eq "gs3") {
95 # read it in again to save the original form for later writing out
96 # of buildConfig.xml
97 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
98 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
99 }
100
101 # get the database type for this collection from the collect.cfg file (may be undefined)
102 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
103
104 # get the list of plugins for this collection
105 my $plugins = [];
106 if (defined $self->{'collect_cfg'}->{'plugin'}) {
107 $plugins = $self->{'collect_cfg'}->{'plugin'};
108 }
109
110 # load all the plugins
111
112 #build up the extra global options for the plugins
113 my @global_opts = ();
114 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
115 push @global_opts, "-separate_cjk";
116 }
117 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts, $keepold);
118
119 if (scalar(@{$self->{'pluginfo'}}) == 0) {
120 print $outhandle "No plugins were loaded.\n";
121 die "\n";
122 }
123
124 # get the list of classifiers for this collection
125 my $classifiers = [];
126 if (defined $self->{'collect_cfg'}->{'classify'}) {
127 $classifiers = $self->{'collect_cfg'}->{'classify'};
128 }
129
130 # load all the classifiers
131 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
132
133 # load up any dontdb fields
134 $self->{'dontdb'} = {};
135 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
136 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
137 $self->{'dontdb'}->{$dg} = 1;
138 }
139 }
140
141 $self->{'maxnumeric'} = 4;
142 return $self;
143}
144
145# stuff has been moved here from new, so we can use subclass methods
146sub init {
147 my $self = shift(@_);
148
149 $self->generate_index_list();
150 my $indexes = $self->{'collect_cfg'}->{'indexes'};
151 if (defined $indexes) {
152 # sort out subcollection indexes
153 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
154 $self->{'collect_cfg'}->{'indexes'} = [];
155 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
156 foreach my $index (@$indexes) {
157 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
158 }
159 }
160 }
161
162 # sort out language subindexes
163 if (defined $self->{'collect_cfg'}->{'languages'}) {
164 $indexes = $self->{'collect_cfg'}->{'indexes'};
165 $self->{'collect_cfg'}->{'indexes'} = [];
166 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
167 foreach my $index (@$indexes) {
168 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
169 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
170 }
171 else { # add in an empty subcollection field
172 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
173 }
174 }
175 }
176 }
177 }
178
179 if (defined($self->{'collect_cfg'}->{'indexes'})) {
180 # make sure that the same index isn't specified more than once
181 my %tmphash = ();
182 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
183 $self->{'collect_cfg'}->{'indexes'} = [];
184 foreach my $i (@tmparray) {
185 if (!defined ($tmphash{$i})) {
186 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
187 $tmphash{$i} = 1;
188 }
189 }
190 } else {
191 $self->{'collect_cfg'}->{'indexes'} = [];
192 }
193
194 # load up the document processor for building
195 # if a buildproc class has been created for this collection, use it
196 # otherwise, use the mg buildproc
197 my ($buildprocdir, $buildproctype);
198 my $collection = $self->{'collection'};
199 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
200 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
201 $buildproctype = "custombuildproc";
202 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
203 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
204 $buildproctype = "custombuildproc";
205 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
206 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
207 $buildproctype = "${collection}buildproc";
208 } else {
209 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
210 $buildproctype = $self->default_buildproc();
211 }
212 require "$buildprocdir/$buildproctype.pm";
213
214 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
215 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
216 die "$@" if $@;
217
218
219 $self->generate_index_options();
220
221 if (!$self->{'debug'} && !$self->{'keepold'}) {
222 # remove any old builds
223 &util::rm_r($self->{'build_dir'});
224 &util::mk_all_dir($self->{'build_dir'});
225
226 # make the text directory
227 my $textdir = "$self->{'build_dir'}/text";
228 &util::mk_all_dir($textdir);
229 }
230
231 if ($self->{'incremental'}) {
232 # some classes may need to do some additional initialisation
233 $self->init_for_incremental_build();
234 }
235
236}
237
238# implement this in subclass if want to do additional initialisation for an
239# incremental build
240sub init_for_incremental_build {
241 my $self = shift (@_);
242}
243
244sub deinit {
245 my $self = shift (@_);
246
247 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
248}
249
250sub generate_index_options {
251 my $self = shift (@_);
252
253 my $separate_cjk = 0;
254
255 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
256 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
257 if ($option =~ /separate_cjk/) {
258 $separate_cjk = 1;
259 }
260 }
261 }
262 # set this for building
263 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
264 # record it for build.cfg
265 $self->{'separate_cjk'} = $separate_cjk;
266}
267
268sub set_sections_index_document_metadata {
269 my $self = shift (@_);
270 my ($index) = @_;
271
272 $self->{'buildproc'}->set_sections_index_document_metadata($index);
273}
274
275sub set_maxnumeric {
276 my $self = shift (@_);
277 my ($maxnumeric) = @_;
278
279 $self->{'maxnumeric'} = $maxnumeric;
280}
281sub set_strip_html {
282 my $self = shift (@_);
283 my ($strip) = @_;
284
285 $self->{'strip_html'} = $strip;
286 $self->{'buildproc'}->set_strip_html($strip);
287}
288
289sub compress_text {
290 my $self = shift (@_);
291 my ($textindex) = @_;
292
293 print STDERR "compress_text() should be implemented in subclass!!";
294 return;
295}
296
297
298sub build_indexes {
299 my $self = shift (@_);
300 my ($indexname) = @_;
301 my $outhandle = $self->{'outhandle'};
302
303 my $indexes = [];
304 if (defined $indexname && $indexname =~ /\w/) {
305 push @$indexes, $indexname;
306 } else {
307 $indexes = $self->{'collect_cfg'}->{'indexes'};
308 }
309
310 # create the mapping between the index descriptions
311 # and their directory names (includes subcolls and langs)
312 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
313
314 # build each of the indexes
315 foreach my $index (@$indexes) {
316 if ($self->want_built($index)) {
317 print $outhandle "\n*** building index $index in subdirectory " .
318 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
319 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
320 $self->build_index($index);
321 } else {
322 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
323 }
324 }
325
326 $self->build_indexes_extra();
327
328}
329
330# implement this in subclass if want to do extra stuff at the end of building
331# all the indexes
332sub build_indexes_extra {
333 my $self = shift(@_);
334
335}
336
337sub build_index {
338 my $self = shift (@_);
339 my ($index) = @_;
340
341 print STDERR "build_index should be implemented in subclass\n";
342 return;
343}
344
345
346
347sub make_infodatabase {
348 my $self = shift (@_);
349 my $outhandle = $self->{'outhandle'};
350
351 print STDERR "BuildDir: $self->{'build_dir'}\n";
352
353 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
354 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
355 &util::mk_all_dir ($textdir);
356 &util::mk_all_dir ($assocdir);
357
358 # Get info database file path
359 my $infodb_type = $self->{'infodbtype'};
360 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
361
362 print $outhandle "\n*** creating the info database and processing associated files\n"
363 if ($self->{'verbosity'} >= 1);
364 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
365
366 # init all the classifiers
367 &classify::init_classifiers ($self->{'classifiers'});
368
369 my $reconstructed_docs = undef;
370 my $database_recs = undef;
371
372 if ($self->{'keepold'}) {
373 $database_recs = {};
374
375 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
376 }
377
378
379 # Important (for memory usage reasons) that we obtain the filehandle
380 # here for writing out to the database, rather than after
381 # $reconstructed_docs has been set up (assuming -keepold is on)
382 #
383 # This is because when we open a pipe to txt2db [using open()]
384 # this triggers a fork() followed by exec(). $reconstructed_docs
385 # can get very large, and so if we did the open() after this, it means
386 # the fork creates a clone of the *large* process image which (admittedly)
387 # is then quickly replaced in the execve() with the much smaller image for
388 # 'txt2db'. The trouble is, in that for a seismic second caused by
389 # the fork(), the system really does need to have all that memory available
390 # even though it isn't ultimately used. The result is an out of memory
391 # error.
392
393 my ($infodb_handle);
394 if ($self->{'debug'}) {
395 $infodb_handle = *STDOUT;
396 }
397 else {
398 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
399 if (!defined($infodb_handle))
400 {
401 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
402 die "builder::make_infodatabase - couldn't open infodb write handle\n";
403 }
404 }
405
406 if ($self->{'keepold'}) {
407 # reconstruct doc_obj metadata from database for all docs
408 $reconstructed_docs
409 = &classify::reconstruct_doc_objs_metadata($infodb_type,
410 $infodb_file_path,
411 $database_recs);
412 }
413
414 # set up the document processor
415
416 $self->{'buildproc'}->set_infodbtype ($infodb_type);
417 $self->{'buildproc'}->set_output_handle ($infodb_handle);
418 $self->{'buildproc'}->set_mode ('infodb');
419 $self->{'buildproc'}->set_assocdir ($assocdir);
420 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
421 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
422 $self->{'buildproc'}->set_indexing_text (0);
423 $self->{'buildproc'}->set_store_text(1);
424 $self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
425
426 # make_infodatabase needs full reset even for incremental build
427 # as incremental works by reconstructing all docs from the database and
428 # then adding in the new ones
429 $self->{'buildproc'}->zero_reset();
430
431 $self->{'buildproc'}->{'mdprefix_fields'} = {};
432
433 if ($self->{'keepold'}) {
434 # create flat classify structure, ready for new docs to be added
435 foreach my $doc_obj ( @$reconstructed_docs ) {
436 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
437 $self->{'buildproc'}->process($doc_obj,undef);
438 }
439 }
440
441
442 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
443 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
444
445 # this has changed to only output collection meta if its
446 # not in the config file
447 $self->output_collection_meta($infodb_handle);
448
449 # output classification information
450 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
451 $self->{'remove_empty_classifications'},
452 $self->{'gli'});
453
454 # Output classifier reverse lookup, used in incremental deletion
455 ####&classify::print_reverse_lookup($infodb_handle);
456
457 # output doclist
458 my @doc_list = $self->{'buildproc'}->get_doc_list();
459 my $browselist_infodb = { 'hastxt' => [ "0" ],
460 'childtype' => [ "VList" ],
461 'numleafdocs' => [ scalar(@doc_list) ],
462 'thistype' => [ "Invisible" ],
463 'contains' => [ join(";", @doc_list) ] };
464 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
465
466 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
467
468 print STDERR "</Stage>\n" if $self->{'gli'};
469}
470
471sub make_auxiliary_files {
472 my $self = shift (@_);
473 my ($index);
474 my $build_cfg = {};
475 # subclasses may have already defined stuff in here
476 if (defined $self->{'build_cfg'}) {
477 $build_cfg = $self->{'build_cfg'};
478 }
479
480 my $outhandle = $self->{'outhandle'};
481
482 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
483 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
484
485 # get the text directory
486 &util::mk_all_dir ($self->{'build_dir'});
487
488 # store the build date
489 $build_cfg->{'builddate'} = time;
490 $build_cfg->{'buildtype'} = $self->{'buildtype'};
491 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
492 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
493 if ($self->{'separate_cjk'}) {
494 $build_cfg->{'separate_cjk'} = "true";
495 }
496
497 # store the number of documents and number of bytes
498 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
499 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
500 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
501
502 # store the mapping between the index names and the directory names
503 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
504 my @indexmap = ();
505 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
506 if (not defined ($self->{'notbuilt'}->{$index})) {
507 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
508 }
509 }
510 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
511
512 my @subcollectionmap = ();
513 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
514 push (@subcollectionmap, "$subcollection\-\>" .
515 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
516 }
517 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
518
519 my @languagemap = ();
520 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
521 push (@languagemap, "$language\-\>" .
522 $self->{'index_mapping'}->{'languagemap'}->{$language});
523 }
524 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
525
526 my @notbuilt = ();
527 foreach my $nb (keys %{$self->{'notbuilt'}}) {
528 push (@notbuilt, $nb);
529 }
530 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
531
532 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
533
534 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
535
536 $self->build_cfg_extra($build_cfg);
537
538 if ($gs_mode eq "gs2") {
539 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
540 }
541 if ($gs_mode eq "gs3") {
542
543 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'}, $self->{'disable_OAI'});
544 }
545
546 print STDERR "</Stage>\n" if $self->{'gli'};
547}
548
549# implement this in subclass if want to add extra stuff to build.cfg
550sub build_cfg_extra {
551 my $self = shift(@_);
552 my ($build_cfg) = @_;
553
554}
555
556
557sub collect_specific {
558 my $self = shift (@_);
559}
560
561sub want_built {
562 my $self = shift (@_);
563 my ($index) = @_;
564
565 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
566 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
567 if ($index =~ /^$checkstr$/) {
568 $self->{'notbuilt'}->{$index} = 1;
569 return 0;
570 }
571 }
572 }
573
574 return 1;
575}
576
577sub create_index_mapping {
578 my $self = shift (@_);
579 my ($indexes) = @_;
580
581 print STDERR "create_index_mapping should be implemented in subclass\n";
582 my %mapping = ();
583 return \%mapping;
584}
585
586# returns a processed version of a field.
587# if the field has only one component the processed
588# version will contain the first character and next consonant
589# of that componant - otherwise it will contain the first
590# character of the first two components
591# only uses letdig (\w) characters now
592sub process_field {
593 my $self = shift (@_);
594 my ($field) = @_;
595
596 return "" unless (defined ($field) && $field =~ /\S/);
597
598 my ($a, $b);
599 my @components = split /,/, $field;
600 if (scalar @components >= 2) {
601 # pick the first letdig from the first two field names
602 ($a) = $components[0] =~ /^[^\w]*(\w)/;
603 ($b) = $components[1] =~ /^[^\w]*(\w)/;
604 } else {
605 # pick the first two letdig chars
606 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
607 }
608 # there may not have been any letdigs...
609 $a = 'a' unless defined $a;
610 $b = '0' unless defined $b;
611
612 return "$a$b";
613
614}
615
616sub get_next_version {
617 my $self = shift (@_);
618 my ($nameref) = @_;
619 my $num=0;
620 if ($$nameref =~ /(\d\d)$/) {
621 $num = $1; $num ++;
622 $$nameref =~ s/\d\d$/$num/;
623 } elsif ($$nameref =~ /(\d)$/) {
624 $num = $1;
625 if ($num == 9) {$$nameref =~ s/\d$/10/;}
626 else {$num ++; $$nameref =~ s/\d$/$num/;}
627 } else {
628 $$nameref =~ s/.$/0/;
629 }
630}
631
632
633
634sub get_collection_meta_sets
635{
636 my $self = shift(@_);
637 my $collection_infodb = shift(@_);
638
639 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
640 foreach my $prefix (keys %$mdprefix_fields)
641 {
642 push(@{$collection_infodb->{"metadataset"}}, $prefix);
643
644 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
645 {
646 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
647
648 my $val = $mdprefix_fields->{$prefix}->{$field};
649 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
650 }
651 }
652}
653
654
655# default is to output the metadata sets (prefixes) used in collection
656sub output_collection_meta
657{
658 my $self = shift(@_);
659 my $infodb_handle = shift(@_);
660
661 my %collection_infodb = ();
662 $self->get_collection_meta_sets(\%collection_infodb);
663 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
664}
665
666# sometimes we need to read in an existing build.cfg - for example,
667# if doing each stage of building separately, or when doing incremental
668# building
669sub read_build_cfg {
670 my $self = shift(@_);
671
672 my $buildconfigfilename;
673
674 if ($gs_mode eq "gs2") {
675 $buildconfigfilename = "build.cfg";
676 } else {
677 $buildconfigfilename = "buildConfig.xml";
678 }
679
680 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
681
682 if (!-e $buildconfigfile) {
683 # try the index dir - but do we know where it is?? try here
684 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
685 if (!-e $buildconfigfile) {
686 #we cant find a config file - just ignore the field list
687 return undef;
688 }
689 }
690 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
691
692}
693
694sub print_stats {
695 my $self = shift (@_);
696
697 my $outhandle = $self->{'outhandle'};
698 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
699 my $index = $self->{'buildproc'}->get_index();
700 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
701 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
702
703 if ($indexing_text) {
704 print $outhandle "Stats (Creating index $index)\n";
705 } else {
706 print $outhandle "Stats (Compressing text from $index)\n";
707 }
708 print $outhandle "Total bytes in collection: $num_bytes\n";
709 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
710
711 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
712
713 if ($self->{'keepold'}) {
714 if ($num_processed_bytes == 0) {
715 if ($indexing_text) {
716 print $outhandle "No additional text was added to $index\n";
717 } elsif (!$self->{'no_text'}) {
718 print $outhandle "No additional text was compressed\n";
719 }
720 }
721 }
722 else {
723 print $outhandle "***************\n";
724 if ($indexing_text) {
725 print $outhandle "WARNING: There is very little or no text to process for $index\n";
726 } elsif (!$self->{'no_text'}) {
727 print $outhandle "WARNING: There is very little or no text to compress\n";
728 }
729 print $outhandle " Was this your intention?\n";
730 print $outhandle "***************\n";
731 }
732
733 }
734
735}
736
737
7381;
739
Note: See TracBrowser for help on using the repository browser.