source: main/trunk/greenstone2/perllib/basebuilder.pm@ 21607

Last change on this file since 21607 was 21607, checked in by mdewsnip, 14 years ago

Changed basebuilder.pm so set_infodbtype() is called on the buildproc object as soon as it is created, instead of just for the infodb phase. This is so the buildproc knows the infodbtype for all phases of the build. Part of making the code less GDBM-specific.

  • Property svn:keywords set to Author Date Id Revision
File size: 24.1 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57 my ($class, $collection, $source_dir, $build_dir, $verbosity,
58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
59 $remove_empty_classifications,
60 $outhandle, $no_text, $failhandle, $gli, $disable_OAI) = @_;
61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
67 my $self = bless {'collection'=>$collection,
68 'source_dir'=>$source_dir,
69 'build_dir'=>$build_dir,
70 'verbosity'=>$verbosity,
71 'maxdocs'=>$maxdocs,
72 'debug'=>$debug,
73 'keepold'=>$keepold,
74 'incremental'=>$incremental,
75 'incremental_mode'=>$incremental_mode,
76 'remove_empty_classifications'=>$remove_empty_classifications,
77 'outhandle'=>$outhandle,
78 'no_text'=>$no_text,
79 'failhandle'=>$failhandle,
80 'notbuilt'=>{}, # indexes not built
81 'gli'=>$gli,
82 'disable_OAI'=>$disable_OAI
83 }, $class;
84
85 $self->{'gli'} = 0 unless defined $self->{'gli'};
86
87 # disable_OAI applies to greenstone 3 only and is only passed to &colcfg::write_build_cfg_xml (then buildConfigxml::write_build_cfg_file) when writing the buildConfig.xml
88 $self->{'disable_OAI'} = 0 unless defined $self->{'disable_OAI'};
89
90 # Read in the collection configuration file.
91 my ($colcfgname);
92 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
93 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
94
95 if ($gs_mode eq "gs3") {
96 # read it in again to save the original form for later writing out
97 # of buildConfig.xml
98 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
99 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
100 }
101
102 # get the database type for this collection from the collect.cfg file (may be undefined)
103 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
104
105
106 # load up any dontdb fields
107 $self->{'dontdb'} = {};
108 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
109 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
110 $self->{'dontdb'}->{$dg} = 1;
111 }
112 }
113
114 $self->{'maxnumeric'} = 4;
115 return $self;
116}
117
118# stuff has been moved here from new, so we can use subclass methods
119sub init {
120 my $self = shift(@_);
121
122 my $outhandle = $self->{'outhandle'};
123 my $failhandle = $self->{'failhandle'};
124
125 $self->generate_index_list();
126 my $indexes = $self->{'collect_cfg'}->{'indexes'};
127 if (defined $indexes) {
128 # sort out subcollection indexes
129 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
130 $self->{'collect_cfg'}->{'indexes'} = [];
131 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
132 foreach my $index (@$indexes) {
133 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
134 }
135 }
136 }
137
138 # sort out language subindexes
139 if (defined $self->{'collect_cfg'}->{'languages'}) {
140 $indexes = $self->{'collect_cfg'}->{'indexes'};
141 $self->{'collect_cfg'}->{'indexes'} = [];
142 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
143 foreach my $index (@$indexes) {
144 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
145 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
146 }
147 else { # add in an empty subcollection field
148 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
149 }
150 }
151 }
152 }
153 }
154
155 if (defined($self->{'collect_cfg'}->{'indexes'})) {
156 # make sure that the same index isn't specified more than once
157 my %tmphash = ();
158 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
159 $self->{'collect_cfg'}->{'indexes'} = [];
160 foreach my $i (@tmparray) {
161 if (!defined ($tmphash{$i})) {
162 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
163 $tmphash{$i} = 1;
164 }
165 }
166 } else {
167 $self->{'collect_cfg'}->{'indexes'} = [];
168 }
169
170 # check incremental against whether builder can cope or not.
171 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
172 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
173 $self->{'keepold'} = 0;
174 $self->{'incremental'} = 0;
175 $self->{'incremental_mode'} = "none";
176
177 }
178
179
180 # get the list of plugins for this collection
181 my $plugins = [];
182 if (defined $self->{'collect_cfg'}->{'plugin'}) {
183 $plugins = $self->{'collect_cfg'}->{'plugin'};
184 }
185
186 # load all the plugins
187
188 #build up the extra global options for the plugins
189 my @global_opts = ();
190 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
191 push @global_opts, "-separate_cjk";
192 }
193 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'});
194
195 if (scalar(@{$self->{'pluginfo'}}) == 0) {
196 print $outhandle "No plugins were loaded.\n";
197 die "\n";
198 }
199
200 # get the list of classifiers for this collection
201 my $classifiers = [];
202 if (defined $self->{'collect_cfg'}->{'classify'}) {
203 $classifiers = $self->{'collect_cfg'}->{'classify'};
204 }
205
206 # load all the classifiers
207 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
208
209 # load up the document processor for building
210 # if a buildproc class has been created for this collection, use it
211 # otherwise, use the default buildproc for the builder we are initialising
212 my ($buildprocdir, $buildproctype);
213 my $collection = $self->{'collection'};
214 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
215 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
216 $buildproctype = "custombuildproc";
217 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
218 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
219 $buildproctype = "custombuildproc";
220 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
221 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
222 $buildproctype = "${collection}buildproc";
223 } else {
224 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
225 $buildproctype = $self->default_buildproc();
226 }
227 require "$buildprocdir/$buildproctype.pm";
228
229 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
230 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
231 die "$@" if $@;
232
233 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
234 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
235
236 $self->generate_index_options();
237
238 if (!$self->{'debug'} && !$self->{'keepold'}) {
239 # remove any old builds
240 &util::rm_r($self->{'build_dir'});
241 &util::mk_all_dir($self->{'build_dir'});
242
243 # make the text directory
244 my $textdir = "$self->{'build_dir'}/text";
245 &util::mk_all_dir($textdir);
246 }
247
248 if ($self->{'incremental'}) {
249 # some classes may need to do some additional initialisation
250 $self->init_for_incremental_build();
251 }
252
253}
254
255sub is_incremental_capable
256{
257 # By default we return 'no' as the answer
258 # Safer to assume non-incremental to start with, and then override in
259 # inherited classes that are.
260
261 return 0;
262}
263
264# implement this in subclass if want to do additional initialisation for an
265# incremental build
266sub init_for_incremental_build {
267 my $self = shift (@_);
268}
269
270sub deinit {
271 my $self = shift (@_);
272
273 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
274}
275
276sub generate_index_options {
277 my $self = shift (@_);
278
279 my $separate_cjk = 0;
280
281 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
282 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
283 if ($option =~ /separate_cjk/) {
284 $separate_cjk = 1;
285 }
286 }
287 }
288 # set this for building
289 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
290 # record it for build.cfg
291 $self->{'separate_cjk'} = $separate_cjk;
292}
293
294sub set_sections_index_document_metadata {
295 my $self = shift (@_);
296 my ($index) = @_;
297
298 $self->{'buildproc'}->set_sections_index_document_metadata($index);
299}
300
301sub set_maxnumeric {
302 my $self = shift (@_);
303 my ($maxnumeric) = @_;
304
305 $self->{'maxnumeric'} = $maxnumeric;
306}
307sub set_strip_html {
308 my $self = shift (@_);
309 my ($strip) = @_;
310
311 $self->{'strip_html'} = $strip;
312 $self->{'buildproc'}->set_strip_html($strip);
313}
314
315sub compress_text {
316 my $self = shift (@_);
317 my ($textindex) = @_;
318
319 print STDERR "compress_text() should be implemented in subclass!!";
320 return;
321}
322
323
324sub build_indexes {
325 my $self = shift (@_);
326 my ($indexname) = @_;
327 my $outhandle = $self->{'outhandle'};
328
329 my $indexes = [];
330 if (defined $indexname && $indexname =~ /\w/) {
331 push @$indexes, $indexname;
332 } else {
333 $indexes = $self->{'collect_cfg'}->{'indexes'};
334 }
335
336 # create the mapping between the index descriptions
337 # and their directory names (includes subcolls and langs)
338 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
339
340 # build each of the indexes
341 foreach my $index (@$indexes) {
342 if ($self->want_built($index)) {
343 print $outhandle "\n*** building index $index in subdirectory " .
344 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
345 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
346 $self->build_index($index);
347 } else {
348 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
349 }
350 }
351
352 $self->build_indexes_extra();
353
354}
355
356# implement this in subclass if want to do extra stuff at the end of building
357# all the indexes
358sub build_indexes_extra {
359 my $self = shift(@_);
360
361}
362
363sub build_index {
364 my $self = shift (@_);
365 my ($index) = @_;
366
367 print STDERR "build_index should be implemented in subclass\n";
368 return;
369}
370
371
372
373sub make_infodatabase {
374 my $self = shift (@_);
375 my $outhandle = $self->{'outhandle'};
376
377 print STDERR "BuildDir: $self->{'build_dir'}\n";
378
379 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
380 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
381 &util::mk_all_dir ($textdir);
382 &util::mk_all_dir ($assocdir);
383
384 # Get info database file path
385 my $infodb_type = $self->{'infodbtype'};
386 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
387
388 print $outhandle "\n*** creating the info database and processing associated files\n"
389 if ($self->{'verbosity'} >= 1);
390 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
391
392 # init all the classifiers
393 &classify::init_classifiers ($self->{'classifiers'});
394
395 my $reconstructed_docs = undef;
396 my $database_recs = undef;
397
398 if ($self->{'incremental'}) {
399 $database_recs = {};
400
401 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
402 }
403
404
405 # Important (for memory usage reasons) that we obtain the filehandle
406 # here for writing out to the database, rather than after
407 # $reconstructed_docs has been set up (assuming -incremental is on)
408 #
409 # This is because when we open a pipe to txt2db [using open()]
410 # this triggers a fork() followed by exec(). $reconstructed_docs
411 # can get very large, and so if we did the open() after this, it means
412 # the fork creates a clone of the *large* process image which (admittedly)
413 # is then quickly replaced in the execve() with the much smaller image for
414 # 'txt2db'. The trouble is, in that seismic second caused by
415 # the fork(), the system really does need to have all that memory available
416 # even though it isn't ultimately used. The result is an out of memory
417 # error.
418
419 my ($infodb_handle);
420 if ($self->{'debug'}) {
421 $infodb_handle = *STDOUT;
422 }
423 else {
424 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
425 if (!defined($infodb_handle))
426 {
427 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
428 die "builder::make_infodatabase - couldn't open infodb write handle\n";
429 }
430 }
431
432 if ($self->{'incremental'}) {
433 # reconstruct doc_obj metadata from database for all docs
434 $reconstructed_docs
435 = &classify::reconstruct_doc_objs_metadata($infodb_type,
436 $infodb_file_path,
437 $database_recs);
438 }
439
440 # set up the document processor
441
442 $self->{'buildproc'}->set_output_handle ($infodb_handle);
443 $self->{'buildproc'}->set_mode ('infodb');
444 $self->{'buildproc'}->set_assocdir ($assocdir);
445 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
446 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
447 $self->{'buildproc'}->set_indexing_text (0);
448 $self->{'buildproc'}->set_store_text(1);
449 $self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
450
451 # make_infodatabase needs full reset even for incremental build
452 # as incremental works by reconstructing all docs from the database and
453 # then adding in the new ones
454 $self->{'buildproc'}->zero_reset();
455
456 $self->{'buildproc'}->{'mdprefix_fields'} = {};
457
458 if ($self->{'incremental'}) {
459 # create flat classify structure, ready for new docs to be added
460 foreach my $doc_obj ( @$reconstructed_docs ) {
461 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
462 $self->{'buildproc'}->process($doc_obj,undef);
463 }
464 }
465
466
467 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
468 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
469
470 # this has changed to only output collection meta if its
471 # not in the config file
472 $self->output_collection_meta($infodb_handle);
473
474 # output classification information
475 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
476 $self->{'remove_empty_classifications'},
477 $self->{'gli'});
478
479 # Output classifier reverse lookup, used in incremental deletion
480 ####&classify::print_reverse_lookup($infodb_handle);
481
482 # output doclist
483 my @doc_list = $self->{'buildproc'}->get_doc_list();
484 my $browselist_infodb = { 'hastxt' => [ "0" ],
485 'childtype' => [ "VList" ],
486 'numleafdocs' => [ scalar(@doc_list) ],
487 'thistype' => [ "Invisible" ],
488 'contains' => [ join(";", @doc_list) ] };
489 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
490
491 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
492
493 print STDERR "</Stage>\n" if $self->{'gli'};
494}
495
496sub make_auxiliary_files {
497 my $self = shift (@_);
498 my ($index);
499 my $build_cfg = {};
500 # subclasses may have already defined stuff in here
501 if (defined $self->{'build_cfg'}) {
502 $build_cfg = $self->{'build_cfg'};
503 }
504
505 my $outhandle = $self->{'outhandle'};
506
507 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
508 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
509
510 # get the text directory
511 &util::mk_all_dir ($self->{'build_dir'});
512
513 # store the build date
514 $build_cfg->{'builddate'} = time;
515 $build_cfg->{'buildtype'} = $self->{'buildtype'};
516 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
517 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
518 if ($self->{'separate_cjk'}) {
519 $build_cfg->{'separate_cjk'} = "true";
520 }
521
522 # store the number of documents and number of bytes
523 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
524 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
525 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
526
527 # store the mapping between the index names and the directory names
528 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
529 my @indexmap = ();
530 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
531 if (not defined ($self->{'notbuilt'}->{$index})) {
532 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
533 }
534 }
535 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
536
537 my @subcollectionmap = ();
538 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
539 push (@subcollectionmap, "$subcollection\-\>" .
540 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
541 }
542 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
543
544 my @languagemap = ();
545 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
546 push (@languagemap, "$language\-\>" .
547 $self->{'index_mapping'}->{'languagemap'}->{$language});
548 }
549 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
550
551 my @notbuilt = ();
552 foreach my $nb (keys %{$self->{'notbuilt'}}) {
553 push (@notbuilt, $nb);
554 }
555 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
556
557 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
558
559 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
560
561 $self->build_cfg_extra($build_cfg);
562
563 if ($gs_mode eq "gs2") {
564 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
565 }
566 if ($gs_mode eq "gs3") {
567
568 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'}, $self->{'disable_OAI'});
569 }
570
571 print STDERR "</Stage>\n" if $self->{'gli'};
572}
573
574# implement this in subclass if want to add extra stuff to build.cfg
575sub build_cfg_extra {
576 my $self = shift(@_);
577 my ($build_cfg) = @_;
578
579}
580
581
582sub collect_specific {
583 my $self = shift (@_);
584}
585
586sub want_built {
587 my $self = shift (@_);
588 my ($index) = @_;
589
590 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
591 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
592 if ($index =~ /^$checkstr$/) {
593 $self->{'notbuilt'}->{$index} = 1;
594 return 0;
595 }
596 }
597 }
598
599 return 1;
600}
601
602sub create_index_mapping {
603 my $self = shift (@_);
604 my ($indexes) = @_;
605
606 print STDERR "create_index_mapping should be implemented in subclass\n";
607 my %mapping = ();
608 return \%mapping;
609}
610
611# returns a processed version of a field.
612# if the field has only one component the processed
613# version will contain the first character and next consonant
614# of that componant - otherwise it will contain the first
615# character of the first two components
616# only uses letdig (\w) characters now
617sub process_field {
618 my $self = shift (@_);
619 my ($field) = @_;
620
621 return "" unless (defined ($field) && $field =~ /\S/);
622
623 my ($a, $b);
624 my @components = split /,/, $field;
625 if (scalar @components >= 2) {
626 # pick the first letdig from the first two field names
627 ($a) = $components[0] =~ /^[^\w]*(\w)/;
628 ($b) = $components[1] =~ /^[^\w]*(\w)/;
629 } else {
630 # pick the first two letdig chars
631 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
632 }
633 # there may not have been any letdigs...
634 $a = 'a' unless defined $a;
635 $b = '0' unless defined $b;
636
637 return "$a$b";
638
639}
640
641sub get_next_version {
642 my $self = shift (@_);
643 my ($nameref) = @_;
644 my $num=0;
645 if ($$nameref =~ /(\d\d)$/) {
646 $num = $1; $num ++;
647 $$nameref =~ s/\d\d$/$num/;
648 } elsif ($$nameref =~ /(\d)$/) {
649 $num = $1;
650 if ($num == 9) {$$nameref =~ s/\d$/10/;}
651 else {$num ++; $$nameref =~ s/\d$/$num/;}
652 } else {
653 $$nameref =~ s/.$/0/;
654 }
655}
656
657
658
659sub get_collection_meta_sets
660{
661 my $self = shift(@_);
662 my $collection_infodb = shift(@_);
663
664 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
665 foreach my $prefix (keys %$mdprefix_fields)
666 {
667 push(@{$collection_infodb->{"metadataset"}}, $prefix);
668
669 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
670 {
671 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
672
673 my $val = $mdprefix_fields->{$prefix}->{$field};
674 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
675 }
676 }
677}
678
679
680# default is to output the metadata sets (prefixes) used in collection
681sub output_collection_meta
682{
683 my $self = shift(@_);
684 my $infodb_handle = shift(@_);
685
686 my %collection_infodb = ();
687 $self->get_collection_meta_sets(\%collection_infodb);
688 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
689}
690
691# sometimes we need to read in an existing build.cfg - for example,
692# if doing each stage of building separately, or when doing incremental
693# building
694sub read_build_cfg {
695 my $self = shift(@_);
696
697 my $buildconfigfilename;
698
699 if ($gs_mode eq "gs2") {
700 $buildconfigfilename = "build.cfg";
701 } else {
702 $buildconfigfilename = "buildConfig.xml";
703 }
704
705 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
706
707 if (!-e $buildconfigfile) {
708 # try the index dir - but do we know where it is?? try here
709 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
710 if (!-e $buildconfigfile) {
711 #we cant find a config file - just ignore the field list
712 return undef;
713 }
714 }
715 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
716
717}
718
719sub print_stats {
720 my $self = shift (@_);
721
722 my $outhandle = $self->{'outhandle'};
723 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
724 my $index = $self->{'buildproc'}->get_index();
725 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
726 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
727
728 if ($indexing_text) {
729 print $outhandle "Stats (Creating index $index)\n";
730 } else {
731 print $outhandle "Stats (Compressing text from $index)\n";
732 }
733 print $outhandle "Total bytes in collection: $num_bytes\n";
734 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
735
736 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
737
738 if ($self->{'incremental'}) {
739 if ($num_processed_bytes == 0) {
740 if ($indexing_text) {
741 print $outhandle "No additional text was added to $index\n";
742 } elsif (!$self->{'no_text'}) {
743 print $outhandle "No additional text was compressed\n";
744 }
745 }
746 }
747 else {
748 print $outhandle "***************\n";
749 if ($indexing_text) {
750 print $outhandle "WARNING: There is very little or no text to process for $index\n";
751 } elsif (!$self->{'no_text'}) {
752 print $outhandle "WARNING: There is very little or no text to compress\n";
753 }
754 print $outhandle " Was this your intention?\n";
755 print $outhandle "***************\n";
756 }
757
758 }
759
760}
761
762
7631;
764
Note: See TracBrowser for help on using the repository browser.