source: main/trunk/greenstone2/perllib/basebuilder.pm@ 23182

Last change on this file since 23182 was 23172, checked in by kjdon, 14 years ago

when using gdbm-txtgz infodb type, the runtime system will generate gdb version the first time it accesses the collection. Then it doesn't check again. So if we modify the txtgz version, we need to delete the gdb version so it can be regenerated

  • Property svn:keywords set to Author Date Id Revision
File size: 24.3 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57 my ($class, $collection, $source_dir, $build_dir, $verbosity,
58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
59 $remove_empty_classifications,
60 $outhandle, $no_text, $failhandle, $gli) = @_;
61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
67 my $self = bless {'collection'=>$collection,
68 'source_dir'=>$source_dir,
69 'build_dir'=>$build_dir,
70 'verbosity'=>$verbosity,
71 'maxdocs'=>$maxdocs,
72 'debug'=>$debug,
73 'keepold'=>$keepold,
74 'incremental'=>$incremental,
75 'incremental_mode'=>$incremental_mode,
76 'remove_empty_classifications'=>$remove_empty_classifications,
77 'outhandle'=>$outhandle,
78 'no_text'=>$no_text,
79 'failhandle'=>$failhandle,
80 'notbuilt'=>{}, # indexes not built
81 'gli'=>$gli
82 }, $class;
83
84 $self->{'gli'} = 0 unless defined $self->{'gli'};
85
86 # Read in the collection configuration file.
87 my ($colcfgname);
88 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
89 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
90
91 if ($gs_mode eq "gs3") {
92 # read it in again to save the original form for later writing out
93 # of buildConfig.xml
94 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
95 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
96 }
97
98 # get the database type for this collection from the collect.cfg file (may be undefined)
99 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
100
101
102 # load up any dontdb fields
103 $self->{'dontdb'} = {};
104 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
105 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
106 $self->{'dontdb'}->{$dg} = 1;
107 }
108 }
109
110 $self->{'maxnumeric'} = 4;
111 return $self;
112}
113
114# stuff has been moved here from new, so we can use subclass methods
115sub init {
116 my $self = shift(@_);
117
118 my $outhandle = $self->{'outhandle'};
119 my $failhandle = $self->{'failhandle'};
120
121 $self->generate_index_list();
122 my $indexes = $self->{'collect_cfg'}->{'indexes'};
123 if (defined $indexes) {
124 # sort out subcollection indexes
125 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
126 $self->{'collect_cfg'}->{'indexes'} = [];
127 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
128 foreach my $index (@$indexes) {
129 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
130 }
131 }
132 }
133
134 # sort out language subindexes
135 if (defined $self->{'collect_cfg'}->{'languages'}) {
136 $indexes = $self->{'collect_cfg'}->{'indexes'};
137 $self->{'collect_cfg'}->{'indexes'} = [];
138 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
139 foreach my $index (@$indexes) {
140 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
141 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
142 }
143 else { # add in an empty subcollection field
144 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
145 }
146 }
147 }
148 }
149 }
150
151 if (defined($self->{'collect_cfg'}->{'indexes'})) {
152 # make sure that the same index isn't specified more than once
153 my %tmphash = ();
154 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
155 $self->{'collect_cfg'}->{'indexes'} = [];
156 foreach my $i (@tmparray) {
157 if (!defined ($tmphash{$i})) {
158 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
159 $tmphash{$i} = 1;
160 }
161 }
162 } else {
163 $self->{'collect_cfg'}->{'indexes'} = [];
164 }
165
166 # check incremental against whether builder can cope or not.
167 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
168 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
169 $self->{'keepold'} = 0;
170 $self->{'incremental'} = 0;
171 $self->{'incremental_mode'} = "none";
172
173 }
174
175
176 # get the list of plugins for this collection
177 my $plugins = [];
178 if (defined $self->{'collect_cfg'}->{'plugin'}) {
179 $plugins = $self->{'collect_cfg'}->{'plugin'};
180 }
181
182 # load all the plugins
183
184 #build up the extra global options for the plugins
185 my @global_opts = ();
186 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
187 push @global_opts, "-separate_cjk";
188 }
189 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'});
190
191 if (scalar(@{$self->{'pluginfo'}}) == 0) {
192 print $outhandle "No plugins were loaded.\n";
193 die "\n";
194 }
195
196 # get the list of classifiers for this collection
197 my $classifiers = [];
198 if (defined $self->{'collect_cfg'}->{'classify'}) {
199 $classifiers = $self->{'collect_cfg'}->{'classify'};
200 }
201
202 # load all the classifiers
203 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
204
205 # load up the document processor for building
206 # if a buildproc class has been created for this collection, use it
207 # otherwise, use the default buildproc for the builder we are initialising
208 my ($buildprocdir, $buildproctype);
209 my $collection = $self->{'collection'};
210 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
211 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
212 $buildproctype = "custombuildproc";
213 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
214 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
215 $buildproctype = "custombuildproc";
216 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
217 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
218 $buildproctype = "${collection}buildproc";
219 } else {
220 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
221 $buildproctype = $self->default_buildproc();
222 }
223 require "$buildprocdir/$buildproctype.pm";
224
225 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
226 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
227 die "$@" if $@;
228
229 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
230 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
231
232 $self->generate_index_options();
233
234 if (!$self->{'debug'} && !$self->{'keepold'}) {
235 # remove any old builds
236 &util::rm_r($self->{'build_dir'});
237 &util::mk_all_dir($self->{'build_dir'});
238
239 # make the text directory
240 my $textdir = "$self->{'build_dir'}/text";
241 &util::mk_all_dir($textdir);
242 }
243
244 if ($self->{'incremental'}) {
245 # some classes may need to do some additional initialisation
246 $self->init_for_incremental_build();
247 }
248
249}
250
251sub is_incremental_capable
252{
253 # By default we return 'no' as the answer
254 # Safer to assume non-incremental to start with, and then override in
255 # inherited classes that are.
256
257 return 0;
258}
259
260# implement this in subclass if want to do additional initialisation for an
261# incremental build
262sub init_for_incremental_build {
263 my $self = shift (@_);
264}
265
266sub deinit {
267 my $self = shift (@_);
268
269 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
270}
271
272sub generate_index_options {
273 my $self = shift (@_);
274
275 my $separate_cjk = 0;
276
277 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
278 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
279 if ($option =~ /separate_cjk/) {
280 $separate_cjk = 1;
281 }
282 }
283 }
284 # set this for building
285 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
286 # record it for build.cfg
287 $self->{'separate_cjk'} = $separate_cjk;
288}
289
290sub set_sections_index_document_metadata {
291 my $self = shift (@_);
292 my ($index) = @_;
293
294 $self->{'buildproc'}->set_sections_index_document_metadata($index);
295}
296
297sub set_maxnumeric {
298 my $self = shift (@_);
299 my ($maxnumeric) = @_;
300
301 $self->{'maxnumeric'} = $maxnumeric;
302}
303sub set_strip_html {
304 my $self = shift (@_);
305 my ($strip) = @_;
306
307 $self->{'strip_html'} = $strip;
308 $self->{'buildproc'}->set_strip_html($strip);
309}
310
311sub compress_text {
312 my $self = shift (@_);
313 my ($textindex) = @_;
314
315 print STDERR "compress_text() should be implemented in subclass!!";
316 return;
317}
318
319
320sub build_indexes {
321 my $self = shift (@_);
322 my ($indexname) = @_;
323 my $outhandle = $self->{'outhandle'};
324
325 my $indexes = [];
326 if (defined $indexname && $indexname =~ /\w/) {
327 push @$indexes, $indexname;
328 } else {
329 $indexes = $self->{'collect_cfg'}->{'indexes'};
330 }
331
332 # create the mapping between the index descriptions
333 # and their directory names (includes subcolls and langs)
334 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
335
336 # build each of the indexes
337 foreach my $index (@$indexes) {
338 if ($self->want_built($index)) {
339 print $outhandle "\n*** building index $index in subdirectory " .
340 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
341 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
342 $self->build_index($index);
343 } else {
344 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
345 }
346 }
347
348 $self->build_indexes_extra();
349
350}
351
352# implement this in subclass if want to do extra stuff at the end of building
353# all the indexes
354sub build_indexes_extra {
355 my $self = shift(@_);
356
357}
358
359sub build_index {
360 my $self = shift (@_);
361 my ($index) = @_;
362
363 print STDERR "build_index should be implemented in subclass\n";
364 return;
365}
366
367
368
369sub make_infodatabase {
370 my $self = shift (@_);
371 my $outhandle = $self->{'outhandle'};
372
373 print STDERR "BuildDir: $self->{'build_dir'}\n";
374
375 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
376 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
377 &util::mk_all_dir ($textdir);
378 &util::mk_all_dir ($assocdir);
379
380 # Get info database file path
381 my $infodb_type = $self->{'infodbtype'};
382 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
383
384 print $outhandle "\n*** creating the info database and processing associated files\n"
385 if ($self->{'verbosity'} >= 1);
386 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
387
388 # init all the classifiers
389 &classify::init_classifiers ($self->{'classifiers'});
390
391 my $reconstructed_docs = undef;
392 my $database_recs = undef;
393
394 if ($self->{'incremental'}) {
395 $database_recs = {};
396
397 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
398 }
399
400
401 # Important (for memory usage reasons) that we obtain the filehandle
402 # here for writing out to the database, rather than after
403 # $reconstructed_docs has been set up (assuming -incremental is on)
404 #
405 # This is because when we open a pipe to txt2db [using open()]
406 # this triggers a fork() followed by exec(). $reconstructed_docs
407 # can get very large, and so if we did the open() after this, it means
408 # the fork creates a clone of the *large* process image which (admittedly)
409 # is then quickly replaced in the execve() with the much smaller image for
410 # 'txt2db'. The trouble is, in that seismic second caused by
411 # the fork(), the system really does need to have all that memory available
412 # even though it isn't ultimately used. The result is an out of memory
413 # error.
414
415 my ($infodb_handle);
416 if ($self->{'debug'}) {
417 $infodb_handle = *STDOUT;
418 }
419 else {
420 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
421 if (!defined($infodb_handle))
422 {
423 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
424 die "builder::make_infodatabase - couldn't open infodb write handle\n";
425 }
426 }
427
428 if ($self->{'incremental'}) {
429 # reconstruct doc_obj metadata from database for all docs
430 $reconstructed_docs
431 = &classify::reconstruct_doc_objs_metadata($infodb_type,
432 $infodb_file_path,
433 $database_recs);
434 }
435
436 # set up the document processor
437
438 $self->{'buildproc'}->set_output_handle ($infodb_handle);
439 $self->{'buildproc'}->set_mode ('infodb');
440 $self->{'buildproc'}->set_assocdir ($assocdir);
441 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
442 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
443 $self->{'buildproc'}->set_indexing_text (0);
444 $self->{'buildproc'}->set_store_text(1);
445 $self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
446
447 # make_infodatabase needs full reset even for incremental build
448 # as incremental works by reconstructing all docs from the database and
449 # then adding in the new ones
450 $self->{'buildproc'}->zero_reset();
451
452 $self->{'buildproc'}->{'mdprefix_fields'} = {};
453
454 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
455 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
456
457 if ($self->{'incremental'}) {
458 # create flat classify structure, ready for new docs to be added
459 foreach my $doc_obj ( @$reconstructed_docs ) {
460 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
461 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
462 $self->{'buildproc'}->process($doc_obj,undef);
463 }
464 }
465 }
466 # this has changed to only output collection meta if its
467 # not in the config file
468 $self->output_collection_meta($infodb_handle);
469
470 # output classification information
471 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
472 $self->{'remove_empty_classifications'},
473 $self->{'gli'});
474
475 # Output classifier reverse lookup, used in incremental deletion
476 ####&classify::print_reverse_lookup($infodb_handle);
477
478 # output doclist
479 my @doc_list = $self->{'buildproc'}->get_doc_list();
480 my $browselist_infodb = { 'hastxt' => [ "0" ],
481 'childtype' => [ "VList" ],
482 'numleafdocs' => [ scalar(@doc_list) ],
483 'thistype' => [ "Invisible" ],
484 'contains' => [ join(";", @doc_list) ] };
485 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
486
487 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
488
489 if ($infodb_type eq "gdbm-txtgz") {
490 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
491 if (-e $gdb_infodb_file_path) {
492 &util::rm($gdb_infodb_file_path);
493 }
494 }
495 print STDERR "</Stage>\n" if $self->{'gli'};
496}
497
498sub make_auxiliary_files {
499 my $self = shift (@_);
500 my ($index);
501 my $build_cfg = {};
502 # subclasses may have already defined stuff in here
503 if (defined $self->{'build_cfg'}) {
504 $build_cfg = $self->{'build_cfg'};
505 }
506
507 my $outhandle = $self->{'outhandle'};
508
509 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
510 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
511
512 # get the text directory
513 &util::mk_all_dir ($self->{'build_dir'});
514
515 # store the build date
516 $build_cfg->{'builddate'} = time;
517 $build_cfg->{'buildtype'} = $self->{'buildtype'};
518 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
519 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
520 if ($self->{'separate_cjk'}) {
521 $build_cfg->{'separate_cjk'} = "true";
522 }
523
524 # store the number of documents and number of bytes
525 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
526 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
527 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
528
529 # store the mapping between the index names and the directory names
530 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
531 my @indexmap = ();
532 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
533 if (not defined ($self->{'notbuilt'}->{$index})) {
534 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
535 }
536 }
537 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
538
539 my @subcollectionmap = ();
540 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
541 push (@subcollectionmap, "$subcollection\-\>" .
542 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
543 }
544 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
545
546 my @languagemap = ();
547 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
548 push (@languagemap, "$language\-\>" .
549 $self->{'index_mapping'}->{'languagemap'}->{$language});
550 }
551 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
552
553 my @notbuilt = ();
554 foreach my $nb (keys %{$self->{'notbuilt'}}) {
555 push (@notbuilt, $nb);
556 }
557 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
558
559 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
560
561 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
562
563 $self->build_cfg_extra($build_cfg);
564
565 if ($gs_mode eq "gs2") {
566 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
567 }
568 if ($gs_mode eq "gs3") {
569
570 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
571 }
572
573 print STDERR "</Stage>\n" if $self->{'gli'};
574}
575
576# implement this in subclass if want to add extra stuff to build.cfg
577sub build_cfg_extra {
578 my $self = shift(@_);
579 my ($build_cfg) = @_;
580
581}
582
583
584sub collect_specific {
585 my $self = shift (@_);
586}
587
588sub want_built {
589 my $self = shift (@_);
590 my ($index) = @_;
591
592 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
593 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
594 if ($index =~ /^$checkstr$/) {
595 $self->{'notbuilt'}->{$index} = 1;
596 return 0;
597 }
598 }
599 }
600
601 return 1;
602}
603
604sub create_index_mapping {
605 my $self = shift (@_);
606 my ($indexes) = @_;
607
608 print STDERR "create_index_mapping should be implemented in subclass\n";
609 my %mapping = ();
610 return \%mapping;
611}
612
613# returns a processed version of a field.
614# if the field has only one component the processed
615# version will contain the first character and next consonant
616# of that componant - otherwise it will contain the first
617# character of the first two components
618# only uses letdig (\w) characters now
619sub process_field {
620 my $self = shift (@_);
621 my ($field) = @_;
622
623 return "" unless (defined ($field) && $field =~ /\S/);
624
625 my ($a, $b);
626 my @components = split /,/, $field;
627 if (scalar @components >= 2) {
628 # pick the first letdig from the first two field names
629 ($a) = $components[0] =~ /^[^\w]*(\w)/;
630 ($b) = $components[1] =~ /^[^\w]*(\w)/;
631 } else {
632 # pick the first two letdig chars
633 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
634 }
635 # there may not have been any letdigs...
636 $a = 'a' unless defined $a;
637 $b = '0' unless defined $b;
638
639 my $newfield = "$a$b";
640 if ($newfield =~ /^\d\d$/) {
641 # digits only - Greenstone runtime doesn't like this.
642 $newfield = "a$a";
643 }
644 return $newfield;
645
646}
647
648sub get_next_version {
649 my $self = shift (@_);
650 my ($nameref) = @_;
651 my $num=0;
652 if ($$nameref =~ /(\d\d)$/) {
653 $num = $1; $num ++;
654 $$nameref =~ s/\d\d$/$num/;
655 } elsif ($$nameref =~ /(\d)$/) {
656 $num = $1;
657 if ($num == 9) {$$nameref =~ s/\d$/10/;}
658 else {$num ++; $$nameref =~ s/\d$/$num/;}
659 } else {
660 $$nameref =~ s/.$/0/;
661 }
662}
663
664
665
666sub get_collection_meta_sets
667{
668 my $self = shift(@_);
669 my $collection_infodb = shift(@_);
670
671 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
672 foreach my $prefix (keys %$mdprefix_fields)
673 {
674 push(@{$collection_infodb->{"metadataset"}}, $prefix);
675
676 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
677 {
678 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
679
680 my $val = $mdprefix_fields->{$prefix}->{$field};
681 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
682 }
683 }
684}
685
686
687# default is to output the metadata sets (prefixes) used in collection
688sub output_collection_meta
689{
690 my $self = shift(@_);
691 my $infodb_handle = shift(@_);
692
693 my %collection_infodb = ();
694 $self->get_collection_meta_sets(\%collection_infodb);
695 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
696}
697
698# sometimes we need to read in an existing build.cfg - for example,
699# if doing each stage of building separately, or when doing incremental
700# building
701sub read_build_cfg {
702 my $self = shift(@_);
703
704 my $buildconfigfilename;
705
706 if ($gs_mode eq "gs2") {
707 $buildconfigfilename = "build.cfg";
708 } else {
709 $buildconfigfilename = "buildConfig.xml";
710 }
711
712 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
713
714 if (!-e $buildconfigfile) {
715 # try the index dir - but do we know where it is?? try here
716 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
717 if (!-e $buildconfigfile) {
718 #we cant find a config file - just ignore the field list
719 return undef;
720 }
721 }
722 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
723
724}
725
726sub print_stats {
727 my $self = shift (@_);
728
729 my $outhandle = $self->{'outhandle'};
730 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
731 my $index = $self->{'buildproc'}->get_index();
732 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
733 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
734
735 if ($indexing_text) {
736 print $outhandle "Stats (Creating index $index)\n";
737 } else {
738 print $outhandle "Stats (Compressing text from $index)\n";
739 }
740 print $outhandle "Total bytes in collection: $num_bytes\n";
741 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
742
743 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
744
745 if ($self->{'incremental'}) {
746 if ($num_processed_bytes == 0) {
747 if ($indexing_text) {
748 print $outhandle "No additional text was added to $index\n";
749 } elsif (!$self->{'no_text'}) {
750 print $outhandle "No additional text was compressed\n";
751 }
752 }
753 }
754 else {
755 print $outhandle "***************\n";
756 if ($indexing_text) {
757 print $outhandle "WARNING: There is very little or no text to process for $index\n";
758 } elsif (!$self->{'no_text'}) {
759 print $outhandle "WARNING: There is very little or no text to compress\n";
760 }
761 print $outhandle " Was this your intention?\n";
762 print $outhandle "***************\n";
763 }
764
765 }
766
767}
768
769
7701;
771
Note: See TracBrowser for help on using the repository browser.