source: main/trunk/greenstone2/perllib/basebuilder.pm@ 21715

Last change on this file since 21715 was 21607, checked in by mdewsnip, 14 years ago

Changed basebuilder.pm so set_infodbtype() is called on the buildproc object as soon as it is created, instead of just for the infodb phase. This is so the buildproc knows the infodbtype for all phases of the build. Part of making the code less GDBM-specific.

  • Property svn:keywords set to Author Date Id Revision
File size: 24.1 KB
RevLine 
[14930]1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
[15709]34use dbutil;
[14930]35use plugin;
36use util;
37
[15709]38
[14930]39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
[20095]54our $gs_mode = "gs2";
[14930]55
56sub new {
57 my ($class, $collection, $source_dir, $build_dir, $verbosity,
[20647]58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
[14930]59 $remove_empty_classifications,
60 $outhandle, $no_text, $failhandle, $gli, $disable_OAI) = @_;
61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
67 my $self = bless {'collection'=>$collection,
68 'source_dir'=>$source_dir,
69 'build_dir'=>$build_dir,
70 'verbosity'=>$verbosity,
71 'maxdocs'=>$maxdocs,
72 'debug'=>$debug,
73 'keepold'=>$keepold,
74 'incremental'=>$incremental,
[20647]75 'incremental_mode'=>$incremental_mode,
[14930]76 'remove_empty_classifications'=>$remove_empty_classifications,
77 'outhandle'=>$outhandle,
78 'no_text'=>$no_text,
79 'failhandle'=>$failhandle,
80 'notbuilt'=>{}, # indexes not built
81 'gli'=>$gli,
82 'disable_OAI'=>$disable_OAI
83 }, $class;
84
85 $self->{'gli'} = 0 unless defined $self->{'gli'};
86
[20095]87 # disable_OAI applies to greenstone 3 only and is only passed to &colcfg::write_build_cfg_xml (then buildConfigxml::write_build_cfg_file) when writing the buildConfig.xml
[14930]88 $self->{'disable_OAI'} = 0 unless defined $self->{'disable_OAI'};
89
90 # Read in the collection configuration file.
91 my ($colcfgname);
92 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
[20100]93 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
[14384]94
[20100]95 if ($gs_mode eq "gs3") {
96 # read it in again to save the original form for later writing out
97 # of buildConfig.xml
98 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
99 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
[14930]100 }
[20100]101
[15725]102 # get the database type for this collection from the collect.cfg file (may be undefined)
[15727]103 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
[15725]104
[14384]105
[15688]106 # load up any dontdb fields
107 $self->{'dontdb'} = {};
[14930]108 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
109 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
[15688]110 $self->{'dontdb'}->{$dg} = 1;
[14930]111 }
112 }
113
114 $self->{'maxnumeric'} = 4;
115 return $self;
116}
117
118# stuff has been moved here from new, so we can use subclass methods
119sub init {
120 my $self = shift(@_);
121
[20647]122 my $outhandle = $self->{'outhandle'};
123 my $failhandle = $self->{'failhandle'};
124
[14930]125 $self->generate_index_list();
[19218]126 my $indexes = $self->{'collect_cfg'}->{'indexes'};
127 if (defined $indexes) {
128 # sort out subcollection indexes
129 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
130 $self->{'collect_cfg'}->{'indexes'} = [];
131 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
132 foreach my $index (@$indexes) {
133 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
134 }
[14930]135 }
136 }
[19218]137
138 # sort out language subindexes
139 if (defined $self->{'collect_cfg'}->{'languages'}) {
140 $indexes = $self->{'collect_cfg'}->{'indexes'};
141 $self->{'collect_cfg'}->{'indexes'} = [];
142 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
143 foreach my $index (@$indexes) {
144 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
145 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
146 }
147 else { # add in an empty subcollection field
148 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
149 }
[14930]150 }
151 }
152 }
153 }
[19218]154
[14930]155 if (defined($self->{'collect_cfg'}->{'indexes'})) {
156 # make sure that the same index isn't specified more than once
157 my %tmphash = ();
158 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
159 $self->{'collect_cfg'}->{'indexes'} = [];
160 foreach my $i (@tmparray) {
161 if (!defined ($tmphash{$i})) {
162 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
163 $tmphash{$i} = 1;
164 }
165 }
166 } else {
167 $self->{'collect_cfg'}->{'indexes'} = [];
168 }
169
[20647]170 # check incremental against whether builder can cope or not.
[20681]171 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
[20647]172 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
173 $self->{'keepold'} = 0;
174 $self->{'incremental'} = 0;
175 $self->{'incremental_mode'} = "none";
176
177 }
178
179
180 # get the list of plugins for this collection
181 my $plugins = [];
182 if (defined $self->{'collect_cfg'}->{'plugin'}) {
183 $plugins = $self->{'collect_cfg'}->{'plugin'};
184 }
185
186 # load all the plugins
187
188 #build up the extra global options for the plugins
189 my @global_opts = ();
190 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
191 push @global_opts, "-separate_cjk";
192 }
193 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'});
194
195 if (scalar(@{$self->{'pluginfo'}}) == 0) {
196 print $outhandle "No plugins were loaded.\n";
197 die "\n";
198 }
199
200 # get the list of classifiers for this collection
201 my $classifiers = [];
202 if (defined $self->{'collect_cfg'}->{'classify'}) {
203 $classifiers = $self->{'collect_cfg'}->{'classify'};
204 }
205
206 # load all the classifiers
207 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
208
[14930]209 # load up the document processor for building
210 # if a buildproc class has been created for this collection, use it
[20647]211 # otherwise, use the default buildproc for the builder we are initialising
[14930]212 my ($buildprocdir, $buildproctype);
213 my $collection = $self->{'collection'};
214 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
215 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
216 $buildproctype = "custombuildproc";
217 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
218 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
219 $buildproctype = "custombuildproc";
220 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
221 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
222 $buildproctype = "${collection}buildproc";
223 } else {
224 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
225 $buildproctype = $self->default_buildproc();
226 }
227 require "$buildprocdir/$buildproctype.pm";
228
229 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
230 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
231 die "$@" if $@;
232
[21607]233 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
234 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
[17110]235
236 $self->generate_index_options();
237
[14930]238 if (!$self->{'debug'} && !$self->{'keepold'}) {
239 # remove any old builds
240 &util::rm_r($self->{'build_dir'});
241 &util::mk_all_dir($self->{'build_dir'});
242
243 # make the text directory
244 my $textdir = "$self->{'build_dir'}/text";
245 &util::mk_all_dir($textdir);
246 }
[17573]247
248 if ($self->{'incremental'}) {
249 # some classes may need to do some additional initialisation
250 $self->init_for_incremental_build();
251 }
[14930]252
253}
254
[20647]255sub is_incremental_capable
256{
257 # By default we return 'no' as the answer
258 # Safer to assume non-incremental to start with, and then override in
259 # inherited classes that are.
260
261 return 0;
262}
263
[17573]264# implement this in subclass if want to do additional initialisation for an
265# incremental build
266sub init_for_incremental_build {
267 my $self = shift (@_);
268}
269
[14930]270sub deinit {
271 my $self = shift (@_);
272
273 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
274}
275
[17110]276sub generate_index_options {
277 my $self = shift (@_);
278
279 my $separate_cjk = 0;
280
281 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
282 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
283 if ($option =~ /separate_cjk/) {
284 $separate_cjk = 1;
285 }
286 }
287 }
288 # set this for building
289 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
290 # record it for build.cfg
291 $self->{'separate_cjk'} = $separate_cjk;
292}
293
[14930]294sub set_sections_index_document_metadata {
295 my $self = shift (@_);
296 my ($index) = @_;
297
298 $self->{'buildproc'}->set_sections_index_document_metadata($index);
299}
300
301sub set_maxnumeric {
302 my $self = shift (@_);
303 my ($maxnumeric) = @_;
304
305 $self->{'maxnumeric'} = $maxnumeric;
306}
307sub set_strip_html {
308 my $self = shift (@_);
309 my ($strip) = @_;
310
311 $self->{'strip_html'} = $strip;
312 $self->{'buildproc'}->set_strip_html($strip);
313}
314
315sub compress_text {
316 my $self = shift (@_);
317 my ($textindex) = @_;
318
319 print STDERR "compress_text() should be implemented in subclass!!";
320 return;
321}
322
323
324sub build_indexes {
325 my $self = shift (@_);
326 my ($indexname) = @_;
327 my $outhandle = $self->{'outhandle'};
328
329 my $indexes = [];
330 if (defined $indexname && $indexname =~ /\w/) {
331 push @$indexes, $indexname;
332 } else {
333 $indexes = $self->{'collect_cfg'}->{'indexes'};
334 }
335
336 # create the mapping between the index descriptions
337 # and their directory names (includes subcolls and langs)
338 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
339
340 # build each of the indexes
341 foreach my $index (@$indexes) {
342 if ($self->want_built($index)) {
343 print $outhandle "\n*** building index $index in subdirectory " .
344 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
345 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
346 $self->build_index($index);
347 } else {
348 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
349 }
350 }
351
352 $self->build_indexes_extra();
353
354}
355
[17573]356# implement this in subclass if want to do extra stuff at the end of building
357# all the indexes
[14930]358sub build_indexes_extra {
359 my $self = shift(@_);
360
361}
362
363sub build_index {
364 my $self = shift (@_);
365 my ($index) = @_;
366
367 print STDERR "build_index should be implemented in subclass\n";
368 return;
369}
370
371
372
373sub make_infodatabase {
374 my $self = shift (@_);
375 my $outhandle = $self->{'outhandle'};
376
377 print STDERR "BuildDir: $self->{'build_dir'}\n";
378
379 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
380 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
381 &util::mk_all_dir ($textdir);
382 &util::mk_all_dir ($assocdir);
383
[15710]384 # Get info database file path
[20575]385 my $infodb_type = $self->{'infodbtype'};
386 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
[14930]387
388 print $outhandle "\n*** creating the info database and processing associated files\n"
389 if ($self->{'verbosity'} >= 1);
390 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
391
392 # init all the classifiers
393 &classify::init_classifiers ($self->{'classifiers'});
394
395 my $reconstructed_docs = undef;
[20575]396 my $database_recs = undef;
397
[20686]398 if ($self->{'incremental'}) {
[20575]399 $database_recs = {};
400
401 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
[14930]402 }
[20575]403
[14930]404
[20575]405 # Important (for memory usage reasons) that we obtain the filehandle
406 # here for writing out to the database, rather than after
[20686]407 # $reconstructed_docs has been set up (assuming -incremental is on)
[20575]408 #
409 # This is because when we open a pipe to txt2db [using open()]
410 # this triggers a fork() followed by exec(). $reconstructed_docs
411 # can get very large, and so if we did the open() after this, it means
412 # the fork creates a clone of the *large* process image which (admittedly)
413 # is then quickly replaced in the execve() with the much smaller image for
[20647]414 # 'txt2db'. The trouble is, in that seismic second caused by
[20575]415 # the fork(), the system really does need to have all that memory available
416 # even though it isn't ultimately used. The result is an out of memory
417 # error.
418
[15700]419 my ($infodb_handle);
[14930]420 if ($self->{'debug'}) {
[15700]421 $infodb_handle = *STDOUT;
[15710]422 }
423 else {
[20575]424 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
[15711]425 if (!defined($infodb_handle))
426 {
[14930]427 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
[15711]428 die "builder::make_infodatabase - couldn't open infodb write handle\n";
[14930]429 }
430 }
[15725]431
[20686]432 if ($self->{'incremental'}) {
[20575]433 # reconstruct doc_obj metadata from database for all docs
434 $reconstructed_docs
435 = &classify::reconstruct_doc_objs_metadata($infodb_type,
436 $infodb_file_path,
437 $database_recs);
438 }
439
440 # set up the document processor
441
[15700]442 $self->{'buildproc'}->set_output_handle ($infodb_handle);
[14930]443 $self->{'buildproc'}->set_mode ('infodb');
444 $self->{'buildproc'}->set_assocdir ($assocdir);
[15688]445 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
[14930]446 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
447 $self->{'buildproc'}->set_indexing_text (0);
448 $self->{'buildproc'}->set_store_text(1);
[16222]449 $self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
[14930]450
451 # make_infodatabase needs full reset even for incremental build
[15688]452 # as incremental works by reconstructing all docs from the database and
[14930]453 # then adding in the new ones
454 $self->{'buildproc'}->zero_reset();
455
[14934]456 $self->{'buildproc'}->{'mdprefix_fields'} = {};
457
[20686]458 if ($self->{'incremental'}) {
[14930]459 # create flat classify structure, ready for new docs to be added
460 foreach my $doc_obj ( @$reconstructed_docs ) {
461 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
462 $self->{'buildproc'}->process($doc_obj,undef);
463 }
464 }
465
466
[14934]467 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]468 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
[14934]469
[14930]470 # this has changed to only output collection meta if its
471 # not in the config file
[15700]472 $self->output_collection_meta($infodb_handle);
[14930]473
474 # output classification information
[20575]475 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
[14930]476 $self->{'remove_empty_classifications'},
477 $self->{'gli'});
478
479 # Output classifier reverse lookup, used in incremental deletion
[18469]480 ####&classify::print_reverse_lookup($infodb_handle);
[14930]481
[15700]482 # output doclist
483 my @doc_list = $self->{'buildproc'}->get_doc_list();
[15725]484 my $browselist_infodb = { 'hastxt' => [ "0" ],
485 'childtype' => [ "VList" ],
486 'numleafdocs' => [ scalar(@doc_list) ],
487 'thistype' => [ "Invisible" ],
488 'contains' => [ join(";", @doc_list) ] };
[20575]489 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
[14930]490
[20575]491 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
[14930]492
493 print STDERR "</Stage>\n" if $self->{'gli'};
494}
495
496sub make_auxiliary_files {
497 my $self = shift (@_);
498 my ($index);
499 my $build_cfg = {};
500 # subclasses may have already defined stuff in here
501 if (defined $self->{'build_cfg'}) {
502 $build_cfg = $self->{'build_cfg'};
503 }
504
505 my $outhandle = $self->{'outhandle'};
506
507 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
508 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
509
510 # get the text directory
511 &util::mk_all_dir ($self->{'build_dir'});
512
513 # store the build date
514 $build_cfg->{'builddate'} = time;
515 $build_cfg->{'buildtype'} = $self->{'buildtype'};
[15003]516 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
[14930]517 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
[17110]518 if ($self->{'separate_cjk'}) {
519 $build_cfg->{'separate_cjk'} = "true";
520 }
[14930]521
522 # store the number of documents and number of bytes
523 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
524 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
525 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
[18441]526
[14930]527 # store the mapping between the index names and the directory names
528 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
529 my @indexmap = ();
530 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
531 if (not defined ($self->{'notbuilt'}->{$index})) {
532 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
533 }
534 }
535 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
536
537 my @subcollectionmap = ();
538 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
539 push (@subcollectionmap, "$subcollection\-\>" .
540 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
541 }
542 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
543
544 my @languagemap = ();
545 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
546 push (@languagemap, "$language\-\>" .
547 $self->{'index_mapping'}->{'languagemap'}->{$language});
548 }
549 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
550
551 my @notbuilt = ();
552 foreach my $nb (keys %{$self->{'notbuilt'}}) {
553 push (@notbuilt, $nb);
554 }
555 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
556
557 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
558
[15728]559 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
560
[14930]561 $self->build_cfg_extra($build_cfg);
562
563 if ($gs_mode eq "gs2") {
[20100]564 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
[14930]565 }
[14384]566 if ($gs_mode eq "gs3") {
[14930]567
[20100]568 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'}, $self->{'disable_OAI'});
[14930]569 }
570
571 print STDERR "</Stage>\n" if $self->{'gli'};
572}
573
[17573]574# implement this in subclass if want to add extra stuff to build.cfg
575sub build_cfg_extra {
576 my $self = shift(@_);
577 my ($build_cfg) = @_;
578
579}
580
581
[14930]582sub collect_specific {
583 my $self = shift (@_);
584}
585
586sub want_built {
587 my $self = shift (@_);
588 my ($index) = @_;
589
590 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
591 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
592 if ($index =~ /^$checkstr$/) {
593 $self->{'notbuilt'}->{$index} = 1;
594 return 0;
595 }
596 }
597 }
598
599 return 1;
600}
601
602sub create_index_mapping {
603 my $self = shift (@_);
604 my ($indexes) = @_;
605
606 print STDERR "create_index_mapping should be implemented in subclass\n";
607 my %mapping = ();
608 return \%mapping;
609}
610
611# returns a processed version of a field.
612# if the field has only one component the processed
613# version will contain the first character and next consonant
614# of that componant - otherwise it will contain the first
615# character of the first two components
616# only uses letdig (\w) characters now
617sub process_field {
618 my $self = shift (@_);
619 my ($field) = @_;
620
621 return "" unless (defined ($field) && $field =~ /\S/);
622
623 my ($a, $b);
624 my @components = split /,/, $field;
625 if (scalar @components >= 2) {
626 # pick the first letdig from the first two field names
627 ($a) = $components[0] =~ /^[^\w]*(\w)/;
628 ($b) = $components[1] =~ /^[^\w]*(\w)/;
629 } else {
630 # pick the first two letdig chars
631 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
632 }
633 # there may not have been any letdigs...
634 $a = 'a' unless defined $a;
635 $b = '0' unless defined $b;
636
637 return "$a$b";
638
639}
640
641sub get_next_version {
642 my $self = shift (@_);
643 my ($nameref) = @_;
644 my $num=0;
645 if ($$nameref =~ /(\d\d)$/) {
646 $num = $1; $num ++;
647 $$nameref =~ s/\d\d$/$num/;
648 } elsif ($$nameref =~ /(\d)$/) {
649 $num = $1;
650 if ($num == 9) {$$nameref =~ s/\d$/10/;}
651 else {$num ++; $$nameref =~ s/\d$/$num/;}
652 } else {
653 $$nameref =~ s/.$/0/;
654 }
655}
656
657
[14934]658
[15709]659sub get_collection_meta_sets
660{
[14930]661 my $self = shift(@_);
[15709]662 my $collection_infodb = shift(@_);
[14930]663
[14934]664 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
665 foreach my $prefix (keys %$mdprefix_fields)
666 {
[15709]667 push(@{$collection_infodb->{"metadataset"}}, $prefix);
[14934]668
669 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
670 {
[15709]671 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
672
[14934]673 my $val = $mdprefix_fields->{$prefix}->{$field};
[15709]674 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
[14934]675 }
676 }
[15709]677}
[14934]678
679
680# default is to output the metadata sets (prefixes) used in collection
[15709]681sub output_collection_meta
682{
[14934]683 my $self = shift(@_);
[15709]684 my $infodb_handle = shift(@_);
[14934]685
[15709]686 my %collection_infodb = ();
687 $self->get_collection_meta_sets(\%collection_infodb);
[15725]688 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
[15709]689}
[14934]690
[17573]691# sometimes we need to read in an existing build.cfg - for example,
692# if doing each stage of building separately, or when doing incremental
693# building
694sub read_build_cfg {
695 my $self = shift(@_);
[14934]696
[20095]697 my $buildconfigfilename;
[17573]698
[20095]699 if ($gs_mode eq "gs2") {
700 $buildconfigfilename = "build.cfg";
701 } else {
702 $buildconfigfilename = "buildConfig.xml";
703 }
704
705 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
706
[17573]707 if (!-e $buildconfigfile) {
708 # try the index dir - but do we know where it is?? try here
[20095]709 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
[17573]710 if (!-e $buildconfigfile) {
711 #we cant find a config file - just ignore the field list
712 return undef;
713 }
714 }
[20100]715 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
[20095]716
[17573]717}
718
[14930]719sub print_stats {
720 my $self = shift (@_);
721
722 my $outhandle = $self->{'outhandle'};
723 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
724 my $index = $self->{'buildproc'}->get_index();
725 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
726 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
727
728 if ($indexing_text) {
729 print $outhandle "Stats (Creating index $index)\n";
730 } else {
731 print $outhandle "Stats (Compressing text from $index)\n";
732 }
733 print $outhandle "Total bytes in collection: $num_bytes\n";
734 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
735
736 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
737
[20686]738 if ($self->{'incremental'}) {
[14930]739 if ($num_processed_bytes == 0) {
740 if ($indexing_text) {
741 print $outhandle "No additional text was added to $index\n";
742 } elsif (!$self->{'no_text'}) {
743 print $outhandle "No additional text was compressed\n";
744 }
745 }
746 }
747 else {
748 print $outhandle "***************\n";
749 if ($indexing_text) {
750 print $outhandle "WARNING: There is very little or no text to process for $index\n";
751 } elsif (!$self->{'no_text'}) {
752 print $outhandle "WARNING: There is very little or no text to compress\n";
753 }
754 print $outhandle " Was this your intention?\n";
755 print $outhandle "***************\n";
756 }
757
758 }
759
760}
761
762
7631;
764
Note: See TracBrowser for help on using the repository browser.