source: gs2-extensions/parallel-building/trunk/src/perllib/basebuilder.pm@ 24668

Last change on this file since 24668 was 24668, checked in by jmt12, 13 years ago

changed warning message when basebuilder asked to generate build recipe (which it can't) to complain that such a function should be implemented in the subclass (similar to warnings elsewhere)

File size: 25.6 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57 my ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
59 $remove_empty_classifications,
60 $outhandle, $no_text, $failhandle, $gli) = @_;
61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
67 my $self = bless {'site'=>$site, # will be undef for Greenstone 2
68 'collection'=>$collection,
69 'source_dir'=>$source_dir,
70 'build_dir'=>$build_dir,
71 'verbosity'=>$verbosity,
72 'maxdocs'=>$maxdocs,
73 'debug'=>$debug,
74 'keepold'=>$keepold,
75 'incremental'=>$incremental,
76 'incremental_mode'=>$incremental_mode,
77 'remove_empty_classifications'=>$remove_empty_classifications,
78 'outhandle'=>$outhandle,
79 'no_text'=>$no_text,
80 'failhandle'=>$failhandle,
81 'notbuilt'=>{}, # indexes not built
82 'gli'=>$gli
83 }, $class;
84
85 $self->{'gli'} = 0 unless defined $self->{'gli'};
86
87 # Read in the collection configuration file.
88 my ($colcfgname);
89 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
90 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
91
92 if ($gs_mode eq "gs3") {
93 # read it in again to save the original form for later writing out
94 # of buildConfig.xml
95 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
96 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
97 }
98
99 # get the database type for this collection from the collect.cfg file (may be undefined)
100 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
101
102
103 # load up any dontdb fields
104 $self->{'dontdb'} = {};
105 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
106 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
107 $self->{'dontdb'}->{$dg} = 1;
108 }
109 }
110
111 $self->{'maxnumeric'} = 4;
112 return $self;
113}
114
115# stuff has been moved here from new, so we can use subclass methods
116sub init {
117 my $self = shift(@_);
118
119 my $outhandle = $self->{'outhandle'};
120 my $failhandle = $self->{'failhandle'};
121
122 $self->generate_index_list();
123 my $indexes = $self->{'collect_cfg'}->{'indexes'};
124 if (defined $indexes) {
125 # sort out subcollection indexes
126 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
127 $self->{'collect_cfg'}->{'indexes'} = [];
128 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
129 foreach my $index (@$indexes) {
130 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
131 }
132 }
133 }
134
135 # sort out language subindexes
136 if (defined $self->{'collect_cfg'}->{'languages'}) {
137 $indexes = $self->{'collect_cfg'}->{'indexes'};
138 $self->{'collect_cfg'}->{'indexes'} = [];
139 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
140 foreach my $index (@$indexes) {
141 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
142 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
143 }
144 else { # add in an empty subcollection field
145 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
146 }
147 }
148 }
149 }
150 }
151
152 if (defined($self->{'collect_cfg'}->{'indexes'})) {
153 # make sure that the same index isn't specified more than once
154 my %tmphash = ();
155 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
156 $self->{'collect_cfg'}->{'indexes'} = [];
157 foreach my $i (@tmparray) {
158 if (!defined ($tmphash{$i})) {
159 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
160 $tmphash{$i} = 1;
161 }
162 }
163 } else {
164 $self->{'collect_cfg'}->{'indexes'} = [];
165 }
166
167 # check incremental against whether builder can cope or not.
168 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
169 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
170 $self->{'keepold'} = 0;
171 $self->{'incremental'} = 0;
172 $self->{'incremental_mode'} = "none";
173
174 }
175
176
177 # get the list of plugins for this collection
178 my $plugins = [];
179 if (defined $self->{'collect_cfg'}->{'plugin'}) {
180 $plugins = $self->{'collect_cfg'}->{'plugin'};
181 }
182
183 # load all the plugins
184
185 #build up the extra global options for the plugins
186 my @global_opts = ();
187 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
188 push @global_opts, "-separate_cjk";
189 }
190 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'});
191
192 if (scalar(@{$self->{'pluginfo'}}) == 0) {
193 print $outhandle "No plugins were loaded.\n";
194 die "\n";
195 }
196
197 # get the list of classifiers for this collection
198 my $classifiers = [];
199 if (defined $self->{'collect_cfg'}->{'classify'}) {
200 $classifiers = $self->{'collect_cfg'}->{'classify'};
201 }
202
203 # load all the classifiers
204 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
205
206 # load up the document processor for building
207 # if a buildproc class has been created for this collection, use it
208 # otherwise, use the default buildproc for the builder we are initialising
209 my $buildprocdir = undef;
210 my $buildproctype;
211
212 my $collection = $self->{'collection'};
213 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
214 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
215 $buildproctype = "custombuildproc";
216 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
217 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
218 $buildproctype = "custombuildproc";
219 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
220 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
221 $buildproctype = "${collection}buildproc";
222 } else {
223 $buildproctype = $self->default_buildproc();
224 }
225 if (defined $buildprocdir) {
226 require "$buildprocdir/$buildproctype.pm";
227 }
228 else {
229 require "$buildproctype.pm";
230 }
231
232 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
233 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
234 die "$@" if $@;
235
236 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
237 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
238
239 $self->generate_index_options();
240
241 if (!$self->{'debug'} && !$self->{'keepold'}) {
242 # remove any old builds
243 &util::rm_r($self->{'build_dir'});
244 &util::mk_all_dir($self->{'build_dir'});
245
246 # make the text directory
247 my $textdir = "$self->{'build_dir'}/text";
248 &util::mk_all_dir($textdir);
249 }
250
251 if ($self->{'incremental'}) {
252 # some classes may need to do some additional initialisation
253 $self->init_for_incremental_build();
254 }
255
256}
257
258sub is_incremental_capable
259{
260 # By default we return 'no' as the answer
261 # Safer to assume non-incremental to start with, and then override in
262 # inherited classes that are.
263
264 return 0;
265}
266
267# implement this in subclass if want to do additional initialisation for an
268# incremental build
269sub init_for_incremental_build {
270 my $self = shift (@_);
271}
272
273sub deinit {
274 my $self = shift (@_);
275
276 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
277}
278
279sub generate_index_options {
280 my $self = shift (@_);
281
282 my $separate_cjk = 0;
283
284 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
285 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
286 if ($option =~ /separate_cjk/) {
287 $separate_cjk = 1;
288 }
289 }
290 }
291 # set this for building
292 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
293 # record it for build.cfg
294 $self->{'separate_cjk'} = $separate_cjk;
295}
296
297sub set_sections_index_document_metadata {
298 my $self = shift (@_);
299 my ($index) = @_;
300
301 $self->{'buildproc'}->set_sections_index_document_metadata($index);
302}
303
304sub set_maxnumeric {
305 my $self = shift (@_);
306 my ($maxnumeric) = @_;
307
308 $self->{'maxnumeric'} = $maxnumeric;
309}
310sub set_strip_html {
311 my $self = shift (@_);
312 my ($strip) = @_;
313
314 $self->{'strip_html'} = $strip;
315 $self->{'buildproc'}->set_strip_html($strip);
316}
317
318sub compress_text {
319 my $self = shift (@_);
320 my ($textindex) = @_;
321
322 print STDERR "compress_text() should be implemented in subclass!!";
323 return;
324}
325
326
327sub build_indexes {
328 my $self = shift (@_);
329 my ($indexname,$indexlevel) = @_;
330 my $outhandle = $self->{'outhandle'};
331
332 $self->pre_build_indexes();
333
334 my $indexes = [];
335 if (defined $indexname && $indexname =~ /\w/) {
336 push @$indexes, $indexname;
337 } else {
338 $indexes = $self->{'collect_cfg'}->{'indexes'};
339 }
340
341 # create the mapping between the index descriptions
342 # and their directory names (includes subcolls and langs)
343 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
344
345 # build each of the indexes
346 foreach my $index (@$indexes) {
347 if ($self->want_built($index)) {
348 print $outhandle "\n*** building index $index in subdirectory " .
349 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
350 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
351 $self->build_index($index);
352 } else {
353 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
354 }
355 }
356
357 $self->post_build_indexes();
358
359}
360
361# implement this in subclass if want to do extra stuff at before building
362# all the indexes
363sub pre_build_indexes {
364 my $self = shift(@_);
365 my ($indexname) = @_; # optional parameter
366}
367
368# implement this in subclass if want to do extra stuff at the end of building
369# all the indexes
370sub post_build_indexes {
371 my $self = shift(@_);
372}
373
374sub build_index {
375 my $self = shift (@_);
376 my ($index) = @_;
377
378 print STDERR "build_index should be implemented in subclass\n";
379 return;
380}
381
382
383
384sub make_infodatabase {
385 my $self = shift (@_);
386 my $outhandle = $self->{'outhandle'};
387
388 print STDERR "BuildDir: $self->{'build_dir'}\n";
389
390 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
391 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
392 &util::mk_all_dir ($textdir);
393 &util::mk_all_dir ($assocdir);
394
395 # Get info database file path
396 my $infodb_type = $self->{'infodbtype'};
397 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
398
399 print $outhandle "\n*** creating the info database and processing associated files\n"
400 if ($self->{'verbosity'} >= 1);
401 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
402
403 # init all the classifiers
404 &classify::init_classifiers ($self->{'classifiers'});
405
406 my $reconstructed_docs = undef;
407 my $database_recs = undef;
408
409 if ($self->{'incremental'}) {
410 $database_recs = {};
411
412 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
413 }
414
415
416 # Important (for memory usage reasons) that we obtain the filehandle
417 # here for writing out to the database, rather than after
418 # $reconstructed_docs has been set up (assuming -incremental is on)
419 #
420 # This is because when we open a pipe to txt2db [using open()]
421 # this triggers a fork() followed by exec(). $reconstructed_docs
422 # can get very large, and so if we did the open() after this, it means
423 # the fork creates a clone of the *large* process image which (admittedly)
424 # is then quickly replaced in the execve() with the much smaller image for
425 # 'txt2db'. The trouble is, in that seismic second caused by
426 # the fork(), the system really does need to have all that memory available
427 # even though it isn't ultimately used. The result is an out of memory
428 # error.
429
430 my ($infodb_handle);
431 if ($self->{'debug'}) {
432 $infodb_handle = *STDOUT;
433 }
434 else {
435 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
436 if (!defined($infodb_handle))
437 {
438 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
439 die "builder::make_infodatabase - couldn't open infodb write handle\n";
440 }
441 }
442
443 if ($self->{'incremental'}) {
444 # reconstruct doc_obj metadata from database for all docs
445 $reconstructed_docs
446 = &classify::reconstruct_doc_objs_metadata($infodb_type,
447 $infodb_file_path,
448 $database_recs);
449 }
450
451 # set up the document processor
452
453 $self->{'buildproc'}->set_output_handle ($infodb_handle);
454 $self->{'buildproc'}->set_mode ('infodb');
455 $self->{'buildproc'}->set_assocdir ($assocdir);
456 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
457 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
458 $self->{'buildproc'}->set_indexing_text (0);
459 $self->{'buildproc'}->set_store_text(1);
460 $self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
461
462 # make_infodatabase needs full reset even for incremental build
463 # as incremental works by reconstructing all docs from the database and
464 # then adding in the new ones
465 $self->{'buildproc'}->zero_reset();
466
467 $self->{'buildproc'}->{'mdprefix_fields'} = {};
468
469 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
470 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
471
472 if ($self->{'incremental'}) {
473 # create flat classify structure, ready for new docs to be added
474 foreach my $doc_obj ( @$reconstructed_docs ) {
475 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
476 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
477 $self->{'buildproc'}->process($doc_obj,undef);
478 }
479 }
480 }
481 # this has changed to only output collection meta if its
482 # not in the config file
483 $self->output_collection_meta($infodb_handle);
484
485 # output classification information
486 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
487 $self->{'remove_empty_classifications'},
488 $self->{'gli'});
489
490 # Output classifier reverse lookup, used in incremental deletion
491 ####&classify::print_reverse_lookup($infodb_handle);
492
493 # output doclist
494 my @doc_list = $self->{'buildproc'}->get_doc_list();
495 my $browselist_infodb = { 'hastxt' => [ "0" ],
496 'childtype' => [ "VList" ],
497 'numleafdocs' => [ scalar(@doc_list) ],
498 'thistype' => [ "Invisible" ],
499 'contains' => [ join(";", @doc_list) ] };
500 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
501
502 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
503
504 if ($infodb_type eq "gdbm-txtgz") {
505 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
506 if (-e $gdb_infodb_file_path) {
507 &util::rm($gdb_infodb_file_path);
508 }
509 }
510 print STDERR "</Stage>\n" if $self->{'gli'};
511}
512
513sub make_auxiliary_files {
514 my $self = shift (@_);
515 my ($index);
516 my $build_cfg = {};
517 # subclasses may have already defined stuff in here
518 if (defined $self->{'build_cfg'}) {
519 $build_cfg = $self->{'build_cfg'};
520 }
521
522 my $outhandle = $self->{'outhandle'};
523
524 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
525 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
526
527 # get the text directory
528 &util::mk_all_dir ($self->{'build_dir'});
529
530 # store the build date
531 $build_cfg->{'builddate'} = time;
532 $build_cfg->{'buildtype'} = $self->{'buildtype'};
533 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
534 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
535 if ($self->{'separate_cjk'}) {
536 $build_cfg->{'separate_cjk'} = "true";
537 }
538
539 # store the number of documents and number of bytes
540 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
541 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
542 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
543
544 # store the mapping between the index names and the directory names
545 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
546 my @indexmap = ();
547 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
548 if (not defined ($self->{'notbuilt'}->{$index})) {
549 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
550 }
551 }
552 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
553
554 my @subcollectionmap = ();
555 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
556 push (@subcollectionmap, "$subcollection\-\>" .
557 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
558 }
559 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
560
561 my @languagemap = ();
562 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
563 push (@languagemap, "$language\-\>" .
564 $self->{'index_mapping'}->{'languagemap'}->{$language});
565 }
566 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
567
568 my @notbuilt = ();
569 foreach my $nb (keys %{$self->{'notbuilt'}}) {
570 push (@notbuilt, $nb);
571 }
572 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
573
574 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
575
576 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
577
578 # write out the earliestDatestamp information needed for OAI
579 my $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
580 if(!-d $archivedir) {
581 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
582 }
583 my $earliestDatestampFile = &util::filename_cat ($archivedir, "earliestDatestamp");
584 my $earliestDatestamp = 0;
585 if (open(FIN,"<$earliestDatestampFile")) {
586 {
587 # slurp in file as a single line
588 local $/ = undef;
589 $earliestDatestamp = <FIN>;
590 #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
591 }
592 close(FIN);
593 }
594 else {
595 print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
596 print $outhandle "Setting value to 0.\n";
597 }
598 $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
599
600 $self->build_cfg_extra($build_cfg);
601
602 if ($gs_mode eq "gs2") {
603 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
604 }
605 if ($gs_mode eq "gs3") {
606
607 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
608 }
609
610 print STDERR "</Stage>\n" if $self->{'gli'};
611}
612
613# implement this in subclass if want to add extra stuff to build.cfg
614sub build_cfg_extra {
615 my $self = shift(@_);
616 my ($build_cfg) = @_;
617
618}
619
620
621sub collect_specific {
622 my $self = shift (@_);
623}
624
625sub want_built {
626 my $self = shift (@_);
627 my ($index) = @_;
628
629 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
630 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
631 if ($index =~ /^$checkstr$/) {
632 $self->{'notbuilt'}->{$index} = 1;
633 return 0;
634 }
635 }
636 }
637
638 return 1;
639}
640
641sub create_index_mapping {
642 my $self = shift (@_);
643 my ($indexes) = @_;
644
645 print STDERR "create_index_mapping should be implemented in subclass\n";
646 my %mapping = ();
647 return \%mapping;
648}
649
650# returns a processed version of a field.
651# if the field has only one component the processed
652# version will contain the first character and next consonant
653# of that componant - otherwise it will contain the first
654# character of the first two components
655# only uses letdig (\w) characters now
656sub process_field {
657 my $self = shift (@_);
658 my ($field) = @_;
659
660 return "" unless (defined ($field) && $field =~ /\S/);
661
662 my ($a, $b);
663 my @components = split /,/, $field;
664 if (scalar @components >= 2) {
665 # pick the first letdig from the first two field names
666 ($a) = $components[0] =~ /^[^\w]*(\w)/;
667 ($b) = $components[1] =~ /^[^\w]*(\w)/;
668 } else {
669 # pick the first two letdig chars
670 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
671 }
672 # there may not have been any letdigs...
673 $a = 'a' unless defined $a;
674 $b = '0' unless defined $b;
675
676 my $newfield = "$a$b";
677 if ($newfield =~ /^\d\d$/) {
678 # digits only - Greenstone runtime doesn't like this.
679 $newfield = "a$a";
680 }
681 return $newfield;
682
683}
684
685sub get_next_version {
686 my $self = shift (@_);
687 my ($nameref) = @_;
688 my $num=0;
689 if ($$nameref =~ /(\d\d)$/) {
690 $num = $1; $num ++;
691 $$nameref =~ s/\d\d$/$num/;
692 } elsif ($$nameref =~ /(\d)$/) {
693 $num = $1;
694 if ($num == 9) {$$nameref =~ s/\d$/10/;}
695 else {$num ++; $$nameref =~ s/\d$/$num/;}
696 } else {
697 $$nameref =~ s/.$/0/;
698 }
699}
700
701
702
703sub get_collection_meta_sets
704{
705 my $self = shift(@_);
706 my $collection_infodb = shift(@_);
707
708 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
709 foreach my $prefix (keys %$mdprefix_fields)
710 {
711 push(@{$collection_infodb->{"metadataset"}}, $prefix);
712
713 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
714 {
715 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
716
717 my $val = $mdprefix_fields->{$prefix}->{$field};
718 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
719 }
720 }
721}
722
723
724# default is to output the metadata sets (prefixes) used in collection
725sub output_collection_meta
726{
727 my $self = shift(@_);
728 my $infodb_handle = shift(@_);
729
730 my %collection_infodb = ();
731 $self->get_collection_meta_sets(\%collection_infodb);
732 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
733}
734
735# sometimes we need to read in an existing build.cfg - for example,
736# if doing each stage of building separately, or when doing incremental
737# building
738sub read_build_cfg {
739 my $self = shift(@_);
740
741 my $buildconfigfilename;
742
743 if ($gs_mode eq "gs2") {
744 $buildconfigfilename = "build.cfg";
745 } else {
746 $buildconfigfilename = "buildConfig.xml";
747 }
748
749 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
750
751 if (!-e $buildconfigfile) {
752 # try the index dir - but do we know where it is?? try here
753 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
754 if (!-e $buildconfigfile) {
755 #we cant find a config file - just ignore the field list
756 return undef;
757 }
758 }
759 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
760
761}
762
763sub print_stats {
764 my $self = shift (@_);
765
766 my $outhandle = $self->{'outhandle'};
767 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
768 my $index = $self->{'buildproc'}->get_index();
769 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
770 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
771
772 if ($indexing_text) {
773 print $outhandle "Stats (Creating index $index)\n";
774 } else {
775 print $outhandle "Stats (Compressing text from $index)\n";
776 }
777 print $outhandle "Total bytes in collection: $num_bytes\n";
778 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
779
780 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
781
782 if ($self->{'incremental'}) {
783 if ($num_processed_bytes == 0) {
784 if ($indexing_text) {
785 print $outhandle "No additional text was added to $index\n";
786 } elsif (!$self->{'no_text'}) {
787 print $outhandle "No additional text was compressed\n";
788 }
789 }
790 }
791 else {
792 print $outhandle "***************\n";
793 if ($indexing_text) {
794 print $outhandle "WARNING: There is very little or no text to process for $index\n";
795 } elsif (!$self->{'no_text'}) {
796 print $outhandle "WARNING: There is very little or no text to compress\n";
797 }
798 print $outhandle " Was this your intention?\n";
799 print $outhandle "***************\n";
800 }
801
802 }
803
804}
805
806sub prepare_build_recipe
807{
808 my ($self) = @_;
809 my $outhandle = $self->{'outhandle'};
810 print $outhandle "WARNING: prepare_build_recipe() should be implemented in subclass!!";
811}
812
8131;
814
Note: See TracBrowser for help on using the repository browser.