source: main/trunk/greenstone2/perllib/basebuilder.pm@ 24460

Last change on this file since 24460 was 24460, checked in by davidb, 13 years ago

Code changes to support indexers that are provided through the extension mechanism

  • Property svn:keywords set to Author Date Id Revision
File size: 25.4 KB
RevLine 
[14930]1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
[15709]34use dbutil;
[14930]35use plugin;
36use util;
37
[15709]38
[14930]39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
[20095]54our $gs_mode = "gs2";
[14930]55
56sub new {
57 my ($class, $collection, $source_dir, $build_dir, $verbosity,
[20647]58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
[14930]59 $remove_empty_classifications,
[21785]60 $outhandle, $no_text, $failhandle, $gli) = @_;
[14930]61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
67 my $self = bless {'collection'=>$collection,
68 'source_dir'=>$source_dir,
69 'build_dir'=>$build_dir,
70 'verbosity'=>$verbosity,
71 'maxdocs'=>$maxdocs,
72 'debug'=>$debug,
73 'keepold'=>$keepold,
74 'incremental'=>$incremental,
[20647]75 'incremental_mode'=>$incremental_mode,
[14930]76 'remove_empty_classifications'=>$remove_empty_classifications,
77 'outhandle'=>$outhandle,
78 'no_text'=>$no_text,
79 'failhandle'=>$failhandle,
80 'notbuilt'=>{}, # indexes not built
[21785]81 'gli'=>$gli
[14930]82 }, $class;
83
84 $self->{'gli'} = 0 unless defined $self->{'gli'};
85
86 # Read in the collection configuration file.
87 my ($colcfgname);
88 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
[20100]89 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
[14384]90
[20100]91 if ($gs_mode eq "gs3") {
92 # read it in again to save the original form for later writing out
93 # of buildConfig.xml
94 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
95 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
[14930]96 }
[20100]97
[15725]98 # get the database type for this collection from the collect.cfg file (may be undefined)
[15727]99 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
[15725]100
[14384]101
[15688]102 # load up any dontdb fields
103 $self->{'dontdb'} = {};
[14930]104 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
105 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
[15688]106 $self->{'dontdb'}->{$dg} = 1;
[14930]107 }
108 }
109
110 $self->{'maxnumeric'} = 4;
111 return $self;
112}
113
114# stuff has been moved here from new, so we can use subclass methods
115sub init {
116 my $self = shift(@_);
117
[20647]118 my $outhandle = $self->{'outhandle'};
119 my $failhandle = $self->{'failhandle'};
120
[14930]121 $self->generate_index_list();
[19218]122 my $indexes = $self->{'collect_cfg'}->{'indexes'};
123 if (defined $indexes) {
124 # sort out subcollection indexes
125 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
126 $self->{'collect_cfg'}->{'indexes'} = [];
127 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
128 foreach my $index (@$indexes) {
129 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
130 }
[14930]131 }
132 }
[19218]133
134 # sort out language subindexes
135 if (defined $self->{'collect_cfg'}->{'languages'}) {
136 $indexes = $self->{'collect_cfg'}->{'indexes'};
137 $self->{'collect_cfg'}->{'indexes'} = [];
138 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
139 foreach my $index (@$indexes) {
140 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
141 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
142 }
143 else { # add in an empty subcollection field
144 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
145 }
[14930]146 }
147 }
148 }
149 }
[19218]150
[14930]151 if (defined($self->{'collect_cfg'}->{'indexes'})) {
152 # make sure that the same index isn't specified more than once
153 my %tmphash = ();
154 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
155 $self->{'collect_cfg'}->{'indexes'} = [];
156 foreach my $i (@tmparray) {
157 if (!defined ($tmphash{$i})) {
158 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
159 $tmphash{$i} = 1;
160 }
161 }
162 } else {
163 $self->{'collect_cfg'}->{'indexes'} = [];
164 }
165
[20647]166 # check incremental against whether builder can cope or not.
[20681]167 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
[20647]168 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
169 $self->{'keepold'} = 0;
170 $self->{'incremental'} = 0;
171 $self->{'incremental_mode'} = "none";
172
173 }
174
175
176 # get the list of plugins for this collection
177 my $plugins = [];
178 if (defined $self->{'collect_cfg'}->{'plugin'}) {
179 $plugins = $self->{'collect_cfg'}->{'plugin'};
180 }
181
182 # load all the plugins
183
184 #build up the extra global options for the plugins
185 my @global_opts = ();
186 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
187 push @global_opts, "-separate_cjk";
188 }
189 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'});
190
191 if (scalar(@{$self->{'pluginfo'}}) == 0) {
192 print $outhandle "No plugins were loaded.\n";
193 die "\n";
194 }
195
196 # get the list of classifiers for this collection
197 my $classifiers = [];
198 if (defined $self->{'collect_cfg'}->{'classify'}) {
199 $classifiers = $self->{'collect_cfg'}->{'classify'};
200 }
201
202 # load all the classifiers
203 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
204
[14930]205 # load up the document processor for building
206 # if a buildproc class has been created for this collection, use it
[20647]207 # otherwise, use the default buildproc for the builder we are initialising
[24342]208 my $buildprocdir = undef;
209 my $buildproctype;
210
[14930]211 my $collection = $self->{'collection'};
212 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
213 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
214 $buildproctype = "custombuildproc";
215 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
216 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
217 $buildproctype = "custombuildproc";
218 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
219 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
220 $buildproctype = "${collection}buildproc";
221 } else {
222 $buildproctype = $self->default_buildproc();
223 }
[24342]224 if (defined $buildprocdir) {
225 require "$buildprocdir/$buildproctype.pm";
226 }
227 else {
228 require "$buildproctype.pm";
229 }
[14930]230
231 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
232 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
233 die "$@" if $@;
234
[21607]235 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
236 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
[17110]237
238 $self->generate_index_options();
239
[14930]240 if (!$self->{'debug'} && !$self->{'keepold'}) {
241 # remove any old builds
242 &util::rm_r($self->{'build_dir'});
243 &util::mk_all_dir($self->{'build_dir'});
244
245 # make the text directory
246 my $textdir = "$self->{'build_dir'}/text";
247 &util::mk_all_dir($textdir);
248 }
[17573]249
250 if ($self->{'incremental'}) {
251 # some classes may need to do some additional initialisation
252 $self->init_for_incremental_build();
253 }
[14930]254
255}
256
[20647]257sub is_incremental_capable
258{
259 # By default we return 'no' as the answer
260 # Safer to assume non-incremental to start with, and then override in
261 # inherited classes that are.
262
263 return 0;
264}
265
[17573]266# implement this in subclass if want to do additional initialisation for an
267# incremental build
268sub init_for_incremental_build {
269 my $self = shift (@_);
270}
271
[14930]272sub deinit {
273 my $self = shift (@_);
274
275 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
276}
277
[17110]278sub generate_index_options {
279 my $self = shift (@_);
280
281 my $separate_cjk = 0;
282
283 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
284 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
285 if ($option =~ /separate_cjk/) {
286 $separate_cjk = 1;
287 }
288 }
289 }
290 # set this for building
291 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
292 # record it for build.cfg
293 $self->{'separate_cjk'} = $separate_cjk;
294}
295
[14930]296sub set_sections_index_document_metadata {
297 my $self = shift (@_);
298 my ($index) = @_;
299
300 $self->{'buildproc'}->set_sections_index_document_metadata($index);
301}
302
303sub set_maxnumeric {
304 my $self = shift (@_);
305 my ($maxnumeric) = @_;
306
307 $self->{'maxnumeric'} = $maxnumeric;
308}
309sub set_strip_html {
310 my $self = shift (@_);
311 my ($strip) = @_;
312
313 $self->{'strip_html'} = $strip;
314 $self->{'buildproc'}->set_strip_html($strip);
315}
316
317sub compress_text {
318 my $self = shift (@_);
319 my ($textindex) = @_;
320
321 print STDERR "compress_text() should be implemented in subclass!!";
322 return;
323}
324
325
326sub build_indexes {
327 my $self = shift (@_);
328 my ($indexname) = @_;
329 my $outhandle = $self->{'outhandle'};
330
[24460]331 $self->pre_build_indexes();
332
[14930]333 my $indexes = [];
334 if (defined $indexname && $indexname =~ /\w/) {
335 push @$indexes, $indexname;
336 } else {
337 $indexes = $self->{'collect_cfg'}->{'indexes'};
338 }
339
340 # create the mapping between the index descriptions
341 # and their directory names (includes subcolls and langs)
342 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
343
344 # build each of the indexes
345 foreach my $index (@$indexes) {
346 if ($self->want_built($index)) {
347 print $outhandle "\n*** building index $index in subdirectory " .
348 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
349 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
350 $self->build_index($index);
351 } else {
352 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
353 }
354 }
355
[24460]356 $self->post_build_indexes();
[14930]357
358}
359
[24460]360# implement this in subclass if want to do extra stuff at before building
[17573]361# all the indexes
[24460]362sub pre_build_indexes {
[14930]363 my $self = shift(@_);
[24460]364 my ($indexname) = @_; # optional parameter
[14930]365}
366
[24460]367# implement this in subclass if want to do extra stuff at the end of building
368# all the indexes
369sub post_build_indexes {
370 my $self = shift(@_);
371}
372
[14930]373sub build_index {
374 my $self = shift (@_);
375 my ($index) = @_;
376
377 print STDERR "build_index should be implemented in subclass\n";
378 return;
379}
380
381
382
383sub make_infodatabase {
384 my $self = shift (@_);
385 my $outhandle = $self->{'outhandle'};
386
387 print STDERR "BuildDir: $self->{'build_dir'}\n";
388
389 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
390 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
391 &util::mk_all_dir ($textdir);
392 &util::mk_all_dir ($assocdir);
393
[15710]394 # Get info database file path
[20575]395 my $infodb_type = $self->{'infodbtype'};
396 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
[14930]397
398 print $outhandle "\n*** creating the info database and processing associated files\n"
399 if ($self->{'verbosity'} >= 1);
400 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
401
402 # init all the classifiers
403 &classify::init_classifiers ($self->{'classifiers'});
404
405 my $reconstructed_docs = undef;
[20575]406 my $database_recs = undef;
407
[20686]408 if ($self->{'incremental'}) {
[20575]409 $database_recs = {};
410
411 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
[14930]412 }
[20575]413
[14930]414
[20575]415 # Important (for memory usage reasons) that we obtain the filehandle
416 # here for writing out to the database, rather than after
[20686]417 # $reconstructed_docs has been set up (assuming -incremental is on)
[20575]418 #
419 # This is because when we open a pipe to txt2db [using open()]
420 # this triggers a fork() followed by exec(). $reconstructed_docs
421 # can get very large, and so if we did the open() after this, it means
422 # the fork creates a clone of the *large* process image which (admittedly)
423 # is then quickly replaced in the execve() with the much smaller image for
[20647]424 # 'txt2db'. The trouble is, in that seismic second caused by
[20575]425 # the fork(), the system really does need to have all that memory available
426 # even though it isn't ultimately used. The result is an out of memory
427 # error.
428
[15700]429 my ($infodb_handle);
[14930]430 if ($self->{'debug'}) {
[15700]431 $infodb_handle = *STDOUT;
[15710]432 }
433 else {
[20575]434 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
[15711]435 if (!defined($infodb_handle))
436 {
[14930]437 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
[15711]438 die "builder::make_infodatabase - couldn't open infodb write handle\n";
[14930]439 }
440 }
[15725]441
[20686]442 if ($self->{'incremental'}) {
[20575]443 # reconstruct doc_obj metadata from database for all docs
444 $reconstructed_docs
445 = &classify::reconstruct_doc_objs_metadata($infodb_type,
446 $infodb_file_path,
447 $database_recs);
448 }
449
450 # set up the document processor
451
[15700]452 $self->{'buildproc'}->set_output_handle ($infodb_handle);
[14930]453 $self->{'buildproc'}->set_mode ('infodb');
454 $self->{'buildproc'}->set_assocdir ($assocdir);
[15688]455 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
[14930]456 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
457 $self->{'buildproc'}->set_indexing_text (0);
458 $self->{'buildproc'}->set_store_text(1);
[16222]459 $self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
[14930]460
461 # make_infodatabase needs full reset even for incremental build
[15688]462 # as incremental works by reconstructing all docs from the database and
[14930]463 # then adding in the new ones
464 $self->{'buildproc'}->zero_reset();
465
[14934]466 $self->{'buildproc'}->{'mdprefix_fields'} = {};
[23120]467
468 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
469 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
[14934]470
[20686]471 if ($self->{'incremental'}) {
[14930]472 # create flat classify structure, ready for new docs to be added
[23120]473 foreach my $doc_obj ( @$reconstructed_docs ) {
[23159]474 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
[23120]475 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
476 $self->{'buildproc'}->process($doc_obj,undef);
477 }
[14930]478 }
479 }
480 # this has changed to only output collection meta if its
481 # not in the config file
[15700]482 $self->output_collection_meta($infodb_handle);
[14930]483
484 # output classification information
[20575]485 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
[14930]486 $self->{'remove_empty_classifications'},
487 $self->{'gli'});
488
489 # Output classifier reverse lookup, used in incremental deletion
[18469]490 ####&classify::print_reverse_lookup($infodb_handle);
[14930]491
[15700]492 # output doclist
493 my @doc_list = $self->{'buildproc'}->get_doc_list();
[15725]494 my $browselist_infodb = { 'hastxt' => [ "0" ],
495 'childtype' => [ "VList" ],
496 'numleafdocs' => [ scalar(@doc_list) ],
497 'thistype' => [ "Invisible" ],
498 'contains' => [ join(";", @doc_list) ] };
[20575]499 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
[14930]500
[20575]501 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
[23172]502
503 if ($infodb_type eq "gdbm-txtgz") {
504 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
505 if (-e $gdb_infodb_file_path) {
506 &util::rm($gdb_infodb_file_path);
507 }
508 }
[14930]509 print STDERR "</Stage>\n" if $self->{'gli'};
510}
511
512sub make_auxiliary_files {
513 my $self = shift (@_);
514 my ($index);
515 my $build_cfg = {};
516 # subclasses may have already defined stuff in here
517 if (defined $self->{'build_cfg'}) {
518 $build_cfg = $self->{'build_cfg'};
519 }
520
521 my $outhandle = $self->{'outhandle'};
522
523 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
524 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
525
526 # get the text directory
527 &util::mk_all_dir ($self->{'build_dir'});
528
529 # store the build date
530 $build_cfg->{'builddate'} = time;
531 $build_cfg->{'buildtype'} = $self->{'buildtype'};
[15003]532 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
[14930]533 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
[17110]534 if ($self->{'separate_cjk'}) {
535 $build_cfg->{'separate_cjk'} = "true";
536 }
[14930]537
538 # store the number of documents and number of bytes
539 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
540 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
541 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
[18441]542
[14930]543 # store the mapping between the index names and the directory names
544 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
545 my @indexmap = ();
546 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
547 if (not defined ($self->{'notbuilt'}->{$index})) {
548 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
549 }
550 }
551 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
552
553 my @subcollectionmap = ();
554 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
555 push (@subcollectionmap, "$subcollection\-\>" .
556 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
557 }
558 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
559
560 my @languagemap = ();
561 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
562 push (@languagemap, "$language\-\>" .
563 $self->{'index_mapping'}->{'languagemap'}->{$language});
564 }
565 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
566
567 my @notbuilt = ();
568 foreach my $nb (keys %{$self->{'notbuilt'}}) {
569 push (@notbuilt, $nb);
570 }
571 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
572
573 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
574
[15728]575 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
[23939]576
577 # write out the earliestDatestamp information needed for OAI
578 my $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
579 if(!-d $archivedir) {
580 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
[23946]581 }
582 my $earliestDatestampFile = &util::filename_cat ($archivedir, "earliestDatestamp");
583 my $earliestDatestamp = 0;
584 if (open(FIN,"<$earliestDatestampFile")) {
[23939]585 {
586 # slurp in file as a single line
587 local $/ = undef;
588 $earliestDatestamp = <FIN>;
589 #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
590 }
[23946]591 close(FIN);
[23939]592 }
[23946]593 else {
594 print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
595 print $outhandle "Setting value to 0.\n";
596 }
[24070]597 $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
[23946]598
[14930]599 $self->build_cfg_extra($build_cfg);
600
601 if ($gs_mode eq "gs2") {
[20100]602 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
[14930]603 }
[14384]604 if ($gs_mode eq "gs3") {
[14930]605
[21785]606 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
[14930]607 }
608
609 print STDERR "</Stage>\n" if $self->{'gli'};
610}
611
[17573]612# implement this in subclass if want to add extra stuff to build.cfg
613sub build_cfg_extra {
614 my $self = shift(@_);
615 my ($build_cfg) = @_;
616
617}
618
619
[14930]620sub collect_specific {
621 my $self = shift (@_);
622}
623
624sub want_built {
625 my $self = shift (@_);
626 my ($index) = @_;
627
628 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
629 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
630 if ($index =~ /^$checkstr$/) {
631 $self->{'notbuilt'}->{$index} = 1;
632 return 0;
633 }
634 }
635 }
636
637 return 1;
638}
639
640sub create_index_mapping {
641 my $self = shift (@_);
642 my ($indexes) = @_;
643
644 print STDERR "create_index_mapping should be implemented in subclass\n";
645 my %mapping = ();
646 return \%mapping;
647}
648
649# returns a processed version of a field.
650# if the field has only one component the processed
651# version will contain the first character and next consonant
652# of that componant - otherwise it will contain the first
653# character of the first two components
654# only uses letdig (\w) characters now
655sub process_field {
656 my $self = shift (@_);
657 my ($field) = @_;
658
659 return "" unless (defined ($field) && $field =~ /\S/);
660
661 my ($a, $b);
662 my @components = split /,/, $field;
663 if (scalar @components >= 2) {
664 # pick the first letdig from the first two field names
665 ($a) = $components[0] =~ /^[^\w]*(\w)/;
666 ($b) = $components[1] =~ /^[^\w]*(\w)/;
667 } else {
668 # pick the first two letdig chars
669 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
670 }
671 # there may not have been any letdigs...
672 $a = 'a' unless defined $a;
673 $b = '0' unless defined $b;
674
[22264]675 my $newfield = "$a$b";
676 if ($newfield =~ /^\d\d$/) {
677 # digits only - Greenstone runtime doesn't like this.
678 $newfield = "a$a";
679 }
680 return $newfield;
681
[14930]682}
683
684sub get_next_version {
685 my $self = shift (@_);
686 my ($nameref) = @_;
687 my $num=0;
688 if ($$nameref =~ /(\d\d)$/) {
689 $num = $1; $num ++;
690 $$nameref =~ s/\d\d$/$num/;
691 } elsif ($$nameref =~ /(\d)$/) {
692 $num = $1;
693 if ($num == 9) {$$nameref =~ s/\d$/10/;}
694 else {$num ++; $$nameref =~ s/\d$/$num/;}
695 } else {
696 $$nameref =~ s/.$/0/;
697 }
698}
699
700
[14934]701
[15709]702sub get_collection_meta_sets
703{
[14930]704 my $self = shift(@_);
[15709]705 my $collection_infodb = shift(@_);
[14930]706
[14934]707 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
708 foreach my $prefix (keys %$mdprefix_fields)
709 {
[15709]710 push(@{$collection_infodb->{"metadataset"}}, $prefix);
[14934]711
712 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
713 {
[15709]714 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
715
[14934]716 my $val = $mdprefix_fields->{$prefix}->{$field};
[15709]717 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
[14934]718 }
719 }
[15709]720}
[14934]721
722
723# default is to output the metadata sets (prefixes) used in collection
[15709]724sub output_collection_meta
725{
[14934]726 my $self = shift(@_);
[15709]727 my $infodb_handle = shift(@_);
[14934]728
[15709]729 my %collection_infodb = ();
730 $self->get_collection_meta_sets(\%collection_infodb);
[15725]731 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
[15709]732}
[14934]733
[17573]734# sometimes we need to read in an existing build.cfg - for example,
735# if doing each stage of building separately, or when doing incremental
736# building
737sub read_build_cfg {
738 my $self = shift(@_);
[14934]739
[20095]740 my $buildconfigfilename;
[17573]741
[20095]742 if ($gs_mode eq "gs2") {
743 $buildconfigfilename = "build.cfg";
744 } else {
745 $buildconfigfilename = "buildConfig.xml";
746 }
747
748 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
749
[17573]750 if (!-e $buildconfigfile) {
751 # try the index dir - but do we know where it is?? try here
[20095]752 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
[17573]753 if (!-e $buildconfigfile) {
754 #we cant find a config file - just ignore the field list
755 return undef;
756 }
757 }
[20100]758 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
[20095]759
[17573]760}
761
[14930]762sub print_stats {
763 my $self = shift (@_);
764
765 my $outhandle = $self->{'outhandle'};
766 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
767 my $index = $self->{'buildproc'}->get_index();
768 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
769 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
770
771 if ($indexing_text) {
772 print $outhandle "Stats (Creating index $index)\n";
773 } else {
774 print $outhandle "Stats (Compressing text from $index)\n";
775 }
776 print $outhandle "Total bytes in collection: $num_bytes\n";
777 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
778
779 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
780
[20686]781 if ($self->{'incremental'}) {
[14930]782 if ($num_processed_bytes == 0) {
783 if ($indexing_text) {
784 print $outhandle "No additional text was added to $index\n";
785 } elsif (!$self->{'no_text'}) {
786 print $outhandle "No additional text was compressed\n";
787 }
788 }
789 }
790 else {
791 print $outhandle "***************\n";
792 if ($indexing_text) {
793 print $outhandle "WARNING: There is very little or no text to process for $index\n";
794 } elsif (!$self->{'no_text'}) {
795 print $outhandle "WARNING: There is very little or no text to compress\n";
796 }
797 print $outhandle " Was this your intention?\n";
798 print $outhandle "***************\n";
799 }
800
801 }
802
803}
804
805
8061;
807
Note: See TracBrowser for help on using the repository browser.