source: main/trunk/greenstone2/perllib/basebuilder.pm@ 26052

Last change on this file since 26052 was 25958, checked in by kjdon, 12 years ago

pass gs_version to loading plugins

  • Property svn:keywords set to Author Date Id Revision
File size: 25.6 KB
RevLine 
[14930]1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
[15709]34use dbutil;
[14930]35use plugin;
36use util;
37
[15709]38
[14930]39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
[20095]54our $gs_mode = "gs2";
[14930]55
56sub new {
[24495]57 my ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
[20647]58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
[14930]59 $remove_empty_classifications,
[21785]60 $outhandle, $no_text, $failhandle, $gli) = @_;
[14930]61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
[24495]67 my $self = bless {'site'=>$site, # will be undef for Greenstone 2
68 'collection'=>$collection,
[14930]69 'source_dir'=>$source_dir,
70 'build_dir'=>$build_dir,
71 'verbosity'=>$verbosity,
72 'maxdocs'=>$maxdocs,
73 'debug'=>$debug,
74 'keepold'=>$keepold,
75 'incremental'=>$incremental,
[20647]76 'incremental_mode'=>$incremental_mode,
[14930]77 'remove_empty_classifications'=>$remove_empty_classifications,
78 'outhandle'=>$outhandle,
79 'no_text'=>$no_text,
80 'failhandle'=>$failhandle,
81 'notbuilt'=>{}, # indexes not built
[21785]82 'gli'=>$gli
[14930]83 }, $class;
84
85 $self->{'gli'} = 0 unless defined $self->{'gli'};
86
87 # Read in the collection configuration file.
88 my ($colcfgname);
89 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
[20100]90 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
[14384]91
[20100]92 if ($gs_mode eq "gs3") {
93 # read it in again to save the original form for later writing out
94 # of buildConfig.xml
95 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
96 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
[14930]97 }
[20100]98
[15725]99 # get the database type for this collection from the collect.cfg file (may be undefined)
[15727]100 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
[15725]101
[14384]102
[15688]103 # load up any dontdb fields
104 $self->{'dontdb'} = {};
[14930]105 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
106 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
[15688]107 $self->{'dontdb'}->{$dg} = 1;
[14930]108 }
109 }
110
111 $self->{'maxnumeric'} = 4;
112 return $self;
113}
114
115# stuff has been moved here from new, so we can use subclass methods
116sub init {
117 my $self = shift(@_);
118
[20647]119 my $outhandle = $self->{'outhandle'};
120 my $failhandle = $self->{'failhandle'};
121
[14930]122 $self->generate_index_list();
[19218]123 my $indexes = $self->{'collect_cfg'}->{'indexes'};
124 if (defined $indexes) {
125 # sort out subcollection indexes
126 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
127 $self->{'collect_cfg'}->{'indexes'} = [];
128 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
129 foreach my $index (@$indexes) {
130 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
131 }
[14930]132 }
133 }
[19218]134
135 # sort out language subindexes
136 if (defined $self->{'collect_cfg'}->{'languages'}) {
137 $indexes = $self->{'collect_cfg'}->{'indexes'};
138 $self->{'collect_cfg'}->{'indexes'} = [];
139 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
140 foreach my $index (@$indexes) {
141 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
142 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
143 }
144 else { # add in an empty subcollection field
145 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
146 }
[14930]147 }
148 }
149 }
150 }
[19218]151
[14930]152 if (defined($self->{'collect_cfg'}->{'indexes'})) {
153 # make sure that the same index isn't specified more than once
154 my %tmphash = ();
155 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
156 $self->{'collect_cfg'}->{'indexes'} = [];
157 foreach my $i (@tmparray) {
158 if (!defined ($tmphash{$i})) {
159 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
160 $tmphash{$i} = 1;
161 }
162 }
163 } else {
164 $self->{'collect_cfg'}->{'indexes'} = [];
165 }
166
[20647]167 # check incremental against whether builder can cope or not.
[20681]168 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
[20647]169 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
170 $self->{'keepold'} = 0;
171 $self->{'incremental'} = 0;
172 $self->{'incremental_mode'} = "none";
173
174 }
175
[25958]176 # gs_version for plugins
177 my $gs_version = "2";
178 if ($gs_mode eq "gs3") {
179 $gs_version = "3";
180 }
[20647]181 # get the list of plugins for this collection
182 my $plugins = [];
183 if (defined $self->{'collect_cfg'}->{'plugin'}) {
184 $plugins = $self->{'collect_cfg'}->{'plugin'};
185 }
186
187 # load all the plugins
188
189 #build up the extra global options for the plugins
190 my @global_opts = ();
191 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
192 push @global_opts, "-separate_cjk";
193 }
[25958]194 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'}, $gs_version);
[20647]195
196 if (scalar(@{$self->{'pluginfo'}}) == 0) {
197 print $outhandle "No plugins were loaded.\n";
198 die "\n";
199 }
200
201 # get the list of classifiers for this collection
202 my $classifiers = [];
203 if (defined $self->{'collect_cfg'}->{'classify'}) {
204 $classifiers = $self->{'collect_cfg'}->{'classify'};
205 }
206
207 # load all the classifiers
208 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
209
[14930]210 # load up the document processor for building
211 # if a buildproc class has been created for this collection, use it
[20647]212 # otherwise, use the default buildproc for the builder we are initialising
[24342]213 my $buildprocdir = undef;
214 my $buildproctype;
215
[14930]216 my $collection = $self->{'collection'};
217 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
218 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
219 $buildproctype = "custombuildproc";
220 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
221 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
222 $buildproctype = "custombuildproc";
223 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
224 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
225 $buildproctype = "${collection}buildproc";
226 } else {
227 $buildproctype = $self->default_buildproc();
228 }
[24342]229 if (defined $buildprocdir) {
230 require "$buildprocdir/$buildproctype.pm";
231 }
232 else {
233 require "$buildproctype.pm";
234 }
[14930]235
236 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
237 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
238 die "$@" if $@;
239
[21607]240 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
241 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
[17110]242
243 $self->generate_index_options();
244
[14930]245 if (!$self->{'debug'} && !$self->{'keepold'}) {
246 # remove any old builds
247 &util::rm_r($self->{'build_dir'});
248 &util::mk_all_dir($self->{'build_dir'});
249
250 # make the text directory
251 my $textdir = "$self->{'build_dir'}/text";
252 &util::mk_all_dir($textdir);
253 }
[17573]254
255 if ($self->{'incremental'}) {
256 # some classes may need to do some additional initialisation
257 $self->init_for_incremental_build();
258 }
[14930]259
260}
261
[20647]262sub is_incremental_capable
263{
264 # By default we return 'no' as the answer
265 # Safer to assume non-incremental to start with, and then override in
266 # inherited classes that are.
267
268 return 0;
269}
270
[17573]271# implement this in subclass if want to do additional initialisation for an
272# incremental build
273sub init_for_incremental_build {
274 my $self = shift (@_);
275}
276
[14930]277sub deinit {
278 my $self = shift (@_);
279
280 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
281}
282
[17110]283sub generate_index_options {
284 my $self = shift (@_);
285
286 my $separate_cjk = 0;
287
288 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
289 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
290 if ($option =~ /separate_cjk/) {
291 $separate_cjk = 1;
292 }
293 }
294 }
295 # set this for building
296 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
297 # record it for build.cfg
298 $self->{'separate_cjk'} = $separate_cjk;
299}
300
[14930]301sub set_sections_index_document_metadata {
302 my $self = shift (@_);
303 my ($index) = @_;
304
305 $self->{'buildproc'}->set_sections_index_document_metadata($index);
306}
307
308sub set_maxnumeric {
309 my $self = shift (@_);
310 my ($maxnumeric) = @_;
311
312 $self->{'maxnumeric'} = $maxnumeric;
313}
314sub set_strip_html {
315 my $self = shift (@_);
316 my ($strip) = @_;
317
318 $self->{'strip_html'} = $strip;
319 $self->{'buildproc'}->set_strip_html($strip);
320}
321
[24754]322sub set_store_metadata_coverage {
323 my $self = shift (@_);
324 my ($store_metadata_coverage) = @_;
325
326 $self->{'buildproc'}->set_store_metadata_coverage($store_metadata_coverage);
327}
328
[14930]329sub compress_text {
330 my $self = shift (@_);
331 my ($textindex) = @_;
332
333 print STDERR "compress_text() should be implemented in subclass!!";
334 return;
335}
336
337
338sub build_indexes {
339 my $self = shift (@_);
340 my ($indexname) = @_;
341 my $outhandle = $self->{'outhandle'};
342
[24460]343 $self->pre_build_indexes();
344
[14930]345 my $indexes = [];
346 if (defined $indexname && $indexname =~ /\w/) {
347 push @$indexes, $indexname;
348 } else {
349 $indexes = $self->{'collect_cfg'}->{'indexes'};
350 }
351
352 # create the mapping between the index descriptions
353 # and their directory names (includes subcolls and langs)
354 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
355
356 # build each of the indexes
357 foreach my $index (@$indexes) {
358 if ($self->want_built($index)) {
359 print $outhandle "\n*** building index $index in subdirectory " .
360 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
361 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
362 $self->build_index($index);
363 } else {
364 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
365 }
366 }
367
[24460]368 $self->post_build_indexes();
[14930]369
370}
371
[24460]372# implement this in subclass if want to do extra stuff at before building
[17573]373# all the indexes
[24460]374sub pre_build_indexes {
[14930]375 my $self = shift(@_);
[24460]376 my ($indexname) = @_; # optional parameter
[14930]377}
378
[24460]379# implement this in subclass if want to do extra stuff at the end of building
380# all the indexes
381sub post_build_indexes {
382 my $self = shift(@_);
383}
384
[14930]385sub build_index {
386 my $self = shift (@_);
387 my ($index) = @_;
388
389 print STDERR "build_index should be implemented in subclass\n";
390 return;
391}
392
393
394
395sub make_infodatabase {
396 my $self = shift (@_);
397 my $outhandle = $self->{'outhandle'};
398
399 print STDERR "BuildDir: $self->{'build_dir'}\n";
400
401 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
402 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
403 &util::mk_all_dir ($textdir);
404 &util::mk_all_dir ($assocdir);
405
[15710]406 # Get info database file path
[20575]407 my $infodb_type = $self->{'infodbtype'};
408 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
[14930]409
410 print $outhandle "\n*** creating the info database and processing associated files\n"
411 if ($self->{'verbosity'} >= 1);
412 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
413
414 # init all the classifiers
415 &classify::init_classifiers ($self->{'classifiers'});
416
417 my $reconstructed_docs = undef;
[20575]418 my $database_recs = undef;
419
[20686]420 if ($self->{'incremental'}) {
[20575]421 $database_recs = {};
422
423 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
[14930]424 }
[20575]425
[14930]426
[20575]427 # Important (for memory usage reasons) that we obtain the filehandle
428 # here for writing out to the database, rather than after
[20686]429 # $reconstructed_docs has been set up (assuming -incremental is on)
[20575]430 #
431 # This is because when we open a pipe to txt2db [using open()]
432 # this triggers a fork() followed by exec(). $reconstructed_docs
433 # can get very large, and so if we did the open() after this, it means
434 # the fork creates a clone of the *large* process image which (admittedly)
435 # is then quickly replaced in the execve() with the much smaller image for
[20647]436 # 'txt2db'. The trouble is, in that seismic second caused by
[20575]437 # the fork(), the system really does need to have all that memory available
438 # even though it isn't ultimately used. The result is an out of memory
439 # error.
440
[15700]441 my ($infodb_handle);
[14930]442 if ($self->{'debug'}) {
[15700]443 $infodb_handle = *STDOUT;
[15710]444 }
445 else {
[20575]446 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
[15711]447 if (!defined($infodb_handle))
448 {
[14930]449 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
[15711]450 die "builder::make_infodatabase - couldn't open infodb write handle\n";
[14930]451 }
452 }
[15725]453
[20686]454 if ($self->{'incremental'}) {
[20575]455 # reconstruct doc_obj metadata from database for all docs
456 $reconstructed_docs
457 = &classify::reconstruct_doc_objs_metadata($infodb_type,
458 $infodb_file_path,
459 $database_recs);
460 }
461
462 # set up the document processor
463
[15700]464 $self->{'buildproc'}->set_output_handle ($infodb_handle);
[14930]465 $self->{'buildproc'}->set_mode ('infodb');
466 $self->{'buildproc'}->set_assocdir ($assocdir);
[15688]467 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
[14930]468 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
469 $self->{'buildproc'}->set_indexing_text (0);
470 $self->{'buildproc'}->set_store_text(1);
471
472 # make_infodatabase needs full reset even for incremental build
[15688]473 # as incremental works by reconstructing all docs from the database and
[14930]474 # then adding in the new ones
475 $self->{'buildproc'}->zero_reset();
476
[14934]477 $self->{'buildproc'}->{'mdprefix_fields'} = {};
[23120]478
479 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
480 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
[14934]481
[20686]482 if ($self->{'incremental'}) {
[14930]483 # create flat classify structure, ready for new docs to be added
[23120]484 foreach my $doc_obj ( @$reconstructed_docs ) {
[23159]485 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
[23120]486 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
487 $self->{'buildproc'}->process($doc_obj,undef);
488 }
[14930]489 }
490 }
491 # this has changed to only output collection meta if its
492 # not in the config file
[15700]493 $self->output_collection_meta($infodb_handle);
[14930]494
495 # output classification information
[20575]496 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
[14930]497 $self->{'remove_empty_classifications'},
498 $self->{'gli'});
499
500 # Output classifier reverse lookup, used in incremental deletion
[18469]501 ####&classify::print_reverse_lookup($infodb_handle);
[14930]502
[15700]503 # output doclist
504 my @doc_list = $self->{'buildproc'}->get_doc_list();
[15725]505 my $browselist_infodb = { 'hastxt' => [ "0" ],
506 'childtype' => [ "VList" ],
507 'numleafdocs' => [ scalar(@doc_list) ],
508 'thistype' => [ "Invisible" ],
509 'contains' => [ join(";", @doc_list) ] };
[20575]510 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
[14930]511
[20575]512 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
[23172]513
514 if ($infodb_type eq "gdbm-txtgz") {
515 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
516 if (-e $gdb_infodb_file_path) {
517 &util::rm($gdb_infodb_file_path);
518 }
519 }
[14930]520 print STDERR "</Stage>\n" if $self->{'gli'};
521}
522
523sub make_auxiliary_files {
524 my $self = shift (@_);
525 my ($index);
526 my $build_cfg = {};
527 # subclasses may have already defined stuff in here
528 if (defined $self->{'build_cfg'}) {
529 $build_cfg = $self->{'build_cfg'};
530 }
531
532 my $outhandle = $self->{'outhandle'};
533
534 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
535 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
536
537 # get the text directory
538 &util::mk_all_dir ($self->{'build_dir'});
539
540 # store the build date
541 $build_cfg->{'builddate'} = time;
542 $build_cfg->{'buildtype'} = $self->{'buildtype'};
[15003]543 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
[14930]544 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
[17110]545 if ($self->{'separate_cjk'}) {
546 $build_cfg->{'separate_cjk'} = "true";
547 }
[14930]548
549 # store the number of documents and number of bytes
550 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
551 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
552 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
[18441]553
[14930]554 # store the mapping between the index names and the directory names
555 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
556 my @indexmap = ();
557 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
558 if (not defined ($self->{'notbuilt'}->{$index})) {
559 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
560 }
561 }
562 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
563
564 my @subcollectionmap = ();
565 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
566 push (@subcollectionmap, "$subcollection\-\>" .
567 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
568 }
569 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
570
571 my @languagemap = ();
572 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
573 push (@languagemap, "$language\-\>" .
574 $self->{'index_mapping'}->{'languagemap'}->{$language});
575 }
576 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
577
578 my @notbuilt = ();
579 foreach my $nb (keys %{$self->{'notbuilt'}}) {
580 push (@notbuilt, $nb);
581 }
582 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
583
584 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
585
[15728]586 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
[23939]587
588 # write out the earliestDatestamp information needed for OAI
589 my $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
590 if(!-d $archivedir) {
591 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
[23946]592 }
593 my $earliestDatestampFile = &util::filename_cat ($archivedir, "earliestDatestamp");
594 my $earliestDatestamp = 0;
595 if (open(FIN,"<$earliestDatestampFile")) {
[23939]596 {
597 # slurp in file as a single line
598 local $/ = undef;
599 $earliestDatestamp = <FIN>;
600 #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
601 }
[23946]602 close(FIN);
[23939]603 }
[23946]604 else {
605 print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
606 print $outhandle "Setting value to 0.\n";
607 }
[24070]608 $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
[23946]609
[14930]610 $self->build_cfg_extra($build_cfg);
611
612 if ($gs_mode eq "gs2") {
[20100]613 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
[14930]614 }
[14384]615 if ($gs_mode eq "gs3") {
[14930]616
[21785]617 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
[14930]618 }
619
620 print STDERR "</Stage>\n" if $self->{'gli'};
621}
622
[17573]623# implement this in subclass if want to add extra stuff to build.cfg
624sub build_cfg_extra {
625 my $self = shift(@_);
626 my ($build_cfg) = @_;
627
628}
629
630
[14930]631sub collect_specific {
632 my $self = shift (@_);
633}
634
635sub want_built {
636 my $self = shift (@_);
637 my ($index) = @_;
638
639 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
640 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
641 if ($index =~ /^$checkstr$/) {
642 $self->{'notbuilt'}->{$index} = 1;
643 return 0;
644 }
645 }
646 }
647
648 return 1;
649}
650
651sub create_index_mapping {
652 my $self = shift (@_);
653 my ($indexes) = @_;
654
655 print STDERR "create_index_mapping should be implemented in subclass\n";
656 my %mapping = ();
657 return \%mapping;
658}
659
660# returns a processed version of a field.
661# if the field has only one component the processed
662# version will contain the first character and next consonant
663# of that componant - otherwise it will contain the first
664# character of the first two components
665# only uses letdig (\w) characters now
666sub process_field {
667 my $self = shift (@_);
668 my ($field) = @_;
669
670 return "" unless (defined ($field) && $field =~ /\S/);
671
672 my ($a, $b);
673 my @components = split /,/, $field;
674 if (scalar @components >= 2) {
675 # pick the first letdig from the first two field names
676 ($a) = $components[0] =~ /^[^\w]*(\w)/;
677 ($b) = $components[1] =~ /^[^\w]*(\w)/;
678 } else {
679 # pick the first two letdig chars
680 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
681 }
682 # there may not have been any letdigs...
683 $a = 'a' unless defined $a;
684 $b = '0' unless defined $b;
685
[22264]686 my $newfield = "$a$b";
687 if ($newfield =~ /^\d\d$/) {
688 # digits only - Greenstone runtime doesn't like this.
689 $newfield = "a$a";
690 }
691 return $newfield;
692
[14930]693}
694
695sub get_next_version {
696 my $self = shift (@_);
697 my ($nameref) = @_;
698 my $num=0;
699 if ($$nameref =~ /(\d\d)$/) {
700 $num = $1; $num ++;
701 $$nameref =~ s/\d\d$/$num/;
702 } elsif ($$nameref =~ /(\d)$/) {
703 $num = $1;
704 if ($num == 9) {$$nameref =~ s/\d$/10/;}
705 else {$num ++; $$nameref =~ s/\d$/$num/;}
706 } else {
707 $$nameref =~ s/.$/0/;
708 }
709}
710
711
[14934]712
[15709]713sub get_collection_meta_sets
714{
[14930]715 my $self = shift(@_);
[15709]716 my $collection_infodb = shift(@_);
[14930]717
[14934]718 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
719 foreach my $prefix (keys %$mdprefix_fields)
[24754]720 {
[15709]721 push(@{$collection_infodb->{"metadataset"}}, $prefix);
[14934]722
723 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
724 {
[15709]725 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
726
[14934]727 my $val = $mdprefix_fields->{$prefix}->{$field};
[15709]728 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
[14934]729 }
730 }
[15709]731}
[14934]732
733
734# default is to output the metadata sets (prefixes) used in collection
[15709]735sub output_collection_meta
736{
[14934]737 my $self = shift(@_);
[15709]738 my $infodb_handle = shift(@_);
[14934]739
[15709]740 my %collection_infodb = ();
741 $self->get_collection_meta_sets(\%collection_infodb);
[15725]742 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
[15709]743}
[14934]744
[17573]745# sometimes we need to read in an existing build.cfg - for example,
746# if doing each stage of building separately, or when doing incremental
747# building
748sub read_build_cfg {
749 my $self = shift(@_);
[14934]750
[20095]751 my $buildconfigfilename;
[17573]752
[20095]753 if ($gs_mode eq "gs2") {
754 $buildconfigfilename = "build.cfg";
755 } else {
756 $buildconfigfilename = "buildConfig.xml";
757 }
758
759 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
760
[17573]761 if (!-e $buildconfigfile) {
762 # try the index dir - but do we know where it is?? try here
[20095]763 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
[17573]764 if (!-e $buildconfigfile) {
765 #we cant find a config file - just ignore the field list
766 return undef;
767 }
768 }
[20100]769 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
[20095]770
[17573]771}
772
[14930]773sub print_stats {
774 my $self = shift (@_);
775
776 my $outhandle = $self->{'outhandle'};
777 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
778 my $index = $self->{'buildproc'}->get_index();
779 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
780 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
781
782 if ($indexing_text) {
783 print $outhandle "Stats (Creating index $index)\n";
784 } else {
785 print $outhandle "Stats (Compressing text from $index)\n";
786 }
787 print $outhandle "Total bytes in collection: $num_bytes\n";
788 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
789
790 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
791
[20686]792 if ($self->{'incremental'}) {
[14930]793 if ($num_processed_bytes == 0) {
794 if ($indexing_text) {
795 print $outhandle "No additional text was added to $index\n";
796 } elsif (!$self->{'no_text'}) {
797 print $outhandle "No additional text was compressed\n";
798 }
799 }
800 }
801 else {
802 print $outhandle "***************\n";
803 if ($indexing_text) {
804 print $outhandle "WARNING: There is very little or no text to process for $index\n";
805 } elsif (!$self->{'no_text'}) {
806 print $outhandle "WARNING: There is very little or no text to compress\n";
807 }
808 print $outhandle " Was this your intention?\n";
809 print $outhandle "***************\n";
810 }
811
812 }
813
814}
815
816
8171;
818
Note: See TracBrowser for help on using the repository browser.