source: main/trunk/greenstone2/perllib/basebuilder.pm@ 26094

Last change on this file since 26094 was 26094, checked in by ak19, 12 years ago

Finally properly fixed the appearance and non-appearance of the quick search form. It should not appear if there are no query elements in the config file. However, even when no search indexes were built, query elements still came through in the XML and it was hard to distinguish at the XSLT stage whether any search indexes were built or not. Kathy suggested that the perl code writing out the buildconfig.xml should not write out the ServiceRacks for searching if no indexes were built and pointed out that the changes were required in buildconfigxml.pm which was called from basebuilder.pm to write out the buildConfig.xml file. Now the XSLT can at last do the right thing: it doesn't display the quick search area if there are no search elements, and doesn't provide the plain text query form in the quick search area if TextQuery isn't one of the query types, but does provide the buttons to other query types like form search if these are meant to be visible according to the SearchType format feature. If there are no search indexes built, then there is no quick search area.

  • Property svn:keywords set to Author Date Id Revision
File size: 25.8 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57 my ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
59 $remove_empty_classifications,
60 $outhandle, $no_text, $failhandle, $gli) = @_;
61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
67 my $self = bless {'site'=>$site, # will be undef for Greenstone 2
68 'collection'=>$collection,
69 'source_dir'=>$source_dir,
70 'build_dir'=>$build_dir,
71 'verbosity'=>$verbosity,
72 'maxdocs'=>$maxdocs,
73 'debug'=>$debug,
74 'keepold'=>$keepold,
75 'incremental'=>$incremental,
76 'incremental_mode'=>$incremental_mode,
77 'remove_empty_classifications'=>$remove_empty_classifications,
78 'outhandle'=>$outhandle,
79 'no_text'=>$no_text,
80 'failhandle'=>$failhandle,
81 'notbuilt'=>{}, # indexes not built
82 'gli'=>$gli
83 }, $class;
84
85 $self->{'gli'} = 0 unless defined $self->{'gli'};
86
87 # Read in the collection configuration file.
88 my ($colcfgname);
89 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
90 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
91
92 if ($gs_mode eq "gs3") {
93 # read it in again to save the original form for later writing out
94 # of buildConfig.xml
95 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
96 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
97 }
98
99 # get the database type for this collection from the collect.cfg file (may be undefined)
100 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
101
102
103 # load up any dontdb fields
104 $self->{'dontdb'} = {};
105 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
106 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
107 $self->{'dontdb'}->{$dg} = 1;
108 }
109 }
110
111 $self->{'maxnumeric'} = 4;
112 return $self;
113}
114
115# stuff has been moved here from new, so we can use subclass methods
116sub init {
117 my $self = shift(@_);
118
119 my $outhandle = $self->{'outhandle'};
120 my $failhandle = $self->{'failhandle'};
121
122 $self->generate_index_list();
123 my $indexes = $self->{'collect_cfg'}->{'indexes'};
124 if (defined $indexes) {
125 # sort out subcollection indexes
126 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
127 $self->{'collect_cfg'}->{'indexes'} = [];
128 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
129 foreach my $index (@$indexes) {
130 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
131 }
132 }
133 }
134
135 # sort out language subindexes
136 if (defined $self->{'collect_cfg'}->{'languages'}) {
137 $indexes = $self->{'collect_cfg'}->{'indexes'};
138 $self->{'collect_cfg'}->{'indexes'} = [];
139 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
140 foreach my $index (@$indexes) {
141 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
142 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
143 }
144 else { # add in an empty subcollection field
145 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
146 }
147 }
148 }
149 }
150 }
151
152 if (defined($self->{'collect_cfg'}->{'indexes'})) {
153 # make sure that the same index isn't specified more than once
154 my %tmphash = ();
155 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
156 $self->{'collect_cfg'}->{'indexes'} = [];
157 foreach my $i (@tmparray) {
158 if (!defined ($tmphash{$i})) {
159 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
160 $tmphash{$i} = 1;
161 }
162 }
163 } else {
164 $self->{'collect_cfg'}->{'indexes'} = [];
165 }
166
167 # check incremental against whether builder can cope or not.
168 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
169 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
170 $self->{'keepold'} = 0;
171 $self->{'incremental'} = 0;
172 $self->{'incremental_mode'} = "none";
173
174 }
175
176 # gs_version for plugins
177 my $gs_version = "2";
178 if ($gs_mode eq "gs3") {
179 $gs_version = "3";
180 }
181 # get the list of plugins for this collection
182 my $plugins = [];
183 if (defined $self->{'collect_cfg'}->{'plugin'}) {
184 $plugins = $self->{'collect_cfg'}->{'plugin'};
185 }
186
187 # load all the plugins
188
189 #build up the extra global options for the plugins
190 my @global_opts = ();
191 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
192 push @global_opts, "-separate_cjk";
193 }
194 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'}, $gs_version);
195
196 if (scalar(@{$self->{'pluginfo'}}) == 0) {
197 print $outhandle "No plugins were loaded.\n";
198 die "\n";
199 }
200
201 # get the list of classifiers for this collection
202 my $classifiers = [];
203 if (defined $self->{'collect_cfg'}->{'classify'}) {
204 $classifiers = $self->{'collect_cfg'}->{'classify'};
205 }
206
207 # load all the classifiers
208 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
209
210 # load up the document processor for building
211 # if a buildproc class has been created for this collection, use it
212 # otherwise, use the default buildproc for the builder we are initialising
213 my $buildprocdir = undef;
214 my $buildproctype;
215
216 my $collection = $self->{'collection'};
217 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
218 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
219 $buildproctype = "custombuildproc";
220 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
221 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
222 $buildproctype = "custombuildproc";
223 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
224 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
225 $buildproctype = "${collection}buildproc";
226 } else {
227 $buildproctype = $self->default_buildproc();
228 }
229 if (defined $buildprocdir) {
230 require "$buildprocdir/$buildproctype.pm";
231 }
232 else {
233 require "$buildproctype.pm";
234 }
235
236 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
237 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
238 die "$@" if $@;
239
240 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
241 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
242
243 $self->generate_index_options();
244
245 if (!$self->{'debug'} && !$self->{'keepold'}) {
246 # remove any old builds
247 &util::rm_r($self->{'build_dir'});
248 &util::mk_all_dir($self->{'build_dir'});
249
250 # make the text directory
251 my $textdir = "$self->{'build_dir'}/text";
252 &util::mk_all_dir($textdir);
253 }
254
255 if ($self->{'incremental'}) {
256 # some classes may need to do some additional initialisation
257 $self->init_for_incremental_build();
258 }
259
260}
261
262sub is_incremental_capable
263{
264 # By default we return 'no' as the answer
265 # Safer to assume non-incremental to start with, and then override in
266 # inherited classes that are.
267
268 return 0;
269}
270
271# implement this in subclass if want to do additional initialisation for an
272# incremental build
273sub init_for_incremental_build {
274 my $self = shift (@_);
275}
276
277sub deinit {
278 my $self = shift (@_);
279
280 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
281}
282
283sub generate_index_options {
284 my $self = shift (@_);
285
286 my $separate_cjk = 0;
287
288 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
289 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
290 if ($option =~ /separate_cjk/) {
291 $separate_cjk = 1;
292 }
293 }
294 }
295 # set this for building
296 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
297 # record it for build.cfg
298 $self->{'separate_cjk'} = $separate_cjk;
299}
300
301sub set_sections_index_document_metadata {
302 my $self = shift (@_);
303 my ($index) = @_;
304
305 $self->{'buildproc'}->set_sections_index_document_metadata($index);
306}
307
308sub set_maxnumeric {
309 my $self = shift (@_);
310 my ($maxnumeric) = @_;
311
312 $self->{'maxnumeric'} = $maxnumeric;
313}
314sub set_strip_html {
315 my $self = shift (@_);
316 my ($strip) = @_;
317
318 $self->{'strip_html'} = $strip;
319 $self->{'buildproc'}->set_strip_html($strip);
320}
321
322sub set_store_metadata_coverage {
323 my $self = shift (@_);
324 my ($store_metadata_coverage) = @_;
325
326 $self->{'buildproc'}->set_store_metadata_coverage($store_metadata_coverage);
327}
328
329sub compress_text {
330 my $self = shift (@_);
331 my ($textindex) = @_;
332
333 print STDERR "compress_text() should be implemented in subclass!!";
334 return;
335}
336
337
338sub build_indexes {
339 my $self = shift (@_);
340 my ($indexname) = @_;
341 my $outhandle = $self->{'outhandle'};
342
343 $self->pre_build_indexes();
344
345 my $indexes = [];
346 if (defined $indexname && $indexname =~ /\w/) {
347 push @$indexes, $indexname;
348 } else {
349 $indexes = $self->{'collect_cfg'}->{'indexes'};
350 }
351
352 # create the mapping between the index descriptions
353 # and their directory names (includes subcolls and langs)
354 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
355
356 # build each of the indexes
357 foreach my $index (@$indexes) {
358 if ($self->want_built($index)) {
359 print $outhandle "\n*** building index $index in subdirectory " .
360 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
361 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
362 $self->build_index($index);
363 } else {
364 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
365 }
366 }
367
368 $self->post_build_indexes();
369
370}
371
372# implement this in subclass if want to do extra stuff at before building
373# all the indexes
374sub pre_build_indexes {
375 my $self = shift(@_);
376 my ($indexname) = @_; # optional parameter
377}
378
379# implement this in subclass if want to do extra stuff at the end of building
380# all the indexes
381sub post_build_indexes {
382 my $self = shift(@_);
383}
384
385sub build_index {
386 my $self = shift (@_);
387 my ($index) = @_;
388
389 print STDERR "build_index should be implemented in subclass\n";
390 return;
391}
392
393
394
395sub make_infodatabase {
396 my $self = shift (@_);
397 my $outhandle = $self->{'outhandle'};
398
399 print STDERR "BuildDir: $self->{'build_dir'}\n";
400
401 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
402 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
403 &util::mk_all_dir ($textdir);
404 &util::mk_all_dir ($assocdir);
405
406 # Get info database file path
407 my $infodb_type = $self->{'infodbtype'};
408 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
409
410 print $outhandle "\n*** creating the info database and processing associated files\n"
411 if ($self->{'verbosity'} >= 1);
412 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
413
414 # init all the classifiers
415 &classify::init_classifiers ($self->{'classifiers'});
416
417 my $reconstructed_docs = undef;
418 my $database_recs = undef;
419
420 if ($self->{'incremental'}) {
421 $database_recs = {};
422
423 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
424 }
425
426
427 # Important (for memory usage reasons) that we obtain the filehandle
428 # here for writing out to the database, rather than after
429 # $reconstructed_docs has been set up (assuming -incremental is on)
430 #
431 # This is because when we open a pipe to txt2db [using open()]
432 # this triggers a fork() followed by exec(). $reconstructed_docs
433 # can get very large, and so if we did the open() after this, it means
434 # the fork creates a clone of the *large* process image which (admittedly)
435 # is then quickly replaced in the execve() with the much smaller image for
436 # 'txt2db'. The trouble is, in that seismic second caused by
437 # the fork(), the system really does need to have all that memory available
438 # even though it isn't ultimately used. The result is an out of memory
439 # error.
440
441 my ($infodb_handle);
442 if ($self->{'debug'}) {
443 $infodb_handle = *STDOUT;
444 }
445 else {
446 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
447 if (!defined($infodb_handle))
448 {
449 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
450 die "builder::make_infodatabase - couldn't open infodb write handle\n";
451 }
452 }
453
454 if ($self->{'incremental'}) {
455 # reconstruct doc_obj metadata from database for all docs
456 $reconstructed_docs
457 = &classify::reconstruct_doc_objs_metadata($infodb_type,
458 $infodb_file_path,
459 $database_recs);
460 }
461
462 # set up the document processor
463
464 $self->{'buildproc'}->set_output_handle ($infodb_handle);
465 $self->{'buildproc'}->set_mode ('infodb');
466 $self->{'buildproc'}->set_assocdir ($assocdir);
467 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
468 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
469 $self->{'buildproc'}->set_indexing_text (0);
470 $self->{'buildproc'}->set_store_text(1);
471
472 # make_infodatabase needs full reset even for incremental build
473 # as incremental works by reconstructing all docs from the database and
474 # then adding in the new ones
475 $self->{'buildproc'}->zero_reset();
476
477 $self->{'buildproc'}->{'mdprefix_fields'} = {};
478
479 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
480 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
481
482 if ($self->{'incremental'}) {
483 # create flat classify structure, ready for new docs to be added
484 foreach my $doc_obj ( @$reconstructed_docs ) {
485 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
486 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
487 $self->{'buildproc'}->process($doc_obj,undef);
488 }
489 }
490 }
491 # this has changed to only output collection meta if its
492 # not in the config file
493 $self->output_collection_meta($infodb_handle);
494
495 # output classification information
496 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
497 $self->{'remove_empty_classifications'},
498 $self->{'gli'});
499
500 # Output classifier reverse lookup, used in incremental deletion
501 ####&classify::print_reverse_lookup($infodb_handle);
502
503 # output doclist
504 my @doc_list = $self->{'buildproc'}->get_doc_list();
505 my $browselist_infodb = { 'hastxt' => [ "0" ],
506 'childtype' => [ "VList" ],
507 'numleafdocs' => [ scalar(@doc_list) ],
508 'thistype' => [ "Invisible" ],
509 'contains' => [ join(";", @doc_list) ] };
510 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
511
512 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
513
514 if ($infodb_type eq "gdbm-txtgz") {
515 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
516 if (-e $gdb_infodb_file_path) {
517 &util::rm($gdb_infodb_file_path);
518 }
519 }
520 print STDERR "</Stage>\n" if $self->{'gli'};
521}
522
523sub make_auxiliary_files {
524 my $self = shift (@_);
525 my ($index);
526 my $build_cfg = {};
527 # subclasses may have already defined stuff in here
528 if (defined $self->{'build_cfg'}) {
529 $build_cfg = $self->{'build_cfg'};
530 }
531
532 my $outhandle = $self->{'outhandle'};
533
534 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
535 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
536
537 # get the text directory
538 &util::mk_all_dir ($self->{'build_dir'});
539
540 # store the build date
541 $build_cfg->{'builddate'} = time;
542 $build_cfg->{'buildtype'} = $self->{'buildtype'};
543 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
544 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
545 if ($self->{'separate_cjk'}) {
546 $build_cfg->{'separate_cjk'} = "true";
547 }
548
549 # store the number of documents and number of bytes
550 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
551 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
552 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
553
554 # store the mapping between the index names and the directory names
555 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
556 my @indexmap = ();
557 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
558 if (not defined ($self->{'notbuilt'}->{$index})) {
559 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
560 }
561 }
562
563 # store the number of indexes built to later determine whether search serviceracks get written out to buildConfig.xml
564 $build_cfg->{'num_indexes'} = scalar (@indexmap);
565
566 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
567
568 my @subcollectionmap = ();
569 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
570 push (@subcollectionmap, "$subcollection\-\>" .
571 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
572 }
573 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
574
575 my @languagemap = ();
576 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
577 push (@languagemap, "$language\-\>" .
578 $self->{'index_mapping'}->{'languagemap'}->{$language});
579 }
580 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
581
582 my @notbuilt = ();
583 foreach my $nb (keys %{$self->{'notbuilt'}}) {
584 push (@notbuilt, $nb);
585 }
586 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
587
588 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
589
590 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
591
592 # write out the earliestDatestamp information needed for OAI
593 my $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
594 if(!-d $archivedir) {
595 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
596 }
597 my $earliestDatestampFile = &util::filename_cat ($archivedir, "earliestDatestamp");
598 my $earliestDatestamp = 0;
599 if (open(FIN,"<$earliestDatestampFile")) {
600 {
601 # slurp in file as a single line
602 local $/ = undef;
603 $earliestDatestamp = <FIN>;
604 #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
605 }
606 close(FIN);
607 }
608 else {
609 print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
610 print $outhandle "Setting value to 0.\n";
611 }
612 $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
613
614 $self->build_cfg_extra($build_cfg);
615
616 if ($gs_mode eq "gs2") {
617 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
618 }
619 if ($gs_mode eq "gs3") {
620
621 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
622 }
623
624 print STDERR "</Stage>\n" if $self->{'gli'};
625}
626
627# implement this in subclass if want to add extra stuff to build.cfg
628sub build_cfg_extra {
629 my $self = shift(@_);
630 my ($build_cfg) = @_;
631
632}
633
634
635sub collect_specific {
636 my $self = shift (@_);
637}
638
639sub want_built {
640 my $self = shift (@_);
641 my ($index) = @_;
642
643 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
644 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
645 if ($index =~ /^$checkstr$/) {
646 $self->{'notbuilt'}->{$index} = 1;
647 return 0;
648 }
649 }
650 }
651
652 return 1;
653}
654
655sub create_index_mapping {
656 my $self = shift (@_);
657 my ($indexes) = @_;
658
659 print STDERR "create_index_mapping should be implemented in subclass\n";
660 my %mapping = ();
661 return \%mapping;
662}
663
664# returns a processed version of a field.
665# if the field has only one component the processed
666# version will contain the first character and next consonant
667# of that componant - otherwise it will contain the first
668# character of the first two components
669# only uses letdig (\w) characters now
670sub process_field {
671 my $self = shift (@_);
672 my ($field) = @_;
673
674 return "" unless (defined ($field) && $field =~ /\S/);
675
676 my ($a, $b);
677 my @components = split /,/, $field;
678 if (scalar @components >= 2) {
679 # pick the first letdig from the first two field names
680 ($a) = $components[0] =~ /^[^\w]*(\w)/;
681 ($b) = $components[1] =~ /^[^\w]*(\w)/;
682 } else {
683 # pick the first two letdig chars
684 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
685 }
686 # there may not have been any letdigs...
687 $a = 'a' unless defined $a;
688 $b = '0' unless defined $b;
689
690 my $newfield = "$a$b";
691 if ($newfield =~ /^\d\d$/) {
692 # digits only - Greenstone runtime doesn't like this.
693 $newfield = "a$a";
694 }
695 return $newfield;
696
697}
698
699sub get_next_version {
700 my $self = shift (@_);
701 my ($nameref) = @_;
702 my $num=0;
703 if ($$nameref =~ /(\d\d)$/) {
704 $num = $1; $num ++;
705 $$nameref =~ s/\d\d$/$num/;
706 } elsif ($$nameref =~ /(\d)$/) {
707 $num = $1;
708 if ($num == 9) {$$nameref =~ s/\d$/10/;}
709 else {$num ++; $$nameref =~ s/\d$/$num/;}
710 } else {
711 $$nameref =~ s/.$/0/;
712 }
713}
714
715
716
717sub get_collection_meta_sets
718{
719 my $self = shift(@_);
720 my $collection_infodb = shift(@_);
721
722 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
723 foreach my $prefix (keys %$mdprefix_fields)
724 {
725 push(@{$collection_infodb->{"metadataset"}}, $prefix);
726
727 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
728 {
729 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
730
731 my $val = $mdprefix_fields->{$prefix}->{$field};
732 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
733 }
734 }
735}
736
737
738# default is to output the metadata sets (prefixes) used in collection
739sub output_collection_meta
740{
741 my $self = shift(@_);
742 my $infodb_handle = shift(@_);
743
744 my %collection_infodb = ();
745 $self->get_collection_meta_sets(\%collection_infodb);
746 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
747}
748
749# sometimes we need to read in an existing build.cfg - for example,
750# if doing each stage of building separately, or when doing incremental
751# building
752sub read_build_cfg {
753 my $self = shift(@_);
754
755 my $buildconfigfilename;
756
757 if ($gs_mode eq "gs2") {
758 $buildconfigfilename = "build.cfg";
759 } else {
760 $buildconfigfilename = "buildConfig.xml";
761 }
762
763 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
764
765 if (!-e $buildconfigfile) {
766 # try the index dir - but do we know where it is?? try here
767 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
768 if (!-e $buildconfigfile) {
769 #we cant find a config file - just ignore the field list
770 return undef;
771 }
772 }
773 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
774
775}
776
777sub print_stats {
778 my $self = shift (@_);
779
780 my $outhandle = $self->{'outhandle'};
781 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
782 my $index = $self->{'buildproc'}->get_index();
783 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
784 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
785
786 if ($indexing_text) {
787 print $outhandle "Stats (Creating index $index)\n";
788 } else {
789 print $outhandle "Stats (Compressing text from $index)\n";
790 }
791 print $outhandle "Total bytes in collection: $num_bytes\n";
792 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
793
794 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
795
796 if ($self->{'incremental'}) {
797 if ($num_processed_bytes == 0) {
798 if ($indexing_text) {
799 print $outhandle "No additional text was added to $index\n";
800 } elsif (!$self->{'no_text'}) {
801 print $outhandle "No additional text was compressed\n";
802 }
803 }
804 }
805 else {
806 print $outhandle "***************\n";
807 if ($indexing_text) {
808 print $outhandle "WARNING: There is very little or no text to process for $index\n";
809 } elsif (!$self->{'no_text'}) {
810 print $outhandle "WARNING: There is very little or no text to compress\n";
811 }
812 print $outhandle " Was this your intention?\n";
813 print $outhandle "***************\n";
814 }
815
816 }
817
818}
819
820
8211;
822
Note: See TracBrowser for help on using the repository browser.