source: main/trunk/greenstone2/perllib/basebuilder.pm@ 27306

Last change on this file since 27306 was 27306, checked in by jmt12, 11 years ago

Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'

  • Property svn:keywords set to Author Date Id Revision
File size: 26.1 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37use FileUtils;
38
39
40BEGIN {
41 # set autoflush on for STDERR and STDOUT so that mgpp
42 # doesn't get out of sync with plugins
43 STDOUT->autoflush(1);
44 STDERR->autoflush(1);
45}
46
47END {
48 STDOUT->autoflush(0);
49 STDERR->autoflush(0);
50}
51
52our $maxdocsize = 12000;
53
54# used to signify "gs2"(default) or "gs3"
55our $gs_mode = "gs2";
56
57sub new {
58 my ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
59 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
60 $remove_empty_classifications,
61 $outhandle, $no_text, $failhandle, $gli) = @_;
62
63 $outhandle = *STDERR unless defined $outhandle;
64 $no_text = 0 unless defined $no_text;
65 $failhandle = *STDERR unless defined $failhandle;
66
67 # create a builder object
68 my $self = bless {'site'=>$site, # will be undef for Greenstone 2
69 'collection'=>$collection,
70 'source_dir'=>$source_dir,
71 'build_dir'=>$build_dir,
72 'verbosity'=>$verbosity,
73 'maxdocs'=>$maxdocs,
74 'debug'=>$debug,
75 'keepold'=>$keepold,
76 'incremental'=>$incremental,
77 'incremental_mode'=>$incremental_mode,
78 'remove_empty_classifications'=>$remove_empty_classifications,
79 'outhandle'=>$outhandle,
80 'no_text'=>$no_text,
81 'failhandle'=>$failhandle,
82 'notbuilt'=>{}, # indexes not built
83 'gli'=>$gli
84 }, $class;
85
86 $self->{'gli'} = 0 unless defined $self->{'gli'};
87
88 # Read in the collection configuration file.
89 if ((defined $site) && ($site ne "")) { # GS3
90 $gs_mode = "gs3";
91 }
92
93 my $colcfgname = &colcfg::get_collect_cfg_name($outhandle, $gs_mode);
94 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
95
96 if ($gs_mode eq "gs3") {
97 # read it in again to save the original form for later writing out
98 # of buildConfig.xml
99 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
100 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
101 }
102
103 # get the database type for this collection from the collect.cfg file (may be undefined)
104 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
105
106
107 # load up any dontdb fields
108 $self->{'dontdb'} = {};
109 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
110 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
111 $self->{'dontdb'}->{$dg} = 1;
112 }
113 }
114
115 $self->{'maxnumeric'} = 4;
116 return $self;
117}
118
119# stuff has been moved here from new, so we can use subclass methods
120sub init {
121 my $self = shift(@_);
122
123 my $outhandle = $self->{'outhandle'};
124 my $failhandle = $self->{'failhandle'};
125
126 $self->generate_index_list();
127 my $indexes = $self->{'collect_cfg'}->{'indexes'};
128 if (defined $indexes) {
129 # sort out subcollection indexes
130 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
131 $self->{'collect_cfg'}->{'indexes'} = [];
132 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
133 foreach my $index (@$indexes) {
134 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
135 }
136 }
137 }
138
139 # sort out language subindexes
140 if (defined $self->{'collect_cfg'}->{'languages'}) {
141 $indexes = $self->{'collect_cfg'}->{'indexes'};
142 $self->{'collect_cfg'}->{'indexes'} = [];
143 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
144 foreach my $index (@$indexes) {
145 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
146 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
147 }
148 else { # add in an empty subcollection field
149 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
150 }
151 }
152 }
153 }
154 }
155
156 if (defined($self->{'collect_cfg'}->{'indexes'})) {
157 # make sure that the same index isn't specified more than once
158 my %tmphash = ();
159 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
160 $self->{'collect_cfg'}->{'indexes'} = [];
161 foreach my $i (@tmparray) {
162 if (!defined ($tmphash{$i})) {
163 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
164 $tmphash{$i} = 1;
165 }
166 }
167 } else {
168 $self->{'collect_cfg'}->{'indexes'} = [];
169 }
170
171 # check incremental against whether builder can cope or not.
172 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
173 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
174 $self->{'keepold'} = 0;
175 $self->{'incremental'} = 0;
176 $self->{'incremental_mode'} = "none";
177
178 }
179
180 # gs_version for plugins
181 my $gs_version = "2";
182 if ($gs_mode eq "gs3") {
183 $gs_version = "3";
184 }
185 # get the list of plugins for this collection
186 my $plugins = [];
187 if (defined $self->{'collect_cfg'}->{'plugin'}) {
188 $plugins = $self->{'collect_cfg'}->{'plugin'};
189 }
190
191 # load all the plugins
192
193 #build up the extra global options for the plugins
194 my @global_opts = ();
195 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
196 push @global_opts, "-separate_cjk";
197 }
198 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'}, $gs_version);
199
200 if (scalar(@{$self->{'pluginfo'}}) == 0) {
201 print $outhandle "No plugins were loaded.\n";
202 die "\n";
203 }
204
205 # get the list of classifiers for this collection
206 my $classifiers = [];
207 if (defined $self->{'collect_cfg'}->{'classify'}) {
208 $classifiers = $self->{'collect_cfg'}->{'classify'};
209 }
210
211 # load all the classifiers
212 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
213
214 # load up the document processor for building
215 # if a buildproc class has been created for this collection, use it
216 # otherwise, use the default buildproc for the builder we are initialising
217 my $buildprocdir = undef;
218 my $buildproctype;
219
220 my $collection = $self->{'collection'};
221 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
222 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
223 $buildproctype = "custombuildproc";
224 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
225 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
226 $buildproctype = "custombuildproc";
227 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
228 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
229 $buildproctype = "${collection}buildproc";
230 } else {
231 $buildproctype = $self->default_buildproc();
232 }
233 if (defined $buildprocdir) {
234 require "$buildprocdir/$buildproctype.pm";
235 }
236 else {
237 require "$buildproctype.pm";
238 }
239
240 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
241 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
242 die "$@" if $@;
243
244 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
245 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
246
247 $self->generate_index_options();
248
249 if (!$self->{'debug'} && !$self->{'keepold'}) {
250 # remove any old builds
251 &FileUtils::removeFilesRecursive($self->{'build_dir'});
252 &FileUtils::makeAllDirectories($self->{'build_dir'});
253
254 # make the text directory
255 my $textdir = "$self->{'build_dir'}/text";
256 &FileUtils::makeAllDirectories($textdir);
257 }
258
259 if ($self->{'incremental'}) {
260 # some classes may need to do some additional initialisation
261 $self->init_for_incremental_build();
262 }
263
264}
265
266sub is_incremental_capable
267{
268 # By default we return 'no' as the answer
269 # Safer to assume non-incremental to start with, and then override in
270 # inherited classes that are.
271
272 return 0;
273}
274
275# implement this in subclass if want to do additional initialisation for an
276# incremental build
277sub init_for_incremental_build {
278 my $self = shift (@_);
279}
280
281sub deinit {
282 my $self = shift (@_);
283
284 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
285}
286
287sub generate_index_options {
288 my $self = shift (@_);
289
290 my $separate_cjk = 0;
291
292 my $indexoptions = $self->{'collect_cfg'}->{'indexoptions'};
293 if (defined($indexoptions)) {
294
295 foreach my $option (@$indexoptions) {
296 if ($option =~ /separate_cjk/) {
297 $separate_cjk = 1;
298 }
299 }
300 }
301 # set this for building
302 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
303 # record it for build.cfg
304 $self->{'separate_cjk'} = $separate_cjk;
305}
306
307sub set_sections_index_document_metadata {
308 my $self = shift (@_);
309 my ($index) = @_;
310
311 $self->{'buildproc'}->set_sections_index_document_metadata($index);
312}
313
314sub set_maxnumeric {
315 my $self = shift (@_);
316 my ($maxnumeric) = @_;
317
318 $self->{'maxnumeric'} = $maxnumeric;
319}
320sub set_strip_html {
321 my $self = shift (@_);
322 my ($strip) = @_;
323
324 $self->{'strip_html'} = $strip;
325 $self->{'buildproc'}->set_strip_html($strip);
326}
327
328sub set_store_metadata_coverage {
329 my $self = shift (@_);
330 my ($store_metadata_coverage) = @_;
331
332 $self->{'buildproc'}->set_store_metadata_coverage($store_metadata_coverage);
333}
334
335sub compress_text {
336 my $self = shift (@_);
337 my ($textindex) = @_;
338
339 print STDERR "compress_text() should be implemented in subclass!!";
340 return;
341}
342
343
344sub build_indexes {
345 my $self = shift (@_);
346 my ($indexname) = @_;
347 my $outhandle = $self->{'outhandle'};
348
349 $self->pre_build_indexes();
350
351 my $indexes = [];
352 if (defined $indexname && $indexname =~ /\w/) {
353 push @$indexes, $indexname;
354 } else {
355 $indexes = $self->{'collect_cfg'}->{'indexes'};
356 }
357
358 # create the mapping between the index descriptions
359 # and their directory names (includes subcolls and langs)
360 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
361
362 # build each of the indexes
363 foreach my $index (@$indexes) {
364 if ($self->want_built($index)) {
365 print $outhandle "\n*** building index $index in subdirectory " .
366 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
367 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
368 $self->build_index($index);
369 } else {
370 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
371 }
372 }
373
374 $self->post_build_indexes();
375
376}
377
378# implement this in subclass if want to do extra stuff at before building
379# all the indexes
380sub pre_build_indexes {
381 my $self = shift(@_);
382 my ($indexname) = @_; # optional parameter
383}
384
385# implement this in subclass if want to do extra stuff at the end of building
386# all the indexes
387sub post_build_indexes {
388 my $self = shift(@_);
389}
390
391sub build_index {
392 my $self = shift (@_);
393 my ($index) = @_;
394
395 print STDERR "build_index should be implemented in subclass\n";
396 return;
397}
398
399
400
401sub make_infodatabase {
402 my $self = shift (@_);
403 my $outhandle = $self->{'outhandle'};
404
405 print STDERR "BuildDir: $self->{'build_dir'}\n";
406
407 my $textdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
408 my $assocdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "assoc");
409 &FileUtils::makeAllDirectories ($textdir);
410 &FileUtils::makeAllDirectories ($assocdir);
411
412 # Get info database file path
413 my $infodb_type = $self->{'infodbtype'};
414 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
415
416 print $outhandle "\n*** creating the info database and processing associated files\n"
417 if ($self->{'verbosity'} >= 1);
418 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
419
420 # init all the classifiers
421 &classify::init_classifiers ($self->{'classifiers'});
422
423 my $reconstructed_docs = undef;
424 my $database_recs = undef;
425
426 if ($self->{'incremental'}) {
427 $database_recs = {};
428
429 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
430 }
431
432
433 # Important (for memory usage reasons) that we obtain the filehandle
434 # here for writing out to the database, rather than after
435 # $reconstructed_docs has been set up (assuming -incremental is on)
436 #
437 # This is because when we open a pipe to txt2db [using open()]
438 # this triggers a fork() followed by exec(). $reconstructed_docs
439 # can get very large, and so if we did the open() after this, it means
440 # the fork creates a clone of the *large* process image which (admittedly)
441 # is then quickly replaced in the execve() with the much smaller image for
442 # 'txt2db'. The trouble is, in that seismic second caused by
443 # the fork(), the system really does need to have all that memory available
444 # even though it isn't ultimately used. The result is an out of memory
445 # error.
446
447 my ($infodb_handle);
448 if ($self->{'debug'}) {
449 $infodb_handle = *STDOUT;
450 }
451 else {
452 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
453 if (!defined($infodb_handle))
454 {
455 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
456 die "builder::make_infodatabase - couldn't open infodb write handle\n";
457 }
458 }
459
460 if ($self->{'incremental'}) {
461 # reconstruct doc_obj metadata from database for all docs
462 $reconstructed_docs
463 = &classify::reconstruct_doc_objs_metadata($infodb_type,
464 $infodb_file_path,
465 $database_recs);
466 }
467
468 # set up the document processor
469
470 $self->{'buildproc'}->set_output_handle ($infodb_handle);
471 $self->{'buildproc'}->set_mode ('infodb');
472 $self->{'buildproc'}->set_assocdir ($assocdir);
473 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
474 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
475 $self->{'buildproc'}->set_indexing_text (0);
476 $self->{'buildproc'}->set_store_text(1);
477
478 # make_infodatabase needs full reset even for incremental build
479 # as incremental works by reconstructing all docs from the database and
480 # then adding in the new ones
481 $self->{'buildproc'}->zero_reset();
482
483 $self->{'buildproc'}->{'mdprefix_fields'} = {};
484
485 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
486 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
487
488 if ($self->{'incremental'}) {
489 # create flat classify structure, ready for new docs to be added
490 foreach my $doc_obj ( @$reconstructed_docs ) {
491 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
492 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
493 $self->{'buildproc'}->process($doc_obj,undef);
494 }
495 }
496 }
497 # this has changed to only output collection meta if its
498 # not in the config file
499 $self->output_collection_meta($infodb_handle);
500
501 # output classification information
502 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
503 $self->{'remove_empty_classifications'},
504 $self->{'gli'});
505
506 # Output classifier reverse lookup, used in incremental deletion
507 ####&classify::print_reverse_lookup($infodb_handle);
508
509 # output doclist
510 my @doc_list = $self->{'buildproc'}->get_doc_list();
511 my $browselist_infodb = { 'hastxt' => [ "0" ],
512 'childtype' => [ "VList" ],
513 'numleafdocs' => [ scalar(@doc_list) ],
514 'thistype' => [ "Invisible" ],
515 'contains' => [ join(";", @doc_list) ] };
516 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
517
518 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
519
520 if ($infodb_type eq "gdbm-txtgz") {
521 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
522 if (-e $gdb_infodb_file_path) {
523 &util::rm($gdb_infodb_file_path);
524 }
525 }
526 print STDERR "</Stage>\n" if $self->{'gli'};
527}
528
529sub make_auxiliary_files {
530 my $self = shift (@_);
531 my ($index);
532 my $build_cfg = {};
533 # subclasses may have already defined stuff in here
534 if (defined $self->{'build_cfg'}) {
535 $build_cfg = $self->{'build_cfg'};
536 }
537
538 my $outhandle = $self->{'outhandle'};
539
540 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
541 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
542
543 # get the text directory
544 &FileUtils::makeAllDirectories ($self->{'build_dir'});
545
546 # store the build date
547 $build_cfg->{'builddate'} = time;
548 $build_cfg->{'buildtype'} = $self->{'buildtype'};
549 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
550 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
551 if ($self->{'separate_cjk'}) {
552 $build_cfg->{'separate_cjk'} = "true";
553 }
554
555 # store the number of documents and number of bytes
556 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
557 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
558 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
559
560 # store the mapping between the index names and the directory names
561 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
562 my @indexmap = ();
563 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
564 if (not defined ($self->{'notbuilt'}->{$index})) {
565 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
566 }
567 }
568
569 # store the number of indexes built to later determine whether search serviceracks get written out to buildConfig.xml
570 $build_cfg->{'num_indexes'} = scalar (@indexmap);
571
572 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
573
574 my @subcollectionmap = ();
575 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
576 push (@subcollectionmap, "$subcollection\-\>" .
577 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
578 }
579 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
580
581 my @languagemap = ();
582 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
583 push (@languagemap, "$language\-\>" .
584 $self->{'index_mapping'}->{'languagemap'}->{$language});
585 }
586 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
587
588 my @notbuilt = ();
589 foreach my $nb (keys %{$self->{'notbuilt'}}) {
590 push (@notbuilt, $nb);
591 }
592 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
593
594 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
595
596 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
597
598 # write out the earliestDatestamp information needed for OAI
599 my $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "archives");
600 if(!-d $archivedir) {
601 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "export");
602 }
603 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
604 my $earliestDatestamp = 0;
605 if (open(FIN,"<$earliestDatestampFile")) {
606 {
607 # slurp in file as a single line
608 local $/ = undef;
609 $earliestDatestamp = <FIN>;
610 #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
611 }
612 close(FIN);
613 }
614 else {
615 print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
616 print $outhandle "Setting value to 0.\n";
617 }
618 $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
619
620 $self->build_cfg_extra($build_cfg);
621
622 if ($gs_mode eq "gs2") {
623 &colcfg::write_build_cfg(&FileUtils::filenameConcatenate($self->{'build_dir'},"build.cfg"), $build_cfg);
624 }
625 if ($gs_mode eq "gs3") {
626
627 &colcfg::write_build_cfg_xml(&FileUtils::filenameConcatenate($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
628 }
629
630 print STDERR "</Stage>\n" if $self->{'gli'};
631}
632
633# implement this in subclass if want to add extra stuff to build.cfg
634sub build_cfg_extra {
635 my $self = shift(@_);
636 my ($build_cfg) = @_;
637
638}
639
640
641sub collect_specific {
642 my $self = shift (@_);
643}
644
645sub want_built {
646 my $self = shift (@_);
647 my ($index) = @_;
648
649 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
650 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
651 if ($index =~ /^$checkstr$/) {
652 $self->{'notbuilt'}->{$index} = 1;
653 return 0;
654 }
655 }
656 }
657
658 return 1;
659}
660
661sub create_index_mapping {
662 my $self = shift (@_);
663 my ($indexes) = @_;
664
665 print STDERR "create_index_mapping should be implemented in subclass\n";
666 my %mapping = ();
667 return \%mapping;
668}
669
670# returns a processed version of a field.
671# if the field has only one component the processed
672# version will contain the first character and next consonant
673# of that componant - otherwise it will contain the first
674# character of the first two components
675# only uses letdig (\w) characters now
676sub process_field {
677 my $self = shift (@_);
678 my ($field) = @_;
679
680 return "" unless (defined ($field) && $field =~ /\S/);
681
682 my ($a, $b);
683 my @components = split /,/, $field;
684 if (scalar @components >= 2) {
685 # pick the first letdig from the first two field names
686 ($a) = $components[0] =~ /^[^\w]*(\w)/;
687 ($b) = $components[1] =~ /^[^\w]*(\w)/;
688 } else {
689 # pick the first two letdig chars
690 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
691 }
692 # there may not have been any letdigs...
693 $a = 'a' unless defined $a;
694 $b = '0' unless defined $b;
695
696 my $newfield = "$a$b";
697 if ($newfield =~ /^\d\d$/) {
698 # digits only - Greenstone runtime doesn't like this.
699 $newfield = "a$a";
700 }
701 return $newfield;
702
703}
704
705sub get_next_version {
706 my $self = shift (@_);
707 my ($nameref) = @_;
708 my $num=0;
709 if ($$nameref =~ /(\d\d)$/) {
710 $num = $1; $num ++;
711 $$nameref =~ s/\d\d$/$num/;
712 } elsif ($$nameref =~ /(\d)$/) {
713 $num = $1;
714 if ($num == 9) {$$nameref =~ s/\d$/10/;}
715 else {$num ++; $$nameref =~ s/\d$/$num/;}
716 } else {
717 $$nameref =~ s/.$/0/;
718 }
719}
720
721
722
723sub get_collection_meta_sets
724{
725 my $self = shift(@_);
726 my $collection_infodb = shift(@_);
727
728 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
729 foreach my $prefix (keys %$mdprefix_fields)
730 {
731 push(@{$collection_infodb->{"metadataset"}}, $prefix);
732
733 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
734 {
735 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
736
737 my $val = $mdprefix_fields->{$prefix}->{$field};
738 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
739 }
740 }
741}
742
743
744# default is to output the metadata sets (prefixes) used in collection
745sub output_collection_meta
746{
747 my $self = shift(@_);
748 my $infodb_handle = shift(@_);
749
750 my %collection_infodb = ();
751 $self->get_collection_meta_sets(\%collection_infodb);
752 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
753}
754
755# sometimes we need to read in an existing build.cfg - for example,
756# if doing each stage of building separately, or when doing incremental
757# building
758sub read_build_cfg {
759 my $self = shift(@_);
760
761 my $buildconfigfilename;
762
763 if ($gs_mode eq "gs2") {
764 $buildconfigfilename = "build.cfg";
765 } else {
766 $buildconfigfilename = "buildConfig.xml";
767 }
768
769 my $buildconfigfile = &FileUtils::filenameConcatenate($self->{'build_dir'}, $buildconfigfilename);
770
771 if (!-e $buildconfigfile) {
772 # try the index dir - but do we know where it is?? try here
773 $buildconfigfile = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
774 if (!-e $buildconfigfile) {
775 #we cant find a config file - just ignore the field list
776 return undef;
777 }
778 }
779 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
780
781}
782
783sub print_stats {
784 my $self = shift (@_);
785
786 my $outhandle = $self->{'outhandle'};
787 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
788 my $index = $self->{'buildproc'}->get_index();
789 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
790 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
791
792 if ($indexing_text) {
793 print $outhandle "Stats (Creating index $index)\n";
794 } else {
795 print $outhandle "Stats (Compressing text from $index)\n";
796 }
797 print $outhandle "Total bytes in collection: $num_bytes\n";
798 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
799
800 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
801
802 if ($self->{'incremental'}) {
803 if ($num_processed_bytes == 0) {
804 if ($indexing_text) {
805 print $outhandle "No additional text was added to $index\n";
806 } elsif (!$self->{'no_text'}) {
807 print $outhandle "No additional text was compressed\n";
808 }
809 }
810 }
811 else {
812 print $outhandle "***************\n";
813 if ($indexing_text) {
814 print $outhandle "WARNING: There is very little or no text to process for $index\n";
815 } elsif (!$self->{'no_text'}) {
816 print $outhandle "WARNING: There is very little or no text to compress\n";
817 }
818 print $outhandle " Was this your intention?\n";
819 print $outhandle "***************\n";
820 }
821
822 }
823
824}
825
826
8271;
828
Note: See TracBrowser for help on using the repository browser.