source: main/trunk/greenstone2/perllib/basebuilder.pm@ 24193

Last change on this file since 24193 was 24070, checked in by ak19, 13 years ago

Fixed build error message about uninitialised variables, which was due to my having used uppercase characters in strings in instances where the config file parser is set to expect the same strings in all lower-case.

  • Property svn:keywords set to Author Date Id Revision
File size: 25.1 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57 my ($class, $collection, $source_dir, $build_dir, $verbosity,
58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
59 $remove_empty_classifications,
60 $outhandle, $no_text, $failhandle, $gli) = @_;
61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
67 my $self = bless {'collection'=>$collection,
68 'source_dir'=>$source_dir,
69 'build_dir'=>$build_dir,
70 'verbosity'=>$verbosity,
71 'maxdocs'=>$maxdocs,
72 'debug'=>$debug,
73 'keepold'=>$keepold,
74 'incremental'=>$incremental,
75 'incremental_mode'=>$incremental_mode,
76 'remove_empty_classifications'=>$remove_empty_classifications,
77 'outhandle'=>$outhandle,
78 'no_text'=>$no_text,
79 'failhandle'=>$failhandle,
80 'notbuilt'=>{}, # indexes not built
81 'gli'=>$gli
82 }, $class;
83
84 $self->{'gli'} = 0 unless defined $self->{'gli'};
85
86 # Read in the collection configuration file.
87 my ($colcfgname);
88 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
89 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
90
91 if ($gs_mode eq "gs3") {
92 # read it in again to save the original form for later writing out
93 # of buildConfig.xml
94 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
95 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
96 }
97
98 # get the database type for this collection from the collect.cfg file (may be undefined)
99 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
100
101
102 # load up any dontdb fields
103 $self->{'dontdb'} = {};
104 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
105 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
106 $self->{'dontdb'}->{$dg} = 1;
107 }
108 }
109
110 $self->{'maxnumeric'} = 4;
111 return $self;
112}
113
114# stuff has been moved here from new, so we can use subclass methods
115sub init {
116 my $self = shift(@_);
117
118 my $outhandle = $self->{'outhandle'};
119 my $failhandle = $self->{'failhandle'};
120
121 $self->generate_index_list();
122 my $indexes = $self->{'collect_cfg'}->{'indexes'};
123 if (defined $indexes) {
124 # sort out subcollection indexes
125 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
126 $self->{'collect_cfg'}->{'indexes'} = [];
127 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
128 foreach my $index (@$indexes) {
129 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
130 }
131 }
132 }
133
134 # sort out language subindexes
135 if (defined $self->{'collect_cfg'}->{'languages'}) {
136 $indexes = $self->{'collect_cfg'}->{'indexes'};
137 $self->{'collect_cfg'}->{'indexes'} = [];
138 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
139 foreach my $index (@$indexes) {
140 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
141 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
142 }
143 else { # add in an empty subcollection field
144 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
145 }
146 }
147 }
148 }
149 }
150
151 if (defined($self->{'collect_cfg'}->{'indexes'})) {
152 # make sure that the same index isn't specified more than once
153 my %tmphash = ();
154 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
155 $self->{'collect_cfg'}->{'indexes'} = [];
156 foreach my $i (@tmparray) {
157 if (!defined ($tmphash{$i})) {
158 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
159 $tmphash{$i} = 1;
160 }
161 }
162 } else {
163 $self->{'collect_cfg'}->{'indexes'} = [];
164 }
165
166 # check incremental against whether builder can cope or not.
167 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
168 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
169 $self->{'keepold'} = 0;
170 $self->{'incremental'} = 0;
171 $self->{'incremental_mode'} = "none";
172
173 }
174
175
176 # get the list of plugins for this collection
177 my $plugins = [];
178 if (defined $self->{'collect_cfg'}->{'plugin'}) {
179 $plugins = $self->{'collect_cfg'}->{'plugin'};
180 }
181
182 # load all the plugins
183
184 #build up the extra global options for the plugins
185 my @global_opts = ();
186 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
187 push @global_opts, "-separate_cjk";
188 }
189 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'});
190
191 if (scalar(@{$self->{'pluginfo'}}) == 0) {
192 print $outhandle "No plugins were loaded.\n";
193 die "\n";
194 }
195
196 # get the list of classifiers for this collection
197 my $classifiers = [];
198 if (defined $self->{'collect_cfg'}->{'classify'}) {
199 $classifiers = $self->{'collect_cfg'}->{'classify'};
200 }
201
202 # load all the classifiers
203 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
204
205 # load up the document processor for building
206 # if a buildproc class has been created for this collection, use it
207 # otherwise, use the default buildproc for the builder we are initialising
208 my ($buildprocdir, $buildproctype);
209 my $collection = $self->{'collection'};
210 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
211 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
212 $buildproctype = "custombuildproc";
213 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
214 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
215 $buildproctype = "custombuildproc";
216 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
217 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
218 $buildproctype = "${collection}buildproc";
219 } else {
220 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
221 $buildproctype = $self->default_buildproc();
222 }
223 require "$buildprocdir/$buildproctype.pm";
224
225 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
226 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
227 die "$@" if $@;
228
229 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
230 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
231
232 $self->generate_index_options();
233
234 if (!$self->{'debug'} && !$self->{'keepold'}) {
235 # remove any old builds
236 &util::rm_r($self->{'build_dir'});
237 &util::mk_all_dir($self->{'build_dir'});
238
239 # make the text directory
240 my $textdir = "$self->{'build_dir'}/text";
241 &util::mk_all_dir($textdir);
242 }
243
244 if ($self->{'incremental'}) {
245 # some classes may need to do some additional initialisation
246 $self->init_for_incremental_build();
247 }
248
249}
250
251sub is_incremental_capable
252{
253 # By default we return 'no' as the answer
254 # Safer to assume non-incremental to start with, and then override in
255 # inherited classes that are.
256
257 return 0;
258}
259
260# implement this in subclass if want to do additional initialisation for an
261# incremental build
262sub init_for_incremental_build {
263 my $self = shift (@_);
264}
265
266sub deinit {
267 my $self = shift (@_);
268
269 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
270}
271
272sub generate_index_options {
273 my $self = shift (@_);
274
275 my $separate_cjk = 0;
276
277 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
278 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
279 if ($option =~ /separate_cjk/) {
280 $separate_cjk = 1;
281 }
282 }
283 }
284 # set this for building
285 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
286 # record it for build.cfg
287 $self->{'separate_cjk'} = $separate_cjk;
288}
289
290sub set_sections_index_document_metadata {
291 my $self = shift (@_);
292 my ($index) = @_;
293
294 $self->{'buildproc'}->set_sections_index_document_metadata($index);
295}
296
297sub set_maxnumeric {
298 my $self = shift (@_);
299 my ($maxnumeric) = @_;
300
301 $self->{'maxnumeric'} = $maxnumeric;
302}
303sub set_strip_html {
304 my $self = shift (@_);
305 my ($strip) = @_;
306
307 $self->{'strip_html'} = $strip;
308 $self->{'buildproc'}->set_strip_html($strip);
309}
310
311sub compress_text {
312 my $self = shift (@_);
313 my ($textindex) = @_;
314
315 print STDERR "compress_text() should be implemented in subclass!!";
316 return;
317}
318
319
320sub build_indexes {
321 my $self = shift (@_);
322 my ($indexname) = @_;
323 my $outhandle = $self->{'outhandle'};
324
325 my $indexes = [];
326 if (defined $indexname && $indexname =~ /\w/) {
327 push @$indexes, $indexname;
328 } else {
329 $indexes = $self->{'collect_cfg'}->{'indexes'};
330 }
331
332 # create the mapping between the index descriptions
333 # and their directory names (includes subcolls and langs)
334 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
335
336 # build each of the indexes
337 foreach my $index (@$indexes) {
338 if ($self->want_built($index)) {
339 print $outhandle "\n*** building index $index in subdirectory " .
340 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
341 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
342 $self->build_index($index);
343 } else {
344 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
345 }
346 }
347
348 $self->build_indexes_extra();
349
350}
351
352# implement this in subclass if want to do extra stuff at the end of building
353# all the indexes
354sub build_indexes_extra {
355 my $self = shift(@_);
356
357}
358
359sub build_index {
360 my $self = shift (@_);
361 my ($index) = @_;
362
363 print STDERR "build_index should be implemented in subclass\n";
364 return;
365}
366
367
368
369sub make_infodatabase {
370 my $self = shift (@_);
371 my $outhandle = $self->{'outhandle'};
372
373 print STDERR "BuildDir: $self->{'build_dir'}\n";
374
375 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
376 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
377 &util::mk_all_dir ($textdir);
378 &util::mk_all_dir ($assocdir);
379
380 # Get info database file path
381 my $infodb_type = $self->{'infodbtype'};
382 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
383
384 print $outhandle "\n*** creating the info database and processing associated files\n"
385 if ($self->{'verbosity'} >= 1);
386 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
387
388 # init all the classifiers
389 &classify::init_classifiers ($self->{'classifiers'});
390
391 my $reconstructed_docs = undef;
392 my $database_recs = undef;
393
394 if ($self->{'incremental'}) {
395 $database_recs = {};
396
397 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
398 }
399
400
401 # Important (for memory usage reasons) that we obtain the filehandle
402 # here for writing out to the database, rather than after
403 # $reconstructed_docs has been set up (assuming -incremental is on)
404 #
405 # This is because when we open a pipe to txt2db [using open()]
406 # this triggers a fork() followed by exec(). $reconstructed_docs
407 # can get very large, and so if we did the open() after this, it means
408 # the fork creates a clone of the *large* process image which (admittedly)
409 # is then quickly replaced in the execve() with the much smaller image for
410 # 'txt2db'. The trouble is, in that seismic second caused by
411 # the fork(), the system really does need to have all that memory available
412 # even though it isn't ultimately used. The result is an out of memory
413 # error.
414
415 my ($infodb_handle);
416 if ($self->{'debug'}) {
417 $infodb_handle = *STDOUT;
418 }
419 else {
420 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
421 if (!defined($infodb_handle))
422 {
423 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
424 die "builder::make_infodatabase - couldn't open infodb write handle\n";
425 }
426 }
427
428 if ($self->{'incremental'}) {
429 # reconstruct doc_obj metadata from database for all docs
430 $reconstructed_docs
431 = &classify::reconstruct_doc_objs_metadata($infodb_type,
432 $infodb_file_path,
433 $database_recs);
434 }
435
436 # set up the document processor
437
438 $self->{'buildproc'}->set_output_handle ($infodb_handle);
439 $self->{'buildproc'}->set_mode ('infodb');
440 $self->{'buildproc'}->set_assocdir ($assocdir);
441 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
442 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
443 $self->{'buildproc'}->set_indexing_text (0);
444 $self->{'buildproc'}->set_store_text(1);
445 $self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
446
447 # make_infodatabase needs full reset even for incremental build
448 # as incremental works by reconstructing all docs from the database and
449 # then adding in the new ones
450 $self->{'buildproc'}->zero_reset();
451
452 $self->{'buildproc'}->{'mdprefix_fields'} = {};
453
454 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
455 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
456
457 if ($self->{'incremental'}) {
458 # create flat classify structure, ready for new docs to be added
459 foreach my $doc_obj ( @$reconstructed_docs ) {
460 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
461 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
462 $self->{'buildproc'}->process($doc_obj,undef);
463 }
464 }
465 }
466 # this has changed to only output collection meta if its
467 # not in the config file
468 $self->output_collection_meta($infodb_handle);
469
470 # output classification information
471 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
472 $self->{'remove_empty_classifications'},
473 $self->{'gli'});
474
475 # Output classifier reverse lookup, used in incremental deletion
476 ####&classify::print_reverse_lookup($infodb_handle);
477
478 # output doclist
479 my @doc_list = $self->{'buildproc'}->get_doc_list();
480 my $browselist_infodb = { 'hastxt' => [ "0" ],
481 'childtype' => [ "VList" ],
482 'numleafdocs' => [ scalar(@doc_list) ],
483 'thistype' => [ "Invisible" ],
484 'contains' => [ join(";", @doc_list) ] };
485 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
486
487 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
488
489 if ($infodb_type eq "gdbm-txtgz") {
490 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
491 if (-e $gdb_infodb_file_path) {
492 &util::rm($gdb_infodb_file_path);
493 }
494 }
495 print STDERR "</Stage>\n" if $self->{'gli'};
496}
497
498sub make_auxiliary_files {
499 my $self = shift (@_);
500 my ($index);
501 my $build_cfg = {};
502 # subclasses may have already defined stuff in here
503 if (defined $self->{'build_cfg'}) {
504 $build_cfg = $self->{'build_cfg'};
505 }
506
507 my $outhandle = $self->{'outhandle'};
508
509 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
510 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
511
512 # get the text directory
513 &util::mk_all_dir ($self->{'build_dir'});
514
515 # store the build date
516 $build_cfg->{'builddate'} = time;
517 $build_cfg->{'buildtype'} = $self->{'buildtype'};
518 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
519 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
520 if ($self->{'separate_cjk'}) {
521 $build_cfg->{'separate_cjk'} = "true";
522 }
523
524 # store the number of documents and number of bytes
525 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
526 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
527 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
528
529 # store the mapping between the index names and the directory names
530 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
531 my @indexmap = ();
532 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
533 if (not defined ($self->{'notbuilt'}->{$index})) {
534 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
535 }
536 }
537 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
538
539 my @subcollectionmap = ();
540 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
541 push (@subcollectionmap, "$subcollection\-\>" .
542 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
543 }
544 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
545
546 my @languagemap = ();
547 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
548 push (@languagemap, "$language\-\>" .
549 $self->{'index_mapping'}->{'languagemap'}->{$language});
550 }
551 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
552
553 my @notbuilt = ();
554 foreach my $nb (keys %{$self->{'notbuilt'}}) {
555 push (@notbuilt, $nb);
556 }
557 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
558
559 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
560
561 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
562
563 # write out the earliestDatestamp information needed for OAI
564 my $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
565 if(!-d $archivedir) {
566 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
567 }
568 my $earliestDatestampFile = &util::filename_cat ($archivedir, "earliestDatestamp");
569 my $earliestDatestamp = 0;
570 if (open(FIN,"<$earliestDatestampFile")) {
571 {
572 # slurp in file as a single line
573 local $/ = undef;
574 $earliestDatestamp = <FIN>;
575 #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
576 }
577 close(FIN);
578 }
579 else {
580 print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
581 print $outhandle "Setting value to 0.\n";
582 }
583 $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
584
585 $self->build_cfg_extra($build_cfg);
586
587 if ($gs_mode eq "gs2") {
588 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
589 }
590 if ($gs_mode eq "gs3") {
591
592 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
593 }
594
595 print STDERR "</Stage>\n" if $self->{'gli'};
596}
597
598# implement this in subclass if want to add extra stuff to build.cfg
599sub build_cfg_extra {
600 my $self = shift(@_);
601 my ($build_cfg) = @_;
602
603}
604
605
606sub collect_specific {
607 my $self = shift (@_);
608}
609
610sub want_built {
611 my $self = shift (@_);
612 my ($index) = @_;
613
614 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
615 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
616 if ($index =~ /^$checkstr$/) {
617 $self->{'notbuilt'}->{$index} = 1;
618 return 0;
619 }
620 }
621 }
622
623 return 1;
624}
625
626sub create_index_mapping {
627 my $self = shift (@_);
628 my ($indexes) = @_;
629
630 print STDERR "create_index_mapping should be implemented in subclass\n";
631 my %mapping = ();
632 return \%mapping;
633}
634
635# returns a processed version of a field.
636# if the field has only one component the processed
637# version will contain the first character and next consonant
638# of that componant - otherwise it will contain the first
639# character of the first two components
640# only uses letdig (\w) characters now
641sub process_field {
642 my $self = shift (@_);
643 my ($field) = @_;
644
645 return "" unless (defined ($field) && $field =~ /\S/);
646
647 my ($a, $b);
648 my @components = split /,/, $field;
649 if (scalar @components >= 2) {
650 # pick the first letdig from the first two field names
651 ($a) = $components[0] =~ /^[^\w]*(\w)/;
652 ($b) = $components[1] =~ /^[^\w]*(\w)/;
653 } else {
654 # pick the first two letdig chars
655 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
656 }
657 # there may not have been any letdigs...
658 $a = 'a' unless defined $a;
659 $b = '0' unless defined $b;
660
661 my $newfield = "$a$b";
662 if ($newfield =~ /^\d\d$/) {
663 # digits only - Greenstone runtime doesn't like this.
664 $newfield = "a$a";
665 }
666 return $newfield;
667
668}
669
670sub get_next_version {
671 my $self = shift (@_);
672 my ($nameref) = @_;
673 my $num=0;
674 if ($$nameref =~ /(\d\d)$/) {
675 $num = $1; $num ++;
676 $$nameref =~ s/\d\d$/$num/;
677 } elsif ($$nameref =~ /(\d)$/) {
678 $num = $1;
679 if ($num == 9) {$$nameref =~ s/\d$/10/;}
680 else {$num ++; $$nameref =~ s/\d$/$num/;}
681 } else {
682 $$nameref =~ s/.$/0/;
683 }
684}
685
686
687
688sub get_collection_meta_sets
689{
690 my $self = shift(@_);
691 my $collection_infodb = shift(@_);
692
693 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
694 foreach my $prefix (keys %$mdprefix_fields)
695 {
696 push(@{$collection_infodb->{"metadataset"}}, $prefix);
697
698 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
699 {
700 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
701
702 my $val = $mdprefix_fields->{$prefix}->{$field};
703 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
704 }
705 }
706}
707
708
709# default is to output the metadata sets (prefixes) used in collection
710sub output_collection_meta
711{
712 my $self = shift(@_);
713 my $infodb_handle = shift(@_);
714
715 my %collection_infodb = ();
716 $self->get_collection_meta_sets(\%collection_infodb);
717 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
718}
719
720# sometimes we need to read in an existing build.cfg - for example,
721# if doing each stage of building separately, or when doing incremental
722# building
723sub read_build_cfg {
724 my $self = shift(@_);
725
726 my $buildconfigfilename;
727
728 if ($gs_mode eq "gs2") {
729 $buildconfigfilename = "build.cfg";
730 } else {
731 $buildconfigfilename = "buildConfig.xml";
732 }
733
734 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
735
736 if (!-e $buildconfigfile) {
737 # try the index dir - but do we know where it is?? try here
738 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
739 if (!-e $buildconfigfile) {
740 #we cant find a config file - just ignore the field list
741 return undef;
742 }
743 }
744 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
745
746}
747
748sub print_stats {
749 my $self = shift (@_);
750
751 my $outhandle = $self->{'outhandle'};
752 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
753 my $index = $self->{'buildproc'}->get_index();
754 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
755 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
756
757 if ($indexing_text) {
758 print $outhandle "Stats (Creating index $index)\n";
759 } else {
760 print $outhandle "Stats (Compressing text from $index)\n";
761 }
762 print $outhandle "Total bytes in collection: $num_bytes\n";
763 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
764
765 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
766
767 if ($self->{'incremental'}) {
768 if ($num_processed_bytes == 0) {
769 if ($indexing_text) {
770 print $outhandle "No additional text was added to $index\n";
771 } elsif (!$self->{'no_text'}) {
772 print $outhandle "No additional text was compressed\n";
773 }
774 }
775 }
776 else {
777 print $outhandle "***************\n";
778 if ($indexing_text) {
779 print $outhandle "WARNING: There is very little or no text to process for $index\n";
780 } elsif (!$self->{'no_text'}) {
781 print $outhandle "WARNING: There is very little or no text to compress\n";
782 }
783 print $outhandle " Was this your intention?\n";
784 print $outhandle "***************\n";
785 }
786
787 }
788
789}
790
791
7921;
793
Note: See TracBrowser for help on using the repository browser.