source: gsdl/trunk/perllib/basebuilder.pm@ 20647

Last change on this file since 20647 was 20647, checked in by kjdon, 15 years ago

now pass incremental_mode to new method. check keepold/incremental based on self->is_incremental_capable, and set to 0 if not incremental capable. moved load plugins adn classifiers to init so that we can use subclass methods

  • Property svn:keywords set to Author Date Id Revision
File size: 24.0 KB
RevLine 
[14930]1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
[15709]34use dbutil;
[14930]35use plugin;
36use util;
37
[15709]38
[14930]39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
[20095]54our $gs_mode = "gs2";
[14930]55
56sub new {
57 my ($class, $collection, $source_dir, $build_dir, $verbosity,
[20647]58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
[14930]59 $remove_empty_classifications,
60 $outhandle, $no_text, $failhandle, $gli, $disable_OAI) = @_;
61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
67 my $self = bless {'collection'=>$collection,
68 'source_dir'=>$source_dir,
69 'build_dir'=>$build_dir,
70 'verbosity'=>$verbosity,
71 'maxdocs'=>$maxdocs,
72 'debug'=>$debug,
73 'keepold'=>$keepold,
74 'incremental'=>$incremental,
[20647]75 'incremental_mode'=>$incremental_mode,
[14930]76 'remove_empty_classifications'=>$remove_empty_classifications,
77 'outhandle'=>$outhandle,
78 'no_text'=>$no_text,
79 'failhandle'=>$failhandle,
80 'notbuilt'=>{}, # indexes not built
81 'gli'=>$gli,
82 'disable_OAI'=>$disable_OAI
83 }, $class;
84
85 $self->{'gli'} = 0 unless defined $self->{'gli'};
86
[20095]87 # disable_OAI applies to greenstone 3 only and is only passed to &colcfg::write_build_cfg_xml (then buildConfigxml::write_build_cfg_file) when writing the buildConfig.xml
[14930]88 $self->{'disable_OAI'} = 0 unless defined $self->{'disable_OAI'};
89
90 # Read in the collection configuration file.
91 my ($colcfgname);
92 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
[20100]93 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
[14384]94
[20100]95 if ($gs_mode eq "gs3") {
96 # read it in again to save the original form for later writing out
97 # of buildConfig.xml
98 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
99 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
[14930]100 }
[20100]101
[15725]102 # get the database type for this collection from the collect.cfg file (may be undefined)
[15727]103 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
[15725]104
[14384]105
[15688]106 # load up any dontdb fields
107 $self->{'dontdb'} = {};
[14930]108 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
109 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
[15688]110 $self->{'dontdb'}->{$dg} = 1;
[14930]111 }
112 }
113
114 $self->{'maxnumeric'} = 4;
115 return $self;
116}
117
118# stuff has been moved here from new, so we can use subclass methods
119sub init {
120 my $self = shift(@_);
121
[20647]122 my $outhandle = $self->{'outhandle'};
123 my $failhandle = $self->{'failhandle'};
124
[14930]125 $self->generate_index_list();
[19218]126 my $indexes = $self->{'collect_cfg'}->{'indexes'};
127 if (defined $indexes) {
128 # sort out subcollection indexes
129 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
130 $self->{'collect_cfg'}->{'indexes'} = [];
131 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
132 foreach my $index (@$indexes) {
133 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
134 }
[14930]135 }
136 }
[19218]137
138 # sort out language subindexes
139 if (defined $self->{'collect_cfg'}->{'languages'}) {
140 $indexes = $self->{'collect_cfg'}->{'indexes'};
141 $self->{'collect_cfg'}->{'indexes'} = [];
142 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
143 foreach my $index (@$indexes) {
144 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
145 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
146 }
147 else { # add in an empty subcollection field
148 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
149 }
[14930]150 }
151 }
152 }
153 }
[19218]154
[14930]155 if (defined($self->{'collect_cfg'}->{'indexes'})) {
156 # make sure that the same index isn't specified more than once
157 my %tmphash = ();
158 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
159 $self->{'collect_cfg'}->{'indexes'} = [];
160 foreach my $i (@tmparray) {
161 if (!defined ($tmphash{$i})) {
162 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
163 $tmphash{$i} = 1;
164 }
165 }
166 } else {
167 $self->{'collect_cfg'}->{'indexes'} = [];
168 }
169
[20647]170 # check incremental against whether builder can cope or not.
171 if (($self->{'keepold'} || $self->{'incremental'}) && !$self->is_incremental_capable()) {
172 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
173 $self->{'keepold'} = 0;
174 $self->{'incremental'} = 0;
175 $self->{'incremental_mode'} = "none";
176
177 }
178
179
180 # get the list of plugins for this collection
181 my $plugins = [];
182 if (defined $self->{'collect_cfg'}->{'plugin'}) {
183 $plugins = $self->{'collect_cfg'}->{'plugin'};
184 }
185
186 # load all the plugins
187
188 #build up the extra global options for the plugins
189 my @global_opts = ();
190 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
191 push @global_opts, "-separate_cjk";
192 }
193 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'});
194
195 if (scalar(@{$self->{'pluginfo'}}) == 0) {
196 print $outhandle "No plugins were loaded.\n";
197 die "\n";
198 }
199
200 # get the list of classifiers for this collection
201 my $classifiers = [];
202 if (defined $self->{'collect_cfg'}->{'classify'}) {
203 $classifiers = $self->{'collect_cfg'}->{'classify'};
204 }
205
206 # load all the classifiers
207 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
208
[14930]209 # load up the document processor for building
210 # if a buildproc class has been created for this collection, use it
[20647]211 # otherwise, use the default buildproc for the builder we are initialising
[14930]212 my ($buildprocdir, $buildproctype);
213 my $collection = $self->{'collection'};
214 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
215 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
216 $buildproctype = "custombuildproc";
217 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
218 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
219 $buildproctype = "custombuildproc";
220 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
221 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
222 $buildproctype = "${collection}buildproc";
223 } else {
224 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
225 $buildproctype = $self->default_buildproc();
226 }
227 require "$buildprocdir/$buildproctype.pm";
228
229 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
230 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
231 die "$@" if $@;
232
[17110]233
234 $self->generate_index_options();
235
[14930]236 if (!$self->{'debug'} && !$self->{'keepold'}) {
237 # remove any old builds
238 &util::rm_r($self->{'build_dir'});
239 &util::mk_all_dir($self->{'build_dir'});
240
241 # make the text directory
242 my $textdir = "$self->{'build_dir'}/text";
243 &util::mk_all_dir($textdir);
244 }
[17573]245
246 if ($self->{'incremental'}) {
247 # some classes may need to do some additional initialisation
248 $self->init_for_incremental_build();
249 }
[14930]250
251}
252
[20647]253sub is_incremental_capable
254{
255 # By default we return 'no' as the answer
256 # Safer to assume non-incremental to start with, and then override in
257 # inherited classes that are.
258
259 return 0;
260}
261
[17573]262# implement this in subclass if want to do additional initialisation for an
263# incremental build
264sub init_for_incremental_build {
265 my $self = shift (@_);
266}
267
[14930]268sub deinit {
269 my $self = shift (@_);
270
271 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
272}
273
[17110]274sub generate_index_options {
275 my $self = shift (@_);
276
277 my $separate_cjk = 0;
278
279 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
280 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
281 if ($option =~ /separate_cjk/) {
282 $separate_cjk = 1;
283 }
284 }
285 }
286 # set this for building
287 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
288 # record it for build.cfg
289 $self->{'separate_cjk'} = $separate_cjk;
290}
291
[14930]292sub set_sections_index_document_metadata {
293 my $self = shift (@_);
294 my ($index) = @_;
295
296 $self->{'buildproc'}->set_sections_index_document_metadata($index);
297}
298
299sub set_maxnumeric {
300 my $self = shift (@_);
301 my ($maxnumeric) = @_;
302
303 $self->{'maxnumeric'} = $maxnumeric;
304}
305sub set_strip_html {
306 my $self = shift (@_);
307 my ($strip) = @_;
308
309 $self->{'strip_html'} = $strip;
310 $self->{'buildproc'}->set_strip_html($strip);
311}
312
313sub compress_text {
314 my $self = shift (@_);
315 my ($textindex) = @_;
316
317 print STDERR "compress_text() should be implemented in subclass!!";
318 return;
319}
320
321
322sub build_indexes {
323 my $self = shift (@_);
324 my ($indexname) = @_;
325 my $outhandle = $self->{'outhandle'};
326
327 my $indexes = [];
328 if (defined $indexname && $indexname =~ /\w/) {
329 push @$indexes, $indexname;
330 } else {
331 $indexes = $self->{'collect_cfg'}->{'indexes'};
332 }
333
334 # create the mapping between the index descriptions
335 # and their directory names (includes subcolls and langs)
336 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
337
338 # build each of the indexes
339 foreach my $index (@$indexes) {
340 if ($self->want_built($index)) {
341 print $outhandle "\n*** building index $index in subdirectory " .
342 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
343 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
344 $self->build_index($index);
345 } else {
346 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
347 }
348 }
349
350 $self->build_indexes_extra();
351
352}
353
[17573]354# implement this in subclass if want to do extra stuff at the end of building
355# all the indexes
[14930]356sub build_indexes_extra {
357 my $self = shift(@_);
358
359}
360
361sub build_index {
362 my $self = shift (@_);
363 my ($index) = @_;
364
365 print STDERR "build_index should be implemented in subclass\n";
366 return;
367}
368
369
370
371sub make_infodatabase {
372 my $self = shift (@_);
373 my $outhandle = $self->{'outhandle'};
374
375 print STDERR "BuildDir: $self->{'build_dir'}\n";
376
377 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
378 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
379 &util::mk_all_dir ($textdir);
380 &util::mk_all_dir ($assocdir);
381
[15710]382 # Get info database file path
[20575]383 my $infodb_type = $self->{'infodbtype'};
384 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
[14930]385
386 print $outhandle "\n*** creating the info database and processing associated files\n"
387 if ($self->{'verbosity'} >= 1);
388 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
389
390 # init all the classifiers
391 &classify::init_classifiers ($self->{'classifiers'});
392
393 my $reconstructed_docs = undef;
[20575]394 my $database_recs = undef;
395
[14930]396 if ($self->{'keepold'}) {
[20575]397 $database_recs = {};
398
399 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
[14930]400 }
[20575]401
[14930]402
[20575]403 # Important (for memory usage reasons) that we obtain the filehandle
404 # here for writing out to the database, rather than after
405 # $reconstructed_docs has been set up (assuming -keepold is on)
406 #
407 # This is because when we open a pipe to txt2db [using open()]
408 # this triggers a fork() followed by exec(). $reconstructed_docs
409 # can get very large, and so if we did the open() after this, it means
410 # the fork creates a clone of the *large* process image which (admittedly)
411 # is then quickly replaced in the execve() with the much smaller image for
[20647]412 # 'txt2db'. The trouble is, in that seismic second caused by
[20575]413 # the fork(), the system really does need to have all that memory available
414 # even though it isn't ultimately used. The result is an out of memory
415 # error.
416
[15700]417 my ($infodb_handle);
[14930]418 if ($self->{'debug'}) {
[15700]419 $infodb_handle = *STDOUT;
[15710]420 }
421 else {
[20575]422 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
[15711]423 if (!defined($infodb_handle))
424 {
[14930]425 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
[15711]426 die "builder::make_infodatabase - couldn't open infodb write handle\n";
[14930]427 }
428 }
[15725]429
[20575]430 if ($self->{'keepold'}) {
431 # reconstruct doc_obj metadata from database for all docs
432 $reconstructed_docs
433 = &classify::reconstruct_doc_objs_metadata($infodb_type,
434 $infodb_file_path,
435 $database_recs);
436 }
437
438 # set up the document processor
439
440 $self->{'buildproc'}->set_infodbtype ($infodb_type);
[15700]441 $self->{'buildproc'}->set_output_handle ($infodb_handle);
[14930]442 $self->{'buildproc'}->set_mode ('infodb');
443 $self->{'buildproc'}->set_assocdir ($assocdir);
[15688]444 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
[14930]445 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
446 $self->{'buildproc'}->set_indexing_text (0);
447 $self->{'buildproc'}->set_store_text(1);
[16222]448 $self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
[14930]449
450 # make_infodatabase needs full reset even for incremental build
[15688]451 # as incremental works by reconstructing all docs from the database and
[14930]452 # then adding in the new ones
453 $self->{'buildproc'}->zero_reset();
454
[14934]455 $self->{'buildproc'}->{'mdprefix_fields'} = {};
456
[14930]457 if ($self->{'keepold'}) {
458 # create flat classify structure, ready for new docs to be added
459 foreach my $doc_obj ( @$reconstructed_docs ) {
460 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
461 $self->{'buildproc'}->process($doc_obj,undef);
462 }
463 }
464
465
[14934]466 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]467 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
[14934]468
[14930]469 # this has changed to only output collection meta if its
470 # not in the config file
[15700]471 $self->output_collection_meta($infodb_handle);
[14930]472
473 # output classification information
[20575]474 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
[14930]475 $self->{'remove_empty_classifications'},
476 $self->{'gli'});
477
478 # Output classifier reverse lookup, used in incremental deletion
[18469]479 ####&classify::print_reverse_lookup($infodb_handle);
[14930]480
[15700]481 # output doclist
482 my @doc_list = $self->{'buildproc'}->get_doc_list();
[15725]483 my $browselist_infodb = { 'hastxt' => [ "0" ],
484 'childtype' => [ "VList" ],
485 'numleafdocs' => [ scalar(@doc_list) ],
486 'thistype' => [ "Invisible" ],
487 'contains' => [ join(";", @doc_list) ] };
[20575]488 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
[14930]489
[20575]490 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
[14930]491
492 print STDERR "</Stage>\n" if $self->{'gli'};
493}
494
495sub make_auxiliary_files {
496 my $self = shift (@_);
497 my ($index);
498 my $build_cfg = {};
499 # subclasses may have already defined stuff in here
500 if (defined $self->{'build_cfg'}) {
501 $build_cfg = $self->{'build_cfg'};
502 }
503
504 my $outhandle = $self->{'outhandle'};
505
506 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
507 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
508
509 # get the text directory
510 &util::mk_all_dir ($self->{'build_dir'});
511
512 # store the build date
513 $build_cfg->{'builddate'} = time;
514 $build_cfg->{'buildtype'} = $self->{'buildtype'};
[15003]515 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
[14930]516 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
[17110]517 if ($self->{'separate_cjk'}) {
518 $build_cfg->{'separate_cjk'} = "true";
519 }
[14930]520
521 # store the number of documents and number of bytes
522 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
523 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
524 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
[18441]525
[14930]526 # store the mapping between the index names and the directory names
527 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
528 my @indexmap = ();
529 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
530 if (not defined ($self->{'notbuilt'}->{$index})) {
531 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
532 }
533 }
534 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
535
536 my @subcollectionmap = ();
537 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
538 push (@subcollectionmap, "$subcollection\-\>" .
539 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
540 }
541 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
542
543 my @languagemap = ();
544 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
545 push (@languagemap, "$language\-\>" .
546 $self->{'index_mapping'}->{'languagemap'}->{$language});
547 }
548 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
549
550 my @notbuilt = ();
551 foreach my $nb (keys %{$self->{'notbuilt'}}) {
552 push (@notbuilt, $nb);
553 }
554 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
555
556 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
557
[15728]558 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
559
[14930]560 $self->build_cfg_extra($build_cfg);
561
562 if ($gs_mode eq "gs2") {
[20100]563 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
[14930]564 }
[14384]565 if ($gs_mode eq "gs3") {
[14930]566
[20100]567 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'}, $self->{'disable_OAI'});
[14930]568 }
569
570 print STDERR "</Stage>\n" if $self->{'gli'};
571}
572
[17573]573# implement this in subclass if want to add extra stuff to build.cfg
574sub build_cfg_extra {
575 my $self = shift(@_);
576 my ($build_cfg) = @_;
577
578}
579
580
[14930]581sub collect_specific {
582 my $self = shift (@_);
583}
584
585sub want_built {
586 my $self = shift (@_);
587 my ($index) = @_;
588
589 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
590 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
591 if ($index =~ /^$checkstr$/) {
592 $self->{'notbuilt'}->{$index} = 1;
593 return 0;
594 }
595 }
596 }
597
598 return 1;
599}
600
601sub create_index_mapping {
602 my $self = shift (@_);
603 my ($indexes) = @_;
604
605 print STDERR "create_index_mapping should be implemented in subclass\n";
606 my %mapping = ();
607 return \%mapping;
608}
609
610# returns a processed version of a field.
611# if the field has only one component the processed
612# version will contain the first character and next consonant
613# of that componant - otherwise it will contain the first
614# character of the first two components
615# only uses letdig (\w) characters now
616sub process_field {
617 my $self = shift (@_);
618 my ($field) = @_;
619
620 return "" unless (defined ($field) && $field =~ /\S/);
621
622 my ($a, $b);
623 my @components = split /,/, $field;
624 if (scalar @components >= 2) {
625 # pick the first letdig from the first two field names
626 ($a) = $components[0] =~ /^[^\w]*(\w)/;
627 ($b) = $components[1] =~ /^[^\w]*(\w)/;
628 } else {
629 # pick the first two letdig chars
630 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
631 }
632 # there may not have been any letdigs...
633 $a = 'a' unless defined $a;
634 $b = '0' unless defined $b;
635
636 return "$a$b";
637
638}
639
640sub get_next_version {
641 my $self = shift (@_);
642 my ($nameref) = @_;
643 my $num=0;
644 if ($$nameref =~ /(\d\d)$/) {
645 $num = $1; $num ++;
646 $$nameref =~ s/\d\d$/$num/;
647 } elsif ($$nameref =~ /(\d)$/) {
648 $num = $1;
649 if ($num == 9) {$$nameref =~ s/\d$/10/;}
650 else {$num ++; $$nameref =~ s/\d$/$num/;}
651 } else {
652 $$nameref =~ s/.$/0/;
653 }
654}
655
656
[14934]657
[15709]658sub get_collection_meta_sets
659{
[14930]660 my $self = shift(@_);
[15709]661 my $collection_infodb = shift(@_);
[14930]662
[14934]663 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
664 foreach my $prefix (keys %$mdprefix_fields)
665 {
[15709]666 push(@{$collection_infodb->{"metadataset"}}, $prefix);
[14934]667
668 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
669 {
[15709]670 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
671
[14934]672 my $val = $mdprefix_fields->{$prefix}->{$field};
[15709]673 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
[14934]674 }
675 }
[15709]676}
[14934]677
678
679# default is to output the metadata sets (prefixes) used in collection
[15709]680sub output_collection_meta
681{
[14934]682 my $self = shift(@_);
[15709]683 my $infodb_handle = shift(@_);
[14934]684
[15709]685 my %collection_infodb = ();
686 $self->get_collection_meta_sets(\%collection_infodb);
[15725]687 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
[15709]688}
[14934]689
[17573]690# sometimes we need to read in an existing build.cfg - for example,
691# if doing each stage of building separately, or when doing incremental
692# building
693sub read_build_cfg {
694 my $self = shift(@_);
[14934]695
[20095]696 my $buildconfigfilename;
[17573]697
[20095]698 if ($gs_mode eq "gs2") {
699 $buildconfigfilename = "build.cfg";
700 } else {
701 $buildconfigfilename = "buildConfig.xml";
702 }
703
704 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
705
[17573]706 if (!-e $buildconfigfile) {
707 # try the index dir - but do we know where it is?? try here
[20095]708 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
[17573]709 if (!-e $buildconfigfile) {
710 #we cant find a config file - just ignore the field list
711 return undef;
712 }
713 }
[20100]714 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
[20095]715
[17573]716}
717
[14930]718sub print_stats {
719 my $self = shift (@_);
720
721 my $outhandle = $self->{'outhandle'};
722 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
723 my $index = $self->{'buildproc'}->get_index();
724 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
725 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
726
727 if ($indexing_text) {
728 print $outhandle "Stats (Creating index $index)\n";
729 } else {
730 print $outhandle "Stats (Compressing text from $index)\n";
731 }
732 print $outhandle "Total bytes in collection: $num_bytes\n";
733 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
734
735 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
736
737 if ($self->{'keepold'}) {
738 if ($num_processed_bytes == 0) {
739 if ($indexing_text) {
740 print $outhandle "No additional text was added to $index\n";
741 } elsif (!$self->{'no_text'}) {
742 print $outhandle "No additional text was compressed\n";
743 }
744 }
745 }
746 else {
747 print $outhandle "***************\n";
748 if ($indexing_text) {
749 print $outhandle "WARNING: There is very little or no text to process for $index\n";
750 } elsif (!$self->{'no_text'}) {
751 print $outhandle "WARNING: There is very little or no text to compress\n";
752 }
753 print $outhandle " Was this your intention?\n";
754 print $outhandle "***************\n";
755 }
756
757 }
758
759}
760
761
7621;
763
Note: See TracBrowser for help on using the repository browser.