source: main/trunk/greenstone2/perllib/basebuilder.pm@ 23939

Last change on this file since 23939 was 23939, checked in by ak19, 13 years ago

GS3's OAIserver passes final official oaiserver validation tests: to do with earliestDatestamp. 1. Perl code (inexport, basebuilder, colcfg, buildconfigxml.pm perl files) write out the earliestDatestamp into GS3's buildconfig.xml. Whenever a full-build is performed, the archives directory is recreated. At this stage, inexport creates a new file in archives called earliestDatestamp containing the current time. Whenever an incremental build is performed, this file already exists in archive, so it is left untouched, preserving the time of the full-build (which is the earliestDatestamp). The other perl files are concerned with obtaining this value from the archives directory and writing it out to the build config file. 2. doc.pm and BasePlugout.pm write out the current date and time for each document processed under the new fields oailastmodified and oailastmodifieddate. Changes made in this commit are related to GS3 java src code changes that work in tandem.

  • Property svn:keywords set to Author Date Id Revision
File size: 25.0 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57 my ($class, $collection, $source_dir, $build_dir, $verbosity,
58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
59 $remove_empty_classifications,
60 $outhandle, $no_text, $failhandle, $gli) = @_;
61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
67 my $self = bless {'collection'=>$collection,
68 'source_dir'=>$source_dir,
69 'build_dir'=>$build_dir,
70 'verbosity'=>$verbosity,
71 'maxdocs'=>$maxdocs,
72 'debug'=>$debug,
73 'keepold'=>$keepold,
74 'incremental'=>$incremental,
75 'incremental_mode'=>$incremental_mode,
76 'remove_empty_classifications'=>$remove_empty_classifications,
77 'outhandle'=>$outhandle,
78 'no_text'=>$no_text,
79 'failhandle'=>$failhandle,
80 'notbuilt'=>{}, # indexes not built
81 'gli'=>$gli
82 }, $class;
83
84 $self->{'gli'} = 0 unless defined $self->{'gli'};
85
86 # Read in the collection configuration file.
87 my ($colcfgname);
88 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
89 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
90
91 if ($gs_mode eq "gs3") {
92 # read it in again to save the original form for later writing out
93 # of buildConfig.xml
94 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
95 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
96 }
97
98 # get the database type for this collection from the collect.cfg file (may be undefined)
99 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
100
101
102 # load up any dontdb fields
103 $self->{'dontdb'} = {};
104 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
105 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
106 $self->{'dontdb'}->{$dg} = 1;
107 }
108 }
109
110 $self->{'maxnumeric'} = 4;
111 return $self;
112}
113
114# stuff has been moved here from new, so we can use subclass methods
115sub init {
116 my $self = shift(@_);
117
118 my $outhandle = $self->{'outhandle'};
119 my $failhandle = $self->{'failhandle'};
120
121 $self->generate_index_list();
122 my $indexes = $self->{'collect_cfg'}->{'indexes'};
123 if (defined $indexes) {
124 # sort out subcollection indexes
125 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
126 $self->{'collect_cfg'}->{'indexes'} = [];
127 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
128 foreach my $index (@$indexes) {
129 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
130 }
131 }
132 }
133
134 # sort out language subindexes
135 if (defined $self->{'collect_cfg'}->{'languages'}) {
136 $indexes = $self->{'collect_cfg'}->{'indexes'};
137 $self->{'collect_cfg'}->{'indexes'} = [];
138 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
139 foreach my $index (@$indexes) {
140 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
141 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
142 }
143 else { # add in an empty subcollection field
144 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
145 }
146 }
147 }
148 }
149 }
150
151 if (defined($self->{'collect_cfg'}->{'indexes'})) {
152 # make sure that the same index isn't specified more than once
153 my %tmphash = ();
154 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
155 $self->{'collect_cfg'}->{'indexes'} = [];
156 foreach my $i (@tmparray) {
157 if (!defined ($tmphash{$i})) {
158 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
159 $tmphash{$i} = 1;
160 }
161 }
162 } else {
163 $self->{'collect_cfg'}->{'indexes'} = [];
164 }
165
166 # check incremental against whether builder can cope or not.
167 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
168 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
169 $self->{'keepold'} = 0;
170 $self->{'incremental'} = 0;
171 $self->{'incremental_mode'} = "none";
172
173 }
174
175
176 # get the list of plugins for this collection
177 my $plugins = [];
178 if (defined $self->{'collect_cfg'}->{'plugin'}) {
179 $plugins = $self->{'collect_cfg'}->{'plugin'};
180 }
181
182 # load all the plugins
183
184 #build up the extra global options for the plugins
185 my @global_opts = ();
186 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
187 push @global_opts, "-separate_cjk";
188 }
189 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'});
190
191 if (scalar(@{$self->{'pluginfo'}}) == 0) {
192 print $outhandle "No plugins were loaded.\n";
193 die "\n";
194 }
195
196 # get the list of classifiers for this collection
197 my $classifiers = [];
198 if (defined $self->{'collect_cfg'}->{'classify'}) {
199 $classifiers = $self->{'collect_cfg'}->{'classify'};
200 }
201
202 # load all the classifiers
203 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
204
205 # load up the document processor for building
206 # if a buildproc class has been created for this collection, use it
207 # otherwise, use the default buildproc for the builder we are initialising
208 my ($buildprocdir, $buildproctype);
209 my $collection = $self->{'collection'};
210 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
211 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
212 $buildproctype = "custombuildproc";
213 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
214 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
215 $buildproctype = "custombuildproc";
216 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
217 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
218 $buildproctype = "${collection}buildproc";
219 } else {
220 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
221 $buildproctype = $self->default_buildproc();
222 }
223 require "$buildprocdir/$buildproctype.pm";
224
225 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
226 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
227 die "$@" if $@;
228
229 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
230 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
231
232 $self->generate_index_options();
233
234 if (!$self->{'debug'} && !$self->{'keepold'}) {
235 # remove any old builds
236 &util::rm_r($self->{'build_dir'});
237 &util::mk_all_dir($self->{'build_dir'});
238
239 # make the text directory
240 my $textdir = "$self->{'build_dir'}/text";
241 &util::mk_all_dir($textdir);
242 }
243
244 if ($self->{'incremental'}) {
245 # some classes may need to do some additional initialisation
246 $self->init_for_incremental_build();
247 }
248
249}
250
251sub is_incremental_capable
252{
253 # By default we return 'no' as the answer
254 # Safer to assume non-incremental to start with, and then override in
255 # inherited classes that are.
256
257 return 0;
258}
259
260# implement this in subclass if want to do additional initialisation for an
261# incremental build
262sub init_for_incremental_build {
263 my $self = shift (@_);
264}
265
266sub deinit {
267 my $self = shift (@_);
268
269 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
270}
271
272sub generate_index_options {
273 my $self = shift (@_);
274
275 my $separate_cjk = 0;
276
277 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
278 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
279 if ($option =~ /separate_cjk/) {
280 $separate_cjk = 1;
281 }
282 }
283 }
284 # set this for building
285 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
286 # record it for build.cfg
287 $self->{'separate_cjk'} = $separate_cjk;
288}
289
290sub set_sections_index_document_metadata {
291 my $self = shift (@_);
292 my ($index) = @_;
293
294 $self->{'buildproc'}->set_sections_index_document_metadata($index);
295}
296
297sub set_maxnumeric {
298 my $self = shift (@_);
299 my ($maxnumeric) = @_;
300
301 $self->{'maxnumeric'} = $maxnumeric;
302}
303sub set_strip_html {
304 my $self = shift (@_);
305 my ($strip) = @_;
306
307 $self->{'strip_html'} = $strip;
308 $self->{'buildproc'}->set_strip_html($strip);
309}
310
311sub compress_text {
312 my $self = shift (@_);
313 my ($textindex) = @_;
314
315 print STDERR "compress_text() should be implemented in subclass!!";
316 return;
317}
318
319
320sub build_indexes {
321 my $self = shift (@_);
322 my ($indexname) = @_;
323 my $outhandle = $self->{'outhandle'};
324
325 my $indexes = [];
326 if (defined $indexname && $indexname =~ /\w/) {
327 push @$indexes, $indexname;
328 } else {
329 $indexes = $self->{'collect_cfg'}->{'indexes'};
330 }
331
332 # create the mapping between the index descriptions
333 # and their directory names (includes subcolls and langs)
334 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
335
336 # build each of the indexes
337 foreach my $index (@$indexes) {
338 if ($self->want_built($index)) {
339 print $outhandle "\n*** building index $index in subdirectory " .
340 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
341 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
342 $self->build_index($index);
343 } else {
344 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
345 }
346 }
347
348 $self->build_indexes_extra();
349
350}
351
352# implement this in subclass if want to do extra stuff at the end of building
353# all the indexes
354sub build_indexes_extra {
355 my $self = shift(@_);
356
357}
358
359sub build_index {
360 my $self = shift (@_);
361 my ($index) = @_;
362
363 print STDERR "build_index should be implemented in subclass\n";
364 return;
365}
366
367
368
369sub make_infodatabase {
370 my $self = shift (@_);
371 my $outhandle = $self->{'outhandle'};
372
373 print STDERR "BuildDir: $self->{'build_dir'}\n";
374
375 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
376 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
377 &util::mk_all_dir ($textdir);
378 &util::mk_all_dir ($assocdir);
379
380 # Get info database file path
381 my $infodb_type = $self->{'infodbtype'};
382 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
383
384 print $outhandle "\n*** creating the info database and processing associated files\n"
385 if ($self->{'verbosity'} >= 1);
386 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
387
388 # init all the classifiers
389 &classify::init_classifiers ($self->{'classifiers'});
390
391 my $reconstructed_docs = undef;
392 my $database_recs = undef;
393
394 if ($self->{'incremental'}) {
395 $database_recs = {};
396
397 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
398 }
399
400
401 # Important (for memory usage reasons) that we obtain the filehandle
402 # here for writing out to the database, rather than after
403 # $reconstructed_docs has been set up (assuming -incremental is on)
404 #
405 # This is because when we open a pipe to txt2db [using open()]
406 # this triggers a fork() followed by exec(). $reconstructed_docs
407 # can get very large, and so if we did the open() after this, it means
408 # the fork creates a clone of the *large* process image which (admittedly)
409 # is then quickly replaced in the execve() with the much smaller image for
410 # 'txt2db'. The trouble is, in that seismic second caused by
411 # the fork(), the system really does need to have all that memory available
412 # even though it isn't ultimately used. The result is an out of memory
413 # error.
414
415 my ($infodb_handle);
416 if ($self->{'debug'}) {
417 $infodb_handle = *STDOUT;
418 }
419 else {
420 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
421 if (!defined($infodb_handle))
422 {
423 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
424 die "builder::make_infodatabase - couldn't open infodb write handle\n";
425 }
426 }
427
428 if ($self->{'incremental'}) {
429 # reconstruct doc_obj metadata from database for all docs
430 $reconstructed_docs
431 = &classify::reconstruct_doc_objs_metadata($infodb_type,
432 $infodb_file_path,
433 $database_recs);
434 }
435
436 # set up the document processor
437
438 $self->{'buildproc'}->set_output_handle ($infodb_handle);
439 $self->{'buildproc'}->set_mode ('infodb');
440 $self->{'buildproc'}->set_assocdir ($assocdir);
441 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
442 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
443 $self->{'buildproc'}->set_indexing_text (0);
444 $self->{'buildproc'}->set_store_text(1);
445 $self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
446
447 # make_infodatabase needs full reset even for incremental build
448 # as incremental works by reconstructing all docs from the database and
449 # then adding in the new ones
450 $self->{'buildproc'}->zero_reset();
451
452 $self->{'buildproc'}->{'mdprefix_fields'} = {};
453
454 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
455 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
456
457 if ($self->{'incremental'}) {
458 # create flat classify structure, ready for new docs to be added
459 foreach my $doc_obj ( @$reconstructed_docs ) {
460 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
461 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
462 $self->{'buildproc'}->process($doc_obj,undef);
463 }
464 }
465 }
466 # this has changed to only output collection meta if its
467 # not in the config file
468 $self->output_collection_meta($infodb_handle);
469
470 # output classification information
471 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
472 $self->{'remove_empty_classifications'},
473 $self->{'gli'});
474
475 # Output classifier reverse lookup, used in incremental deletion
476 ####&classify::print_reverse_lookup($infodb_handle);
477
478 # output doclist
479 my @doc_list = $self->{'buildproc'}->get_doc_list();
480 my $browselist_infodb = { 'hastxt' => [ "0" ],
481 'childtype' => [ "VList" ],
482 'numleafdocs' => [ scalar(@doc_list) ],
483 'thistype' => [ "Invisible" ],
484 'contains' => [ join(";", @doc_list) ] };
485 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
486
487 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
488
489 if ($infodb_type eq "gdbm-txtgz") {
490 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
491 if (-e $gdb_infodb_file_path) {
492 &util::rm($gdb_infodb_file_path);
493 }
494 }
495 print STDERR "</Stage>\n" if $self->{'gli'};
496}
497
498sub make_auxiliary_files {
499 my $self = shift (@_);
500 my ($index);
501 my $build_cfg = {};
502 # subclasses may have already defined stuff in here
503 if (defined $self->{'build_cfg'}) {
504 $build_cfg = $self->{'build_cfg'};
505 }
506
507 my $outhandle = $self->{'outhandle'};
508
509 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
510 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
511
512 # get the text directory
513 &util::mk_all_dir ($self->{'build_dir'});
514
515 # store the build date
516 $build_cfg->{'builddate'} = time;
517 $build_cfg->{'buildtype'} = $self->{'buildtype'};
518 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
519 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
520 if ($self->{'separate_cjk'}) {
521 $build_cfg->{'separate_cjk'} = "true";
522 }
523
524 # store the number of documents and number of bytes
525 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
526 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
527 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
528
529 # store the mapping between the index names and the directory names
530 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
531 my @indexmap = ();
532 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
533 if (not defined ($self->{'notbuilt'}->{$index})) {
534 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
535 }
536 }
537 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
538
539 my @subcollectionmap = ();
540 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
541 push (@subcollectionmap, "$subcollection\-\>" .
542 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
543 }
544 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
545
546 my @languagemap = ();
547 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
548 push (@languagemap, "$language\-\>" .
549 $self->{'index_mapping'}->{'languagemap'}->{$language});
550 }
551 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
552
553 my @notbuilt = ();
554 foreach my $nb (keys %{$self->{'notbuilt'}}) {
555 push (@notbuilt, $nb);
556 }
557 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
558
559 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
560
561 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
562
563 # write out the earliestDatestamp information needed for OAI
564 my $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
565 if(!-d $archivedir) {
566 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
567 }
568 if(-d $archivedir) {
569 my $earliestDatestampFile = &util::filename_cat ($archivedir, "earliestDatestamp");
570 open(FIN,"<$earliestDatestampFile") || die "{common.cannot_open} $earliestDatestampFile: $!\n";
571 my $earliestDatestamp;
572 {
573 # slurp in file as a single line
574 local $/ = undef;
575 $earliestDatestamp = <FIN>;
576 #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
577 }
578 close(FIN);
579 $build_cfg->{'earliestDatestamp'} = $earliestDatestamp;
580 }
581
582 $self->build_cfg_extra($build_cfg);
583
584 if ($gs_mode eq "gs2") {
585 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
586 }
587 if ($gs_mode eq "gs3") {
588
589 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
590 }
591
592 print STDERR "</Stage>\n" if $self->{'gli'};
593}
594
595# implement this in subclass if want to add extra stuff to build.cfg
596sub build_cfg_extra {
597 my $self = shift(@_);
598 my ($build_cfg) = @_;
599
600}
601
602
603sub collect_specific {
604 my $self = shift (@_);
605}
606
607sub want_built {
608 my $self = shift (@_);
609 my ($index) = @_;
610
611 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
612 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
613 if ($index =~ /^$checkstr$/) {
614 $self->{'notbuilt'}->{$index} = 1;
615 return 0;
616 }
617 }
618 }
619
620 return 1;
621}
622
623sub create_index_mapping {
624 my $self = shift (@_);
625 my ($indexes) = @_;
626
627 print STDERR "create_index_mapping should be implemented in subclass\n";
628 my %mapping = ();
629 return \%mapping;
630}
631
632# returns a processed version of a field.
633# if the field has only one component the processed
634# version will contain the first character and next consonant
635# of that componant - otherwise it will contain the first
636# character of the first two components
637# only uses letdig (\w) characters now
638sub process_field {
639 my $self = shift (@_);
640 my ($field) = @_;
641
642 return "" unless (defined ($field) && $field =~ /\S/);
643
644 my ($a, $b);
645 my @components = split /,/, $field;
646 if (scalar @components >= 2) {
647 # pick the first letdig from the first two field names
648 ($a) = $components[0] =~ /^[^\w]*(\w)/;
649 ($b) = $components[1] =~ /^[^\w]*(\w)/;
650 } else {
651 # pick the first two letdig chars
652 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
653 }
654 # there may not have been any letdigs...
655 $a = 'a' unless defined $a;
656 $b = '0' unless defined $b;
657
658 my $newfield = "$a$b";
659 if ($newfield =~ /^\d\d$/) {
660 # digits only - Greenstone runtime doesn't like this.
661 $newfield = "a$a";
662 }
663 return $newfield;
664
665}
666
667sub get_next_version {
668 my $self = shift (@_);
669 my ($nameref) = @_;
670 my $num=0;
671 if ($$nameref =~ /(\d\d)$/) {
672 $num = $1; $num ++;
673 $$nameref =~ s/\d\d$/$num/;
674 } elsif ($$nameref =~ /(\d)$/) {
675 $num = $1;
676 if ($num == 9) {$$nameref =~ s/\d$/10/;}
677 else {$num ++; $$nameref =~ s/\d$/$num/;}
678 } else {
679 $$nameref =~ s/.$/0/;
680 }
681}
682
683
684
685sub get_collection_meta_sets
686{
687 my $self = shift(@_);
688 my $collection_infodb = shift(@_);
689
690 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
691 foreach my $prefix (keys %$mdprefix_fields)
692 {
693 push(@{$collection_infodb->{"metadataset"}}, $prefix);
694
695 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
696 {
697 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
698
699 my $val = $mdprefix_fields->{$prefix}->{$field};
700 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
701 }
702 }
703}
704
705
706# default is to output the metadata sets (prefixes) used in collection
707sub output_collection_meta
708{
709 my $self = shift(@_);
710 my $infodb_handle = shift(@_);
711
712 my %collection_infodb = ();
713 $self->get_collection_meta_sets(\%collection_infodb);
714 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
715}
716
717# sometimes we need to read in an existing build.cfg - for example,
718# if doing each stage of building separately, or when doing incremental
719# building
720sub read_build_cfg {
721 my $self = shift(@_);
722
723 my $buildconfigfilename;
724
725 if ($gs_mode eq "gs2") {
726 $buildconfigfilename = "build.cfg";
727 } else {
728 $buildconfigfilename = "buildConfig.xml";
729 }
730
731 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
732
733 if (!-e $buildconfigfile) {
734 # try the index dir - but do we know where it is?? try here
735 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
736 if (!-e $buildconfigfile) {
737 #we cant find a config file - just ignore the field list
738 return undef;
739 }
740 }
741 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
742
743}
744
745sub print_stats {
746 my $self = shift (@_);
747
748 my $outhandle = $self->{'outhandle'};
749 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
750 my $index = $self->{'buildproc'}->get_index();
751 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
752 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
753
754 if ($indexing_text) {
755 print $outhandle "Stats (Creating index $index)\n";
756 } else {
757 print $outhandle "Stats (Compressing text from $index)\n";
758 }
759 print $outhandle "Total bytes in collection: $num_bytes\n";
760 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
761
762 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
763
764 if ($self->{'incremental'}) {
765 if ($num_processed_bytes == 0) {
766 if ($indexing_text) {
767 print $outhandle "No additional text was added to $index\n";
768 } elsif (!$self->{'no_text'}) {
769 print $outhandle "No additional text was compressed\n";
770 }
771 }
772 }
773 else {
774 print $outhandle "***************\n";
775 if ($indexing_text) {
776 print $outhandle "WARNING: There is very little or no text to process for $index\n";
777 } elsif (!$self->{'no_text'}) {
778 print $outhandle "WARNING: There is very little or no text to compress\n";
779 }
780 print $outhandle " Was this your intention?\n";
781 print $outhandle "***************\n";
782 }
783
784 }
785
786}
787
788
7891;
790
Note: See TracBrowser for help on using the repository browser.