source: main/trunk/greenstone2/perllib/basebuilder.pm@ 31957

Last change on this file since 31957 was 31409, checked in by ak19, 7 years ago

Adding verbosity setting to oaiinfo.pm to reduce debug output on regular runs of building process. The perl filest that use oaiinfo now pass in the verbosity setting to the oaiinfo constructor.

  • Property svn:keywords set to Author Date Id Revision
File size: 27.1 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use arcinfo;
32use classify;
33use cfgread;
34use colcfg;
35use dbutil;
36use oaiinfo;
37use plugin;
38use util;
39use FileUtils;
40
41
42BEGIN {
43 # set autoflush on for STDERR and STDOUT so that mgpp
44 # doesn't get out of sync with plugins
45 STDOUT->autoflush(1);
46 STDERR->autoflush(1);
47}
48
49END {
50 STDOUT->autoflush(0);
51 STDERR->autoflush(0);
52}
53
54our $maxdocsize = 12000;
55
56# used to signify "gs2"(default) or "gs3"
57our $gs_mode = "gs2";
58
59sub new {
60 my ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
61 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
62 $remove_empty_classifications,
63 $outhandle, $no_text, $failhandle, $gli) = @_;
64
65 $outhandle = *STDERR unless defined $outhandle;
66 $no_text = 0 unless defined $no_text;
67 $failhandle = *STDERR unless defined $failhandle;
68
69 # create a builder object
70 my $self = bless {'site'=>$site, # will be undef for Greenstone 2
71 'collection'=>$collection,
72 'source_dir'=>$source_dir,
73 'build_dir'=>$build_dir,
74 'verbosity'=>$verbosity,
75 'maxdocs'=>$maxdocs,
76 'debug'=>$debug,
77 'keepold'=>$keepold,
78 'incremental'=>$incremental,
79 'incremental_mode'=>$incremental_mode,
80 'remove_empty_classifications'=>$remove_empty_classifications,
81 'outhandle'=>$outhandle,
82 'no_text'=>$no_text,
83 'failhandle'=>$failhandle,
84 'notbuilt'=>{}, # indexes not built
85 'gli'=>$gli
86 }, $class;
87
88 $self->{'gli'} = 0 unless defined $self->{'gli'};
89
90 # Read in the collection configuration file.
91 if ((defined $site) && ($site ne "")) { # GS3
92 $gs_mode = "gs3";
93 }
94
95 my $colcfgname = &colcfg::get_collect_cfg_name($outhandle, $gs_mode);
96 $self->{'colcfgname'} = $colcfgname;
97 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
98
99 if ($gs_mode eq "gs3") {
100 # read it in again to save the original form for later writing out
101 # of buildConfig.xml
102 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
103 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
104 }
105
106 # get the database type for this collection from the collect.cfg file (may be undefined)
107 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
108
109
110 # load up any dontdb fields
111 $self->{'dontdb'} = {};
112 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
113 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
114 $self->{'dontdb'}->{$dg} = 1;
115 }
116 }
117
118 $self->{'maxnumeric'} = 4;
119 return $self;
120}
121
122# stuff has been moved here from new, so we can use subclass methods
123sub init {
124 my $self = shift(@_);
125
126 my $outhandle = $self->{'outhandle'};
127 my $failhandle = $self->{'failhandle'};
128
129 $self->generate_index_list();
130 my $indexes = $self->{'collect_cfg'}->{'indexes'};
131 if (defined $indexes) {
132 # sort out subcollection indexes
133 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
134 $self->{'collect_cfg'}->{'indexes'} = [];
135 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
136 foreach my $index (@$indexes) {
137 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
138 }
139 }
140 }
141
142 # sort out language subindexes
143 if (defined $self->{'collect_cfg'}->{'languages'}) {
144 $indexes = $self->{'collect_cfg'}->{'indexes'};
145 $self->{'collect_cfg'}->{'indexes'} = [];
146 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
147 foreach my $index (@$indexes) {
148 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
149 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
150 }
151 else { # add in an empty subcollection field
152 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
153 }
154 }
155 }
156 }
157 }
158
159 if (defined($self->{'collect_cfg'}->{'indexes'})) {
160 # make sure that the same index isn't specified more than once
161 my %tmphash = ();
162 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
163 $self->{'collect_cfg'}->{'indexes'} = [];
164 foreach my $i (@tmparray) {
165 if (!defined ($tmphash{$i})) {
166 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
167 $tmphash{$i} = 1;
168 }
169 }
170 } else {
171 $self->{'collect_cfg'}->{'indexes'} = [];
172 }
173
174
175 # Prepare to work with the <collection>/etc/oai-inf.<db> that keeps track
176 # of the OAI identifiers with their time stamps and deleted status.
177 #
178 # At this stage of working with the oai info db, we don't care whether we have a
179 # manifest or are otherwise incremental, or whether we're doing removeold (full rebuild).
180 # Because we've already dealt with that during the import stage. From here on, we pretend
181 # we're incremental, since the oai info db should just do what archiveinfo contains.
182 # This is because "building is always incremental" where oai info db is concerned.
183
184 my $archivedir = $self->{'source_dir'};
185 my $oai_info = new oaiinfo($self->{'colcfgname'}, $self->{'collect_cfg'}->{'infodbtype'}, $self->{'verbosity'});
186 $oai_info->building_stage_before_indexing($archivedir);
187
188
189 # check incremental against whether builder can cope or not.
190 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
191 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
192 $self->{'keepold'} = 0;
193 $self->{'incremental'} = 0;
194 $self->{'incremental_mode'} = "none";
195
196 }
197
198 # gs_version for plugins
199 my $gs_version = "2";
200 if ($gs_mode eq "gs3") {
201 $gs_version = "3";
202 }
203 # get the list of plugins for this collection
204 my $plugins = [];
205 if (defined $self->{'collect_cfg'}->{'plugin'}) {
206 $plugins = $self->{'collect_cfg'}->{'plugin'};
207 }
208
209 # load all the plugins
210
211 #build up the extra global options for the plugins
212 my @global_opts = ();
213 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
214 push @global_opts, "-separate_cjk";
215 }
216 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'}, $gs_version);
217
218 if (scalar(@{$self->{'pluginfo'}}) == 0) {
219 print $outhandle "No plugins were loaded.\n";
220 die "\n";
221 }
222
223 # get the list of classifiers for this collection
224 my $classifiers = [];
225 if (defined $self->{'collect_cfg'}->{'classify'}) {
226 $classifiers = $self->{'collect_cfg'}->{'classify'};
227 }
228
229 # load all the classifiers
230 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
231
232 # load up the document processor for building
233 # if a buildproc class has been created for this collection, use it
234 # otherwise, use the default buildproc for the builder we are initialising
235 my $buildprocdir = undef;
236 my $buildproctype;
237
238 my $collection = $self->{'collection'};
239 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
240 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
241 $buildproctype = "custombuildproc";
242 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
243 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
244 $buildproctype = "custombuildproc";
245 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
246 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
247 $buildproctype = "${collection}buildproc";
248 } else {
249 $buildproctype = $self->default_buildproc();
250 }
251 if (defined $buildprocdir) {
252 require "$buildprocdir/$buildproctype.pm";
253 }
254 else {
255 require "$buildproctype.pm";
256 }
257
258 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
259 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
260 die "$@" if $@;
261
262 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
263 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
264
265 $self->generate_index_options();
266
267 if (!$self->{'debug'} && !$self->{'keepold'}) {
268 # remove any old builds
269 &FileUtils::removeFilesRecursive($self->{'build_dir'});
270 &FileUtils::makeAllDirectories($self->{'build_dir'});
271
272 # make the text directory
273 my $textdir = "$self->{'build_dir'}/text";
274 &FileUtils::makeAllDirectories($textdir);
275 }
276
277 if ($self->{'incremental'}) {
278 # some classes may need to do some additional initialisation
279 $self->init_for_incremental_build();
280 }
281
282}
283
284sub is_incremental_capable
285{
286 # By default we return 'no' as the answer
287 # Safer to assume non-incremental to start with, and then override in
288 # inherited classes that are.
289
290 return 0;
291}
292
293# implement this in subclass if want to do additional initialisation for an
294# incremental build
295sub init_for_incremental_build {
296 my $self = shift (@_);
297}
298
299sub deinit {
300 my $self = shift (@_);
301
302 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
303}
304
305sub generate_index_options {
306 my $self = shift (@_);
307
308 my $separate_cjk = 0;
309
310 my $indexoptions = $self->{'collect_cfg'}->{'indexoptions'};
311 if (defined($indexoptions)) {
312
313 foreach my $option (@$indexoptions) {
314 if ($option =~ /separate_cjk/) {
315 $separate_cjk = 1;
316 }
317 }
318 }
319 # set this for building
320 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
321 # record it for build.cfg
322 $self->{'separate_cjk'} = $separate_cjk;
323}
324
325sub set_sections_index_document_metadata {
326 my $self = shift (@_);
327 my ($index) = @_;
328
329 $self->{'buildproc'}->set_sections_index_document_metadata($index);
330}
331
332sub set_maxnumeric {
333 my $self = shift (@_);
334 my ($maxnumeric) = @_;
335
336 $self->{'maxnumeric'} = $maxnumeric;
337}
338sub set_strip_html {
339 my $self = shift (@_);
340 my ($strip) = @_;
341
342 $self->{'strip_html'} = $strip;
343 $self->{'buildproc'}->set_strip_html($strip);
344}
345
346sub set_store_metadata_coverage {
347 my $self = shift (@_);
348 my ($store_metadata_coverage) = @_;
349
350 $self->{'buildproc'}->set_store_metadata_coverage($store_metadata_coverage);
351}
352
353sub compress_text {
354 my $self = shift (@_);
355 my ($textindex) = @_;
356
357 print STDERR "compress_text() should be implemented in subclass!!";
358 return;
359}
360
361
362sub build_indexes {
363 my $self = shift (@_);
364 my ($indexname) = @_;
365 my $outhandle = $self->{'outhandle'};
366
367 $self->pre_build_indexes();
368
369 my $indexes = [];
370 if (defined $indexname && $indexname =~ /\w/) {
371 push @$indexes, $indexname;
372 } else {
373 $indexes = $self->{'collect_cfg'}->{'indexes'};
374 }
375
376 # create the mapping between the index descriptions
377 # and their directory names (includes subcolls and langs)
378 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
379
380 # build each of the indexes
381 foreach my $index (@$indexes) {
382 if ($self->want_built($index)) {
383 print $outhandle "\n*** building index $index in subdirectory " .
384 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
385 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
386 $self->build_index($index);
387 } else {
388 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
389 }
390 }
391
392 $self->post_build_indexes();
393
394}
395
396# implement this in subclass if want to do extra stuff at before building
397# all the indexes
398sub pre_build_indexes {
399 my $self = shift(@_);
400 my ($indexname) = @_; # optional parameter
401}
402
403# implement this in subclass if want to do extra stuff at the end of building
404# all the indexes
405sub post_build_indexes {
406 my $self = shift(@_);
407}
408
409sub build_index {
410 my $self = shift (@_);
411 my ($index) = @_;
412
413 print STDERR "build_index should be implemented in subclass\n";
414 return;
415}
416
417# By default, builders do support make_infodatabase()
418sub supports_make_infodatabase {
419 return 1;
420}
421
422
423sub make_infodatabase {
424 my $self = shift (@_);
425 my $outhandle = $self->{'outhandle'};
426
427 print STDERR "BuildDir: $self->{'build_dir'}\n";
428
429 my $textdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
430 my $assocdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "assoc");
431 &FileUtils::makeAllDirectories ($textdir);
432 &FileUtils::makeAllDirectories ($assocdir);
433
434 # Get info database file path
435 my $infodb_type = $self->{'infodbtype'};
436 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
437
438 print $outhandle "\n*** creating the info database and processing associated files\n"
439 if ($self->{'verbosity'} >= 1);
440 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
441
442 # init all the classifiers
443 &classify::init_classifiers ($self->{'classifiers'});
444
445 my $reconstructed_docs = undef;
446 my $database_recs = undef;
447
448 if ($self->{'incremental'}) {
449 $database_recs = {};
450
451 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
452 }
453
454
455 # Important (for memory usage reasons) that we obtain the filehandle
456 # here for writing out to the database, rather than after
457 # $reconstructed_docs has been set up (assuming -incremental is on)
458 #
459 # This is because when we open a pipe to txt2db [using open()]
460 # this triggers a fork() followed by exec(). $reconstructed_docs
461 # can get very large, and so if we did the open() after this, it means
462 # the fork creates a clone of the *large* process image which (admittedly)
463 # is then quickly replaced in the execve() with the much smaller image for
464 # 'txt2db'. The trouble is, in that seismic second caused by
465 # the fork(), the system really does need to have all that memory available
466 # even though it isn't ultimately used. The result is an out of memory
467 # error.
468
469 my ($infodb_handle);
470 if ($self->{'debug'}) {
471 $infodb_handle = *STDOUT;
472 }
473 else {
474 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
475 if (!defined($infodb_handle))
476 {
477 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
478 die "builder::make_infodatabase - couldn't open infodb write handle\n";
479 }
480 }
481
482 if ($self->{'incremental'}) {
483 # reconstruct doc_obj metadata from database for all docs
484 $reconstructed_docs
485 = &classify::reconstruct_doc_objs_metadata($infodb_type,
486 $infodb_file_path,
487 $database_recs);
488 }
489
490 # set up the document processor
491
492 $self->{'buildproc'}->set_output_handle ($infodb_handle);
493 $self->{'buildproc'}->set_mode ('infodb');
494 $self->{'buildproc'}->set_assocdir ($assocdir);
495 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
496 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
497 $self->{'buildproc'}->set_indexing_text (0);
498 $self->{'buildproc'}->set_store_text(1);
499
500 # make_infodatabase needs full reset even for incremental build
501 # as incremental works by reconstructing all docs from the database and
502 # then adding in the new ones
503 $self->{'buildproc'}->zero_reset();
504
505 $self->{'buildproc'}->{'mdprefix_fields'} = {};
506
507 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
508 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
509
510 if ($self->{'incremental'}) {
511 # create flat classify structure, ready for new docs to be added
512 foreach my $doc_obj ( @$reconstructed_docs ) {
513 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
514 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
515 $self->{'buildproc'}->process($doc_obj,undef);
516 }
517 }
518 }
519 # this has changed to only output collection meta if its
520 # not in the config file
521 $self->output_collection_meta($infodb_handle);
522
523 # output classification information
524 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
525 $self->{'remove_empty_classifications'},
526 $self->{'gli'});
527
528 # Output classifier reverse lookup, used in incremental deletion
529 ####&classify::print_reverse_lookup($infodb_handle);
530
531 # output doclist
532 my @doc_list = $self->{'buildproc'}->get_doc_list();
533 my $browselist_infodb = { 'hastxt' => [ "0" ],
534 'childtype' => [ "VList" ],
535 'numleafdocs' => [ scalar(@doc_list) ],
536 'thistype' => [ "Invisible" ],
537 'contains' => [ join(";", @doc_list) ] };
538 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
539
540 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
541
542 if ($infodb_type eq "gdbm-txtgz") {
543 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
544 if (-e $gdb_infodb_file_path) {
545 &FileUtils::removeFiles($gdb_infodb_file_path);
546 }
547 }
548 print STDERR "</Stage>\n" if $self->{'gli'};
549}
550
551sub make_auxiliary_files {
552 my $self = shift (@_);
553 my ($index);
554 my $build_cfg = {};
555 # subclasses may have already defined stuff in here
556 if (defined $self->{'build_cfg'}) {
557 $build_cfg = $self->{'build_cfg'};
558 }
559
560 my $outhandle = $self->{'outhandle'};
561
562 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
563 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
564
565 # get the text directory
566 &FileUtils::makeAllDirectories ($self->{'build_dir'});
567
568 # store the build date
569 $build_cfg->{'builddate'} = time;
570 $build_cfg->{'buildtype'} = $self->{'buildtype'};
571 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
572 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
573 if ($self->{'separate_cjk'}) {
574 $build_cfg->{'separate_cjk'} = "true";
575 }
576
577 # store the number of documents and number of bytes
578 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
579 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
580 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
581
582 # store the mapping between the index names and the directory names
583 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
584 my @indexmap = ();
585 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
586 if (not defined ($self->{'notbuilt'}->{$index})) {
587 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
588 }
589 }
590
591 # store the number of indexes built to later determine whether search serviceracks get written out to buildConfig.xml
592 $build_cfg->{'num_indexes'} = scalar (@indexmap);
593
594 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
595
596 my @subcollectionmap = ();
597 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
598 push (@subcollectionmap, "$subcollection\-\>" .
599 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
600 }
601 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
602
603 my @languagemap = ();
604 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
605 push (@languagemap, "$language\-\>" .
606 $self->{'index_mapping'}->{'languagemap'}->{$language});
607 }
608 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
609
610 my @notbuilt = ();
611 foreach my $nb (keys %{$self->{'notbuilt'}}) {
612 push (@notbuilt, $nb);
613 }
614 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
615
616 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
617
618 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
619
620 # write out the earliestDatestamp information needed for OAI
621 my $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "archives");
622 if(!-d $archivedir) {
623 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "export");
624 }
625 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
626 my $earliestDatestamp = 0;
627 if (open(FIN,"<$earliestDatestampFile")) {
628 {
629 # slurp in file as a single line
630 local $/ = undef;
631 $earliestDatestamp = <FIN>;
632 #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
633 }
634 close(FIN);
635 }
636 else {
637 print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
638 print $outhandle "Setting value to 0.\n";
639 }
640 $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
641
642 $self->build_cfg_extra($build_cfg);
643
644 if ($gs_mode eq "gs2") {
645 &colcfg::write_build_cfg(&FileUtils::filenameConcatenate($self->{'build_dir'},"build.cfg"), $build_cfg);
646 }
647 if ($gs_mode eq "gs3") {
648
649 &colcfg::write_build_cfg_xml(&FileUtils::filenameConcatenate($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
650 }
651
652 print STDERR "</Stage>\n" if $self->{'gli'};
653}
654
655# implement this in subclass if want to add extra stuff to build.cfg
656sub build_cfg_extra {
657 my $self = shift(@_);
658 my ($build_cfg) = @_;
659
660}
661
662
663sub collect_specific {
664 my $self = shift (@_);
665}
666
667sub want_built {
668 my $self = shift (@_);
669 my ($index) = @_;
670
671 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
672 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
673 if ($index =~ /^$checkstr$/) {
674 $self->{'notbuilt'}->{$index} = 1;
675 return 0;
676 }
677 }
678 }
679
680 return 1;
681}
682
683sub create_index_mapping {
684 my $self = shift (@_);
685 my ($indexes) = @_;
686
687 print STDERR "create_index_mapping should be implemented in subclass\n";
688 my %mapping = ();
689 return \%mapping;
690}
691
692# returns a processed version of a field.
693# if the field has only one component the processed
694# version will contain the first character and next consonant
695# of that componant - otherwise it will contain the first
696# character of the first two components
697# only uses letdig (\w) characters now
698sub process_field {
699 my $self = shift (@_);
700 my ($field) = @_;
701
702 return "" unless (defined ($field) && $field =~ /\S/);
703
704 my ($a, $b);
705 my @components = split /,/, $field;
706 if (scalar @components >= 2) {
707 # pick the first letdig from the first two field names
708 ($a) = $components[0] =~ /^[^\w]*(\w)/;
709 ($b) = $components[1] =~ /^[^\w]*(\w)/;
710 } else {
711 # pick the first two letdig chars
712 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
713 }
714 # there may not have been any letdigs...
715 $a = 'a' unless defined $a;
716 $b = '0' unless defined $b;
717
718 my $newfield = "$a$b";
719 if ($newfield =~ /^\d\d$/) {
720 # digits only - Greenstone runtime doesn't like this.
721 $newfield = "a$a";
722 }
723 return $newfield;
724
725}
726
727sub get_next_version {
728 my $self = shift (@_);
729 my ($nameref) = @_;
730 my $num=0;
731 if ($$nameref =~ /(\d\d)$/) {
732 $num = $1; $num ++;
733 $$nameref =~ s/\d\d$/$num/;
734 } elsif ($$nameref =~ /(\d)$/) {
735 $num = $1;
736 if ($num == 9) {$$nameref =~ s/\d$/10/;}
737 else {$num ++; $$nameref =~ s/\d$/$num/;}
738 } else {
739 $$nameref =~ s/.$/0/;
740 }
741}
742
743
744
745sub get_collection_meta_sets
746{
747 my $self = shift(@_);
748 my $collection_infodb = shift(@_);
749
750 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
751 foreach my $prefix (keys %$mdprefix_fields)
752 {
753 push(@{$collection_infodb->{"metadataset"}}, $prefix);
754
755 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
756 {
757 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
758
759 my $val = $mdprefix_fields->{$prefix}->{$field};
760 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
761 }
762 }
763}
764
765
766# default is to output the metadata sets (prefixes) used in collection
767sub output_collection_meta
768{
769 my $self = shift(@_);
770 my $infodb_handle = shift(@_);
771
772 my %collection_infodb = ();
773 $self->get_collection_meta_sets(\%collection_infodb);
774 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
775}
776
777# sometimes we need to read in an existing build.cfg - for example,
778# if doing each stage of building separately, or when doing incremental
779# building
780sub read_build_cfg {
781 my $self = shift(@_);
782
783 my $buildconfigfilename;
784
785 if ($gs_mode eq "gs2") {
786 $buildconfigfilename = "build.cfg";
787 } else {
788 $buildconfigfilename = "buildConfig.xml";
789 }
790
791 my $buildconfigfile = &FileUtils::filenameConcatenate($self->{'build_dir'}, $buildconfigfilename);
792
793 if (!-e $buildconfigfile) {
794 # try the index dir - but do we know where it is?? try here
795 $buildconfigfile = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
796 if (!-e $buildconfigfile) {
797 #we cant find a config file - just ignore the field list
798 return undef;
799 }
800 }
801 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
802
803}
804
805sub print_stats {
806 my $self = shift (@_);
807
808 my $outhandle = $self->{'outhandle'};
809 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
810 my $index = $self->{'buildproc'}->get_index();
811 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
812 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
813
814 if ($indexing_text) {
815 print $outhandle "Stats (Creating index $index)\n";
816 } else {
817 print $outhandle "Stats (Compressing text from $index)\n";
818 }
819 print $outhandle "Total bytes in collection: $num_bytes\n";
820 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
821
822 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
823
824 if ($self->{'incremental'}) {
825 if ($num_processed_bytes == 0) {
826 if ($indexing_text) {
827 print $outhandle "No additional text was added to $index\n";
828 } elsif (!$self->{'no_text'}) {
829 print $outhandle "No additional text was compressed\n";
830 }
831 }
832 }
833 else {
834 print $outhandle "***************\n";
835 if ($indexing_text) {
836 print $outhandle "WARNING: There is very little or no text to process for $index\n";
837 } elsif (!$self->{'no_text'}) {
838 print $outhandle "WARNING: There is very little or no text to compress\n";
839 }
840 print $outhandle " Was this your intention?\n";
841 print $outhandle "***************\n";
842 }
843
844 }
845
846}
847
848
8491;
850
Note: See TracBrowser for help on using the repository browser.