root/gs2-extensions/parallel-building/trunk/src/perllib/basebuilder.pm @ 25115

Revision 25115, 26.2 KB (checked in by jmt12, 8 years ago)

This is sometimes called without the site argument... detect and match up remaining arguments appropriately

Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40    # set autoflush on for STDERR and STDOUT so that mgpp
41    # doesn't get out of sync with plugins
42    STDOUT->autoflush(1);
43    STDERR->autoflush(1);
44}
45
46END {
47    STDOUT->autoflush(0);
48    STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57  my ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
58      $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
59      $remove_empty_classifications,
60      $outhandle, $no_text, $failhandle, $gli);
61
62  # Somehow this is sometimes called without the site argument... detect and
63  # match up arguments appropriately
64  if (scalar(@_) == 16)
65  {
66    ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
67     $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
68     $remove_empty_classifications,
69     $outhandle, $no_text, $failhandle, $gli) = @_;
70  }
71  else
72  {
73    ($class, $collection, $source_dir, $build_dir, $verbosity,
74     $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
75     $remove_empty_classifications,
76     $outhandle, $no_text, $failhandle, $gli) = @_;
77  }
78
79    $outhandle = *STDERR unless defined $outhandle;
80    $no_text = 0 unless defined $no_text;
81    $failhandle = *STDERR unless defined $failhandle;
82
83    # create a builder object
84    my $self = bless {'site'=>$site, # will be undef for Greenstone 2
85              'collection'=>$collection,
86              'source_dir'=>$source_dir,
87              'build_dir'=>$build_dir,
88              'verbosity'=>$verbosity,
89              'maxdocs'=>$maxdocs,
90              'debug'=>$debug,
91              'keepold'=>$keepold,
92              'incremental'=>$incremental,
93              'incremental_mode'=>$incremental_mode,
94              'remove_empty_classifications'=>$remove_empty_classifications,
95              'outhandle'=>$outhandle,
96              'no_text'=>$no_text,
97              'failhandle'=>$failhandle,
98              'notbuilt'=>{},    # indexes not built
99              'gli'=>$gli
100              }, $class;
101
102    $self->{'gli'} = 0 unless defined $self->{'gli'};
103   
104    # Read in the collection configuration file.
105    my ($colcfgname);
106    ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
107    $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
108
109    if ($gs_mode eq "gs3") {
110    # read it in again to save the original form for later writing out
111    # of buildConfig.xml
112    # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
113    $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
114    }
115   
116    # get the database type for this collection from the collect.cfg file (may be undefined)
117    $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
118
119
120    # load up any dontdb fields
121    $self->{'dontdb'} = {};
122    if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
123    foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
124        $self->{'dontdb'}->{$dg} = 1;
125    }
126    }
127
128    $self->{'maxnumeric'} = 4;
129    return $self;
130}
131
132# stuff has been moved here from new, so we can use subclass methods
133sub init {
134    my $self = shift(@_);
135   
136    my $outhandle = $self->{'outhandle'};
137    my $failhandle = $self->{'failhandle'};
138
139    $self->generate_index_list();
140    my $indexes = $self->{'collect_cfg'}->{'indexes'};
141    if (defined $indexes) {
142    # sort out subcollection indexes
143    if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
144        $self->{'collect_cfg'}->{'indexes'} = [];
145        foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
146        foreach my $index (@$indexes) {
147            push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
148        }
149        }
150    }
151   
152    # sort out language subindexes
153    if (defined $self->{'collect_cfg'}->{'languages'}) {
154        $indexes = $self->{'collect_cfg'}->{'indexes'};
155        $self->{'collect_cfg'}->{'indexes'} = [];
156        foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
157        foreach my $index (@$indexes) {
158            if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
159            push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
160            }
161            else { # add in an empty subcollection field
162            push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
163            }
164        }
165        }
166    }
167    }
168   
169    if (defined($self->{'collect_cfg'}->{'indexes'})) {
170    # make sure that the same index isn't specified more than once
171    my %tmphash = ();
172    my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
173    $self->{'collect_cfg'}->{'indexes'} = [];
174    foreach my $i (@tmparray) {
175        if (!defined ($tmphash{$i})) {
176        push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
177        $tmphash{$i} = 1;
178        }
179    }
180    } else {
181    $self->{'collect_cfg'}->{'indexes'} = [];
182    }
183
184    # check incremental against whether builder can cope or not.
185    if ($self->{'incremental'} && !$self->is_incremental_capable()) {
186    print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
187    $self->{'keepold'} = 0;
188    $self->{'incremental'} = 0;
189    $self->{'incremental_mode'} = "none";
190   
191    }
192
193
194    # get the list of plugins for this collection
195    my $plugins = [];
196    if (defined $self->{'collect_cfg'}->{'plugin'}) {
197    $plugins = $self->{'collect_cfg'}->{'plugin'};
198    }
199   
200    # load all the plugins
201
202    #build up the extra global options for the plugins
203    my @global_opts = ();
204    if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
205    push @global_opts, "-separate_cjk";
206    }
207    $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'});
208   
209    if (scalar(@{$self->{'pluginfo'}}) == 0) {
210    print $outhandle "No plugins were loaded.\n";
211    die "\n";
212    }
213
214    # get the list of classifiers for this collection
215    my $classifiers = [];
216    if (defined $self->{'collect_cfg'}->{'classify'}) {
217    $classifiers = $self->{'collect_cfg'}->{'classify'};
218    }
219
220    # load all the classifiers
221    $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
222
223    # load up the document processor for building
224    # if a buildproc class has been created for this collection, use it
225    # otherwise, use the default buildproc for the builder we are initialising
226    my $buildprocdir = undef;
227    my $buildproctype;
228
229    my $collection = $self->{'collection'};
230    if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
231    $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
232    $buildproctype = "custombuildproc";
233    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
234    $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
235    $buildproctype = "custombuildproc";
236    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
237    $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
238    $buildproctype = "${collection}buildproc";
239    } else {
240    $buildproctype = $self->default_buildproc();
241    }
242    if (defined $buildprocdir) {
243    require "$buildprocdir/$buildproctype.pm";
244    }
245    else {
246    require "$buildproctype.pm";
247    }
248
249    eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
250     "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
251    die "$@" if $@;
252
253    # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
254    $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
255   
256   $self->generate_index_options();
257
258    if (!$self->{'debug'} && !$self->{'keepold'}) {
259    # remove any old builds
260    &util::rm_r($self->{'build_dir'});
261    &util::mk_all_dir($self->{'build_dir'});
262       
263    # make the text directory
264    my $textdir = "$self->{'build_dir'}/text";
265    &util::mk_all_dir($textdir);
266    }
267
268    if ($self->{'incremental'}) {
269    # some classes may need to do some additional initialisation
270    $self->init_for_incremental_build();
271    }
272   
273}
274
275sub is_incremental_capable
276{
277    # By default we return 'no' as the answer
278    # Safer to assume non-incremental to start with, and then override in
279    # inherited classes that are.
280
281    return 0;
282}
283
284# implement this in subclass if want to do additional initialisation for an
285# incremental build
286sub init_for_incremental_build {
287    my $self = shift (@_);
288}
289
290sub deinit {
291    my $self = shift (@_);
292   
293    &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
294}
295
296sub generate_index_options {
297    my $self = shift (@_);
298
299    my $separate_cjk = 0;
300   
301    if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
302    foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
303        if ($option =~ /separate_cjk/) {
304        $separate_cjk = 1;
305        }
306    }
307    }
308    # set this for building
309    $self->{'buildproc'}->set_separate_cjk($separate_cjk);
310    # record it for build.cfg
311    $self->{'separate_cjk'} = $separate_cjk;
312}
313 
314sub set_sections_index_document_metadata {
315    my $self = shift (@_);
316    my ($index) = @_;
317 
318    $self->{'buildproc'}->set_sections_index_document_metadata($index);
319}
320
321sub set_maxnumeric {
322    my $self = shift (@_);
323    my ($maxnumeric) = @_;
324
325    $self->{'maxnumeric'} = $maxnumeric;
326}
327sub set_strip_html {
328    my $self = shift (@_);
329    my ($strip) = @_;
330   
331    $self->{'strip_html'} = $strip;
332    $self->{'buildproc'}->set_strip_html($strip);
333}
334
335sub compress_text {
336    my $self = shift (@_);
337    my ($textindex) = @_;
338
339    print STDERR "compress_text() should be implemented in subclass!!";
340    return;
341}
342
343
344sub build_indexes {
345    my $self = shift (@_);
346    my ($indexname,$indexlevel) = @_;
347    my $outhandle = $self->{'outhandle'};
348
349    $self->pre_build_indexes();
350
351    my $indexes = [];
352    if (defined $indexname && $indexname =~ /\w/) {
353    push @$indexes, $indexname;
354    } else {
355    $indexes = $self->{'collect_cfg'}->{'indexes'};
356    }
357
358    # create the mapping between the index descriptions
359    # and their directory names (includes subcolls and langs)
360    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
361   
362    # build each of the indexes
363    foreach my $index (@$indexes) {
364    if ($self->want_built($index)) {
365        print $outhandle "\n*** building index $index in subdirectory " .
366        "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
367        print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
368        $self->build_index($index);
369    } else {
370        print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
371    }
372    }
373
374    $self->post_build_indexes();
375
376}
377
378# implement this in subclass if want to do extra stuff at before building
379# all the indexes
380sub pre_build_indexes {
381    my $self = shift(@_);
382    my ($indexname) = @_; # optional parameter
383}
384
385# implement this in subclass if want to do extra stuff at the end of building
386# all the indexes
387sub post_build_indexes {
388    my $self = shift(@_);   
389}
390
391sub build_index {
392    my $self = shift (@_);
393    my ($index) = @_;
394   
395    print STDERR "build_index should be implemented in subclass\n";
396    return;
397}
398
399
400
401sub make_infodatabase {
402    my $self = shift (@_);
403    my $outhandle = $self->{'outhandle'};
404
405    print STDERR "BuildDir: $self->{'build_dir'}\n";
406
407    my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
408    my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
409    &util::mk_all_dir ($textdir);
410    &util::mk_all_dir ($assocdir);
411
412    # Get info database file path
413    my $infodb_type = $self->{'infodbtype'};
414    my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
415
416    print $outhandle "\n*** creating the info database and processing associated files\n"
417    if ($self->{'verbosity'} >= 1);
418    print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
419
420    # init all the classifiers
421    &classify::init_classifiers ($self->{'classifiers'});
422
423    my $reconstructed_docs = undef;
424    my $database_recs = undef;
425
426    if ($self->{'incremental'}) {
427    $database_recs = {};
428
429    &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
430    }
431
432   
433    # Important (for memory usage reasons) that we obtain the filehandle
434    # here for writing out to the database, rather than after
435    # $reconstructed_docs has been set up (assuming -incremental is on)
436    #
437    # This is because when we open a pipe to txt2db [using open()]
438    # this triggers a fork() followed by exec().  $reconstructed_docs
439    # can get very large, and so if we did the open() after this, it means
440    # the fork creates a clone of the *large* process image which (admittedly)
441    # is then quickly replaced in the execve() with the much smaller image for
442    # 'txt2db'.  The trouble is, in that seismic second caused by
443    # the fork(), the system really does need to have all that memory available
444    # even though it isn't ultimately used.  The result is an out of memory
445    # error.
446
447    my ($infodb_handle);
448    if ($self->{'debug'}) {
449    $infodb_handle = *STDOUT;
450    }
451    else {
452    $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
453    if (!defined($infodb_handle))
454    {
455        print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
456        die "builder::make_infodatabase - couldn't open infodb write handle\n";
457    }
458    }
459
460    if ($self->{'incremental'}) {
461    # reconstruct doc_obj metadata from database for all docs
462    $reconstructed_docs
463        = &classify::reconstruct_doc_objs_metadata($infodb_type,
464                               $infodb_file_path,
465                               $database_recs);
466    }
467
468    # set up the document processor
469
470    $self->{'buildproc'}->set_output_handle ($infodb_handle);
471    $self->{'buildproc'}->set_mode ('infodb');
472    $self->{'buildproc'}->set_assocdir ($assocdir);
473    $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
474    $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
475    $self->{'buildproc'}->set_indexing_text (0);
476    $self->{'buildproc'}->set_store_text(1);
477    $self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
478
479    # make_infodatabase needs full reset even for incremental build
480    # as incremental works by reconstructing all docs from the database and
481    # then adding in the new ones
482    $self->{'buildproc'}->zero_reset();
483
484    $self->{'buildproc'}->{'mdprefix_fields'} = {};
485   
486    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
487           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
488
489    if ($self->{'incremental'}) {
490    # create flat classify structure, ready for new docs to be added
491    foreach my $doc_obj ( @$reconstructed_docs ) {
492        if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
493        print $outhandle "  Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
494        $self->{'buildproc'}->process($doc_obj,undef);
495        }
496    }
497    }
498    # this has changed to only output collection meta if its
499    # not in the config file
500    $self->output_collection_meta($infodb_handle);
501   
502    # output classification information
503    &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
504                     $self->{'remove_empty_classifications'},
505                     $self->{'gli'});
506
507    # Output classifier reverse lookup, used in incremental deletion
508    ####&classify::print_reverse_lookup($infodb_handle);
509
510    # output doclist
511    my @doc_list = $self->{'buildproc'}->get_doc_list();
512    my $browselist_infodb = { 'hastxt' => [ "0" ],
513                  'childtype' => [ "VList" ],
514                  'numleafdocs' => [ scalar(@doc_list) ],
515                  'thistype' => [ "Invisible" ],
516                  'contains' => [ join(";", @doc_list) ] };
517    &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
518
519    &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
520   
521    if ($infodb_type eq "gdbm-txtgz") {
522    my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
523    if (-e $gdb_infodb_file_path) {
524        &util::rm($gdb_infodb_file_path);
525    }
526    }
527    print STDERR "</Stage>\n" if $self->{'gli'};
528}
529
530sub make_auxiliary_files {
531    my $self = shift (@_);
532    my ($index);
533    my $build_cfg = {};
534    # subclasses may have already defined stuff in here
535    if (defined $self->{'build_cfg'}) {
536    $build_cfg = $self->{'build_cfg'};
537    }
538
539    my $outhandle = $self->{'outhandle'};
540
541    print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
542    print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
543
544    # get the text directory
545    &util::mk_all_dir ($self->{'build_dir'});
546
547    # store the build date
548    $build_cfg->{'builddate'} = time;
549    $build_cfg->{'buildtype'} = $self->{'buildtype'};
550    $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
551    $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
552    if ($self->{'separate_cjk'}) {
553    $build_cfg->{'separate_cjk'} = "true";
554    }
555   
556    # store the number of documents and number of bytes
557    $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
558    $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
559    $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
560
561    # store the mapping between the index names and the directory names
562    # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
563    my @indexmap = ();
564    foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
565    if (not defined ($self->{'notbuilt'}->{$index})) {
566        push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
567    }
568    }
569    $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
570
571    my @subcollectionmap = ();
572    foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
573    push (@subcollectionmap, "$subcollection\-\>" .
574          $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
575    }
576    $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
577
578    my @languagemap = ();
579    foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
580    push (@languagemap, "$language\-\>" .
581          $self->{'index_mapping'}->{'languagemap'}->{$language});
582    }
583    $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
584
585    my @notbuilt = ();
586    foreach my $nb (keys %{$self->{'notbuilt'}}) {
587    push (@notbuilt, $nb);
588    }
589    $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
590
591    $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
592
593    $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
594   
595    # write out the earliestDatestamp information needed for OAI
596    my $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
597    if(!-d $archivedir) {
598    $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
599    }
600    my $earliestDatestampFile = &util::filename_cat ($archivedir, "earliestDatestamp");
601    my $earliestDatestamp = 0;
602    if (open(FIN,"<$earliestDatestampFile")) {
603    {
604        # slurp in file as a single line
605        local $/ = undef;
606        $earliestDatestamp = <FIN>;
607        #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
608    }
609    close(FIN);
610    }
611    else {
612    print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
613    print $outhandle "Setting value to 0.\n";
614    }
615    $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
616   
617    $self->build_cfg_extra($build_cfg);
618
619    if ($gs_mode eq "gs2") {
620      &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
621    }
622    if ($gs_mode eq "gs3") {
623
624      &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
625    }   
626
627    print STDERR "</Stage>\n" if $self->{'gli'};
628}
629
630# implement this in subclass if want to add extra stuff to build.cfg
631sub build_cfg_extra {
632   my $self = shift(@_);
633   my ($build_cfg) = @_;
634   
635}
636
637
638sub collect_specific {
639    my $self = shift (@_);
640}
641
642sub want_built {
643    my $self = shift (@_);
644    my ($index) = @_;
645
646    if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
647    foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
648        if ($index =~ /^$checkstr$/) {
649        $self->{'notbuilt'}->{$index} = 1;
650        return 0;
651        }
652    }
653    }
654
655    return 1;
656}
657
658sub create_index_mapping {
659    my $self = shift (@_);
660    my ($indexes) = @_;
661
662    print STDERR "create_index_mapping should be implemented in subclass\n";
663    my %mapping = ();
664    return \%mapping;
665}
666
667# returns a processed version of a field.
668# if the field has only one component the processed
669# version will contain the first character and next consonant
670# of that componant - otherwise it will contain the first
671# character of the first two components
672# only uses letdig (\w) characters now
673sub process_field {
674    my $self = shift (@_);
675    my ($field) = @_;
676
677    return "" unless (defined ($field) && $field =~ /\S/);
678   
679    my ($a, $b);
680    my @components = split /,/, $field;
681    if (scalar @components >= 2) {
682    # pick the first letdig from the first two field names
683    ($a) = $components[0] =~ /^[^\w]*(\w)/;
684    ($b) = $components[1] =~ /^[^\w]*(\w)/;
685    } else {
686    # pick the first two letdig chars
687    ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
688    }
689    # there may not have been any letdigs...
690    $a = 'a' unless defined $a;
691    $b = '0' unless defined $b;
692   
693    my $newfield = "$a$b";
694    if ($newfield =~ /^\d\d$/) {
695    # digits only - Greenstone runtime doesn't like this.
696    $newfield = "a$a";
697    }
698    return $newfield;
699   
700}
701
702sub get_next_version {
703    my $self = shift (@_);
704    my ($nameref) = @_;
705    my $num=0;
706    if ($$nameref =~ /(\d\d)$/) {
707    $num = $1; $num ++;
708    $$nameref =~ s/\d\d$/$num/;
709    } elsif ($$nameref =~ /(\d)$/) {
710    $num = $1;
711    if ($num == 9) {$$nameref =~ s/\d$/10/;}
712    else {$num ++; $$nameref =~ s/\d$/$num/;}
713    } else {
714    $$nameref =~ s/.$/0/;
715    }
716}
717
718
719
720sub get_collection_meta_sets
721{
722    my $self = shift(@_);
723    my $collection_infodb = shift(@_);
724
725    my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
726    foreach my $prefix (keys %$mdprefix_fields)
727    {
728    push(@{$collection_infodb->{"metadataset"}}, $prefix);
729
730    foreach my $field (keys %{$mdprefix_fields->{$prefix}})
731    {
732        push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
733
734        my $val = $mdprefix_fields->{$prefix}->{$field};
735        push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
736    }
737    }
738}
739
740
741# default is to output the metadata sets (prefixes) used in collection
742sub output_collection_meta
743{
744    my $self = shift(@_);
745    my $infodb_handle = shift(@_);
746
747    my %collection_infodb = ();
748    $self->get_collection_meta_sets(\%collection_infodb);
749    &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
750}
751
752# sometimes we need to read in an existing build.cfg - for example,
753# if doing each stage of building separately, or when doing incremental
754# building
755sub read_build_cfg {
756    my $self = shift(@_);
757
758    my $buildconfigfilename;
759   
760    if ($gs_mode eq "gs2") {
761    $buildconfigfilename = "build.cfg";
762    } else {
763    $buildconfigfilename = "buildConfig.xml";
764    }
765   
766    my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
767   
768    if (!-e $buildconfigfile) {
769    # try the index dir - but do we know where it is?? try here
770    $buildconfigfile  = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
771    if (!-e $buildconfigfile) {
772        #we cant find a config file - just ignore the field list
773        return undef;
774    }
775    }
776    return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
777   
778}
779
780sub print_stats {
781    my $self = shift (@_);
782
783    my $outhandle = $self->{'outhandle'};
784    my $indexing_text = $self->{'buildproc'}->get_indexing_text();
785    my $index = $self->{'buildproc'}->get_index();
786    my $num_bytes = $self->{'buildproc'}->get_num_bytes();
787    my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
788
789    if ($indexing_text) {
790    print $outhandle "Stats (Creating index $index)\n";
791    } else {
792    print $outhandle "Stats (Compressing text from $index)\n";
793    }
794    print $outhandle "Total bytes in collection: $num_bytes\n";
795    print $outhandle "Total bytes in $index: $num_processed_bytes\n";
796
797    if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
798   
799    if ($self->{'incremental'}) {
800        if ($num_processed_bytes == 0) {
801        if ($indexing_text) {
802            print $outhandle "No additional text was added to $index\n";
803        } elsif (!$self->{'no_text'}) {
804            print $outhandle "No additional text was compressed\n";
805        }   
806        }   
807    }
808    else {
809        print $outhandle "***************\n";
810        if ($indexing_text) {
811        print $outhandle "WARNING: There is very little or no text to process for $index\n";
812        } elsif (!$self->{'no_text'}) {
813        print $outhandle "WARNING: There is very little or no text to compress\n";
814        }     
815        print $outhandle "         Was this your intention?\n";
816        print $outhandle "***************\n";
817    }
818
819    }
820
821}
822
823sub prepare_build_recipe
824{
825  my ($self) = @_;
826  my $outhandle = $self->{'outhandle'};
827  print $outhandle "WARNING: prepare_build_recipe() should be implemented in subclass!!";
828}
829 
8301;
831
Note: See TracBrowser for help on using the browser.