root/main/trunk/greenstone2/perllib/basebuilder.pm @ 25958

Revision 25958, 25.6 KB (checked in by kjdon, 8 years ago)

pass gs_version to loading plugins

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40    # set autoflush on for STDERR and STDOUT so that mgpp
41    # doesn't get out of sync with plugins
42    STDOUT->autoflush(1);
43    STDERR->autoflush(1);
44}
45
46END {
47    STDOUT->autoflush(0);
48    STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57    my ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
58    $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
59    $remove_empty_classifications,
60    $outhandle, $no_text, $failhandle, $gli) = @_;
61
62    $outhandle = *STDERR unless defined $outhandle;
63    $no_text = 0 unless defined $no_text;
64    $failhandle = *STDERR unless defined $failhandle;
65
66    # create a builder object
67    my $self = bless {'site'=>$site, # will be undef for Greenstone 2
68              'collection'=>$collection,
69              'source_dir'=>$source_dir,
70              'build_dir'=>$build_dir,
71              'verbosity'=>$verbosity,
72              'maxdocs'=>$maxdocs,
73              'debug'=>$debug,
74              'keepold'=>$keepold,
75              'incremental'=>$incremental,
76              'incremental_mode'=>$incremental_mode,
77              'remove_empty_classifications'=>$remove_empty_classifications,
78              'outhandle'=>$outhandle,
79              'no_text'=>$no_text,
80              'failhandle'=>$failhandle,
81              'notbuilt'=>{},    # indexes not built
82              'gli'=>$gli
83              }, $class;
84
85    $self->{'gli'} = 0 unless defined $self->{'gli'};
86   
87    # Read in the collection configuration file.
88    my ($colcfgname);
89    ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
90    $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
91
92    if ($gs_mode eq "gs3") {
93    # read it in again to save the original form for later writing out
94    # of buildConfig.xml
95    # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
96    $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
97    }
98   
99    # get the database type for this collection from the collect.cfg file (may be undefined)
100    $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
101
102
103    # load up any dontdb fields
104    $self->{'dontdb'} = {};
105    if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
106    foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
107        $self->{'dontdb'}->{$dg} = 1;
108    }
109    }
110
111    $self->{'maxnumeric'} = 4;
112    return $self;
113}
114
115# stuff has been moved here from new, so we can use subclass methods
116sub init {
117    my $self = shift(@_);
118   
119    my $outhandle = $self->{'outhandle'};
120    my $failhandle = $self->{'failhandle'};
121
122    $self->generate_index_list();
123    my $indexes = $self->{'collect_cfg'}->{'indexes'};
124    if (defined $indexes) {
125    # sort out subcollection indexes
126    if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
127        $self->{'collect_cfg'}->{'indexes'} = [];
128        foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
129        foreach my $index (@$indexes) {
130            push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
131        }
132        }
133    }
134   
135    # sort out language subindexes
136    if (defined $self->{'collect_cfg'}->{'languages'}) {
137        $indexes = $self->{'collect_cfg'}->{'indexes'};
138        $self->{'collect_cfg'}->{'indexes'} = [];
139        foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
140        foreach my $index (@$indexes) {
141            if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
142            push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
143            }
144            else { # add in an empty subcollection field
145            push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
146            }
147        }
148        }
149    }
150    }
151   
152    if (defined($self->{'collect_cfg'}->{'indexes'})) {
153    # make sure that the same index isn't specified more than once
154    my %tmphash = ();
155    my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
156    $self->{'collect_cfg'}->{'indexes'} = [];
157    foreach my $i (@tmparray) {
158        if (!defined ($tmphash{$i})) {
159        push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
160        $tmphash{$i} = 1;
161        }
162    }
163    } else {
164    $self->{'collect_cfg'}->{'indexes'} = [];
165    }
166
167    # check incremental against whether builder can cope or not.
168    if ($self->{'incremental'} && !$self->is_incremental_capable()) {
169    print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
170    $self->{'keepold'} = 0;
171    $self->{'incremental'} = 0;
172    $self->{'incremental_mode'} = "none";
173   
174    }
175
176    # gs_version for plugins
177    my $gs_version = "2";
178    if ($gs_mode eq "gs3") {
179    $gs_version = "3";
180    }
181    # get the list of plugins for this collection
182    my $plugins = [];
183    if (defined $self->{'collect_cfg'}->{'plugin'}) {
184    $plugins = $self->{'collect_cfg'}->{'plugin'};
185    }
186   
187    # load all the plugins
188
189    #build up the extra global options for the plugins
190    my @global_opts = ();
191    if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
192    push @global_opts, "-separate_cjk";
193    }
194    $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'}, $gs_version);
195   
196    if (scalar(@{$self->{'pluginfo'}}) == 0) {
197    print $outhandle "No plugins were loaded.\n";
198    die "\n";
199    }
200
201    # get the list of classifiers for this collection
202    my $classifiers = [];
203    if (defined $self->{'collect_cfg'}->{'classify'}) {
204    $classifiers = $self->{'collect_cfg'}->{'classify'};
205    }
206
207    # load all the classifiers
208    $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
209
210    # load up the document processor for building
211    # if a buildproc class has been created for this collection, use it
212    # otherwise, use the default buildproc for the builder we are initialising
213    my $buildprocdir = undef;
214    my $buildproctype;
215
216    my $collection = $self->{'collection'};
217    if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
218    $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
219    $buildproctype = "custombuildproc";
220    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
221    $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
222    $buildproctype = "custombuildproc";
223    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
224    $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
225    $buildproctype = "${collection}buildproc";
226    } else {
227    $buildproctype = $self->default_buildproc();
228    }
229    if (defined $buildprocdir) {
230    require "$buildprocdir/$buildproctype.pm";
231    }
232    else {
233    require "$buildproctype.pm";
234    }
235
236    eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
237     "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
238    die "$@" if $@;
239
240    # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
241    $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
242   
243   $self->generate_index_options();
244
245    if (!$self->{'debug'} && !$self->{'keepold'}) {
246    # remove any old builds
247    &util::rm_r($self->{'build_dir'});
248    &util::mk_all_dir($self->{'build_dir'});
249       
250    # make the text directory
251    my $textdir = "$self->{'build_dir'}/text";
252    &util::mk_all_dir($textdir);
253    }
254
255    if ($self->{'incremental'}) {
256    # some classes may need to do some additional initialisation
257    $self->init_for_incremental_build();
258    }
259   
260}
261
262sub is_incremental_capable
263{
264    # By default we return 'no' as the answer
265    # Safer to assume non-incremental to start with, and then override in
266    # inherited classes that are.
267
268    return 0;
269}
270
271# implement this in subclass if want to do additional initialisation for an
272# incremental build
273sub init_for_incremental_build {
274    my $self = shift (@_);
275}
276
277sub deinit {
278    my $self = shift (@_);
279   
280    &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
281}
282
283sub generate_index_options {
284    my $self = shift (@_);
285
286    my $separate_cjk = 0;
287   
288    if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
289    foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
290        if ($option =~ /separate_cjk/) {
291        $separate_cjk = 1;
292        }
293    }
294    }
295    # set this for building
296    $self->{'buildproc'}->set_separate_cjk($separate_cjk);
297    # record it for build.cfg
298    $self->{'separate_cjk'} = $separate_cjk;
299}
300 
301sub set_sections_index_document_metadata {
302    my $self = shift (@_);
303    my ($index) = @_;
304 
305    $self->{'buildproc'}->set_sections_index_document_metadata($index);
306}
307
308sub set_maxnumeric {
309    my $self = shift (@_);
310    my ($maxnumeric) = @_;
311
312    $self->{'maxnumeric'} = $maxnumeric;
313}
314sub set_strip_html {
315    my $self = shift (@_);
316    my ($strip) = @_;
317   
318    $self->{'strip_html'} = $strip;
319    $self->{'buildproc'}->set_strip_html($strip);
320}
321
322sub set_store_metadata_coverage {
323    my $self = shift (@_);
324    my ($store_metadata_coverage) = @_;
325   
326    $self->{'buildproc'}->set_store_metadata_coverage($store_metadata_coverage);
327}
328
329sub compress_text {
330    my $self = shift (@_);
331    my ($textindex) = @_;
332
333    print STDERR "compress_text() should be implemented in subclass!!";
334    return;
335}
336
337
338sub build_indexes {
339    my $self = shift (@_);
340    my ($indexname) = @_;
341    my $outhandle = $self->{'outhandle'};
342
343    $self->pre_build_indexes();
344
345    my $indexes = [];
346    if (defined $indexname && $indexname =~ /\w/) {
347    push @$indexes, $indexname;
348    } else {
349    $indexes = $self->{'collect_cfg'}->{'indexes'};
350    }
351
352    # create the mapping between the index descriptions
353    # and their directory names (includes subcolls and langs)
354    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
355   
356    # build each of the indexes
357    foreach my $index (@$indexes) {
358    if ($self->want_built($index)) {
359        print $outhandle "\n*** building index $index in subdirectory " .
360        "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
361        print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
362        $self->build_index($index);
363    } else {
364        print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
365    }
366    }
367
368    $self->post_build_indexes();
369
370}
371
372# implement this in subclass if want to do extra stuff at before building
373# all the indexes
374sub pre_build_indexes {
375    my $self = shift(@_);
376    my ($indexname) = @_; # optional parameter
377}
378
379# implement this in subclass if want to do extra stuff at the end of building
380# all the indexes
381sub post_build_indexes {
382    my $self = shift(@_);   
383}
384
385sub build_index {
386    my $self = shift (@_);
387    my ($index) = @_;
388   
389    print STDERR "build_index should be implemented in subclass\n";
390    return;
391}
392
393
394
395sub make_infodatabase {
396    my $self = shift (@_);
397    my $outhandle = $self->{'outhandle'};
398
399    print STDERR "BuildDir: $self->{'build_dir'}\n";
400
401    my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
402    my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
403    &util::mk_all_dir ($textdir);
404    &util::mk_all_dir ($assocdir);
405
406    # Get info database file path
407    my $infodb_type = $self->{'infodbtype'};
408    my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
409
410    print $outhandle "\n*** creating the info database and processing associated files\n"
411    if ($self->{'verbosity'} >= 1);
412    print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
413
414    # init all the classifiers
415    &classify::init_classifiers ($self->{'classifiers'});
416
417    my $reconstructed_docs = undef;
418    my $database_recs = undef;
419
420    if ($self->{'incremental'}) {
421    $database_recs = {};
422
423    &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
424    }
425
426   
427    # Important (for memory usage reasons) that we obtain the filehandle
428    # here for writing out to the database, rather than after
429    # $reconstructed_docs has been set up (assuming -incremental is on)
430    #
431    # This is because when we open a pipe to txt2db [using open()]
432    # this triggers a fork() followed by exec().  $reconstructed_docs
433    # can get very large, and so if we did the open() after this, it means
434    # the fork creates a clone of the *large* process image which (admittedly)
435    # is then quickly replaced in the execve() with the much smaller image for
436    # 'txt2db'.  The trouble is, in that seismic second caused by
437    # the fork(), the system really does need to have all that memory available
438    # even though it isn't ultimately used.  The result is an out of memory
439    # error.
440
441    my ($infodb_handle);
442    if ($self->{'debug'}) {
443    $infodb_handle = *STDOUT;
444    }
445    else {
446    $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
447    if (!defined($infodb_handle))
448    {
449        print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
450        die "builder::make_infodatabase - couldn't open infodb write handle\n";
451    }
452    }
453
454    if ($self->{'incremental'}) {
455    # reconstruct doc_obj metadata from database for all docs
456    $reconstructed_docs
457        = &classify::reconstruct_doc_objs_metadata($infodb_type,
458                               $infodb_file_path,
459                               $database_recs);
460    }
461
462    # set up the document processor
463
464    $self->{'buildproc'}->set_output_handle ($infodb_handle);
465    $self->{'buildproc'}->set_mode ('infodb');
466    $self->{'buildproc'}->set_assocdir ($assocdir);
467    $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
468    $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
469    $self->{'buildproc'}->set_indexing_text (0);
470    $self->{'buildproc'}->set_store_text(1);
471
472    # make_infodatabase needs full reset even for incremental build
473    # as incremental works by reconstructing all docs from the database and
474    # then adding in the new ones
475    $self->{'buildproc'}->zero_reset();
476
477    $self->{'buildproc'}->{'mdprefix_fields'} = {};
478   
479    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
480           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
481
482    if ($self->{'incremental'}) {
483    # create flat classify structure, ready for new docs to be added
484    foreach my $doc_obj ( @$reconstructed_docs ) {
485        if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
486        print $outhandle "  Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
487        $self->{'buildproc'}->process($doc_obj,undef);
488        }
489    }
490    }
491    # this has changed to only output collection meta if its
492    # not in the config file
493    $self->output_collection_meta($infodb_handle);
494   
495    # output classification information
496    &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
497                     $self->{'remove_empty_classifications'},
498                     $self->{'gli'});
499
500    # Output classifier reverse lookup, used in incremental deletion
501    ####&classify::print_reverse_lookup($infodb_handle);
502
503    # output doclist
504    my @doc_list = $self->{'buildproc'}->get_doc_list();
505    my $browselist_infodb = { 'hastxt' => [ "0" ],
506                  'childtype' => [ "VList" ],
507                  'numleafdocs' => [ scalar(@doc_list) ],
508                  'thistype' => [ "Invisible" ],
509                  'contains' => [ join(";", @doc_list) ] };
510    &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
511
512    &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
513   
514    if ($infodb_type eq "gdbm-txtgz") {
515    my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
516    if (-e $gdb_infodb_file_path) {
517        &util::rm($gdb_infodb_file_path);
518    }
519    }
520    print STDERR "</Stage>\n" if $self->{'gli'};
521}
522
523sub make_auxiliary_files {
524    my $self = shift (@_);
525    my ($index);
526    my $build_cfg = {};
527    # subclasses may have already defined stuff in here
528    if (defined $self->{'build_cfg'}) {
529    $build_cfg = $self->{'build_cfg'};
530    }
531
532    my $outhandle = $self->{'outhandle'};
533
534    print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
535    print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
536
537    # get the text directory
538    &util::mk_all_dir ($self->{'build_dir'});
539
540    # store the build date
541    $build_cfg->{'builddate'} = time;
542    $build_cfg->{'buildtype'} = $self->{'buildtype'};
543    $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
544    $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
545    if ($self->{'separate_cjk'}) {
546    $build_cfg->{'separate_cjk'} = "true";
547    }
548   
549    # store the number of documents and number of bytes
550    $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
551    $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
552    $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
553
554    # store the mapping between the index names and the directory names
555    # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
556    my @indexmap = ();
557    foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
558    if (not defined ($self->{'notbuilt'}->{$index})) {
559        push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
560    }
561    }
562    $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
563
564    my @subcollectionmap = ();
565    foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
566    push (@subcollectionmap, "$subcollection\-\>" .
567          $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
568    }
569    $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
570
571    my @languagemap = ();
572    foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
573    push (@languagemap, "$language\-\>" .
574          $self->{'index_mapping'}->{'languagemap'}->{$language});
575    }
576    $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
577
578    my @notbuilt = ();
579    foreach my $nb (keys %{$self->{'notbuilt'}}) {
580    push (@notbuilt, $nb);
581    }
582    $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
583
584    $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
585
586    $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
587   
588    # write out the earliestDatestamp information needed for OAI
589    my $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
590    if(!-d $archivedir) {
591    $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
592    }
593    my $earliestDatestampFile = &util::filename_cat ($archivedir, "earliestDatestamp");
594    my $earliestDatestamp = 0;
595    if (open(FIN,"<$earliestDatestampFile")) {
596    {
597        # slurp in file as a single line
598        local $/ = undef;
599        $earliestDatestamp = <FIN>;
600        #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
601    }
602    close(FIN);
603    }
604    else {
605    print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
606    print $outhandle "Setting value to 0.\n";
607    }
608    $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
609   
610    $self->build_cfg_extra($build_cfg);
611
612    if ($gs_mode eq "gs2") {
613      &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
614    }
615    if ($gs_mode eq "gs3") {
616
617      &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
618    }   
619
620    print STDERR "</Stage>\n" if $self->{'gli'};
621}
622
623# implement this in subclass if want to add extra stuff to build.cfg
624sub build_cfg_extra {
625   my $self = shift(@_);
626   my ($build_cfg) = @_;
627   
628}
629
630
631sub collect_specific {
632    my $self = shift (@_);
633}
634
635sub want_built {
636    my $self = shift (@_);
637    my ($index) = @_;
638
639    if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
640    foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
641        if ($index =~ /^$checkstr$/) {
642        $self->{'notbuilt'}->{$index} = 1;
643        return 0;
644        }
645    }
646    }
647
648    return 1;
649}
650
651sub create_index_mapping {
652    my $self = shift (@_);
653    my ($indexes) = @_;
654
655    print STDERR "create_index_mapping should be implemented in subclass\n";
656    my %mapping = ();
657    return \%mapping;
658}
659
660# returns a processed version of a field.
661# if the field has only one component the processed
662# version will contain the first character and next consonant
663# of that componant - otherwise it will contain the first
664# character of the first two components
665# only uses letdig (\w) characters now
666sub process_field {
667    my $self = shift (@_);
668    my ($field) = @_;
669
670    return "" unless (defined ($field) && $field =~ /\S/);
671   
672    my ($a, $b);
673    my @components = split /,/, $field;
674    if (scalar @components >= 2) {
675    # pick the first letdig from the first two field names
676    ($a) = $components[0] =~ /^[^\w]*(\w)/;
677    ($b) = $components[1] =~ /^[^\w]*(\w)/;
678    } else {
679    # pick the first two letdig chars
680    ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
681    }
682    # there may not have been any letdigs...
683    $a = 'a' unless defined $a;
684    $b = '0' unless defined $b;
685   
686    my $newfield = "$a$b";
687    if ($newfield =~ /^\d\d$/) {
688    # digits only - Greenstone runtime doesn't like this.
689    $newfield = "a$a";
690    }
691    return $newfield;
692   
693}
694
695sub get_next_version {
696    my $self = shift (@_);
697    my ($nameref) = @_;
698    my $num=0;
699    if ($$nameref =~ /(\d\d)$/) {
700    $num = $1; $num ++;
701    $$nameref =~ s/\d\d$/$num/;
702    } elsif ($$nameref =~ /(\d)$/) {
703    $num = $1;
704    if ($num == 9) {$$nameref =~ s/\d$/10/;}
705    else {$num ++; $$nameref =~ s/\d$/$num/;}
706    } else {
707    $$nameref =~ s/.$/0/;
708    }
709}
710
711
712
713sub get_collection_meta_sets
714{
715    my $self = shift(@_);
716    my $collection_infodb = shift(@_);
717
718    my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
719    foreach my $prefix (keys %$mdprefix_fields)
720    {   
721    push(@{$collection_infodb->{"metadataset"}}, $prefix);
722
723    foreach my $field (keys %{$mdprefix_fields->{$prefix}})
724    {
725        push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
726
727        my $val = $mdprefix_fields->{$prefix}->{$field};
728        push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
729    }
730    }
731}
732
733
734# default is to output the metadata sets (prefixes) used in collection
735sub output_collection_meta
736{
737    my $self = shift(@_);
738    my $infodb_handle = shift(@_);
739
740    my %collection_infodb = ();
741    $self->get_collection_meta_sets(\%collection_infodb);
742    &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
743}
744
745# sometimes we need to read in an existing build.cfg - for example,
746# if doing each stage of building separately, or when doing incremental
747# building
748sub read_build_cfg {
749    my $self = shift(@_);
750
751    my $buildconfigfilename;
752   
753    if ($gs_mode eq "gs2") {
754    $buildconfigfilename = "build.cfg";
755    } else {
756    $buildconfigfilename = "buildConfig.xml";
757    }
758   
759    my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
760   
761    if (!-e $buildconfigfile) {
762    # try the index dir - but do we know where it is?? try here
763    $buildconfigfile  = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
764    if (!-e $buildconfigfile) {
765        #we cant find a config file - just ignore the field list
766        return undef;
767    }
768    }
769    return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
770   
771}
772
773sub print_stats {
774    my $self = shift (@_);
775
776    my $outhandle = $self->{'outhandle'};
777    my $indexing_text = $self->{'buildproc'}->get_indexing_text();
778    my $index = $self->{'buildproc'}->get_index();
779    my $num_bytes = $self->{'buildproc'}->get_num_bytes();
780    my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
781
782    if ($indexing_text) {
783    print $outhandle "Stats (Creating index $index)\n";
784    } else {
785    print $outhandle "Stats (Compressing text from $index)\n";
786    }
787    print $outhandle "Total bytes in collection: $num_bytes\n";
788    print $outhandle "Total bytes in $index: $num_processed_bytes\n";
789
790    if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
791   
792    if ($self->{'incremental'}) {
793        if ($num_processed_bytes == 0) {
794        if ($indexing_text) {
795            print $outhandle "No additional text was added to $index\n";
796        } elsif (!$self->{'no_text'}) {
797            print $outhandle "No additional text was compressed\n";
798        }   
799        }   
800    }
801    else {
802        print $outhandle "***************\n";
803        if ($indexing_text) {
804        print $outhandle "WARNING: There is very little or no text to process for $index\n";
805        } elsif (!$self->{'no_text'}) {
806        print $outhandle "WARNING: There is very little or no text to compress\n";
807        }     
808        print $outhandle "         Was this your intention?\n";
809        print $outhandle "***************\n";
810    }
811
812    }
813
814}
815
816 
8171;
818
Note: See TracBrowser for help on using the browser.