root/main/trunk/greenstone2/perllib/basebuilder.pm @ 24460

Revision 24460, 25.4 KB (checked in by davidb, 8 years ago)

Code changes to support indexers that are provided through the extension mechanism

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40    # set autoflush on for STDERR and STDOUT so that mgpp
41    # doesn't get out of sync with plugins
42    STDOUT->autoflush(1);
43    STDERR->autoflush(1);
44}
45
46END {
47    STDOUT->autoflush(0);
48    STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57    my ($class, $collection, $source_dir, $build_dir, $verbosity,
58    $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
59    $remove_empty_classifications,
60    $outhandle, $no_text, $failhandle, $gli) = @_;
61
62    $outhandle = *STDERR unless defined $outhandle;
63    $no_text = 0 unless defined $no_text;
64    $failhandle = *STDERR unless defined $failhandle;
65
66    # create a builder object
67    my $self = bless {'collection'=>$collection,
68              'source_dir'=>$source_dir,
69              'build_dir'=>$build_dir,
70              'verbosity'=>$verbosity,
71              'maxdocs'=>$maxdocs,
72              'debug'=>$debug,
73              'keepold'=>$keepold,
74              'incremental'=>$incremental,
75              'incremental_mode'=>$incremental_mode,
76              'remove_empty_classifications'=>$remove_empty_classifications,
77              'outhandle'=>$outhandle,
78              'no_text'=>$no_text,
79              'failhandle'=>$failhandle,
80              'notbuilt'=>{},    # indexes not built
81              'gli'=>$gli
82              }, $class;
83
84    $self->{'gli'} = 0 unless defined $self->{'gli'};
85   
86    # Read in the collection configuration file.
87    my ($colcfgname);
88    ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
89    $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
90
91    if ($gs_mode eq "gs3") {
92    # read it in again to save the original form for later writing out
93    # of buildConfig.xml
94    # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
95    $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
96    }
97   
98    # get the database type for this collection from the collect.cfg file (may be undefined)
99    $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
100
101
102    # load up any dontdb fields
103    $self->{'dontdb'} = {};
104    if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
105    foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
106        $self->{'dontdb'}->{$dg} = 1;
107    }
108    }
109
110    $self->{'maxnumeric'} = 4;
111    return $self;
112}
113
114# stuff has been moved here from new, so we can use subclass methods
115sub init {
116    my $self = shift(@_);
117   
118    my $outhandle = $self->{'outhandle'};
119    my $failhandle = $self->{'failhandle'};
120
121    $self->generate_index_list();
122    my $indexes = $self->{'collect_cfg'}->{'indexes'};
123    if (defined $indexes) {
124    # sort out subcollection indexes
125    if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
126        $self->{'collect_cfg'}->{'indexes'} = [];
127        foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
128        foreach my $index (@$indexes) {
129            push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
130        }
131        }
132    }
133   
134    # sort out language subindexes
135    if (defined $self->{'collect_cfg'}->{'languages'}) {
136        $indexes = $self->{'collect_cfg'}->{'indexes'};
137        $self->{'collect_cfg'}->{'indexes'} = [];
138        foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
139        foreach my $index (@$indexes) {
140            if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
141            push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
142            }
143            else { # add in an empty subcollection field
144            push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
145            }
146        }
147        }
148    }
149    }
150   
151    if (defined($self->{'collect_cfg'}->{'indexes'})) {
152    # make sure that the same index isn't specified more than once
153    my %tmphash = ();
154    my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
155    $self->{'collect_cfg'}->{'indexes'} = [];
156    foreach my $i (@tmparray) {
157        if (!defined ($tmphash{$i})) {
158        push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
159        $tmphash{$i} = 1;
160        }
161    }
162    } else {
163    $self->{'collect_cfg'}->{'indexes'} = [];
164    }
165
166    # check incremental against whether builder can cope or not.
167    if ($self->{'incremental'} && !$self->is_incremental_capable()) {
168    print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
169    $self->{'keepold'} = 0;
170    $self->{'incremental'} = 0;
171    $self->{'incremental_mode'} = "none";
172   
173    }
174
175
176    # get the list of plugins for this collection
177    my $plugins = [];
178    if (defined $self->{'collect_cfg'}->{'plugin'}) {
179    $plugins = $self->{'collect_cfg'}->{'plugin'};
180    }
181   
182    # load all the plugins
183
184    #build up the extra global options for the plugins
185    my @global_opts = ();
186    if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
187    push @global_opts, "-separate_cjk";
188    }
189    $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'});
190   
191    if (scalar(@{$self->{'pluginfo'}}) == 0) {
192    print $outhandle "No plugins were loaded.\n";
193    die "\n";
194    }
195
196    # get the list of classifiers for this collection
197    my $classifiers = [];
198    if (defined $self->{'collect_cfg'}->{'classify'}) {
199    $classifiers = $self->{'collect_cfg'}->{'classify'};
200    }
201
202    # load all the classifiers
203    $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
204
205    # load up the document processor for building
206    # if a buildproc class has been created for this collection, use it
207    # otherwise, use the default buildproc for the builder we are initialising
208    my $buildprocdir = undef;
209    my $buildproctype;
210
211    my $collection = $self->{'collection'};
212    if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
213    $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
214    $buildproctype = "custombuildproc";
215    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
216    $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
217    $buildproctype = "custombuildproc";
218    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
219    $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
220    $buildproctype = "${collection}buildproc";
221    } else {
222    $buildproctype = $self->default_buildproc();
223    }
224    if (defined $buildprocdir) {
225    require "$buildprocdir/$buildproctype.pm";
226    }
227    else {
228    require "$buildproctype.pm";
229    }
230
231    eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
232     "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
233    die "$@" if $@;
234
235    # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
236    $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
237   
238   $self->generate_index_options();
239
240    if (!$self->{'debug'} && !$self->{'keepold'}) {
241    # remove any old builds
242    &util::rm_r($self->{'build_dir'});
243    &util::mk_all_dir($self->{'build_dir'});
244       
245    # make the text directory
246    my $textdir = "$self->{'build_dir'}/text";
247    &util::mk_all_dir($textdir);
248    }
249
250    if ($self->{'incremental'}) {
251    # some classes may need to do some additional initialisation
252    $self->init_for_incremental_build();
253    }
254   
255}
256
257sub is_incremental_capable
258{
259    # By default we return 'no' as the answer
260    # Safer to assume non-incremental to start with, and then override in
261    # inherited classes that are.
262
263    return 0;
264}
265
266# implement this in subclass if want to do additional initialisation for an
267# incremental build
268sub init_for_incremental_build {
269    my $self = shift (@_);
270}
271
272sub deinit {
273    my $self = shift (@_);
274   
275    &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
276}
277
278sub generate_index_options {
279    my $self = shift (@_);
280
281    my $separate_cjk = 0;
282   
283    if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
284    foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
285        if ($option =~ /separate_cjk/) {
286        $separate_cjk = 1;
287        }
288    }
289    }
290    # set this for building
291    $self->{'buildproc'}->set_separate_cjk($separate_cjk);
292    # record it for build.cfg
293    $self->{'separate_cjk'} = $separate_cjk;
294}
295 
296sub set_sections_index_document_metadata {
297    my $self = shift (@_);
298    my ($index) = @_;
299 
300    $self->{'buildproc'}->set_sections_index_document_metadata($index);
301}
302
303sub set_maxnumeric {
304    my $self = shift (@_);
305    my ($maxnumeric) = @_;
306
307    $self->{'maxnumeric'} = $maxnumeric;
308}
309sub set_strip_html {
310    my $self = shift (@_);
311    my ($strip) = @_;
312   
313    $self->{'strip_html'} = $strip;
314    $self->{'buildproc'}->set_strip_html($strip);
315}
316
317sub compress_text {
318    my $self = shift (@_);
319    my ($textindex) = @_;
320
321    print STDERR "compress_text() should be implemented in subclass!!";
322    return;
323}
324
325
326sub build_indexes {
327    my $self = shift (@_);
328    my ($indexname) = @_;
329    my $outhandle = $self->{'outhandle'};
330
331    $self->pre_build_indexes();
332
333    my $indexes = [];
334    if (defined $indexname && $indexname =~ /\w/) {
335    push @$indexes, $indexname;
336    } else {
337    $indexes = $self->{'collect_cfg'}->{'indexes'};
338    }
339
340    # create the mapping between the index descriptions
341    # and their directory names (includes subcolls and langs)
342    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
343   
344    # build each of the indexes
345    foreach my $index (@$indexes) {
346    if ($self->want_built($index)) {
347        print $outhandle "\n*** building index $index in subdirectory " .
348        "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
349        print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
350        $self->build_index($index);
351    } else {
352        print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
353    }
354    }
355
356    $self->post_build_indexes();
357
358}
359
360# implement this in subclass if want to do extra stuff at before building
361# all the indexes
362sub pre_build_indexes {
363    my $self = shift(@_);
364    my ($indexname) = @_; # optional parameter
365}
366
367# implement this in subclass if want to do extra stuff at the end of building
368# all the indexes
369sub post_build_indexes {
370    my $self = shift(@_);   
371}
372
373sub build_index {
374    my $self = shift (@_);
375    my ($index) = @_;
376   
377    print STDERR "build_index should be implemented in subclass\n";
378    return;
379}
380
381
382
383sub make_infodatabase {
384    my $self = shift (@_);
385    my $outhandle = $self->{'outhandle'};
386
387    print STDERR "BuildDir: $self->{'build_dir'}\n";
388
389    my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
390    my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
391    &util::mk_all_dir ($textdir);
392    &util::mk_all_dir ($assocdir);
393
394    # Get info database file path
395    my $infodb_type = $self->{'infodbtype'};
396    my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
397
398    print $outhandle "\n*** creating the info database and processing associated files\n"
399    if ($self->{'verbosity'} >= 1);
400    print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
401
402    # init all the classifiers
403    &classify::init_classifiers ($self->{'classifiers'});
404
405    my $reconstructed_docs = undef;
406    my $database_recs = undef;
407
408    if ($self->{'incremental'}) {
409    $database_recs = {};
410
411    &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
412    }
413
414   
415    # Important (for memory usage reasons) that we obtain the filehandle
416    # here for writing out to the database, rather than after
417    # $reconstructed_docs has been set up (assuming -incremental is on)
418    #
419    # This is because when we open a pipe to txt2db [using open()]
420    # this triggers a fork() followed by exec().  $reconstructed_docs
421    # can get very large, and so if we did the open() after this, it means
422    # the fork creates a clone of the *large* process image which (admittedly)
423    # is then quickly replaced in the execve() with the much smaller image for
424    # 'txt2db'.  The trouble is, in that seismic second caused by
425    # the fork(), the system really does need to have all that memory available
426    # even though it isn't ultimately used.  The result is an out of memory
427    # error.
428
429    my ($infodb_handle);
430    if ($self->{'debug'}) {
431    $infodb_handle = *STDOUT;
432    }
433    else {
434    $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
435    if (!defined($infodb_handle))
436    {
437        print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
438        die "builder::make_infodatabase - couldn't open infodb write handle\n";
439    }
440    }
441
442    if ($self->{'incremental'}) {
443    # reconstruct doc_obj metadata from database for all docs
444    $reconstructed_docs
445        = &classify::reconstruct_doc_objs_metadata($infodb_type,
446                               $infodb_file_path,
447                               $database_recs);
448    }
449
450    # set up the document processor
451
452    $self->{'buildproc'}->set_output_handle ($infodb_handle);
453    $self->{'buildproc'}->set_mode ('infodb');
454    $self->{'buildproc'}->set_assocdir ($assocdir);
455    $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
456    $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
457    $self->{'buildproc'}->set_indexing_text (0);
458    $self->{'buildproc'}->set_store_text(1);
459    $self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
460
461    # make_infodatabase needs full reset even for incremental build
462    # as incremental works by reconstructing all docs from the database and
463    # then adding in the new ones
464    $self->{'buildproc'}->zero_reset();
465
466    $self->{'buildproc'}->{'mdprefix_fields'} = {};
467   
468    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
469           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
470
471    if ($self->{'incremental'}) {
472    # create flat classify structure, ready for new docs to be added
473    foreach my $doc_obj ( @$reconstructed_docs ) {
474        if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
475        print $outhandle "  Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
476        $self->{'buildproc'}->process($doc_obj,undef);
477        }
478    }
479    }
480    # this has changed to only output collection meta if its
481    # not in the config file
482    $self->output_collection_meta($infodb_handle);
483   
484    # output classification information
485    &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
486                     $self->{'remove_empty_classifications'},
487                     $self->{'gli'});
488
489    # Output classifier reverse lookup, used in incremental deletion
490    ####&classify::print_reverse_lookup($infodb_handle);
491
492    # output doclist
493    my @doc_list = $self->{'buildproc'}->get_doc_list();
494    my $browselist_infodb = { 'hastxt' => [ "0" ],
495                  'childtype' => [ "VList" ],
496                  'numleafdocs' => [ scalar(@doc_list) ],
497                  'thistype' => [ "Invisible" ],
498                  'contains' => [ join(";", @doc_list) ] };
499    &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
500
501    &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
502   
503    if ($infodb_type eq "gdbm-txtgz") {
504    my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
505    if (-e $gdb_infodb_file_path) {
506        &util::rm($gdb_infodb_file_path);
507    }
508    }
509    print STDERR "</Stage>\n" if $self->{'gli'};
510}
511
512sub make_auxiliary_files {
513    my $self = shift (@_);
514    my ($index);
515    my $build_cfg = {};
516    # subclasses may have already defined stuff in here
517    if (defined $self->{'build_cfg'}) {
518    $build_cfg = $self->{'build_cfg'};
519    }
520
521    my $outhandle = $self->{'outhandle'};
522
523    print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
524    print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
525
526    # get the text directory
527    &util::mk_all_dir ($self->{'build_dir'});
528
529    # store the build date
530    $build_cfg->{'builddate'} = time;
531    $build_cfg->{'buildtype'} = $self->{'buildtype'};
532    $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
533    $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
534    if ($self->{'separate_cjk'}) {
535    $build_cfg->{'separate_cjk'} = "true";
536    }
537   
538    # store the number of documents and number of bytes
539    $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
540    $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
541    $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
542
543    # store the mapping between the index names and the directory names
544    # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
545    my @indexmap = ();
546    foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
547    if (not defined ($self->{'notbuilt'}->{$index})) {
548        push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
549    }
550    }
551    $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
552
553    my @subcollectionmap = ();
554    foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
555    push (@subcollectionmap, "$subcollection\-\>" .
556          $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
557    }
558    $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
559
560    my @languagemap = ();
561    foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
562    push (@languagemap, "$language\-\>" .
563          $self->{'index_mapping'}->{'languagemap'}->{$language});
564    }
565    $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
566
567    my @notbuilt = ();
568    foreach my $nb (keys %{$self->{'notbuilt'}}) {
569    push (@notbuilt, $nb);
570    }
571    $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
572
573    $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
574
575    $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
576   
577    # write out the earliestDatestamp information needed for OAI
578    my $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
579    if(!-d $archivedir) {
580    $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
581    }
582    my $earliestDatestampFile = &util::filename_cat ($archivedir, "earliestDatestamp");
583    my $earliestDatestamp = 0;
584    if (open(FIN,"<$earliestDatestampFile")) {
585    {
586        # slurp in file as a single line
587        local $/ = undef;
588        $earliestDatestamp = <FIN>;
589        #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
590    }
591    close(FIN);
592    }
593    else {
594    print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
595    print $outhandle "Setting value to 0.\n";
596    }
597    $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
598   
599    $self->build_cfg_extra($build_cfg);
600
601    if ($gs_mode eq "gs2") {
602      &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
603    }
604    if ($gs_mode eq "gs3") {
605
606      &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
607    }   
608
609    print STDERR "</Stage>\n" if $self->{'gli'};
610}
611
612# implement this in subclass if want to add extra stuff to build.cfg
613sub build_cfg_extra {
614   my $self = shift(@_);
615   my ($build_cfg) = @_;
616   
617}
618
619
620sub collect_specific {
621    my $self = shift (@_);
622}
623
624sub want_built {
625    my $self = shift (@_);
626    my ($index) = @_;
627
628    if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
629    foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
630        if ($index =~ /^$checkstr$/) {
631        $self->{'notbuilt'}->{$index} = 1;
632        return 0;
633        }
634    }
635    }
636
637    return 1;
638}
639
640sub create_index_mapping {
641    my $self = shift (@_);
642    my ($indexes) = @_;
643
644    print STDERR "create_index_mapping should be implemented in subclass\n";
645    my %mapping = ();
646    return \%mapping;
647}
648
649# returns a processed version of a field.
650# if the field has only one component the processed
651# version will contain the first character and next consonant
652# of that componant - otherwise it will contain the first
653# character of the first two components
654# only uses letdig (\w) characters now
655sub process_field {
656    my $self = shift (@_);
657    my ($field) = @_;
658
659    return "" unless (defined ($field) && $field =~ /\S/);
660   
661    my ($a, $b);
662    my @components = split /,/, $field;
663    if (scalar @components >= 2) {
664    # pick the first letdig from the first two field names
665    ($a) = $components[0] =~ /^[^\w]*(\w)/;
666    ($b) = $components[1] =~ /^[^\w]*(\w)/;
667    } else {
668    # pick the first two letdig chars
669    ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
670    }
671    # there may not have been any letdigs...
672    $a = 'a' unless defined $a;
673    $b = '0' unless defined $b;
674   
675    my $newfield = "$a$b";
676    if ($newfield =~ /^\d\d$/) {
677    # digits only - Greenstone runtime doesn't like this.
678    $newfield = "a$a";
679    }
680    return $newfield;
681   
682}
683
684sub get_next_version {
685    my $self = shift (@_);
686    my ($nameref) = @_;
687    my $num=0;
688    if ($$nameref =~ /(\d\d)$/) {
689    $num = $1; $num ++;
690    $$nameref =~ s/\d\d$/$num/;
691    } elsif ($$nameref =~ /(\d)$/) {
692    $num = $1;
693    if ($num == 9) {$$nameref =~ s/\d$/10/;}
694    else {$num ++; $$nameref =~ s/\d$/$num/;}
695    } else {
696    $$nameref =~ s/.$/0/;
697    }
698}
699
700
701
702sub get_collection_meta_sets
703{
704    my $self = shift(@_);
705    my $collection_infodb = shift(@_);
706
707    my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
708    foreach my $prefix (keys %$mdprefix_fields)
709    {
710    push(@{$collection_infodb->{"metadataset"}}, $prefix);
711
712    foreach my $field (keys %{$mdprefix_fields->{$prefix}})
713    {
714        push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
715
716        my $val = $mdprefix_fields->{$prefix}->{$field};
717        push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
718    }
719    }
720}
721
722
723# default is to output the metadata sets (prefixes) used in collection
724sub output_collection_meta
725{
726    my $self = shift(@_);
727    my $infodb_handle = shift(@_);
728
729    my %collection_infodb = ();
730    $self->get_collection_meta_sets(\%collection_infodb);
731    &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
732}
733
734# sometimes we need to read in an existing build.cfg - for example,
735# if doing each stage of building separately, or when doing incremental
736# building
737sub read_build_cfg {
738    my $self = shift(@_);
739
740    my $buildconfigfilename;
741   
742    if ($gs_mode eq "gs2") {
743    $buildconfigfilename = "build.cfg";
744    } else {
745    $buildconfigfilename = "buildConfig.xml";
746    }
747   
748    my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
749   
750    if (!-e $buildconfigfile) {
751    # try the index dir - but do we know where it is?? try here
752    $buildconfigfile  = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
753    if (!-e $buildconfigfile) {
754        #we cant find a config file - just ignore the field list
755        return undef;
756    }
757    }
758    return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
759   
760}
761
762sub print_stats {
763    my $self = shift (@_);
764
765    my $outhandle = $self->{'outhandle'};
766    my $indexing_text = $self->{'buildproc'}->get_indexing_text();
767    my $index = $self->{'buildproc'}->get_index();
768    my $num_bytes = $self->{'buildproc'}->get_num_bytes();
769    my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
770
771    if ($indexing_text) {
772    print $outhandle "Stats (Creating index $index)\n";
773    } else {
774    print $outhandle "Stats (Compressing text from $index)\n";
775    }
776    print $outhandle "Total bytes in collection: $num_bytes\n";
777    print $outhandle "Total bytes in $index: $num_processed_bytes\n";
778
779    if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
780   
781    if ($self->{'incremental'}) {
782        if ($num_processed_bytes == 0) {
783        if ($indexing_text) {
784            print $outhandle "No additional text was added to $index\n";
785        } elsif (!$self->{'no_text'}) {
786            print $outhandle "No additional text was compressed\n";
787        }   
788        }   
789    }
790    else {
791        print $outhandle "***************\n";
792        if ($indexing_text) {
793        print $outhandle "WARNING: There is very little or no text to process for $index\n";
794        } elsif (!$self->{'no_text'}) {
795        print $outhandle "WARNING: There is very little or no text to compress\n";
796        }     
797        print $outhandle "         Was this your intention?\n";
798        print $outhandle "***************\n";
799    }
800
801    }
802
803}
804
805 
8061;
807
Note: See TracBrowser for help on using the browser.