source: gs2-extensions/parallel-building/trunk/src/perllib/basebuilder.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

File size: 25.6 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use dbutil;
35use plugin;
36use util;
37
38
39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mgpp
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54our $gs_mode = "gs2";
55
56sub new {
57 my ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
58 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
59 $remove_empty_classifications,
60 $outhandle, $no_text, $failhandle, $gli) = @_;
61
62 $outhandle = *STDERR unless defined $outhandle;
63 $no_text = 0 unless defined $no_text;
64 $failhandle = *STDERR unless defined $failhandle;
65
66 # create a builder object
67 my $self = bless {'site'=>$site, # will be undef for Greenstone 2
68 'collection'=>$collection,
69 'source_dir'=>$source_dir,
70 'build_dir'=>$build_dir,
71 'verbosity'=>$verbosity,
72 'maxdocs'=>$maxdocs,
73 'debug'=>$debug,
74 'keepold'=>$keepold,
75 'incremental'=>$incremental,
76 'incremental_mode'=>$incremental_mode,
77 'remove_empty_classifications'=>$remove_empty_classifications,
78 'outhandle'=>$outhandle,
79 'no_text'=>$no_text,
80 'failhandle'=>$failhandle,
81 'notbuilt'=>{}, # indexes not built
82 'gli'=>$gli
83 }, $class;
84
85 $self->{'gli'} = 0 unless defined $self->{'gli'};
86
87 # Read in the collection configuration file.
88 my ($colcfgname);
89 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
90 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
91
92 if ($gs_mode eq "gs3") {
93 # read it in again to save the original form for later writing out
94 # of buildConfig.xml
95 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
96 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
97 }
98
99 # get the database type for this collection from the collect.cfg file (may be undefined)
100 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
101
102
103 # load up any dontdb fields
104 $self->{'dontdb'} = {};
105 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
106 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
107 $self->{'dontdb'}->{$dg} = 1;
108 }
109 }
110
111 $self->{'maxnumeric'} = 4;
112 return $self;
113}
114
115# stuff has been moved here from new, so we can use subclass methods
116sub init {
117 my $self = shift(@_);
118
119 my $outhandle = $self->{'outhandle'};
120 my $failhandle = $self->{'failhandle'};
121
122 $self->generate_index_list();
123 my $indexes = $self->{'collect_cfg'}->{'indexes'};
124 if (defined $indexes) {
125 # sort out subcollection indexes
126 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
127 $self->{'collect_cfg'}->{'indexes'} = [];
128 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
129 foreach my $index (@$indexes) {
130 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
131 }
132 }
133 }
134
135 # sort out language subindexes
136 if (defined $self->{'collect_cfg'}->{'languages'}) {
137 $indexes = $self->{'collect_cfg'}->{'indexes'};
138 $self->{'collect_cfg'}->{'indexes'} = [];
139 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
140 foreach my $index (@$indexes) {
141 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
142 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
143 }
144 else { # add in an empty subcollection field
145 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
146 }
147 }
148 }
149 }
150 }
151
152 if (defined($self->{'collect_cfg'}->{'indexes'})) {
153 # make sure that the same index isn't specified more than once
154 my %tmphash = ();
155 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
156 $self->{'collect_cfg'}->{'indexes'} = [];
157 foreach my $i (@tmparray) {
158 if (!defined ($tmphash{$i})) {
159 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
160 $tmphash{$i} = 1;
161 }
162 }
163 } else {
164 $self->{'collect_cfg'}->{'indexes'} = [];
165 }
166
167 # check incremental against whether builder can cope or not.
168 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
169 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
170 $self->{'keepold'} = 0;
171 $self->{'incremental'} = 0;
172 $self->{'incremental_mode'} = "none";
173
174 }
175
176
177 # get the list of plugins for this collection
178 my $plugins = [];
179 if (defined $self->{'collect_cfg'}->{'plugin'}) {
180 $plugins = $self->{'collect_cfg'}->{'plugin'};
181 }
182
183 # load all the plugins
184
185 #build up the extra global options for the plugins
186 my @global_opts = ();
187 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
188 push @global_opts, "-separate_cjk";
189 }
190 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'});
191
192 if (scalar(@{$self->{'pluginfo'}}) == 0) {
193 print $outhandle "No plugins were loaded.\n";
194 die "\n";
195 }
196
197 # get the list of classifiers for this collection
198 my $classifiers = [];
199 if (defined $self->{'collect_cfg'}->{'classify'}) {
200 $classifiers = $self->{'collect_cfg'}->{'classify'};
201 }
202
203 # load all the classifiers
204 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
205
206 # load up the document processor for building
207 # if a buildproc class has been created for this collection, use it
208 # otherwise, use the default buildproc for the builder we are initialising
209 my $buildprocdir = undef;
210 my $buildproctype;
211
212 my $collection = $self->{'collection'};
213 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
214 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
215 $buildproctype = "custombuildproc";
216 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
217 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
218 $buildproctype = "custombuildproc";
219 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
220 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
221 $buildproctype = "${collection}buildproc";
222 } else {
223 $buildproctype = $self->default_buildproc();
224 }
225 if (defined $buildprocdir) {
226 require "$buildprocdir/$buildproctype.pm";
227 }
228 else {
229 require "$buildproctype.pm";
230 }
231
232 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
233 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
234 die "$@" if $@;
235
236 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
237 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
238
239 $self->generate_index_options();
240
241 if (!$self->{'debug'} && !$self->{'keepold'}) {
242 # remove any old builds
243 &util::rm_r($self->{'build_dir'});
244 &util::mk_all_dir($self->{'build_dir'});
245
246 # make the text directory
247 my $textdir = "$self->{'build_dir'}/text";
248 &util::mk_all_dir($textdir);
249 }
250
251 if ($self->{'incremental'}) {
252 # some classes may need to do some additional initialisation
253 $self->init_for_incremental_build();
254 }
255
256}
257
258sub is_incremental_capable
259{
260 # By default we return 'no' as the answer
261 # Safer to assume non-incremental to start with, and then override in
262 # inherited classes that are.
263
264 return 0;
265}
266
267# implement this in subclass if want to do additional initialisation for an
268# incremental build
269sub init_for_incremental_build {
270 my $self = shift (@_);
271}
272
273sub deinit {
274 my $self = shift (@_);
275
276 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
277}
278
279sub generate_index_options {
280 my $self = shift (@_);
281
282 my $separate_cjk = 0;
283
284 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
285 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
286 if ($option =~ /separate_cjk/) {
287 $separate_cjk = 1;
288 }
289 }
290 }
291 # set this for building
292 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
293 # record it for build.cfg
294 $self->{'separate_cjk'} = $separate_cjk;
295}
296
297sub set_sections_index_document_metadata {
298 my $self = shift (@_);
299 my ($index) = @_;
300
301 $self->{'buildproc'}->set_sections_index_document_metadata($index);
302}
303
304sub set_maxnumeric {
305 my $self = shift (@_);
306 my ($maxnumeric) = @_;
307
308 $self->{'maxnumeric'} = $maxnumeric;
309}
310sub set_strip_html {
311 my $self = shift (@_);
312 my ($strip) = @_;
313
314 $self->{'strip_html'} = $strip;
315 $self->{'buildproc'}->set_strip_html($strip);
316}
317
318sub compress_text {
319 my $self = shift (@_);
320 my ($textindex) = @_;
321
322 print STDERR "compress_text() should be implemented in subclass!!";
323 return;
324}
325
326
327sub build_indexes {
328 my $self = shift (@_);
329 my ($indexname,$indexlevel) = @_;
330 my $outhandle = $self->{'outhandle'};
331
332 $self->pre_build_indexes();
333
334 my $indexes = [];
335 if (defined $indexname && $indexname =~ /\w/) {
336 push @$indexes, $indexname;
337 } else {
338 $indexes = $self->{'collect_cfg'}->{'indexes'};
339 }
340
341 # create the mapping between the index descriptions
342 # and their directory names (includes subcolls and langs)
343 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
344
345 # build each of the indexes
346 foreach my $index (@$indexes) {
347 if ($self->want_built($index)) {
348 print $outhandle "\n*** building index $index in subdirectory " .
349 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
350 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
351 $self->build_index($index);
352 } else {
353 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
354 }
355 }
356
357 $self->post_build_indexes();
358
359}
360
361# implement this in subclass if want to do extra stuff at before building
362# all the indexes
363sub pre_build_indexes {
364 my $self = shift(@_);
365 my ($indexname) = @_; # optional parameter
366}
367
368# implement this in subclass if want to do extra stuff at the end of building
369# all the indexes
370sub post_build_indexes {
371 my $self = shift(@_);
372}
373
374sub build_index {
375 my $self = shift (@_);
376 my ($index) = @_;
377
378 print STDERR "build_index should be implemented in subclass\n";
379 return;
380}
381
382
383
384sub make_infodatabase {
385 my $self = shift (@_);
386 my $outhandle = $self->{'outhandle'};
387
388 print STDERR "BuildDir: $self->{'build_dir'}\n";
389
390 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
391 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
392 &util::mk_all_dir ($textdir);
393 &util::mk_all_dir ($assocdir);
394
395 # Get info database file path
396 my $infodb_type = $self->{'infodbtype'};
397 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
398
399 print $outhandle "\n*** creating the info database and processing associated files\n"
400 if ($self->{'verbosity'} >= 1);
401 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
402
403 # init all the classifiers
404 &classify::init_classifiers ($self->{'classifiers'});
405
406 my $reconstructed_docs = undef;
407 my $database_recs = undef;
408
409 if ($self->{'incremental'}) {
410 $database_recs = {};
411
412 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
413 }
414
415
416 # Important (for memory usage reasons) that we obtain the filehandle
417 # here for writing out to the database, rather than after
418 # $reconstructed_docs has been set up (assuming -incremental is on)
419 #
420 # This is because when we open a pipe to txt2db [using open()]
421 # this triggers a fork() followed by exec(). $reconstructed_docs
422 # can get very large, and so if we did the open() after this, it means
423 # the fork creates a clone of the *large* process image which (admittedly)
424 # is then quickly replaced in the execve() with the much smaller image for
425 # 'txt2db'. The trouble is, in that seismic second caused by
426 # the fork(), the system really does need to have all that memory available
427 # even though it isn't ultimately used. The result is an out of memory
428 # error.
429
430 my ($infodb_handle);
431 if ($self->{'debug'}) {
432 $infodb_handle = *STDOUT;
433 }
434 else {
435 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
436 if (!defined($infodb_handle))
437 {
438 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
439 die "builder::make_infodatabase - couldn't open infodb write handle\n";
440 }
441 }
442
443 if ($self->{'incremental'}) {
444 # reconstruct doc_obj metadata from database for all docs
445 $reconstructed_docs
446 = &classify::reconstruct_doc_objs_metadata($infodb_type,
447 $infodb_file_path,
448 $database_recs);
449 }
450
451 # set up the document processor
452
453 $self->{'buildproc'}->set_output_handle ($infodb_handle);
454 $self->{'buildproc'}->set_mode ('infodb');
455 $self->{'buildproc'}->set_assocdir ($assocdir);
456 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
457 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
458 $self->{'buildproc'}->set_indexing_text (0);
459 $self->{'buildproc'}->set_store_text(1);
460 $self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
461
462 # make_infodatabase needs full reset even for incremental build
463 # as incremental works by reconstructing all docs from the database and
464 # then adding in the new ones
465 $self->{'buildproc'}->zero_reset();
466
467 $self->{'buildproc'}->{'mdprefix_fields'} = {};
468
469 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
470 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
471
472 if ($self->{'incremental'}) {
473 # create flat classify structure, ready for new docs to be added
474 foreach my $doc_obj ( @$reconstructed_docs ) {
475 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
476 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
477 $self->{'buildproc'}->process($doc_obj,undef);
478 }
479 }
480 }
481 # this has changed to only output collection meta if its
482 # not in the config file
483 $self->output_collection_meta($infodb_handle);
484
485 # output classification information
486 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
487 $self->{'remove_empty_classifications'},
488 $self->{'gli'});
489
490 # Output classifier reverse lookup, used in incremental deletion
491 ####&classify::print_reverse_lookup($infodb_handle);
492
493 # output doclist
494 my @doc_list = $self->{'buildproc'}->get_doc_list();
495 my $browselist_infodb = { 'hastxt' => [ "0" ],
496 'childtype' => [ "VList" ],
497 'numleafdocs' => [ scalar(@doc_list) ],
498 'thistype' => [ "Invisible" ],
499 'contains' => [ join(";", @doc_list) ] };
500 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
501
502 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
503
504 if ($infodb_type eq "gdbm-txtgz") {
505 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
506 if (-e $gdb_infodb_file_path) {
507 &util::rm($gdb_infodb_file_path);
508 }
509 }
510 print STDERR "</Stage>\n" if $self->{'gli'};
511}
512
513sub make_auxiliary_files {
514 my $self = shift (@_);
515 my ($index);
516 my $build_cfg = {};
517 # subclasses may have already defined stuff in here
518 if (defined $self->{'build_cfg'}) {
519 $build_cfg = $self->{'build_cfg'};
520 }
521
522 my $outhandle = $self->{'outhandle'};
523
524 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
525 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
526
527 # get the text directory
528 &util::mk_all_dir ($self->{'build_dir'});
529
530 # store the build date
531 $build_cfg->{'builddate'} = time;
532 $build_cfg->{'buildtype'} = $self->{'buildtype'};
533 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
534 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
535 if ($self->{'separate_cjk'}) {
536 $build_cfg->{'separate_cjk'} = "true";
537 }
538
539 # store the number of documents and number of bytes
540 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
541 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
542 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
543
544 # store the mapping between the index names and the directory names
545 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
546 my @indexmap = ();
547 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
548 if (not defined ($self->{'notbuilt'}->{$index})) {
549 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
550 }
551 }
552 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
553
554 my @subcollectionmap = ();
555 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
556 push (@subcollectionmap, "$subcollection\-\>" .
557 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
558 }
559 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
560
561 my @languagemap = ();
562 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
563 push (@languagemap, "$language\-\>" .
564 $self->{'index_mapping'}->{'languagemap'}->{$language});
565 }
566 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
567
568 my @notbuilt = ();
569 foreach my $nb (keys %{$self->{'notbuilt'}}) {
570 push (@notbuilt, $nb);
571 }
572 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
573
574 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
575
576 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
577
578 # write out the earliestDatestamp information needed for OAI
579 my $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
580 if(!-d $archivedir) {
581 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
582 }
583 my $earliestDatestampFile = &util::filename_cat ($archivedir, "earliestDatestamp");
584 my $earliestDatestamp = 0;
585 if (open(FIN,"<$earliestDatestampFile")) {
586 {
587 # slurp in file as a single line
588 local $/ = undef;
589 $earliestDatestamp = <FIN>;
590 #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
591 }
592 close(FIN);
593 }
594 else {
595 print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
596 print $outhandle "Setting value to 0.\n";
597 }
598 $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
599
600 $self->build_cfg_extra($build_cfg);
601
602 if ($gs_mode eq "gs2") {
603 &colcfg::write_build_cfg(&util::filename_cat($self->{'build_dir'},"build.cfg"), $build_cfg);
604 }
605 if ($gs_mode eq "gs3") {
606
607 &colcfg::write_build_cfg_xml(&util::filename_cat($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
608 }
609
610 print STDERR "</Stage>\n" if $self->{'gli'};
611}
612
613# implement this in subclass if want to add extra stuff to build.cfg
614sub build_cfg_extra {
615 my $self = shift(@_);
616 my ($build_cfg) = @_;
617
618}
619
620
621sub collect_specific {
622 my $self = shift (@_);
623}
624
625sub want_built {
626 my $self = shift (@_);
627 my ($index) = @_;
628
629 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
630 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
631 if ($index =~ /^$checkstr$/) {
632 $self->{'notbuilt'}->{$index} = 1;
633 return 0;
634 }
635 }
636 }
637
638 return 1;
639}
640
641sub create_index_mapping {
642 my $self = shift (@_);
643 my ($indexes) = @_;
644
645 print STDERR "create_index_mapping should be implemented in subclass\n";
646 my %mapping = ();
647 return \%mapping;
648}
649
650# returns a processed version of a field.
651# if the field has only one component the processed
652# version will contain the first character and next consonant
653# of that componant - otherwise it will contain the first
654# character of the first two components
655# only uses letdig (\w) characters now
656sub process_field {
657 my $self = shift (@_);
658 my ($field) = @_;
659
660 return "" unless (defined ($field) && $field =~ /\S/);
661
662 my ($a, $b);
663 my @components = split /,/, $field;
664 if (scalar @components >= 2) {
665 # pick the first letdig from the first two field names
666 ($a) = $components[0] =~ /^[^\w]*(\w)/;
667 ($b) = $components[1] =~ /^[^\w]*(\w)/;
668 } else {
669 # pick the first two letdig chars
670 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
671 }
672 # there may not have been any letdigs...
673 $a = 'a' unless defined $a;
674 $b = '0' unless defined $b;
675
676 my $newfield = "$a$b";
677 if ($newfield =~ /^\d\d$/) {
678 # digits only - Greenstone runtime doesn't like this.
679 $newfield = "a$a";
680 }
681 return $newfield;
682
683}
684
685sub get_next_version {
686 my $self = shift (@_);
687 my ($nameref) = @_;
688 my $num=0;
689 if ($$nameref =~ /(\d\d)$/) {
690 $num = $1; $num ++;
691 $$nameref =~ s/\d\d$/$num/;
692 } elsif ($$nameref =~ /(\d)$/) {
693 $num = $1;
694 if ($num == 9) {$$nameref =~ s/\d$/10/;}
695 else {$num ++; $$nameref =~ s/\d$/$num/;}
696 } else {
697 $$nameref =~ s/.$/0/;
698 }
699}
700
701
702
703sub get_collection_meta_sets
704{
705 my $self = shift(@_);
706 my $collection_infodb = shift(@_);
707
708 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
709 foreach my $prefix (keys %$mdprefix_fields)
710 {
711 push(@{$collection_infodb->{"metadataset"}}, $prefix);
712
713 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
714 {
715 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
716
717 my $val = $mdprefix_fields->{$prefix}->{$field};
718 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
719 }
720 }
721}
722
723
724# default is to output the metadata sets (prefixes) used in collection
725sub output_collection_meta
726{
727 my $self = shift(@_);
728 my $infodb_handle = shift(@_);
729
730 my %collection_infodb = ();
731 $self->get_collection_meta_sets(\%collection_infodb);
732 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
733}
734
735# sometimes we need to read in an existing build.cfg - for example,
736# if doing each stage of building separately, or when doing incremental
737# building
738sub read_build_cfg {
739 my $self = shift(@_);
740
741 my $buildconfigfilename;
742
743 if ($gs_mode eq "gs2") {
744 $buildconfigfilename = "build.cfg";
745 } else {
746 $buildconfigfilename = "buildConfig.xml";
747 }
748
749 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, $buildconfigfilename);
750
751 if (!-e $buildconfigfile) {
752 # try the index dir - but do we know where it is?? try here
753 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
754 if (!-e $buildconfigfile) {
755 #we cant find a config file - just ignore the field list
756 return undef;
757 }
758 }
759 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
760
761}
762
763sub print_stats {
764 my $self = shift (@_);
765
766 my $outhandle = $self->{'outhandle'};
767 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
768 my $index = $self->{'buildproc'}->get_index();
769 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
770 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
771
772 if ($indexing_text) {
773 print $outhandle "Stats (Creating index $index)\n";
774 } else {
775 print $outhandle "Stats (Compressing text from $index)\n";
776 }
777 print $outhandle "Total bytes in collection: $num_bytes\n";
778 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
779
780 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
781
782 if ($self->{'incremental'}) {
783 if ($num_processed_bytes == 0) {
784 if ($indexing_text) {
785 print $outhandle "No additional text was added to $index\n";
786 } elsif (!$self->{'no_text'}) {
787 print $outhandle "No additional text was compressed\n";
788 }
789 }
790 }
791 else {
792 print $outhandle "***************\n";
793 if ($indexing_text) {
794 print $outhandle "WARNING: There is very little or no text to process for $index\n";
795 } elsif (!$self->{'no_text'}) {
796 print $outhandle "WARNING: There is very little or no text to compress\n";
797 }
798 print $outhandle " Was this your intention?\n";
799 print $outhandle "***************\n";
800 }
801
802 }
803
804}
805
806sub prepare_build_recipe
807{
808 my ($self) = @_;
809 my $outhandle = $self->{'outhandle'};
810 print $outhandle "WARNING: This indexer cannot generate parallel building recipe\n";
811}
812
8131;
814
Note: See TracBrowser for help on using the repository browser.