source: gsdl/trunk/perllib/basebuilder.pm@ 14245

Last change on this file since 14245 was 14245, checked in by xiao, 15 years ago

pass the flag disable_OAI to the write_build_cfg_xml sub instead of hacking into the structure which causes regular expression compilation errors when writing the build.cfg file (since the flag has nothing to do with Greenstone 2 hence the build.cfg file). Plus restore the file back to sjboddie's revision 14112.

  • Property svn:keywords set to Author Date Id Revision
File size: 19.6 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
34use plugin;
35use util;
36use FileHandle;
37
38BEGIN {
39 # set autoflush on for STDERR and STDOUT so that mgpp
40 # doesn't get out of sync with plugins
41 STDOUT->autoflush(1);
42 STDERR->autoflush(1);
43}
44
45END {
46 STDOUT->autoflush(0);
47 STDERR->autoflush(0);
48}
49
50our $maxdocsize = 12000;
51
52# used to signify "gs2"(default) or "gs3"
53my $gs_mode = "gs2";
54
55sub new {
56 my ($class, $collection, $source_dir, $build_dir, $verbosity,
57 $maxdocs, $debug, $keepold, $incremental, $incremental_dlc,
58 $remove_empty_classifications,
59 $outhandle, $no_text, $failhandle, $gli, $disable_OAI) = @_;
60
61 $outhandle = *STDERR unless defined $outhandle;
62 $no_text = 0 unless defined $no_text;
63 $failhandle = *STDERR unless defined $failhandle;
64
65 # create a builder object
66 my $self = bless {'collection'=>$collection,
67 'source_dir'=>$source_dir,
68 'build_dir'=>$build_dir,
69 'verbosity'=>$verbosity,
70 'maxdocs'=>$maxdocs,
71 'debug'=>$debug,
72 'keepold'=>$keepold,
73 'incremental'=>$incremental,
74 'incremental_dlc' => $incremental_dlc,
75 'remove_empty_classifications'=>$remove_empty_classifications,
76 'outhandle'=>$outhandle,
77 'no_text'=>$no_text,
78 'failhandle'=>$failhandle,
79 'notbuilt'=>{}, # indexes not built
80 'gli'=>$gli,
81 'disable_OAI'=>$disable_OAI
82 }, $class;
83
84 $self->{'gli'} = 0 unless defined $self->{'gli'};
85
86 # disable_OIA applies to greenstone 3 only and is only passed to &colcfg::write_build_cfg_xml (then cfgread4gs3::write_cfg_file) when writing the buildConfig.xml
87 $self->{'disable_OAI'} = 0 unless defined $self->{'disable_OAI'};
88
89 # Read in the collection configuration file.
90 my ($colcfgname);
91 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
92 if ($gs_mode eq "gs2") {
93 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
94 } elsif ($gs_mode eq "gs3") {
95 $self->{'collect_cfg'} = &colcfg::read_collection_cfg_xml ($colcfgname);
96 }
97
98 # get the list of plugins for this collection
99 my $plugins = [];
100 if (defined $self->{'collect_cfg'}->{'plugin'}) {
101 $plugins = $self->{'collect_cfg'}->{'plugin'};
102 }
103
104 # load all the plugins
105
106 #build up the extra global options for the plugins
107 my @global_opts = ();
108 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
109 push @global_opts, "-separate_cjk";
110 }
111 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts, $keepold);
112
113 if (scalar(@{$self->{'pluginfo'}}) == 0) {
114 print $outhandle "No plugins were loaded.\n";
115 die "\n";
116 }
117
118 # get the list of classifiers for this collection
119 my $classifiers = [];
120 if (defined $self->{'collect_cfg'}->{'classify'}) {
121 $classifiers = $self->{'collect_cfg'}->{'classify'};
122 }
123
124 # load all the classifiers
125 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
126
127 # load up any dontgdbm fields
128 $self->{'dontgdbm'} = {};
129 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
130 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
131 $self->{'dontgdbm'}->{$dg} = 1;
132 }
133 }
134
135 $self->{'maxnumeric'} = 4;
136 return $self;
137}
138
139# stuff has been moved here from new, so we can use subclass methods
140sub init {
141 my $self = shift(@_);
142
143 $self->generate_index_list();
144 $self->generate_index_options();
145
146 # sort out subcollection indexes
147 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
148 my $indexes = $self->{'collect_cfg'}->{'indexes'};
149 $self->{'collect_cfg'}->{'indexes'} = [];
150 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
151 foreach my $index (@$indexes) {
152 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
153 }
154 }
155 }
156
157 # sort out language subindexes
158 if (defined $self->{'collect_cfg'}->{'languages'}) {
159 my $indexes = $self->{'collect_cfg'}->{'indexes'};
160 $self->{'collect_cfg'}->{'indexes'} = [];
161 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
162 foreach my $index (@$indexes) {
163 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
164 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
165 }
166 else { # add in an empty subcollection field
167 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
168 }
169 }
170 }
171 }
172
173 if (defined($self->{'collect_cfg'}->{'indexes'})) {
174 # make sure that the same index isn't specified more than once
175 my %tmphash = ();
176 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
177 $self->{'collect_cfg'}->{'indexes'} = [];
178 foreach my $i (@tmparray) {
179 if (!defined ($tmphash{$i})) {
180 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
181 $tmphash{$i} = 1;
182 }
183 }
184 } else {
185 $self->{'collect_cfg'}->{'indexes'} = [];
186 }
187
188 # load up the document processor for building
189 # if a buildproc class has been created for this collection, use it
190 # otherwise, use the mg buildproc
191 my ($buildprocdir, $buildproctype);
192 my $collection = $self->{'collection'};
193 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
194 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
195 $buildproctype = "custombuildproc";
196 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
197 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
198 $buildproctype = "custombuildproc";
199 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
200 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
201 $buildproctype = "${collection}buildproc";
202 } else {
203 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
204 $buildproctype = $self->default_buildproc();
205 }
206 require "$buildprocdir/$buildproctype.pm";
207
208 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
209 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
210 die "$@" if $@;
211
212 if (!$self->{'debug'} && !$self->{'keepold'}) {
213 # remove any old builds
214 &util::rm_r($self->{'build_dir'});
215 &util::mk_all_dir($self->{'build_dir'});
216
217 # make the text directory
218 my $textdir = "$self->{'build_dir'}/text";
219 &util::mk_all_dir($textdir);
220 }
221
222}
223
224sub deinit {
225 my $self = shift (@_);
226
227 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
228}
229
230sub set_sections_index_document_metadata {
231 my $self = shift (@_);
232 my ($index) = @_;
233
234 $self->{'buildproc'}->set_sections_index_document_metadata($index);
235}
236
237sub set_maxnumeric {
238 my $self = shift (@_);
239 my ($maxnumeric) = @_;
240
241 $self->{'maxnumeric'} = $maxnumeric;
242}
243sub set_strip_html {
244 my $self = shift (@_);
245 my ($strip) = @_;
246
247 $self->{'strip_html'} = $strip;
248 $self->{'buildproc'}->set_strip_html($strip);
249}
250
251sub compress_text {
252 my $self = shift (@_);
253 my ($textindex) = @_;
254
255 print STDERR "compress_text() should be implemented in subclass!!";
256 return;
257}
258
259
260sub build_indexes {
261 my $self = shift (@_);
262 my ($indexname) = @_;
263 my $outhandle = $self->{'outhandle'};
264
265 my $indexes = [];
266 if (defined $indexname && $indexname =~ /\w/) {
267 push @$indexes, $indexname;
268 } else {
269 $indexes = $self->{'collect_cfg'}->{'indexes'};
270 }
271
272 # create the mapping between the index descriptions
273 # and their directory names (includes subcolls and langs)
274 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
275
276 # build each of the indexes
277 foreach my $index (@$indexes) {
278 if ($self->want_built($index)) {
279 print $outhandle "\n*** building index $index in subdirectory " .
280 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
281 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
282 $self->build_index($index);
283 } else {
284 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
285 }
286 }
287
288 $self->build_indexes_extra();
289
290}
291
292sub build_indexes_extra {
293 my $self = shift(@_);
294
295}
296
297sub build_index {
298 my $self = shift (@_);
299 my ($index) = @_;
300
301 print STDERR "build_index should be implemented in subclass\n";
302 return;
303}
304
305
306
307sub make_infodatabase {
308 my $self = shift (@_);
309 my $outhandle = $self->{'outhandle'};
310
311 print STDERR "BuildDir: $self->{'build_dir'}\n";
312
313 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
314 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
315 &util::mk_all_dir ($textdir);
316 &util::mk_all_dir ($assocdir);
317
318 # get db name
319 my $dbext = ".bdb";
320 $dbext = ".ldb" if &util::is_little_endian();
321 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
322 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
323
324 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
325 my $exe = &util::get_os_exe ();
326 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
327
328 print $outhandle "\n*** creating the info database and processing associated files\n"
329 if ($self->{'verbosity'} >= 1);
330 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
331
332 # init all the classifiers
333 &classify::init_classifiers ($self->{'classifiers'});
334
335
336 my $reconstructed_docs = undef;
337 if ($self->{'keepold'}) {
338 # reconstruct doc_obj metadata from gdbm for all docs
339 $reconstructed_docs = &classify::reconstruct_doc_objs_metadata($fulldbname);
340 }
341
342 # set up the document processor
343 my ($handle);
344 if ($self->{'debug'}) {
345 $handle = *STDOUT;
346 } else {
347 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
348 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
349 die "builder::make_infodatabase - couldn't run $txt2db_exe\n";
350 }
351 $handle = *PIPEOUT;
352 }
353
354 $self->{'buildproc'}->set_output_handle ($handle);
355 $self->{'buildproc'}->set_mode ('infodb');
356 $self->{'buildproc'}->set_assocdir ($assocdir);
357 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
358 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
359 $self->{'buildproc'}->set_indexing_text (0);
360 $self->{'buildproc'}->set_store_text(1);
361
362 # make_infodatabase needs full reset even for incremental build
363 # as incremental works by reconstructing all docs from GDBM and
364 # then adding in the new ones
365 $self->{'buildproc'}->zero_reset();
366
367 if ($self->{'keepold'}) {
368 # create flat classify structure, ready for new docs to be added
369 foreach my $doc_obj ( @$reconstructed_docs ) {
370 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
371 $self->{'buildproc'}->process($doc_obj,undef);
372 }
373 }
374
375
376 # this has changed to only output collection meta if its
377 # not in the config file
378 $self->output_collection_meta($handle);
379 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
380 "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
381
382 # output classification information
383 &classify::output_classify_info ($self->{'classifiers'}, $handle,
384 $self->{'remove_empty_classifications'},
385 $self->{'gli'});
386
387 # Output classifier reverse lookup, used in incremental deletion
388 #&classify::print_reverse_lookup($handle);
389
390 #output doclist
391 my @doclist = $self->{'buildproc'}->get_doc_list();
392 my $docs = join (";",@doclist);
393 print $handle "[browselist]\n";
394 print $handle "<hastxt>0\n";
395 print $handle "<childtype>VList\n";
396 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
397 print $handle "<thistype>Invisible\n";
398 print $handle "<contains>$docs";
399 print $handle "\n" . ('-' x 70) . "\n";
400
401 close ($handle) if !$self->{'debug'};
402
403 print STDERR "</Stage>\n" if $self->{'gli'};
404}
405
406sub make_auxiliary_files {
407 my $self = shift (@_);
408 my ($index);
409 my $build_cfg = {};
410 # subclasses may have already defined stuff in here
411 if (defined $self->{'build_cfg'}) {
412 $build_cfg = $self->{'build_cfg'};
413 }
414
415 my $outhandle = $self->{'outhandle'};
416
417 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
418 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
419
420 # get the text directory
421 &util::mk_all_dir ($self->{'build_dir'});
422
423 # store the build date
424 $build_cfg->{'builddate'} = time;
425 $build_cfg->{'buildtype'} = $self->{'buildtype'};
426 $build_cfg->{'indexstem'} = $self->{'collection'};
427 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
428
429 # store the number of documents and number of bytes
430 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
431 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
432 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
433
434 # store the mapping between the index names and the directory names
435 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
436 my @indexmap = ();
437 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
438 if (not defined ($self->{'notbuilt'}->{$index})) {
439 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
440 }
441 }
442 $build_cfg->{'indexmap'} = \@indexmap;
443
444 my @subcollectionmap = ();
445 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
446 push (@subcollectionmap, "$subcollection\-\>" .
447 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
448 }
449 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
450
451 my @languagemap = ();
452 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
453 push (@languagemap, "$language\-\>" .
454 $self->{'index_mapping'}->{'languagemap'}->{$language});
455 }
456 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
457
458 my @notbuilt = ();
459 foreach my $nb (keys %{$self->{'notbuilt'}}) {
460 push (@notbuilt, $nb);
461 }
462 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
463
464 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
465
466 $self->build_cfg_extra($build_cfg);
467
468 if ($gs_mode eq "gs2") {
469 &colcfg::write_build_cfg("$self->{'build_dir'}/build.cfg", $build_cfg);
470 }
471 if ($gs_mode eq "gs3") {
472 &colcfg::write_build_cfg_xml("$self->{'build_dir'}/buildConfig.xml", $build_cfg, $self->{'collect_cfg'}, $self->{'disable_OAI'});
473 }
474
475 print STDERR "</Stage>\n" if $self->{'gli'};
476}
477
478sub collect_specific {
479 my $self = shift (@_);
480}
481
482sub want_built {
483 my $self = shift (@_);
484 my ($index) = @_;
485
486 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
487 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
488 if ($index =~ /^$checkstr$/) {
489 $self->{'notbuilt'}->{$index} = 1;
490 return 0;
491 }
492 }
493 }
494
495 return 1;
496}
497
498sub create_index_mapping {
499 my $self = shift (@_);
500 my ($indexes) = @_;
501
502 print STDERR "create_index_mapping should be implemented in subclass\n";
503 my %mapping = ();
504 return \%mapping;
505}
506
507# returns a processed version of a field.
508# if the field has only one component the processed
509# version will contain the first character and next consonant
510# of that componant - otherwise it will contain the first
511# character of the first two components
512# only uses letdig (\w) characters now
513sub process_field {
514 my $self = shift (@_);
515 my ($field) = @_;
516
517 return "" unless (defined ($field) && $field =~ /\S/);
518
519 my ($a, $b);
520 my @components = split /,/, $field;
521 if (scalar @components >= 2) {
522 # pick the first letdig from the first two field names
523 ($a) = $components[0] =~ /^[^\w]*(\w)/;
524 ($b) = $components[1] =~ /^[^\w]*(\w)/;
525 } else {
526 # pick the first two letdig chars
527 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
528 }
529 # there may not have been any letdigs...
530 $a = 'a' unless defined $a;
531 $b = '0' unless defined $b;
532
533 return "$a$b";
534
535}
536
537sub get_next_version {
538 my $self = shift (@_);
539 my ($nameref) = @_;
540 my $num=0;
541 if ($$nameref =~ /(\d\d)$/) {
542 $num = $1; $num ++;
543 $$nameref =~ s/\d\d$/$num/;
544 } elsif ($$nameref =~ /(\d)$/) {
545 $num = $1;
546 if ($num == 9) {$$nameref =~ s/\d$/10/;}
547 else {$num ++; $$nameref =~ s/\d$/$num/;}
548 } else {
549 $$nameref =~ s/.$/0/;
550 }
551}
552
553# implement this in subclass if want to add extra stuff to build.cfg
554sub build_cfg_extra {
555 my $self = shift(@_);
556 my ($build_cfg) = @_;
557
558}
559
560# default is to output an empty [collection] entry
561sub output_collection_meta {
562 my $self = shift(@_);
563 my ($handle) = @_;
564
565 print $handle "[collection]\n". ('-' x 70) . "\n";;
566
567}
568
569sub print_stats {
570 my $self = shift (@_);
571
572 my $outhandle = $self->{'outhandle'};
573 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
574 my $index = $self->{'buildproc'}->get_index();
575 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
576 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
577
578 if ($indexing_text) {
579 print $outhandle "Stats (Creating index $index)\n";
580 } else {
581 print $outhandle "Stats (Compressing text from $index)\n";
582 }
583 print $outhandle "Total bytes in collection: $num_bytes\n";
584 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
585
586 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
587
588 if ($self->{'keepold'}) {
589 if ($num_processed_bytes == 0) {
590 if ($indexing_text) {
591 print $outhandle "No additional text was added to $index\n";
592 } elsif (!$self->{'no_text'}) {
593 print $outhandle "No additional text was compressed\n";
594 }
595 }
596 }
597 else {
598 print $outhandle "***************\n";
599 if ($indexing_text) {
600 print $outhandle "WARNING: There is very little or no text to process for $index\n";
601 } elsif (!$self->{'no_text'}) {
602 print $outhandle "WARNING: There is very little or no text to compress\n";
603 }
604 print $outhandle " Was this your intention?\n";
605 print $outhandle "***************\n";
606 }
607
608 }
609
610}
611
612
6131;
614
Note: See TracBrowser for help on using the repository browser.