source: gsdl/trunk/perllib/basebuilder.pm@ 14212

Last change on this file since 14212 was 14212, checked in by xiao, 17 years ago

change the test version back to normal

  • Property svn:keywords set to Author Date Id Revision
File size: 19.9 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mgpp
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47our $maxdocsize = 12000;
48
49# used to signify "gs2"(default) or "gs3"
50my $gs_mode = "gs2";
51
52sub new {
53 my ($class, $collection, $source_dir, $build_dir, $verbosity,
54 $maxdocs, $debug, $keepold, $incremental, $incremental_dlc,
55 $remove_empty_classifications,
56 $outhandle, $no_text, $failhandle, $gli, $disable_OAI) = @_;
57
58 $outhandle = STDERR unless defined $outhandle;
59 $no_text = 0 unless defined $no_text;
60 $failhandle = STDERR unless defined $failhandle;
61
62 # create a builder object
63 my $self = bless {'collection'=>$collection,
64 'source_dir'=>$source_dir,
65 'build_dir'=>$build_dir,
66 'verbosity'=>$verbosity,
67 'maxdocs'=>$maxdocs,
68 'debug'=>$debug,
69 'keepold'=>$keepold,
70 'incremental'=>$incremental,
71 'incremental_dlc' => $incremental_dlc,
72 'remove_empty_classifications'=>$remove_empty_classifications,
73 'outhandle'=>$outhandle,
74 'no_text'=>$no_text,
75 'failhandle'=>$failhandle,
76 'notbuilt'=>{}, # indexes not built
77 'gli'=>$gli,
78 'disable_OAI'=>$disable_OAI
79 }, $class;
80
81 $self->{'gli'} = 0 unless defined $self->{'gli'};
82 $self->{'disable_OAI'} = 0 unless defined $self->{'disable_OAI'};
83
84 # read in the collection configuration file
85 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
86 if (-e $colcfgname) {
87 ##$self->{'collect_cfg'} = &colcfg::read_collection_cfg_xml ($colcfgname);
88 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
89 $gs_mode = "gs2";
90 }
91 else {
92 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collectionConfig.xml";
93 if (!-e $colcfgname) {
94 die "mgbuilder::new - couldn't find collectionConfig.xml for collection $collection\n";
95 }
96 else {
97 #$self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
98 $self->{'collect_cfg'} = &colcfg::read_collection_cfg_xml ($colcfgname);
99 $gs_mode = "gs3";
100 }
101 }
102
103 # get the list of plugins for this collection
104 my $plugins = [];
105 if (defined $self->{'collect_cfg'}->{'plugin'}) {
106 $plugins = $self->{'collect_cfg'}->{'plugin'};
107 }
108
109 # load all the plugins
110
111 #build up the extra global options for the plugins
112 my @global_opts = ();
113 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
114 push @global_opts, "-separate_cjk";
115 }
116 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts, $keepold);
117
118 if (scalar(@{$self->{'pluginfo'}}) == 0) {
119 print $outhandle "No plugins were loaded.\n";
120 die "\n";
121 }
122
123 # get the list of classifiers for this collection
124 my $classifiers = [];
125 if (defined $self->{'collect_cfg'}->{'classify'}) {
126 $classifiers = $self->{'collect_cfg'}->{'classify'};
127 }
128
129 # load all the classifiers
130 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
131
132 # load up any dontgdbm fields
133 $self->{'dontgdbm'} = {};
134 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
135 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
136 $self->{'dontgdbm'}->{$dg} = 1;
137 }
138 }
139
140 $self->{'maxnumeric'} = 4;
141 return $self;
142}
143
144# stuff has been moved here from new, so we can use subclass methods
145sub init {
146 my $self = shift(@_);
147
148 $self->generate_index_list();
149 $self->generate_index_options();
150
151 # sort out subcollection indexes
152 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
153 my $indexes = $self->{'collect_cfg'}->{'indexes'};
154 $self->{'collect_cfg'}->{'indexes'} = [];
155 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
156 foreach my $index (@$indexes) {
157 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
158 }
159 }
160 }
161
162 # sort out language subindexes
163 if (defined $self->{'collect_cfg'}->{'languages'}) {
164 my $indexes = $self->{'collect_cfg'}->{'indexes'};
165 $self->{'collect_cfg'}->{'indexes'} = [];
166 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
167 foreach my $index (@$indexes) {
168 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
169 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
170 }
171 else { # add in an empty subcollection field
172 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
173 }
174 }
175 }
176 }
177
178 if (defined($self->{'collect_cfg'}->{'indexes'})) {
179 # make sure that the same index isn't specified more than once
180 my %tmphash = ();
181 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
182 $self->{'collect_cfg'}->{'indexes'} = [];
183 foreach my $i (@tmparray) {
184 if (!defined ($tmphash{$i})) {
185 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
186 $tmphash{$i} = 1;
187 }
188 }
189 } else {
190 $self->{'collect_cfg'}->{'indexes'} = [];
191 }
192
193 # load up the document processor for building
194 # if a buildproc class has been created for this collection, use it
195 # otherwise, use the mg buildproc
196 my ($buildprocdir, $buildproctype);
197 my $collection = $self->{'collection'};
198 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
199 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
200 $buildproctype = "${collection}buildproc";
201 } else {
202 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
203 $buildproctype = $self->default_buildproc();
204 }
205 require "$buildprocdir/$buildproctype.pm";
206
207 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
208 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
209 die "$@" if $@;
210
211 if (!$self->{'debug'} && !$self->{'keepold'}) {
212 # remove any old builds
213 &util::rm_r($self->{'build_dir'});
214 &util::mk_all_dir($self->{'build_dir'});
215
216 # make the text directory
217 my $textdir = "$self->{'build_dir'}/text";
218 &util::mk_all_dir($textdir);
219 }
220
221}
222
223sub deinit {
224 my $self = shift (@_);
225
226 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
227}
228
229sub set_sections_index_document_metadata {
230 my $self = shift (@_);
231 my ($index) = @_;
232
233 $self->{'buildproc'}->set_sections_index_document_metadata($index);
234}
235
236sub set_maxnumeric {
237 my $self = shift (@_);
238 my ($maxnumeric) = @_;
239
240 $self->{'maxnumeric'} = $maxnumeric;
241}
242# It seems we don't need this sub
243#sub set_disable_OAI {
244# my $disable_OAI = shift (@_);
245# my ($disable_OAI = @_;
246#
247# $self->{'disable_OAI'} = $disable_OAI;
248#}
249sub set_strip_html {
250 my $self = shift (@_);
251 my ($strip) = @_;
252
253 $self->{'strip_html'} = $strip;
254 $self->{'buildproc'}->set_strip_html($strip);
255}
256
257sub compress_text {
258 my $self = shift (@_);
259 my ($textindex) = @_;
260
261 print STDERR "compress_text() should be implemented in subclass!!";
262 return;
263}
264
265
266sub build_indexes {
267 my $self = shift (@_);
268 my ($indexname) = @_;
269 my $outhandle = $self->{'outhandle'};
270
271 my $indexes = [];
272 if (defined $indexname && $indexname =~ /\w/) {
273 push @$indexes, $indexname;
274 } else {
275 $indexes = $self->{'collect_cfg'}->{'indexes'};
276 }
277
278 # create the mapping between the index descriptions
279 # and their directory names (includes subcolls and langs)
280 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
281
282 my $indexmap = $self->{'index_mapping'}->{'indexmap'};
283
284 # build each of the indexes
285 foreach my $index (@$indexes) {
286 if ($self->want_built($index)) {
287 print $outhandle "\n*** building index $index in subdirectory " .
288 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
289 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
290 $self->build_index($index);
291 } else {
292 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
293 }
294 }
295
296 $self->build_indexes_extra();
297
298}
299
300sub build_indexes_extra {
301 my $self = shift(@_);
302
303}
304
305sub build_index {
306 my $self = shift (@_);
307 my ($index) = @_;
308
309 print STDERR "build_index should be implemented in subclass\n";
310 return;
311}
312
313
314
315sub make_infodatabase {
316 my $self = shift (@_);
317 my $outhandle = $self->{'outhandle'};
318
319 print STDERR "BuildDir: $self->{'build_dir'}\n";
320
321 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
322 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
323 &util::mk_all_dir ($textdir);
324 &util::mk_all_dir ($assocdir);
325
326 # get db name
327 my $dbext = ".bdb";
328 $dbext = ".ldb" if &util::is_little_endian();
329 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
330 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
331
332 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
333 my $exe = &util::get_os_exe ();
334 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
335
336 print $outhandle "\n*** creating the info database and processing associated files\n"
337 if ($self->{'verbosity'} >= 1);
338 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
339
340 # init all the classifiers
341 &classify::init_classifiers ($self->{'classifiers'});
342
343
344 my $reconstructed_docs = undef;
345 if ($self->{'keepold'}) {
346 # reconstruct doc_obj metadata from gdbm for all docs
347 $reconstructed_docs = &classify::reconstruct_doc_objs_metadata($fulldbname);
348 }
349
350 # set up the document processor
351 my ($handle);
352 if ($self->{'debug'}) {
353 $handle = STDOUT;
354 } else {
355 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
356 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
357 die "builder::make_infodatabase - couldn't run $txt2db_exe\n";
358 }
359 $handle = basebuilder::PIPEOUT;
360 }
361
362 $self->{'buildproc'}->set_output_handle ($handle);
363 $self->{'buildproc'}->set_mode ('infodb');
364 $self->{'buildproc'}->set_assocdir ($assocdir);
365 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
366 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
367 $self->{'buildproc'}->set_indexing_text (0);
368 $self->{'buildproc'}->set_store_text(1);
369
370 # make_infodatabase needs full reset even for incremental build
371 # as incremental works by reconstructing all docs from GDBM and
372 # then adding in the new ones
373 $self->{'buildproc'}->zero_reset();
374
375 if ($self->{'keepold'}) {
376 # create flat classify structure, ready for new docs to be added
377 foreach my $doc_obj ( @$reconstructed_docs ) {
378 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
379 $self->{'buildproc'}->process($doc_obj,undef);
380 }
381 }
382
383
384 # this has changed to only output collection meta if its
385 # not in the config file
386 $self->output_collection_meta($handle);
387 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
388 "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
389
390 # output classification information
391 &classify::output_classify_info ($self->{'classifiers'}, $handle,
392 $self->{'remove_empty_classifications'},
393 $self->{'gli'});
394
395 # Output classifier reverse lookup, used in incremental deletion
396 #&classify::print_reverse_lookup($handle);
397
398 #output doclist
399 my @doclist = $self->{'buildproc'}->get_doc_list();
400 my $docs = join (";",@doclist);
401 print $handle "[browselist]\n";
402 print $handle "<hastxt>0\n";
403 print $handle "<childtype>VList\n";
404 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
405 print $handle "<thistype>Invisible\n";
406 print $handle "<contains>$docs";
407 print $handle "\n" . ('-' x 70) . "\n";
408
409 close ($handle) if !$self->{'debug'};
410
411 print STDERR "</Stage>\n" if $self->{'gli'};
412}
413
414sub make_auxiliary_files {
415 my $self = shift (@_);
416 my ($index);
417 my $build_cfg = {};
418 # subclasses may have already defined stuff in here
419 if (defined $self->{'build_cfg'}) {
420 $build_cfg = $self->{'build_cfg'};
421 }
422
423 my $outhandle = $self->{'outhandle'};
424
425 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
426 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
427
428 # get the text directory
429 &util::mk_all_dir ($self->{'build_dir'});
430
431 # store the build date
432 $build_cfg->{'builddate'} = time;
433 $build_cfg->{'buildtype'} = $self->{'buildtype'};
434 $build_cfg->{'indexstem'} = $self->{'collection'};
435 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
436
437 # store the number of documents and number of bytes
438 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
439 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
440 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
441
442 # store whether to disable OAI service
443 $build_cfg->{'disable_OAI'} = $self->{'disable_OAI'};
444
445 # store the mapping between the index names and the directory names
446 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
447 my @indexmap = ();
448 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
449 if (not defined ($self->{'notbuilt'}->{$index})) {
450 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
451 }
452 }
453 $build_cfg->{'indexmap'} = \@indexmap;
454
455 my @subcollectionmap = ();
456 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
457 push (@subcollectionmap, "$subcollection\-\>" .
458 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
459 }
460 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
461
462 my @languagemap = ();
463 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
464 push (@languagemap, "$language\-\>" .
465 $self->{'index_mapping'}->{'languagemap'}->{$language});
466 }
467 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
468
469 my @notbuilt = ();
470 foreach my $nb (keys %{$self->{'notbuilt'}}) {
471 push (@notbuilt, $nb);
472 }
473 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
474
475 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
476
477 $self->build_cfg_extra($build_cfg);
478
479 if ($gs_mode eq "gs2") {
480 #&colcfg::write_build_cfg_xml("$self->{'build_dir'}/buildConfig.xml", $build_cfg, $self->{'collect_cfg'});
481 &colcfg::write_build_cfg("$self->{'build_dir'}/build.cfg", $build_cfg);
482 }
483 if ($gs_mode eq "gs3") {
484 #&colcfg::write_build_cfg("$self->{'build_dir'}/build.cfg", $build_cfg);
485 &colcfg::write_build_cfg_xml("$self->{'build_dir'}/buildConfig.xml", $build_cfg, $self->{'collect_cfg'});
486 }
487
488 print STDERR "</Stage>\n" if $self->{'gli'};
489}
490
491sub collect_specific {
492 my $self = shift (@_);
493}
494
495sub want_built {
496 my $self = shift (@_);
497 my ($index) = @_;
498
499 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
500 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
501 if ($index =~ /^$checkstr$/) {
502 $self->{'notbuilt'}->{$index} = 1;
503 return 0;
504 }
505 }
506 }
507
508 return 1;
509}
510
511sub create_index_mapping {
512 my $self = shift (@_);
513 my ($indexes) = @_;
514
515 print STDERR "create_index_mapping should be implemented in subclass\n";
516 my %mapping = ();
517 return \%mapping;
518}
519
520# returns a processed version of a field.
521# if the field has only one component the processed
522# version will contain the first character and next consonant
523# of that componant - otherwise it will contain the first
524# character of the first two components
525# only uses letdig (\w) characters now
526sub process_field {
527 my $self = shift (@_);
528 my ($field) = @_;
529
530 return "" unless (defined ($field) && $field =~ /\S/);
531
532 my ($a, $b);
533 my @components = split /,/, $field;
534 if (scalar @components >= 2) {
535 # pick the first letdig from the first two field names
536 ($a) = $components[0] =~ /^[^\w]*(\w)/;
537 ($b) = $components[1] =~ /^[^\w]*(\w)/;
538 } else {
539 # pick the first two letdig chars
540 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
541 }
542 # there may not have been any letdigs...
543 $a = 'a' unless defined $a;
544 $b = '0' unless defined $b;
545
546 return "$a$b";
547
548}
549
550sub get_next_version {
551 my $self = shift (@_);
552 my ($nameref) = @_;
553 my $num=0;
554 if ($$nameref =~ /(\d\d)$/) {
555 $num = $1; $num ++;
556 $$nameref =~ s/\d\d$/$num/;
557 } elsif ($$nameref =~ /(\d)$/) {
558 $num = $1;
559 if ($num == 9) {$$nameref =~ s/\d$/10/;}
560 else {$num ++; $$nameref =~ s/\d$/$num/;}
561 } else {
562 $$nameref =~ s/.$/0/;
563 }
564}
565
566# implement this in subclass if want to add extra stuff to build.cfg
567sub build_cfg_extra {
568 my $self = shift(@_);
569 my ($build_cfg) = @_;
570
571}
572
573# default is to output an empty [collection] entry
574sub output_collection_meta {
575 my $self = shift(@_);
576 my ($handle) = @_;
577
578 print $handle "[collection]\n". ('-' x 70) . "\n";;
579
580}
581
582sub print_stats {
583 my $self = shift (@_);
584
585 my $outhandle = $self->{'outhandle'};
586 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
587 my $index = $self->{'buildproc'}->get_index();
588 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
589 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
590
591 if ($indexing_text) {
592 print $outhandle "Stats (Creating index $index)\n";
593 } else {
594 print $outhandle "Stats (Compressing text from $index)\n";
595 }
596 print $outhandle "Total bytes in collection: $num_bytes\n";
597 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
598
599 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
600
601 if ($self->{'keepold'}) {
602 if ($num_processed_bytes == 0) {
603 if ($indexing_text) {
604 print $outhandle "No additional text was added to $index\n";
605 } elsif (!$self->{'no_text'}) {
606 print $outhandle "No additional text was compressed\n";
607 }
608 }
609 }
610 else {
611 print $outhandle "***************\n";
612 if ($indexing_text) {
613 print $outhandle "WARNING: There is very little or no text to process for $index\n";
614 } elsif (!$self->{'no_text'}) {
615 print $outhandle "WARNING: There is very little or no text to compress\n";
616 }
617 print $outhandle " Was this your intention?\n";
618 print $outhandle "***************\n";
619 }
620
621 }
622
623}
624
625
6261;
627
Note: See TracBrowser for help on using the repository browser.