source: trunk/gsdl/perllib/basebuilder.pm@ 11334

Last change on this file since 11334 was 11296, checked in by kjdon, 18 years ago

now only use a-z1-9 chars in the index names (\w) in case the names have unicode chars in them

  • Property svn:keywords set to Author Date Id Revision
File size: 19.1 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mgpp
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47our $maxdocsize = 12000;
48
49sub new {
50 my ($class, $collection, $source_dir, $build_dir, $verbosity,
51 $maxdocs, $debug, $keepold, $remove_empty_classifications,
52 $outhandle, $no_text, $failhandle, $gli) = @_;
53
54 $outhandle = STDERR unless defined $outhandle;
55 $no_text = 0 unless defined $no_text;
56 $failhandle = STDERR unless defined $failhandle;
57
58 # create a builder object
59 my $self = bless {'collection'=>$collection,
60 'source_dir'=>$source_dir,
61 'build_dir'=>$build_dir,
62 'verbosity'=>$verbosity,
63 'maxdocs'=>$maxdocs,
64 'debug'=>$debug,
65 'keepold'=>$keepold,
66 'remove_empty_classifications'=>$remove_empty_classifications,
67 'outhandle'=>$outhandle,
68 'no_text'=>$no_text,
69 'failhandle'=>$failhandle,
70 'notbuilt'=>{}, # indexes not built
71 'gli'=>$gli
72 }, $class;
73
74 $self->{'gli'} = 0 unless defined $self->{'gli'};
75
76 # read in the collection configuration file
77 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
78 if (!-e $colcfgname) {
79 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
80 }
81 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
82
83 # get the list of plugins for this collection
84 my $plugins = [];
85 if (defined $self->{'collect_cfg'}->{'plugin'}) {
86 $plugins = $self->{'collect_cfg'}->{'plugin'};
87 }
88
89 # load all the plugins
90
91 #build up the extra global options for the plugins
92 my @global_opts = ();
93 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
94 push @global_opts, "-separate_cjk";
95 }
96 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts, $keepold);
97
98 if (scalar(@{$self->{'pluginfo'}}) == 0) {
99 print $outhandle "No plugins were loaded.\n";
100 die "\n";
101 }
102
103 # get the list of classifiers for this collection
104 my $classifiers = [];
105 if (defined $self->{'collect_cfg'}->{'classify'}) {
106 $classifiers = $self->{'collect_cfg'}->{'classify'};
107 }
108
109 # load all the classifiers
110 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
111
112 # load up any dontgdbm fields
113 $self->{'dontgdbm'} = {};
114 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
115 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
116 $self->{'dontgdbm'}->{$dg} = 1;
117 }
118 }
119
120 return $self;
121}
122
123# stuff has been moved here from new, so we can use subclass methods
124sub init {
125 my $self = shift(@_);
126
127 $self->generate_index_list();
128
129 # sort out subcollection indexes
130 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
131 my $indexes = $self->{'collect_cfg'}->{'indexes'};
132 $self->{'collect_cfg'}->{'indexes'} = [];
133 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
134 foreach my $index (@$indexes) {
135 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
136 }
137 }
138 }
139
140 # sort out language subindexes
141 if (defined $self->{'collect_cfg'}->{'languages'}) {
142 my $indexes = $self->{'collect_cfg'}->{'indexes'};
143 $self->{'collect_cfg'}->{'indexes'} = [];
144 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
145 foreach my $index (@$indexes) {
146 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
147 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
148 }
149 else { # add in an empty subcollection field
150 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
151 }
152 }
153 }
154 }
155
156 if (defined($self->{'collect_cfg'}->{'indexes'})) {
157 # make sure that the same index isn't specified more than once
158 my %tmphash = ();
159 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
160 $self->{'collect_cfg'}->{'indexes'} = [];
161 foreach my $i (@tmparray) {
162 if (!defined ($tmphash{$i})) {
163 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
164 $tmphash{$i} = 1;
165 }
166 }
167 } else {
168 $self->{'collect_cfg'}->{'indexes'} = [];
169 }
170
171 # load up the document processor for building
172 # if a buildproc class has been created for this collection, use it
173 # otherwise, use the mg buildproc
174 my ($buildprocdir, $buildproctype);
175 my $collection = $self->{'collection'};
176 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
177 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
178 $buildproctype = "${collection}buildproc";
179 } else {
180 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
181 $buildproctype = $self->default_buildproc();
182 }
183 require "$buildprocdir/$buildproctype.pm";
184
185 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
186 "\$source_dir, \$build_dir, \$keepold, \$verbosity, \$outhandle)");
187 die "$@" if $@;
188
189 if (!$self->{'debug'} && !$self->{'keepold'}) {
190 # remove any old builds
191 &util::rm_r($self->{'build_dir'});
192 &util::mk_all_dir($self->{'build_dir'});
193
194 # make the text directory
195 my $textdir = "$self->{'build_dir'}/text";
196 &util::mk_all_dir($textdir);
197 }
198
199}
200
201sub deinit {
202 my $self = shift (@_);
203
204 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
205}
206
207sub set_sections_index_document_metadata {
208 my $self = shift (@_);
209 my ($index) = @_;
210
211 $self->{'buildproc'}->set_sections_index_document_metadata($index);
212}
213
214sub set_strip_html {
215 my $self = shift (@_);
216 my ($strip) = @_;
217
218 $self->{'strip_html'} = $strip;
219 $self->{'buildproc'}->set_strip_html($strip);
220}
221
222sub compress_text {
223 my $self = shift (@_);
224 my ($textindex) = @_;
225
226 print STDERR "compress_text() should be implemented in subclass!!";
227 return;
228}
229
230
231sub build_indexes {
232 my $self = shift (@_);
233 my ($indexname) = @_;
234 my $outhandle = $self->{'outhandle'};
235
236 my $indexes = [];
237 if (defined $indexname && $indexname =~ /\w/) {
238 push @$indexes, $indexname;
239 } else {
240 $indexes = $self->{'collect_cfg'}->{'indexes'};
241 }
242
243 # create the mapping between the index descriptions
244 # and their directory names (includes subcolls and langs)
245 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
246
247 # build each of the indexes
248 foreach my $index (@$indexes) {
249 if ($self->want_built($index)) {
250 print $outhandle "\n*** building index $index in subdirectory " .
251 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
252 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
253 $self->build_index($index);
254 } else {
255 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
256 }
257 }
258
259 $self->build_indexes_extra();
260
261}
262
263sub build_indexes_extra {
264 my $self = shift(@_);
265
266}
267
268sub build_index {
269 my $self = shift (@_);
270 my ($index) = @_;
271
272 print STDERR "build_index should be implemented in subclass\n";
273 return;
274}
275
276sub make_infodatabase {
277 my $self = shift (@_);
278 my $outhandle = $self->{'outhandle'};
279
280 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
281 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
282 &util::mk_all_dir ($textdir);
283 &util::mk_all_dir ($assocdir);
284
285 # get db name
286 my $dbext = ".bdb";
287 $dbext = ".ldb" if &util::is_little_endian();
288 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
289 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
290
291 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
292 my $exe = &util::get_os_exe ();
293 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
294
295 print $outhandle "\n*** creating the info database and processing associated files\n"
296 if ($self->{'verbosity'} >= 1);
297 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
298
299 # init all the classifiers
300 &classify::init_classifiers ($self->{'classifiers'});
301
302 # set up the document processor
303 my ($handle);
304 if ($self->{'debug'}) {
305 $handle = STDOUT;
306 } else {
307 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
308 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
309 die "builder::make_infodatabase - couldn't run $txt2db_exe\n";
310 }
311 $handle = basebuilder::PIPEOUT;
312 }
313
314 $self->{'buildproc'}->set_output_handle ($handle);
315 $self->{'buildproc'}->set_mode ('infodb');
316 $self->{'buildproc'}->set_assocdir ($assocdir);
317 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
318 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
319 $self->{'buildproc'}->set_indexing_text (0);
320 $self->{'buildproc'}->set_store_text(1);
321 # make_infodatabase does not support incremental build
322 # => full reset needed
323 $self->{'buildproc'}->zero_reset();
324
325 # do we need this anymore if we get coll meta from config file??
326 $self->output_collection_meta($handle);
327
328 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
329 "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
330
331 # output classification information
332 &classify::output_classify_info ($self->{'classifiers'}, $handle,
333 $self->{'remove_empty_classifications'},
334 $self->{'gli'});
335
336
337 #output doclist
338 my @doclist = $self->{'buildproc'}->get_doc_list();
339 my $docs = join (";",@doclist);
340 print $handle "[browselist]\n";
341 print $handle "<hastxt>0\n";
342 print $handle "<childtype>VList\n";
343 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
344 print $handle "<thistype>Invisible\n";
345 print $handle "<contains>$docs";
346 print $handle "\n" . ('-' x 70) . "\n";
347
348 close ($handle) if !$self->{'debug'};
349
350 print STDERR "</Stage>\n" if $self->{'gli'};
351}
352
353sub make_auxiliary_files {
354 my $self = shift (@_);
355 my ($index);
356 my $build_cfg = {};
357 # subclasses may have already defined stuff in here
358 if (defined $self->{'build_cfg'}) {
359 $build_cfg = $self->{'build_cfg'};
360 }
361
362 my $outhandle = $self->{'outhandle'};
363
364 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
365 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
366
367 # get the text directory
368 &util::mk_all_dir ($self->{'build_dir'});
369
370 # store the build date
371 $build_cfg->{'builddate'} = time;
372 $build_cfg->{'buildtype'} = $self->{'buildtype'};
373 $build_cfg->{'indexstem'} = $self->{'collection'};
374 # store the number of documents and number of bytes
375 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
376 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
377
378
379 # store the mapping between the index names and the directory names
380 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
381 my @indexmap = ();
382 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
383 if (not defined ($self->{'notbuilt'}->{$index})) {
384 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
385 }
386 }
387 $build_cfg->{'indexmap'} = \@indexmap;
388
389 my @subcollectionmap = ();
390 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
391 push (@subcollectionmap, "$subcollection\-\>" .
392 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
393 }
394 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
395
396 my @languagemap = ();
397 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
398 push (@languagemap, "$language\-\>" .
399 $self->{'index_mapping'}->{'languagemap'}->{$language});
400 }
401 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
402
403 my @notbuilt = ();
404 foreach my $nb (keys %{$self->{'notbuilt'}}) {
405 push (@notbuilt, $nb);
406 }
407 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
408
409 $build_cfg->{'maxnumeric'} = 4;
410 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
411 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
412 $build_cfg->{'maxnumeric'} = $self->{'collect_cfg'}->{'maxnumeric'};
413 }
414
415 $self->build_cfg_extra($build_cfg);
416
417 $self->write_cfg_file($build_cfg);
418 print STDERR "</Stage>\n" if $self->{'gli'};
419}
420
421sub collect_specific {
422 my $self = shift (@_);
423}
424
425sub want_built {
426 my $self = shift (@_);
427 my ($index) = @_;
428
429 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
430 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
431 if ($index =~ /^$checkstr$/) {
432 $self->{'notbuilt'}->{$index} = 1;
433 return 0;
434 }
435 }
436 }
437
438 return 1;
439}
440
441sub create_index_mapping {
442 my $self = shift (@_);
443 my ($indexes) = @_;
444
445 print STDERR "create_index_mapping should be implemented in subclass\n";
446 my %mapping = ();
447 return \%mapping;
448}
449
450# returns a processed version of a field.
451# if the field has only one component the processed
452# version will contain the first character and next consonant
453# of that componant - otherwise it will contain the first
454# character of the first two components
455# only uses letdig (\w) characters now
456sub process_field {
457 my $self = shift (@_);
458 my ($field) = @_;
459
460 return "" unless (defined ($field) && $field =~ /\S/);
461
462 my ($a, $b);
463 my @components = split /,/, $field;
464 if (scalar @components >= 2) {
465 # pick the first letdig from the first two field names
466 ($a) = $components[0] =~ /^[^\w]*(\w)/;
467 ($b) = $components[1] =~ /^[^\w]*(\w)/;
468 } else {
469 # pick the first two letdig chars
470 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
471 }
472 # there may not have been any letdigs...
473 $a = 'a' unless defined $a;
474 $b = '0' unless defined $b;
475
476 return "$a$b";
477
478}
479
480sub get_next_version {
481 my $self = shift (@_);
482 my ($nameref) = @_;
483 my $num=0;
484 if ($$nameref =~ /(\d\d)$/) {
485 $num = $1; $num ++;
486 $$nameref =~ s/\d\d$/$num/;
487 } elsif ($$nameref =~ /(\d)$/) {
488 $num = $1;
489 if ($num == 9) {$$nameref =~ s/\d$/10/;}
490 else {$num ++; $$nameref =~ s/\d$/$num/;}
491 } else {
492 $$nameref =~ s/.$/0/;
493 }
494}
495
496# implement this in subclass if want to add extra stuff to build.cfg
497sub build_cfg_extra {
498 my $self = shift(@_);
499 my ($build_cfg) = @_;
500
501}
502
503sub write_cfg_file {
504 my $self = shift(@_);
505 my ($build_cfg) = @_;
506
507 # write out the build information
508 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
509 '^(builddate|buildtype|numdocs|numbytes|numwords|numsections|maxnumeric|indexstem)$',
510 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
511
512}
513
514
515sub output_collection_meta {
516 my $self = shift(@_);
517 my ($handle) = @_;
518 my $outhandle = $self->{'outhandle'};
519 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
520
521 if (!defined $self->{'index_mapping'}) {
522 $self->{'index_mapping'} =
523 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
524 }
525
526 print $handle "[collection]\n";
527
528 foreach my $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
529 my $defaultfound=0;
530 my $first=1;
531 my $metadata_entry = "";
532 my $default="";
533 my $cmetamap = "";
534 if ($cmeta =~ s/^\.//) {
535 if (defined $self->{'index_mapping'}->{$cmeta}) {
536 $cmetamap = $self->{'index_mapping'}->{$cmeta};
537 $cmeta = ".$cmeta";
538 }
539 else {
540 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
541 next; #ignore this one
542 }
543 }
544 else {
545 $cmetamap = $cmeta; # just using the same name
546 }
547 #iterate through the languages
548 foreach my $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
549 if ($first) {
550 $first=0;
551 #set the default default to the first entry
552 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
553 }
554 if ($lang =~ /default/) {
555 $defaultfound=1;
556 #the default entry goes first
557 $metadata_entry = "<$cmetamap>" .
558 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
559 }
560 else {
561 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
562 if ($l) {
563 $metadata_entry .= "<$cmetamap:$l>" .
564 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
565
566 # Use the English value as the default if no default is specified
567 if ($l =~ /en/i) {
568 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
569 }
570 }
571 }
572 }
573 #if we haven't found a default, put one in
574 if (!$defaultfound) {
575 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
576 }
577 #write the entry to the file
578 print $handle $metadata_entry;
579
580 }
581
582 print $handle "\n" . ('-' x 70) . "\n";
583 }
584
585}
586
587sub print_stats {
588 my $self = shift (@_);
589
590 my $outhandle = $self->{'outhandle'};
591 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
592 my $index = $self->{'buildproc'}->get_index();
593 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
594 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
595
596 if ($indexing_text) {
597 print $outhandle "Stats (Creating index $index)\n";
598 } else {
599 print $outhandle "Stats (Compressing text from $index)\n";
600 }
601 print $outhandle "Total bytes in collection: $num_bytes\n";
602 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
603
604 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
605
606 if ($self->{'keepold'}) {
607 if ($num_processed_bytes == 0) {
608 if ($indexing_text) {
609 print $outhandle "No additional text was added to $index\n";
610 } elsif (!$self->{'no_text'}) {
611 print $outhandle "No additional text was compressed\n";
612 }
613 }
614 }
615 else {
616 print $outhandle "***************\n";
617 if ($indexing_text) {
618 print $outhandle "WARNING: There is very little or no text to process for $index\n";
619 } elsif (!$self->{'no_text'}) {
620 print $outhandle "WARNING: There is very little or no text to compress\n";
621 }
622 print $outhandle " Was this your intention?\n";
623 print $outhandle "***************\n";
624 }
625
626 }
627
628}
629
630
6311;
632
Note: See TracBrowser for help on using the repository browser.