source: trunk/gsdl/perllib/mgppbuilder.pm@ 6544

Last change on this file since 6544 was 6544, checked in by kjdon, 20 years ago

the bit where you put language items into the indexexparr was comparing them unnecessarily to the entries in the collect.cfg. don't know why but it wasn't working so I deleted that bit. also added the lang stuff to the gdbm db so now language subcollections work with mgpp

  • Property svn:keywords set to Author Date Id Revision
File size: 40.7 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mgpp
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50%level_map = ('document'=>'Doc',
51 'section'=>'Sec',
52 'paragraph'=>'Para',
53 'Doc'=>'_textdocument_',
54 'Sec'=>'_textsection_',
55 'Para'=>'_textparagraph_');
56
57#$doc_level = "Doc";
58#$sec_level = "Sec";
59#$para_level = "Para";
60
61%wanted_index_files = ('td'=>1,
62 't'=>1,
63 'tl'=>1,
64 'ti'=>1,
65 'idb'=>1,
66 'ib1'=>1,
67 'ib2'=>1,
68 'ib3'=>1,
69 'i'=>1,
70 'il'=>1,
71 'w'=>1,
72 'wa'=>1);
73
74# change this so a user can add their own ones in via a file or cfg
75#add AND, OR, NOT NEAR to this list - these cannot be used as field names
76#also add the level names (Doc, Sec, Para)
77%static_indexfield_map = ('Title'=>'TI',
78 'TI'=>1,
79 'Subject'=>'SU',
80 'SU'=>1,
81 'Creator'=>'CR',
82 'CR'=>1,
83 'Organization'=>'ORG',
84 'ORG'=>1,
85 'Source'=>'SO',
86 'SO'=>1,
87 'Howto'=>'HT',
88 'HT'=>1,
89 'ItemTitle'=>'IT',
90 'IT'=>1,
91 'ProgNumber'=>'PN',
92 'PN'=>1,
93 'People'=>'PE',
94 'PE'=>1,
95 'Coverage'=>'CO',
96 'CO'=>1,
97 'allfields'=>'ZZ',
98 'ZZ'=>1,
99 'text'=>'TX',
100 'TX'=>1,
101 'AND'=>1,
102 'OR'=>1,
103 'NOT'=>1,
104 'NEAR'=>1,
105 'Doc'=>1,
106 'Sec'=>1,
107 'Para'=>1);
108
109sub new {
110 my ($class, $collection, $source_dir, $build_dir, $verbosity,
111 $maxdocs, $debug, $keepold, $allclassifications,
112 $outhandle, $no_text, $gli) = @_;
113
114 $outhandle = STDERR unless defined $outhandle;
115 $no_text = 0 unless defined $no_text;
116
117 # create an mgppbuilder object
118 my $self = bless {'collection'=>$collection,
119 'source_dir'=>$source_dir,
120 'build_dir'=>$build_dir,
121 'verbosity'=>$verbosity,
122 'maxdocs'=>$maxdocs,
123 'debug'=>$debug,
124 'keepold'=>$keepold,
125 'allclassifications'=>$allclassifications,
126 'outhandle'=>$outhandle,
127 'no_text'=>$no_text,
128 'notbuilt'=>{}, # indexes not built
129 'indexfieldmap'=>\%static_indexfield_map,
130 'gli'=>$gli
131 }, $class;
132
133 $self->{'gli'} = 0 unless defined $self->{'gli'};
134
135 # read in the collection configuration file
136 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
137 if (!-e $colcfgname) {
138 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
139 }
140 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
141
142 # sort out the indexes
143 #indexes are specified with spaces, but we put them into one index
144 my $indexes = $self->{'collect_cfg'}->{'indexes'};
145 $self->{'collect_cfg'}->{'indexes'} = [];
146 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes));
147
148
149 # sort out subcollection indexes
150 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
151 my $indexes = $self->{'collect_cfg'}->{'indexes'};
152 $self->{'collect_cfg'}->{'indexes'} = [];
153 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
154 foreach $index (@$indexes) {
155 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
156 }
157 }
158 }
159
160 # sort out language subindexes
161 if (defined $self->{'collect_cfg'}->{'languages'}) {
162 my $indexes = $self->{'collect_cfg'}->{'indexes'};
163 $self->{'collect_cfg'}->{'indexes'} = [];
164 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
165 foreach $index (@$indexes) {
166 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
167 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
168 }
169 else { # add in an empty subcollection field
170 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
171
172 }
173 }
174 }
175 }
176
177 # make sure that the same index isn't specified more than once
178 my %tmphash = ();
179 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
180 $self->{'collect_cfg'}->{'indexes'} = [];
181 foreach my $i (@tmparray) {
182 if (!defined ($tmphash{$i})) {
183 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
184 $tmphash{$i} = 1;
185 }
186 }
187
188
189 # get the levels (Section, Paragraph) for indexing and compression
190 $self->{'levels'} = {};
191 $self->{'levelorder'} = ();
192 if (defined $self->{'collect_cfg'}->{'levels'}) {
193 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
194 $level =~ tr/A-Z/a-z/;
195 $self->{'levels'}->{$level} = 1;
196 push (@{$self->{'levelorder'}}, $level);
197 }
198 } else { # default to document
199 $self->{'levels'}->{'document'} = 1;
200 push (@{$self->{'levelorder'}}, 'document');
201 }
202
203 $self->{'doc_level'} = "document";
204 if (! $self->{'levels'}->{'document'}) {
205 if ($self->{'levels'}->{'section'}) {
206 $self->{'doc_level'} = "section";
207 } else {
208 die "you must have either document or section level specified!!\n";
209 }
210 }
211 print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
212 # get the list of plugins for this collection
213 my $plugins = [];
214 if (defined $self->{'collect_cfg'}->{'plugin'}) {
215 $plugins = $self->{'collect_cfg'}->{'plugin'};
216 }
217
218 # load all the plugins
219 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
220 if (scalar(@{$self->{'pluginfo'}}) == 0) {
221 print $outhandle "No plugins were loaded.\n";
222 die "\n";
223 }
224
225 # get the list of classifiers for this collection
226 my $classifiers = [];
227 if (defined $self->{'collect_cfg'}->{'classify'}) {
228 $classifiers = $self->{'collect_cfg'}->{'classify'};
229 }
230
231 # load all the classifiers
232 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
233
234 # load up any dontgdbm fields
235 $self->{'dontgdbm'} = {};
236 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
237 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
238 $self->{'dontgdbm'}->{$dg} = 1;
239 }
240 }
241
242 # load up the document processor for building
243 # if a buildproc class has been created for this collection, use it
244 # otherwise, use the mgpp buildproc
245 my ($buildprocdir, $buildproctype);
246 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
247 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
248 $buildproctype = "${collection}buildproc";
249 } else {
250 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
251 $buildproctype = "mgppbuildproc";
252 }
253 require "$buildprocdir/$buildproctype.pm";
254
255 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
256 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
257 die "$@" if $@;
258
259
260 return $self;
261}
262
263sub init {
264 my $self = shift (@_);
265
266 if (!$self->{'debug'} && !$self->{'keepold'}) {
267 # remove any old builds
268 &util::rm_r($self->{'build_dir'});
269 &util::mk_all_dir($self->{'build_dir'});
270
271 # make the text directory
272 my $textdir = "$self->{'build_dir'}/text";
273 &util::mk_all_dir($textdir);
274 }
275}
276
277sub set_strip_html {
278 my $self = shift (@_);
279 my ($strip) = @_;
280
281 $self->{'strip_html'} = $strip;
282 $self->{'buildproc'}->set_strip_html($strip);
283}
284
285sub compress_text {
286
287 my $self = shift (@_);
288 my ($textindex) = @_;
289
290 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
291 my $exe = &util::get_os_exe ();
292 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
293 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
294 my $outhandle = $self->{'outhandle'};
295
296 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
297
298 my $basefilename = "text/$self->{'collection'}";
299 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
300
301 my $osextra = "";
302 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
303 $fulltextprefix =~ s@/@\\@g;
304 }
305 else {
306 $osextra = " -d /";
307 }
308
309
310 # define the section names and possibly the doc name for mgpasses
311 # the compressor doesn't need to know about paragraphs - never want to
312 # retrieve them
313 my $mgpp_passes_sections = "";
314 my ($doc_level) = $self->{'doc_level'};
315 $mgpp_passes_sections .= "-J " . %level_map->{$doc_level} . " ";
316 foreach $level (keys %{$self->{'levels'}}) {
317 if ($level ne $doc_level && $level ne "paragraph") {
318 $mgpp_passes_sections .= "-K " . %level_map->{$level} . " ";
319 }
320 }
321
322 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
323 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
324
325 # collect the statistics for the text
326 # -b $maxdocsize sets the maximum document size to be 12 meg
327 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
328 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
329
330 my ($handle);
331 if ($self->{'debug'}) {
332 $handle = STDOUT;
333 } else {
334 #print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n";
335 if (!-e "$mgpp_passes_exe" ||
336 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
337 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
338 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
339 }
340 $handle = mgppbuilder::PIPEOUT;
341 }
342 $self->{'buildproc'}->set_output_handle ($handle);
343 $self->{'buildproc'}->set_mode ('text');
344 $self->{'buildproc'}->set_index ($textindex);
345 $self->{'buildproc'}->set_indexing_text (0);
346 if ($self->{'no_text'}) {
347 $self->{'buildproc'}->set_store_text(0);
348 } else {
349 $self->{'buildproc'}->set_store_text(1);
350 }
351 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
352 $self->{'buildproc'}->set_levels ($self->{'levels'});
353 $self->{'buildproc'}->reset();
354 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
355 $self->{'buildproc'}, $self->{'maxdocs'});
356 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
357 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
358 &plugin::end($self->{'pluginfo'});
359 close (PIPEOUT);
360
361 close ($handle) unless $self->{'debug'};
362
363 $self->print_stats();
364
365 # create the compression dictionary
366 # the compression dictionary is built by assuming the stats are from a seed
367 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
368 # and the resulting dictionary must be less than 5 meg with the most
369 # frequent words being put into the dictionary first (-2 -k 5120)
370 # note: these options are left over from mg version
371 if (!$self->{'debug'}) {
372 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
373 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
374 if (!-e "$mgpp_compression_dict_exe") {
375 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
376 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
377 }
378 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
379
380 if (!$self->{'debug'}) {
381 #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n";
382 if (!-e "$mgpp_passes_exe" ||
383 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
384 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
385 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
386 }
387 }
388 }
389 else {
390 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
391 }
392
393 $self->{'buildproc'}->reset();
394 # compress the text
395 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
396 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
397
398 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
399 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
400 close ($handle) unless $self->{'debug'};
401
402 $self->print_stats();
403 print STDERR "</Stage>\n" if $self->{'gli'};
404}
405
406sub want_built {
407 my $self = shift (@_);
408 my ($index) = @_;
409
410 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
411 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
412 if ($index =~ /^$checkstr$/) {
413 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
414 $self->{'notbuilt'}->{$index} = 1;
415 return 0;
416 }
417 }
418 }
419
420 return 1;
421}
422
423sub build_indexes {
424 my $self = shift (@_);
425 my ($indexname) = @_;
426 my $outhandle = $self->{'outhandle'};
427
428 my $indexes = [];
429 if (defined $indexname && $indexname =~ /\w/) {
430 push @$indexes, $indexname;
431 } else {
432 $indexes = $self->{'collect_cfg'}->{'indexes'};
433 }
434
435 # create the mapping between the index descriptions
436 # and their directory names (includes subcolls and langs)
437 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
438
439 # build each of the indexes
440 foreach $index (@$indexes) {
441 if ($self->want_built($index)) {
442 print $outhandle "\n*** building index $index in subdirectory " .
443 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
444 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
445 $self->build_index($index);
446 } else {
447 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
448 }
449 }
450
451 #define the final field lists
452 $self->make_final_field_list();
453
454}
455
456# creates directory names for each of the index descriptions
457sub create_index_mapping {
458 my $self = shift (@_);
459 my ($indexes) = @_;
460
461 my %mapping = ();
462
463 $mapping{'indexmaporder'} = [];
464 $mapping{'subcollectionmaporder'} = [];
465 $mapping{'languagemaporder'} = [];
466
467 # dirnames is used to check for collisions. Start this off
468 # with the manditory directory names
469 my %dirnames = ('text'=>'text',
470 'extra'=>'extra');
471 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
472
473 foreach $index (@$indexes) {
474 my ($fields, $subcollection, $languages) = split (":", $index);
475 # the directory name starts with a processed version of index fields
476 #my ($pindex) = $self->process_field($fields);
477 #$pindex = lc ($pindex);
478 # now we only ever have one index, and its called 'idx'
479 $pindex = 'idx';
480
481 # next comes a processed version of the subcollection if there is one.
482 my $psub = $self->process_field ($subcollection);
483 $psub = lc ($psub);
484
485 # next comes a processed version of the language if there is one.
486 my $plang = $self->process_field ($languages);
487 $plang = lc ($plang);
488
489 my $dirname = $pindex . $psub . $plang;
490
491 # check to be sure all index names are unique
492 while (defined ($dirnames{$dirname})) {
493 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
494 }
495
496 $mapping{$index} = $dirname;
497
498 # store the mapping orders as well as the maps
499 # also put index, subcollection and language fields into the mapping thing -
500 # (the full index name (eg text:subcol:lang) is not used on
501 # the query page) -these are used for collectionmeta later on
502 if (!defined $mapping{'indexmap'}{"$fields"}) {
503 $mapping{'indexmap'}{"$fields"} = $pindex;
504 push (@{$mapping{'indexmaporder'}}, "$fields");
505 if (!defined $mapping{"$fields"}) {
506 $mapping{"$fields"} = $pindex;
507 }
508 }
509 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
510 $mapping{'subcollectionmap'}{$subcollection} = $psub;
511 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
512 $mapping{$subcollection} = $psub;
513 }
514 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
515 $mapping{'languagemap'}{$languages} = $plang;
516 push (@{$mapping{'languagemaporder'}}, $languages);
517 $mapping{$languages} = $plang;
518 }
519 $dirnames{$dirname} = $index;
520 $pnames{'index'}{$pindex} = "$fields";
521 $pnames{'subcollection'}{$psub} = $subcollection;
522 $pnames{'languages'}{$plang} = $languages;
523 }
524
525 return \%mapping;
526}
527
528# returns a processed version of a field.
529# if the field has only one component the processed
530# version will contain the first character and next consonant
531# of that componant - otherwise it will contain the first
532# character of the first two components
533sub process_field {
534 my $self = shift (@_);
535 my ($field) = @_;
536
537 return "" unless (defined ($field) && $field =~ /\w/);
538
539 my @components = split /,/, $field;
540 if (scalar @components >= 2) {
541 splice (@components, 2);
542 map {s/^(.).*$/$1/;} @components;
543 return join("", @components);
544 } else {
545 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
546 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
547 return "$a$b";
548 }
549}
550
551sub make_unique {
552 my $self = shift (@_);
553 my ($namehash, $index, $indexref, $subref, $langref) = @_;
554 my ($fields, $subcollection, $languages) = split (":", $index);
555
556 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
557 $self->get_next_version ($indexref);
558 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
559 $self->get_next_version ($subref);
560 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
561 $self->get_next_version ($langref);
562 }
563 return "$$indexref$$subref$$langref";
564}
565
566sub get_next_version {
567 my $self = shift (@_);
568 my ($nameref) = @_;
569
570 if ($$nameref =~ /(\d\d)$/) {
571 my $num = $1; $num ++;
572 $$nameref =~ s/\d\d$/$num/;
573 } elsif ($$nameref =~ /(\d)$/) {
574 my $num = $1;
575 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
576 else {$num ++; $$nameref =~ s/\d$/$num/;}
577 } else {
578 $$nameref =~ s/.$/0/;
579 }
580}
581
582sub build_index {
583 my $self = shift (@_);
584 my ($index) = @_;
585 my $outhandle = $self->{'outhandle'};
586
587 # get the full index directory path and make sure it exists
588 my $indexdir = $self->{'index_mapping'}->{$index};
589 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
590 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
591 $indexdir,
592 $self->{'collection'});
593 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
594 $self->{'collection'});
595
596 # get any os specific stuff
597 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
598
599 my $exe = &util::get_os_exe ();
600 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
601
602 # define the section names for mgpasses
603 # define the section names and possibly the doc name for mgpasses
604 my $mgpp_passes_sections = "";
605 my ($doc_level) = $self->{'doc_level'};
606 $mgpp_passes_sections .= "-J " . %level_map->{$doc_level} ." ";
607
608 foreach $level (keys %{$self->{'levels'}}) {
609 if ($level ne $doc_level) {
610 $mgpp_passes_sections .= "-K " . %level_map->{$level}. " ";
611 }
612 }
613
614 my $mgpp_perf_hash_build_exe =
615 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
616 my $mgpp_weights_build_exe =
617 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
618 my $mgpp_invf_dict_exe =
619 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
620 my $mgpp_stem_idx_exe =
621 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
622
623 my $osextra = "";
624 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
625 $fullindexprefix =~ s@/@\\@g;
626 } else {
627 $osextra = " -d /";
628 if ($outhandle ne "STDERR") {
629 # so mgpp_passes doesn't print to stderr if we redirect output
630 $osextra .= " 2>/dev/null";
631 }
632 }
633
634 # get the index expression if this index belongs
635 # to a subcollection
636 my $indexexparr = [];
637
638 # there may be subcollection info, and language info.
639 my ($fields, $subcollection, $language) = split (":", $index);
640 my @subcollections = ();
641 @subcollections = split /,/, $subcollection if (defined $subcollection);
642
643 foreach $subcollection (@subcollections) {
644 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
645 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
646 }
647 }
648
649 # add expressions for languages if this index belongs to
650 # a language subcollection - only put languages expressions for the
651 # ones we want in the index
652
653 my @languages = ();
654 @languages = split /,/, $language if (defined $language);
655 foreach $language (@languages) {
656 my $not=0;
657 if ($language =~ s/^\!//) {
658 $not = 1;
659 }
660 if ($not) {
661 push (@$indexexparr, "!Language/$language/");
662 } else {
663 push (@$indexexparr, "Language/$language/");
664 }
665 }
666
667 # Build index dictionary. Uses verbatim stem method
668 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
669 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
670 my ($handle);
671 if ($self->{'debug'}) {
672 $handle = STDOUT;
673 } else {
674 if (!-e "$mgpp_passes_exe" ||
675 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
676 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
677 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
678 }
679 $handle = mgppbuilder::PIPEOUT;
680 }
681
682 # set up the document processr
683 $self->{'buildproc'}->set_output_handle ($handle);
684 $self->{'buildproc'}->set_mode ('text');
685 $self->{'buildproc'}->set_index ($index, $indexexparr);
686 $self->{'buildproc'}->set_indexing_text (1);
687 $self->{'buildproc'}->set_store_text(1);
688 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
689 $self->{'buildproc'}->set_levels ($self->{'levels'});
690 $self->{'buildproc'}->reset();
691 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
692 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
693 close ($handle) unless $self->{'debug'};
694
695 $self->print_stats();
696
697 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
698 # we check on the .id file - index dictionary
699 my $dict_file = "$fullindexprefix.id";
700 if (!-e $dict_file) {
701 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
702 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
703 $self->{'notbuilt'}->{$index}=1;
704 return;
705 }
706
707 if (!$self->{'debug'}) {
708 # create the perfect hash function
709 if (!-e "$mgpp_perf_hash_build_exe") {
710 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
711 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
712 }
713 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
714
715 if (!-e "$mgpp_passes_exe" ||
716 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
717 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
718 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
719 }
720 }
721
722 # invert the text
723 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
724 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
725 $self->{'buildproc'}->reset();
726 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
727 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
728
729 $self->print_stats ();
730
731 if (!$self->{'debug'}) {
732
733 close ($handle);
734
735 # create the weights file
736 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
737 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
738 if (!-e "$mgpp_weights_build_exe") {
739 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
740 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
741 }
742 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
743
744 # create 'on-disk' stemmed dictionary
745 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
746 if (!-e "$mgpp_invf_dict_exe") {
747 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
748 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
749 }
750 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
751
752
753 # creates stem index files for the various stemming methods
754 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
755 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
756 if (!-e "$mgpp_stem_idx_exe") {
757 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
758 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
759 }
760 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
761 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
762 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
763
764 # remove unwanted files
765 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
766 opendir (DIR, $tmpdir) || die
767 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
768 foreach $file (readdir(DIR)) {
769 next if $file =~ /^\./;
770 my ($suffix) = $file =~ /\.([^\.]+)$/;
771 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
772 # delete it!
773 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
774 #&util::rm (&util::filename_cat ($tmpdir, $file));
775 }
776 }
777 closedir (DIR);
778 }
779 print STDERR "</Stage>\n" if $self->{'gli'};
780}
781
782sub make_infodatabase {
783 my $self = shift (@_);
784 my $outhandle = $self->{'outhandle'};
785
786
787 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
788 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
789 &util::mk_all_dir ($textdir);
790 &util::mk_all_dir ($assocdir);
791
792 # get db name
793 my $dbext = ".bdb";
794 $dbext = ".ldb" if &util::is_little_endian();
795 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
796 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
797
798 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
799 my $exe = &util::get_os_exe ();
800 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
801
802 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
803 if (!defined $self->{'build_cfg'}) {
804 $self->read_final_field_list();
805 }
806 print $outhandle "\n*** creating the info database and processing associated files\n"
807 if ($self->{'verbosity'} >= 1);
808 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
809
810 # init all the classifiers
811 &classify::init_classifiers ($self->{'classifiers'});
812
813 # set up the document processor
814 my ($handle);
815 if ($self->{'debug'}) {
816 $handle = STDOUT;
817 } else {
818 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
819 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
820 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
821 }
822 $handle = mgppbuilder::PIPEOUT;
823 }
824
825 $self->{'buildproc'}->set_output_handle ($handle);
826 $self->{'buildproc'}->set_mode ('infodb');
827 $self->{'buildproc'}->set_assocdir ($assocdir);
828 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
829 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
830 $self->{'buildproc'}->set_indexing_text (0);
831 $self->{'buildproc'}->set_store_text(1);
832 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
833
834 $self->{'buildproc'}->reset();
835
836 # do the collection info
837 print $handle "[collection]\n";
838
839 # first do the collection meta stuff - everything without a dot
840 my $collmetadefined = 0;
841 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
842 $collmetadefined = 1;
843 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
844 next if ($cmeta =~ /^\./); # for now, ignore ones with dots
845 my ($metadata_entry) = $self->create_language_db_map($cmeta, $cmeta);
846 #write the entry to the file
847 print $handle $metadata_entry;
848
849 } # foreach collmeta key
850 }
851 #add the index field macros to [collection]
852 # eg <TI>Title
853 # <SU>Subject
854 # these now come from collection meta. if that is not defined, usses the metadata name
855 $field_entry="";
856 foreach $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
857 $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
858 next if $shortfield eq 1;
859
860 # we need to check if some coll meta has been defined
861 my $collmeta = ".$longfield";
862 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
863 $metadata_entry = $self->create_language_db_map($collmeta, $shortfield);
864 $field_entry .= $metadata_entry;
865 } else { #use the metadata names, or the text macros for allfields and textonly
866 if ($longfield eq "allfields") {
867 $field_entry .= "<$shortfield>_query:textallfields_\n";
868 } elsif ($longfield eq "text") {
869 $field_entry .= "<$shortfield>_query:texttextonly_\n";
870 } else {
871 $field_entry .= "<$shortfield>$longfield\n";
872 }
873 }
874 }
875 print $handle $field_entry;
876
877 # now add the level names
878 $level_entry = "";
879 foreach $level (@{$self->{'collect_cfg'}->{'levels'}}) {
880 my $collmeta = ".$level"; # based on the original specification
881 $level =~ tr/A-Z/a-z/; # make it lower case
882 my $levelid = %level_map->{$level}; # find the actual value we used in the index
883 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
884 $metadata_entry = $self->create_language_db_map($collmeta, $levelid);
885 $level_entry .= $metadata_entry;
886 } else {
887 # use the default macro
888 $level_entry .= "<$levelid>" . %level_map->{$levelid} . "\n";
889 }
890 }
891 print $handle $level_entry;
892
893 # now add subcoll meta
894 $subcoll_entry = "";
895 foreach $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
896 if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
897 my $shortname = $self->{'index_mapping'}->{$subcoll};
898 $one_entry = $self->create_language_db_map(".$subcoll", $shortname);
899 $subcoll_entry .= $one_entry;
900 } else {
901 $subcoll_entry .= "<$shortname>$subcoll\n";
902 }
903 }
904 print $handle $subcoll_entry;
905 # now add language meta
906 $lang_entry = "";
907 foreach $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
908 if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
909 my $shortname = $self->{'index_mapping'}->{$lang};
910 $one_entry = $self->create_language_db_map(".$lang", $shortname);
911 $lang_entry .= $one_entry;
912 } else {
913 $lang_entry .= "<$shortname>$lang\n";
914 }
915 }
916 print $handle $lang_entry;
917 #end the collection entry
918 print $handle "\n" . ('-' x 70) . "\n";
919
920
921
922 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
923 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
924
925 # output classification information
926 &classify::output_classify_info ($self->{'classifiers'}, $handle,
927 $self->{'allclassifications'},
928 $self->{'gli'});
929
930 #output doclist
931 my @doclist = $self->{'buildproc'}->get_doc_list();
932 my $docs = join (";",@doclist);
933 print $handle "[browselist]\n";
934 print $handle "<hastxt>0\n";
935 print $handle "<childtype>VList\n";
936 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
937 print $handle "<thistype>Invisible\n";
938 print $handle "<contains>$docs";
939 print $handle "\n" . ('-' x 70) . "\n";
940 close ($handle) if !$self->{'debug'};
941
942 print STDERR "</Stage>\n" if $self->{'gli'};
943}
944
945sub create_language_db_map {
946 my $self = shift (@_);
947 my ($metaname, $mapname) = @_;
948 my $outhandle = $self->{'outhandle'};
949 my $defaultfound=0;
950 my $first=1;
951 my $metadata_entry = "";
952 my $default="";
953 #iterate through the languages
954 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}})) {
955 if ($first) {
956 $first=0;
957 #set the default default to the first entry
958 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
959 }
960 if ($lang =~ /default/) {
961 $defaultfound=1;
962 #the default entry goes first
963 $metadata_entry = "<$mapname>" .
964 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{'default'} . "\n" . $metadata_entry;
965 }
966 else {
967 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
968 if ($l) {
969 $metadata_entry .= "<$mapname:$l>" .
970 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang} . "\n";
971 }
972 }
973 } #foreach lang
974 #if we haven't found a default, put one in
975 if (!$defaultfound) {
976 $metadata_entry = "<$mapname>$default\n" . $metadata_entry;
977 }
978 return $metadata_entry;
979
980}
981sub collect_specific {
982 my $self = shift (@_);
983}
984
985# at the end of building, we have an indexfieldmap with all teh mappings, plus
986# some extras, and indexmap with any indexes in it that weren't specified in the index definition.
987# we want to make an ordered list of fields that are indexed, and a list of mappings that are used. this will be used for the build.cfg file, and for collection meta definition
988# we store these in a build.cfg bit
989sub make_final_field_list {
990 my $self = shift (@_);
991
992 $self->{'build_cfg'} = {};
993
994 # store the indexfieldmap information
995 my @indexfieldmap = ();
996 my @indexfields = ();
997 my $specifiedfields = {};
998 my @specifiedfieldorder = ();
999 # go through the index definition and add each thing to a map, so we can easily check if it is already specified - when doing the metadata, we print out all the individual fields, but some may already be specified in the index definition, so we dont want to add those again.
1000 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
1001 # remove subcoll stuff
1002 my $parts = $field;
1003 $parts =~ s/:.*$//;
1004 my @fs = split(',', $parts);
1005 foreach $f(@fs) {
1006 if (!defined $specifiedfields->{$f}) {
1007 $specifiedfields->{$f}=1;
1008 push (@specifiedfieldorder, "$f");
1009 }
1010 }
1011 }
1012
1013 #add all fields bit
1014 foreach $field (@specifiedfieldorder) {
1015 if ($field eq "metadata") {
1016 foreach $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
1017 if (!defined $specifiedfields->{$newfield}) {
1018 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
1019 push (@indexfields, "$newfield");
1020 }
1021 }
1022
1023 } elsif ($field eq 'text') {
1024 push (@indexfieldmap, "text\-\>TX");
1025 push (@indexfields, "text");
1026 } elsif ($field eq 'allfields') {
1027 push (@indexfieldmap, "allfields\-\>ZZ");
1028 push (@indexfields, "allfields");
1029 } else {
1030 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
1031 push (@indexfields, "$field");
1032
1033 }
1034 }
1035 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1036 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1037
1038
1039}
1040
1041
1042# recreate the field list from the build.cfg file, look first in building, then in index to find it. if there is no build.cfg, we cant do the field list (there is unlikely to be any index anyway.)
1043sub read_final_field_list {
1044 my $self = shift (@_);
1045 $self->{'build_cfg'} = {};
1046 my @indexfieldmap = ();
1047 my @indexfields = ();
1048
1049 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
1050 # set the default mapping
1051 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
1052 }
1053 # we read the stuff in from the build.cfg file - if its there
1054 $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
1055
1056 if (!-e $buildconfigfile) {
1057 # try the index dir - but do we know where it is?? try here
1058 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
1059 if (!-e $buildconfigfile) {
1060 #we cant find a config file - just ignore the field list
1061 return;
1062 }
1063 }
1064 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
1065 if (defined $buildcfg->{'indexfields'}) {
1066 foreach $field (@{$buildcfg->{'indexfields'}}) {
1067 push (@indexfields, "$field");
1068 }
1069 }
1070 if (defined $buildcfg->{'indexfieldmap'}) {
1071 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
1072 push (@indexfieldmap, "$field");
1073 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
1074 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
1075 }
1076 }
1077
1078 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1079 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1080
1081}
1082sub make_auxiliary_files {
1083 my $self = shift (@_);
1084 my ($index);
1085
1086 my $build_cfg = {};
1087 # this already includes indexfieldmap and indexfields
1088 if (defined $self->{'build_cfg'}) {
1089 $build_cfg = $self->{'build_cfg'};
1090 }
1091 #my %build_cfg = ();
1092
1093 my $outhandle = $self->{'outhandle'};
1094 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
1095 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
1096
1097 # get the text directory
1098 &util::mk_all_dir ($self->{'build_dir'});
1099
1100 # store the build date
1101 $build_cfg->{'builddate'} = time;
1102 $build_cfg->{'buildtype'} = "mgpp"; #do we need this??
1103
1104 # store the level info
1105 my @indexlevels = ();
1106 foreach $l (@{$self->{'levelorder'}}) {
1107 push (@indexlevels, %level_map->{$l});
1108 }
1109 $build_cfg->{'indexlevels'} = \@indexlevels;
1110
1111 if ($self->{'levels'}->{'section'}) {
1112 $build_cfg->{'textlevel'} = %level_map->{'section'};
1113 } else {
1114 $build_cfg->{'textlevel'} = %level_map->{'document'};
1115 }
1116 # store the number of documents and number of bytes
1117 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
1118 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
1119
1120 # store the mapping between the index names and the directory names
1121 my @indexmap = ();
1122 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
1123 if (not defined ($self->{'notbuilt'}->{$index})) {
1124 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
1125 }
1126 }
1127 $build_cfg->{'indexmap'} = \@indexmap;
1128
1129 my @subcollectionmap = ();
1130 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
1131 push (@subcollectionmap, "$subcollection\-\>" .
1132 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
1133 }
1134 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
1135
1136 my @languagemap = ();
1137 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
1138 push (@languagemap, "$language\-\>" .
1139 $self->{'index_mapping'}->{'languagemap'}->{$language});
1140 }
1141 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
1142
1143 #$build_cfg->{'notbuilt'} = $self->{'notbuilt'};
1144 my @notbuilt = ();
1145 foreach $nb (keys %{$self->{'notbuilt'}}) {
1146 push (@notbuilt, $nb);
1147 }
1148 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
1149
1150 # write out the build information
1151 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
1152 '^(builddate|buildtype|numdocs|numbytes|textlevel)$',
1153 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels)$');
1154
1155 print STDERR "</Stage>\n" if $self->{'gli'};
1156}
1157
1158sub deinit {
1159 my $self = shift (@_);
1160}
1161
1162sub print_stats {
1163 my $self = shift (@_);
1164
1165 my $outhandle = $self->{'outhandle'};
1166 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
1167 my $index = $self->{'buildproc'}->get_index();
1168 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
1169 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
1170
1171 if ($indexing_text) {
1172 print $outhandle "Stats (Creating index $index)\n";
1173 } else {
1174 print $outhandle "Stats (Compressing text from $index)\n";
1175 }
1176 print $outhandle "Total bytes in collection: $num_bytes\n";
1177 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
1178
1179 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
1180 print $outhandle "***************\n";
1181 if ($indexing_text) {
1182 print $outhandle "WARNING: There is very little or no text to process for $index\n";
1183 } elsif (!$self->{'no_text'}) {
1184 print $outhandle "WARNING: There is very little or no text to compress\n";
1185 }
1186 print $outhandle " Was this your intention?\n";
1187 print $outhandle "***************\n";
1188 }
1189
1190}
1191
11921;
1193
1194
Note: See TracBrowser for help on using the repository browser.