source: main/tags/2.52/gsdl/perllib/mgppbuilder.pm@ 25422

Last change on this file since 25422 was 8361, checked in by kjdon, 20 years ago

renamed build option 'allclassifications' to 'remove_empty_classifications' - this means that empty classifications (classifiers and internal nodes) are displayed by default now. Note, if a collection has been built previously by the GLI, and allclassifications options used, then this will crap out building until that old option is deleted from collname.col file

  • Property svn:keywords set to Author Date Id Revision
File size: 41.3 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mgpp
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50%level_map = ('document'=>'Doc',
51 'section'=>'Sec',
52 'paragraph'=>'Para',
53 'Doc'=>'_textdocument_',
54 'Sec'=>'_textsection_',
55 'Para'=>'_textparagraph_');
56
57#$doc_level = "Doc";
58#$sec_level = "Sec";
59#$para_level = "Para";
60
61%wanted_index_files = ('td'=>1,
62 't'=>1,
63 'tl'=>1,
64 'ti'=>1,
65 'idb'=>1,
66 'ib1'=>1,
67 'ib2'=>1,
68 'ib3'=>1,
69 'i'=>1,
70 'il'=>1,
71 'w'=>1,
72 'wa'=>1);
73
74# change this so a user can add their own ones in via a file or cfg
75#add AND, OR, NOT NEAR to this list - these cannot be used as field names
76#also add the level names (Doc, Sec, Para)
77%static_indexfield_map = ('Title'=>'TI',
78 'TI'=>1,
79 'Subject'=>'SU',
80 'SU'=>1,
81 'Creator'=>'CR',
82 'CR'=>1,
83 'Organization'=>'ORG',
84 'ORG'=>1,
85 'Source'=>'SO',
86 'SO'=>1,
87 'Howto'=>'HT',
88 'HT'=>1,
89 'ItemTitle'=>'IT',
90 'IT'=>1,
91 'ProgNumber'=>'PN',
92 'PN'=>1,
93 'People'=>'PE',
94 'PE'=>1,
95 'Coverage'=>'CO',
96 'CO'=>1,
97 'allfields'=>'ZZ',
98 'ZZ'=>1,
99 'text'=>'TX',
100 'TX'=>1,
101 'AND'=>1,
102 'OR'=>1,
103 'NOT'=>1,
104 'NEAR'=>1,
105 'Doc'=>1,
106 'Sec'=>1,
107 'Para'=>1);
108
109sub new {
110 my $class = shift(@_);
111
112 my ($collection, $source_dir, $build_dir, $verbosity,
113 $maxdocs, $debug, $keepold, $remove_empty_classifications,
114 $outhandle, $no_text, $gli) = @_;
115
116 $outhandle = STDERR unless defined $outhandle;
117 $no_text = 0 unless defined $no_text;
118
119 # create an mgppbuilder object
120 my $self = bless {'collection'=>$collection,
121 'source_dir'=>$source_dir,
122 'build_dir'=>$build_dir,
123 'verbosity'=>$verbosity,
124 'maxdocs'=>$maxdocs,
125 'debug'=>$debug,
126 'keepold'=>$keepold,
127 'remove_empty_classifications'=>$remove_empty_classifications,
128 'outhandle'=>$outhandle,
129 'no_text'=>$no_text,
130 'notbuilt'=>{}, # indexes not built
131 'indexfieldmap'=>\%static_indexfield_map,
132 'gli'=>$gli
133 }, $class;
134
135 $self->{'gli'} = 0 unless defined $self->{'gli'};
136
137 # read in the collection configuration file
138 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
139 if (!-e $colcfgname) {
140 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
141 }
142 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
143
144 # sort out the indexes
145 #indexes are specified with spaces, but we put them into one index
146 my $indexes = $self->{'collect_cfg'}->{'indexes'};
147 $self->{'collect_cfg'}->{'indexes'} = [];
148 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes));
149
150
151 # sort out subcollection indexes
152 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
153 my $indexes = $self->{'collect_cfg'}->{'indexes'};
154 $self->{'collect_cfg'}->{'indexes'} = [];
155 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
156 foreach $index (@$indexes) {
157 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
158 }
159 }
160 }
161
162 # sort out language subindexes
163 if (defined $self->{'collect_cfg'}->{'languages'}) {
164 my $indexes = $self->{'collect_cfg'}->{'indexes'};
165 $self->{'collect_cfg'}->{'indexes'} = [];
166 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
167 foreach $index (@$indexes) {
168 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
169 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
170 }
171 else { # add in an empty subcollection field
172 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
173
174 }
175 }
176 }
177 }
178
179 # make sure that the same index isn't specified more than once
180 my %tmphash = ();
181 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
182 $self->{'collect_cfg'}->{'indexes'} = [];
183 foreach my $i (@tmparray) {
184 if (!defined ($tmphash{$i})) {
185 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
186 $tmphash{$i} = 1;
187 }
188 }
189
190
191 # get the levels (Section, Paragraph) for indexing and compression
192 $self->{'levels'} = {};
193 $self->{'levelorder'} = ();
194 if (defined $self->{'collect_cfg'}->{'levels'}) {
195 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
196 $level =~ tr/A-Z/a-z/;
197 $self->{'levels'}->{$level} = 1;
198 push (@{$self->{'levelorder'}}, $level);
199 }
200 } else { # default to document
201 $self->{'levels'}->{'document'} = 1;
202 push (@{$self->{'levelorder'}}, 'document');
203 }
204
205 $self->{'doc_level'} = "document";
206 if (! $self->{'levels'}->{'document'}) {
207 if ($self->{'levels'}->{'section'}) {
208 $self->{'doc_level'} = "section";
209 } else {
210 die "you must have either document or section level specified!!\n";
211 }
212 }
213 print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
214 # get the list of plugins for this collection
215
216 #build up the extra global options for the plugins
217 my @global_opts = ();
218 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
219 push @global_opts, "-separate_cjk";
220 }
221
222 my $plugins = [];
223 if (defined $self->{'collect_cfg'}->{'plugin'}) {
224 $plugins = $self->{'collect_cfg'}->{'plugin'};
225 }
226
227 # load all the plugins
228 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, \@global_opts);
229 if (scalar(@{$self->{'pluginfo'}}) == 0) {
230 print $outhandle "No plugins were loaded.\n";
231 die "\n";
232 }
233
234 # get the list of classifiers for this collection
235 my $classifiers = [];
236 if (defined $self->{'collect_cfg'}->{'classify'}) {
237 $classifiers = $self->{'collect_cfg'}->{'classify'};
238 }
239
240 # load all the classifiers
241 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
242
243 # load up any dontgdbm fields
244 $self->{'dontgdbm'} = {};
245 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
246 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
247 $self->{'dontgdbm'}->{$dg} = 1;
248 }
249 }
250
251 # load up the document processor for building
252 # if a buildproc class has been created for this collection, use it
253 # otherwise, use the mgpp buildproc
254 my ($buildprocdir, $buildproctype);
255 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
256 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
257 $buildproctype = "${collection}buildproc";
258 } else {
259 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
260 $buildproctype = "mgppbuildproc";
261 }
262 require "$buildprocdir/$buildproctype.pm";
263
264 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
265 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
266 die "$@" if $@;
267
268 $self->{'buildtype'} = "mgpp";
269
270 return $self;
271}
272
273sub init {
274 my $self = shift (@_);
275
276 if (!$self->{'debug'} && !$self->{'keepold'}) {
277 # remove any old builds
278 &util::rm_r($self->{'build_dir'});
279 &util::mk_all_dir($self->{'build_dir'});
280
281 # make the text directory
282 my $textdir = "$self->{'build_dir'}/text";
283 &util::mk_all_dir($textdir);
284 }
285}
286
287sub set_strip_html {
288 my $self = shift (@_);
289 my ($strip) = @_;
290
291 $self->{'strip_html'} = $strip;
292 $self->{'buildproc'}->set_strip_html($strip);
293}
294
295sub compress_text {
296
297 my $self = shift (@_);
298 my ($textindex) = @_;
299
300 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
301 my $exe = &util::get_os_exe ();
302 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
303 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
304 my $outhandle = $self->{'outhandle'};
305
306 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
307
308 my $basefilename = "text/$self->{'collection'}";
309 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
310
311 my $osextra = "";
312 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
313 $fulltextprefix =~ s@/@\\@g;
314 }
315 else {
316 $osextra = " -d /";
317 }
318
319
320 # define the section names and possibly the doc name for mgpasses
321 # the compressor doesn't need to know about paragraphs - never want to
322 # retrieve them
323 my $mgpp_passes_sections = "";
324 my ($doc_level) = $self->{'doc_level'};
325 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} . " ";
326 foreach $level (keys %{$self->{'levels'}}) {
327 if ($level ne $doc_level && $level ne "paragraph") {
328 $mgpp_passes_sections .= "-K " . $level_map{$level} . " ";
329 }
330 }
331
332 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
333 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
334
335 # collect the statistics for the text
336 # -b $maxdocsize sets the maximum document size to be 12 meg
337 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
338 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
339
340 my ($handle);
341 if ($self->{'debug'}) {
342 $handle = STDOUT;
343 } else {
344 #print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n";
345 if (!-e "$mgpp_passes_exe" ||
346 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
347 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
348 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
349 }
350 $handle = mgppbuilder::PIPEOUT;
351 }
352 $self->{'buildproc'}->set_output_handle ($handle);
353 $self->{'buildproc'}->set_mode ('text');
354 $self->{'buildproc'}->set_index ($textindex);
355 $self->{'buildproc'}->set_indexing_text (0);
356 if ($self->{'no_text'}) {
357 $self->{'buildproc'}->set_store_text(0);
358 } else {
359 $self->{'buildproc'}->set_store_text(1);
360 }
361 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
362 $self->{'buildproc'}->set_levels ($self->{'levels'});
363 $self->{'buildproc'}->reset();
364 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
365 $self->{'buildproc'}, $self->{'maxdocs'});
366 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
367 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
368 &plugin::end($self->{'pluginfo'});
369 close (PIPEOUT);
370
371 close ($handle) unless $self->{'debug'};
372
373 $self->print_stats();
374
375 # create the compression dictionary
376 # the compression dictionary is built by assuming the stats are from a seed
377 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
378 # and the resulting dictionary must be less than 5 meg with the most
379 # frequent words being put into the dictionary first (-2 -k 5120)
380 # note: these options are left over from mg version
381 if (!$self->{'debug'}) {
382 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
383 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
384 if (!-e "$mgpp_compression_dict_exe") {
385 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
386 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
387 }
388 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
389
390 if (!$self->{'debug'}) {
391 #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n";
392 if (!-e "$mgpp_passes_exe" ||
393 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
394 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
395 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
396 }
397 }
398 }
399 else {
400 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
401 }
402
403 $self->{'buildproc'}->reset();
404 # compress the text
405 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
406 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
407
408 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
409 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
410 close ($handle) unless $self->{'debug'};
411
412 $self->print_stats();
413 print STDERR "</Stage>\n" if $self->{'gli'};
414}
415
416sub want_built {
417 my $self = shift (@_);
418 my ($index) = @_;
419
420 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
421 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
422 if ($index =~ /^$checkstr$/) {
423 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
424 $self->{'notbuilt'}->{$index} = 1;
425 return 0;
426 }
427 }
428 }
429
430 return 1;
431}
432
433sub build_indexes {
434 my $self = shift (@_);
435 my ($indexname) = @_;
436 my $outhandle = $self->{'outhandle'};
437
438 my $indexes = [];
439 if (defined $indexname && $indexname =~ /\w/) {
440 push @$indexes, $indexname;
441 } else {
442 $indexes = $self->{'collect_cfg'}->{'indexes'};
443 }
444
445 # create the mapping between the index descriptions
446 # and their directory names (includes subcolls and langs)
447 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
448
449 # build each of the indexes
450 foreach $index (@$indexes) {
451 if ($self->want_built($index)) {
452 print $outhandle "\n*** building index $index in subdirectory " .
453 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
454 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
455 $self->build_index($index);
456 } else {
457 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
458 }
459 }
460
461 #define the final field lists
462 $self->make_final_field_list();
463
464}
465
466# creates directory names for each of the index descriptions
467sub create_index_mapping {
468 my $self = shift (@_);
469 my ($indexes) = @_;
470
471 my %mapping = ();
472
473 $mapping{'indexmaporder'} = [];
474 $mapping{'subcollectionmaporder'} = [];
475 $mapping{'languagemaporder'} = [];
476
477 # dirnames is used to check for collisions. Start this off
478 # with the manditory directory names
479 my %dirnames = ('text'=>'text',
480 'extra'=>'extra');
481 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
482
483 foreach $index (@$indexes) {
484 my ($fields, $subcollection, $languages) = split (":", $index);
485 # the directory name starts with a processed version of index fields
486 #my ($pindex) = $self->process_field($fields);
487 #$pindex = lc ($pindex);
488 # now we only ever have one index, and its called 'idx'
489 $pindex = 'idx';
490
491 # next comes a processed version of the subcollection if there is one.
492 my $psub = $self->process_field ($subcollection);
493 $psub = lc ($psub);
494
495 # next comes a processed version of the language if there is one.
496 my $plang = $self->process_field ($languages);
497 $plang = lc ($plang);
498
499 my $dirname = $pindex . $psub . $plang;
500
501 # check to be sure all index names are unique
502 while (defined ($dirnames{$dirname})) {
503 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
504 }
505
506 $mapping{$index} = $dirname;
507
508 # store the mapping orders as well as the maps
509 # also put index, subcollection and language fields into the mapping thing -
510 # (the full index name (eg text:subcol:lang) is not used on
511 # the query page) -these are used for collectionmeta later on
512 if (!defined $mapping{'indexmap'}{"$fields"}) {
513 $mapping{'indexmap'}{"$fields"} = $pindex;
514 push (@{$mapping{'indexmaporder'}}, "$fields");
515 if (!defined $mapping{"$fields"}) {
516 $mapping{"$fields"} = $pindex;
517 }
518 }
519 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
520 $mapping{'subcollectionmap'}{$subcollection} = $psub;
521 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
522 $mapping{$subcollection} = $psub;
523 }
524 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
525 $mapping{'languagemap'}{$languages} = $plang;
526 push (@{$mapping{'languagemaporder'}}, $languages);
527 $mapping{$languages} = $plang;
528 }
529 $dirnames{$dirname} = $index;
530 $pnames{'index'}{$pindex} = "$fields";
531 $pnames{'subcollection'}{$psub} = $subcollection;
532 $pnames{'languages'}{$plang} = $languages;
533 }
534
535 return \%mapping;
536}
537
538# returns a processed version of a field.
539# if the field has only one component the processed
540# version will contain the first character and next consonant
541# of that componant - otherwise it will contain the first
542# character of the first two components
543sub process_field {
544 my $self = shift (@_);
545 my ($field) = @_;
546
547 return "" unless (defined ($field) && $field =~ /\w/);
548
549 my @components = split /,/, $field;
550 if (scalar @components >= 2) {
551 splice (@components, 2);
552 map {s/^(.).*$/$1/;} @components;
553 return join("", @components);
554 } else {
555 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
556 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
557 return "$a$b";
558 }
559}
560
561sub make_unique {
562 my $self = shift (@_);
563 my ($namehash, $index, $indexref, $subref, $langref) = @_;
564 my ($fields, $subcollection, $languages) = split (":", $index);
565
566 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
567 $self->get_next_version ($indexref);
568 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
569 $self->get_next_version ($subref);
570 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
571 $self->get_next_version ($langref);
572 }
573 return "$$indexref$$subref$$langref";
574}
575
576sub get_next_version {
577 my $self = shift (@_);
578 my ($nameref) = @_;
579
580 if ($$nameref =~ /(\d\d)$/) {
581 my $num = $1; $num ++;
582 $$nameref =~ s/\d\d$/$num/;
583 } elsif ($$nameref =~ /(\d)$/) {
584 my $num = $1;
585 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
586 else {$num ++; $$nameref =~ s/\d$/$num/;}
587 } else {
588 $$nameref =~ s/.$/0/;
589 }
590}
591
592sub build_index {
593 my $self = shift (@_);
594 my ($index) = @_;
595 my $outhandle = $self->{'outhandle'};
596
597 # get the full index directory path and make sure it exists
598 my $indexdir = $self->{'index_mapping'}->{$index};
599 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
600 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
601 $indexdir,
602 $self->{'collection'});
603 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
604 $self->{'collection'});
605
606 # get any os specific stuff
607 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
608
609 my $exe = &util::get_os_exe ();
610 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
611
612 # define the section names for mgpasses
613 # define the section names and possibly the doc name for mgpasses
614 my $mgpp_passes_sections = "";
615 my ($doc_level) = $self->{'doc_level'};
616 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} ." ";
617
618 foreach $level (keys %{$self->{'levels'}}) {
619 if ($level ne $doc_level) {
620 $mgpp_passes_sections .= "-K " . $level_map{$level}. " ";
621 }
622 }
623
624 my $mgpp_perf_hash_build_exe =
625 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
626 my $mgpp_weights_build_exe =
627 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
628 my $mgpp_invf_dict_exe =
629 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
630 my $mgpp_stem_idx_exe =
631 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
632
633 my $osextra = "";
634 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
635 $fullindexprefix =~ s@/@\\@g;
636 } else {
637 $osextra = " -d /";
638 if ($outhandle ne "STDERR") {
639 # so mgpp_passes doesn't print to stderr if we redirect output
640 $osextra .= " 2>/dev/null";
641 }
642 }
643
644 # get the index expression if this index belongs
645 # to a subcollection
646 my $indexexparr = [];
647
648 # there may be subcollection info, and language info.
649 my ($fields, $subcollection, $language) = split (":", $index);
650 my @subcollections = ();
651 @subcollections = split /,/, $subcollection if (defined $subcollection);
652
653 foreach $subcollection (@subcollections) {
654 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
655 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
656 }
657 }
658
659 # add expressions for languages if this index belongs to
660 # a language subcollection - only put languages expressions for the
661 # ones we want in the index
662
663 # this puts a separate Language/en entry in for each language in the list
664 # is this what we want?
665 # should we just have one entry with Language/en,es/ ??
666 my @languages = ();
667 @languages = split /,/, $language if (defined $language);
668 foreach $language (@languages) {
669 my $not=0;
670 if ($language =~ s/^\!//) {
671 $not = 1;
672 }
673 if ($not) {
674 push (@$indexexparr, "!Language/$language/");
675 } else {
676 push (@$indexexparr, "Language/$language/");
677 }
678 }
679
680 # Build index dictionary. Uses verbatim stem method
681 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
682 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
683 my ($handle);
684 if ($self->{'debug'}) {
685 $handle = STDOUT;
686 } else {
687 if (!-e "$mgpp_passes_exe" ||
688 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
689 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
690 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
691 }
692 $handle = mgppbuilder::PIPEOUT;
693 }
694
695 # set up the document processr
696 $self->{'buildproc'}->set_output_handle ($handle);
697 $self->{'buildproc'}->set_mode ('text');
698 $self->{'buildproc'}->set_index ($index, $indexexparr);
699 $self->{'buildproc'}->set_indexing_text (1);
700 $self->{'buildproc'}->set_store_text(1);
701 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
702 $self->{'buildproc'}->set_levels ($self->{'levels'});
703 $self->{'buildproc'}->reset();
704 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
705 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
706 close ($handle) unless $self->{'debug'};
707
708 $self->print_stats();
709
710 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
711 # we check on the .id file - index dictionary
712 my $dict_file = "$fullindexprefix.id";
713 if (!-e $dict_file) {
714 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
715 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
716 $self->{'notbuilt'}->{$index}=1;
717 return;
718 }
719
720 if (!$self->{'debug'}) {
721 # create the perfect hash function
722 if (!-e "$mgpp_perf_hash_build_exe") {
723 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
724 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
725 }
726 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
727
728 if (!-e "$mgpp_passes_exe" ||
729 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
730 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
731 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
732 }
733 }
734
735 # invert the text
736 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
737 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
738 $self->{'buildproc'}->reset();
739 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
740 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
741
742 $self->print_stats ();
743
744 if (!$self->{'debug'}) {
745
746 close ($handle);
747
748 # create the weights file
749 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
750 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
751 if (!-e "$mgpp_weights_build_exe") {
752 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
753 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
754 }
755 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
756
757 # create 'on-disk' stemmed dictionary
758 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
759 if (!-e "$mgpp_invf_dict_exe") {
760 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
761 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
762 }
763 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
764
765
766 # creates stem index files for the various stemming methods
767 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
768 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
769 if (!-e "$mgpp_stem_idx_exe") {
770 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
771 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
772 }
773 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
774 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
775 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
776
777 # remove unwanted files
778 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
779 opendir (DIR, $tmpdir) || die
780 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
781 foreach $file (readdir(DIR)) {
782 next if $file =~ /^\./;
783 my ($suffix) = $file =~ /\.([^\.]+)$/;
784 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
785 # delete it!
786 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
787 #&util::rm (&util::filename_cat ($tmpdir, $file));
788 }
789 }
790 closedir (DIR);
791 }
792 print STDERR "</Stage>\n" if $self->{'gli'};
793}
794
795sub make_infodatabase {
796 my $self = shift (@_);
797 my $outhandle = $self->{'outhandle'};
798
799
800 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
801 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
802 &util::mk_all_dir ($textdir);
803 &util::mk_all_dir ($assocdir);
804
805 # get db name
806 my $dbext = ".bdb";
807 $dbext = ".ldb" if &util::is_little_endian();
808 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
809 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
810
811 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
812 my $exe = &util::get_os_exe ();
813 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
814
815 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
816 if (!defined $self->{'build_cfg'}) {
817 $self->read_final_field_list();
818 }
819 print $outhandle "\n*** creating the info database and processing associated files\n"
820 if ($self->{'verbosity'} >= 1);
821 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
822
823 # init all the classifiers
824 &classify::init_classifiers ($self->{'classifiers'});
825
826 # set up the document processor
827 my ($handle);
828 if ($self->{'debug'}) {
829 $handle = STDOUT;
830 } else {
831 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
832 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
833 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
834 }
835 $handle = mgppbuilder::PIPEOUT;
836 }
837
838 $self->{'buildproc'}->set_output_handle ($handle);
839 $self->{'buildproc'}->set_mode ('infodb');
840 $self->{'buildproc'}->set_assocdir ($assocdir);
841 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
842 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
843 $self->{'buildproc'}->set_indexing_text (0);
844 $self->{'buildproc'}->set_store_text(1);
845 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
846
847 $self->{'buildproc'}->reset();
848
849 # do the collection info
850 print $handle "[collection]\n";
851
852 # first do the collection meta stuff - everything without a dot
853 my $collmetadefined = 0;
854 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
855 $collmetadefined = 1;
856 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
857 next if ($cmeta =~ /^\./); # for now, ignore ones with dots
858 my ($metadata_entry) = $self->create_language_db_map($cmeta, $cmeta);
859 #write the entry to the file
860 print $handle $metadata_entry;
861
862 } # foreach collmeta key
863 }
864 #add the index field macros to [collection]
865 # eg <TI>Title
866 # <SU>Subject
867 # these now come from collection meta. if that is not defined, usses the metadata name
868 $field_entry="";
869 foreach $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
870 $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
871 next if $shortfield eq 1;
872
873 # we need to check if some coll meta has been defined
874 my $collmeta = ".$longfield";
875 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
876 $metadata_entry = $self->create_language_db_map($collmeta, $shortfield);
877 $field_entry .= $metadata_entry;
878 } else { #use the metadata names, or the text macros for allfields and textonly
879 if ($longfield eq "allfields") {
880 $field_entry .= "<$shortfield>_query:textallfields_\n";
881 } elsif ($longfield eq "text") {
882 $field_entry .= "<$shortfield>_query:texttextonly_\n";
883 } else {
884 $field_entry .= "<$shortfield>$longfield\n";
885 }
886 }
887 }
888 print $handle $field_entry;
889
890 # now add the level names
891 $level_entry = "";
892 foreach $level (@{$self->{'collect_cfg'}->{'levels'}}) {
893 my $collmeta = ".$level"; # based on the original specification
894 $level =~ tr/A-Z/a-z/; # make it lower case
895 my $levelid = $level_map{$level}; # find the actual value we used in the index
896 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
897 $metadata_entry = $self->create_language_db_map($collmeta, $levelid);
898 $level_entry .= $metadata_entry;
899 } else {
900 # use the default macro
901 $level_entry .= "<$levelid>" . $level_map{$levelid} . "\n";
902 }
903 }
904 print $handle $level_entry;
905
906 # now add subcoll meta
907 $subcoll_entry = "";
908 foreach $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
909 if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
910 my $shortname = $self->{'index_mapping'}->{$subcoll};
911 $one_entry = $self->create_language_db_map(".$subcoll", $shortname);
912 $subcoll_entry .= $one_entry;
913 } else {
914 $subcoll_entry .= "<$shortname>$subcoll\n";
915 }
916 }
917 print $handle $subcoll_entry;
918 # now add language meta
919 $lang_entry = "";
920 foreach $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
921 if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
922 my $shortname = $self->{'index_mapping'}->{$lang};
923 $one_entry = $self->create_language_db_map(".$lang", $shortname);
924 $lang_entry .= $one_entry;
925 } else {
926 $lang_entry .= "<$shortname>$lang\n";
927 }
928 }
929 print $handle $lang_entry;
930 #end the collection entry
931 print $handle "\n" . ('-' x 70) . "\n";
932
933
934
935 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
936 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
937
938 # output classification information
939 &classify::output_classify_info ($self->{'classifiers'}, $handle,
940 $self->{'remove_empty_classifications'},
941 $self->{'gli'});
942
943 #output doclist
944 my @doclist = $self->{'buildproc'}->get_doc_list();
945 my $docs = join (";",@doclist);
946 print $handle "[browselist]\n";
947 print $handle "<hastxt>0\n";
948 print $handle "<childtype>VList\n";
949 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
950 print $handle "<thistype>Invisible\n";
951 print $handle "<contains>$docs";
952 print $handle "\n" . ('-' x 70) . "\n";
953 close ($handle) if !$self->{'debug'};
954
955 print STDERR "</Stage>\n" if $self->{'gli'};
956}
957
958sub create_language_db_map {
959 my $self = shift (@_);
960 my ($metaname, $mapname) = @_;
961 my $outhandle = $self->{'outhandle'};
962 my $defaultfound=0;
963 my $first=1;
964 my $metadata_entry = "";
965 my $default="";
966 #iterate through the languages
967 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}})) {
968 if ($first) {
969 $first=0;
970 #set the default default to the first entry
971 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
972 }
973 if ($lang =~ /default/) {
974 $defaultfound=1;
975 #the default entry goes first
976 $metadata_entry = "<$mapname>" .
977 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{'default'} . "\n" . $metadata_entry;
978 }
979 else {
980 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
981 if ($l) {
982 $metadata_entry .= "<$mapname:$l>" .
983 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang} . "\n";
984
985 # Use the English value as the default if no default is specified
986 if ($l =~ /en/i) {
987 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
988 }
989 }
990 }
991 } #foreach lang
992 #if we haven't found a default, put one in
993 if (!$defaultfound) {
994 $metadata_entry = "<$mapname>$default\n" . $metadata_entry;
995 }
996 return $metadata_entry;
997
998}
999sub collect_specific {
1000 my $self = shift (@_);
1001}
1002
1003# at the end of building, we have an indexfieldmap with all teh mappings, plus
1004# some extras, and indexmap with any indexes in it that weren't specified in the index definition.
1005# we want to make an ordered list of fields that are indexed, and a list of mappings that are used. this will be used for the build.cfg file, and for collection meta definition
1006# we store these in a build.cfg bit
1007sub make_final_field_list {
1008 my $self = shift (@_);
1009
1010 $self->{'build_cfg'} = {};
1011
1012 # store the indexfieldmap information
1013 my @indexfieldmap = ();
1014 my @indexfields = ();
1015 my $specifiedfields = {};
1016 my @specifiedfieldorder = ();
1017 # go through the index definition and add each thing to a map, so we can easily check if it is already specified - when doing the metadata, we print out all the individual fields, but some may already be specified in the index definition, so we dont want to add those again.
1018 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
1019 # remove subcoll stuff
1020 my $parts = $field;
1021 $parts =~ s/:.*$//;
1022 my @fs = split(',', $parts);
1023 foreach $f(@fs) {
1024 if (!defined $specifiedfields->{$f}) {
1025 $specifiedfields->{$f}=1;
1026 push (@specifiedfieldorder, "$f");
1027 }
1028 }
1029 }
1030
1031 #add all fields bit
1032 foreach $field (@specifiedfieldorder) {
1033 if ($field eq "metadata") {
1034 foreach $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
1035 if (!defined $specifiedfields->{$newfield}) {
1036 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
1037 push (@indexfields, "$newfield");
1038 }
1039 }
1040
1041 } elsif ($field eq 'text') {
1042 push (@indexfieldmap, "text\-\>TX");
1043 push (@indexfields, "text");
1044 } elsif ($field eq 'allfields') {
1045 push (@indexfieldmap, "allfields\-\>ZZ");
1046 push (@indexfields, "allfields");
1047 } else {
1048 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
1049 push (@indexfields, "$field");
1050
1051 }
1052 }
1053 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1054 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1055
1056
1057}
1058
1059
1060# recreate the field list from the build.cfg file, look first in building, then in index to find it. if there is no build.cfg, we cant do the field list (there is unlikely to be any index anyway.)
1061sub read_final_field_list {
1062 my $self = shift (@_);
1063 $self->{'build_cfg'} = {};
1064 my @indexfieldmap = ();
1065 my @indexfields = ();
1066
1067 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
1068 # set the default mapping
1069 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
1070 }
1071 # we read the stuff in from the build.cfg file - if its there
1072 $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
1073
1074 if (!-e $buildconfigfile) {
1075 # try the index dir - but do we know where it is?? try here
1076 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
1077 if (!-e $buildconfigfile) {
1078 #we cant find a config file - just ignore the field list
1079 return;
1080 }
1081 }
1082 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
1083 if (defined $buildcfg->{'indexfields'}) {
1084 foreach $field (@{$buildcfg->{'indexfields'}}) {
1085 push (@indexfields, "$field");
1086 }
1087 }
1088 if (defined $buildcfg->{'indexfieldmap'}) {
1089 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
1090 push (@indexfieldmap, "$field");
1091 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
1092 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
1093 }
1094 }
1095
1096 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1097 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1098
1099}
1100sub make_auxiliary_files {
1101 my $self = shift (@_);
1102 my ($index);
1103
1104 my $build_cfg = {};
1105 # this already includes indexfieldmap and indexfields
1106 if (defined $self->{'build_cfg'}) {
1107 $build_cfg = $self->{'build_cfg'};
1108 }
1109 #my %build_cfg = ();
1110
1111 my $outhandle = $self->{'outhandle'};
1112 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
1113 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
1114
1115 # get the text directory
1116 &util::mk_all_dir ($self->{'build_dir'});
1117
1118 # store the build date
1119 $build_cfg->{'builddate'} = time;
1120 $build_cfg->{'buildtype'} = $self->{'buildtype'};
1121
1122 # store the level info
1123 my @indexlevels = ();
1124 foreach $l (@{$self->{'levelorder'}}) {
1125 push (@indexlevels, $level_map{$l});
1126 }
1127 $build_cfg->{'indexlevels'} = \@indexlevels;
1128
1129 if ($self->{'levels'}->{'section'}) {
1130 $build_cfg->{'textlevel'} = $level_map{'section'};
1131 } else {
1132 $build_cfg->{'textlevel'} = $level_map{'document'};
1133 }
1134 # store the number of documents and number of bytes
1135 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
1136 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
1137
1138 # store the mapping between the index names and the directory names
1139 my @indexmap = ();
1140 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
1141 if (not defined ($self->{'notbuilt'}->{$index})) {
1142 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
1143 }
1144 }
1145 $build_cfg->{'indexmap'} = \@indexmap;
1146
1147 my @subcollectionmap = ();
1148 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
1149 push (@subcollectionmap, "$subcollection\-\>" .
1150 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
1151 }
1152 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
1153
1154 my @languagemap = ();
1155 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
1156 push (@languagemap, "$language\-\>" .
1157 $self->{'index_mapping'}->{'languagemap'}->{$language});
1158 }
1159 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
1160
1161 #$build_cfg->{'notbuilt'} = $self->{'notbuilt'};
1162 my @notbuilt = ();
1163 foreach $nb (keys %{$self->{'notbuilt'}}) {
1164 push (@notbuilt, $nb);
1165 }
1166 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
1167
1168 # write out the build information
1169 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
1170 '^(builddate|buildtype|numdocs|numbytes|textlevel)$',
1171 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels)$');
1172
1173 print STDERR "</Stage>\n" if $self->{'gli'};
1174}
1175
1176sub deinit {
1177 my $self = shift (@_);
1178}
1179
1180sub print_stats {
1181 my $self = shift (@_);
1182
1183 my $outhandle = $self->{'outhandle'};
1184 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
1185 my $index = $self->{'buildproc'}->get_index();
1186 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
1187 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
1188
1189 if ($indexing_text) {
1190 print $outhandle "Stats (Creating index $index)\n";
1191 } else {
1192 print $outhandle "Stats (Compressing text from $index)\n";
1193 }
1194 print $outhandle "Total bytes in collection: $num_bytes\n";
1195 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
1196
1197 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
1198 print $outhandle "***************\n";
1199 if ($indexing_text) {
1200 print $outhandle "WARNING: There is very little or no text to process for $index\n";
1201 } elsif (!$self->{'no_text'}) {
1202 print $outhandle "WARNING: There is very little or no text to compress\n";
1203 }
1204 print $outhandle " Was this your intention?\n";
1205 print $outhandle "***************\n";
1206 }
1207
1208}
1209
12101;
1211
1212
Note: See TracBrowser for help on using the repository browser.