source: main/tags/2.51/gsdl/perllib/mgppbuilder.pm@ 32629

Last change on this file since 32629 was 7150, checked in by mdewsnip, 20 years ago

Now chooses the English collectionmeta value (if it exists) to be the default, in the absence of a specified default value. If there is no default value and no English value, a random value will be used as the default.

  • Property svn:keywords set to Author Date Id Revision
File size: 41.2 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mgpp
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50%level_map = ('document'=>'Doc',
51 'section'=>'Sec',
52 'paragraph'=>'Para',
53 'Doc'=>'_textdocument_',
54 'Sec'=>'_textsection_',
55 'Para'=>'_textparagraph_');
56
57#$doc_level = "Doc";
58#$sec_level = "Sec";
59#$para_level = "Para";
60
61%wanted_index_files = ('td'=>1,
62 't'=>1,
63 'tl'=>1,
64 'ti'=>1,
65 'idb'=>1,
66 'ib1'=>1,
67 'ib2'=>1,
68 'ib3'=>1,
69 'i'=>1,
70 'il'=>1,
71 'w'=>1,
72 'wa'=>1);
73
74# change this so a user can add their own ones in via a file or cfg
75#add AND, OR, NOT NEAR to this list - these cannot be used as field names
76#also add the level names (Doc, Sec, Para)
77%static_indexfield_map = ('Title'=>'TI',
78 'TI'=>1,
79 'Subject'=>'SU',
80 'SU'=>1,
81 'Creator'=>'CR',
82 'CR'=>1,
83 'Organization'=>'ORG',
84 'ORG'=>1,
85 'Source'=>'SO',
86 'SO'=>1,
87 'Howto'=>'HT',
88 'HT'=>1,
89 'ItemTitle'=>'IT',
90 'IT'=>1,
91 'ProgNumber'=>'PN',
92 'PN'=>1,
93 'People'=>'PE',
94 'PE'=>1,
95 'Coverage'=>'CO',
96 'CO'=>1,
97 'allfields'=>'ZZ',
98 'ZZ'=>1,
99 'text'=>'TX',
100 'TX'=>1,
101 'AND'=>1,
102 'OR'=>1,
103 'NOT'=>1,
104 'NEAR'=>1,
105 'Doc'=>1,
106 'Sec'=>1,
107 'Para'=>1);
108
109sub new {
110 my ($class, $collection, $source_dir, $build_dir, $verbosity,
111 $maxdocs, $debug, $keepold, $allclassifications,
112 $outhandle, $no_text, $gli) = @_;
113
114 $outhandle = STDERR unless defined $outhandle;
115 $no_text = 0 unless defined $no_text;
116
117 # create an mgppbuilder object
118 my $self = bless {'collection'=>$collection,
119 'source_dir'=>$source_dir,
120 'build_dir'=>$build_dir,
121 'verbosity'=>$verbosity,
122 'maxdocs'=>$maxdocs,
123 'debug'=>$debug,
124 'keepold'=>$keepold,
125 'allclassifications'=>$allclassifications,
126 'outhandle'=>$outhandle,
127 'no_text'=>$no_text,
128 'notbuilt'=>{}, # indexes not built
129 'indexfieldmap'=>\%static_indexfield_map,
130 'gli'=>$gli
131 }, $class;
132
133 $self->{'gli'} = 0 unless defined $self->{'gli'};
134
135 # read in the collection configuration file
136 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
137 if (!-e $colcfgname) {
138 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
139 }
140 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
141
142 # sort out the indexes
143 #indexes are specified with spaces, but we put them into one index
144 my $indexes = $self->{'collect_cfg'}->{'indexes'};
145 $self->{'collect_cfg'}->{'indexes'} = [];
146 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes));
147
148
149 # sort out subcollection indexes
150 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
151 my $indexes = $self->{'collect_cfg'}->{'indexes'};
152 $self->{'collect_cfg'}->{'indexes'} = [];
153 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
154 foreach $index (@$indexes) {
155 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
156 }
157 }
158 }
159
160 # sort out language subindexes
161 if (defined $self->{'collect_cfg'}->{'languages'}) {
162 my $indexes = $self->{'collect_cfg'}->{'indexes'};
163 $self->{'collect_cfg'}->{'indexes'} = [];
164 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
165 foreach $index (@$indexes) {
166 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
167 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
168 }
169 else { # add in an empty subcollection field
170 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
171
172 }
173 }
174 }
175 }
176
177 # make sure that the same index isn't specified more than once
178 my %tmphash = ();
179 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
180 $self->{'collect_cfg'}->{'indexes'} = [];
181 foreach my $i (@tmparray) {
182 if (!defined ($tmphash{$i})) {
183 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
184 $tmphash{$i} = 1;
185 }
186 }
187
188
189 # get the levels (Section, Paragraph) for indexing and compression
190 $self->{'levels'} = {};
191 $self->{'levelorder'} = ();
192 if (defined $self->{'collect_cfg'}->{'levels'}) {
193 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
194 $level =~ tr/A-Z/a-z/;
195 $self->{'levels'}->{$level} = 1;
196 push (@{$self->{'levelorder'}}, $level);
197 }
198 } else { # default to document
199 $self->{'levels'}->{'document'} = 1;
200 push (@{$self->{'levelorder'}}, 'document');
201 }
202
203 $self->{'doc_level'} = "document";
204 if (! $self->{'levels'}->{'document'}) {
205 if ($self->{'levels'}->{'section'}) {
206 $self->{'doc_level'} = "section";
207 } else {
208 die "you must have either document or section level specified!!\n";
209 }
210 }
211 print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
212 # get the list of plugins for this collection
213
214 #build up the extra global options for the plugins
215 my @global_opts = ();
216 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
217 push @global_opts, "-separate_cjk";
218 }
219
220 my $plugins = [];
221 if (defined $self->{'collect_cfg'}->{'plugin'}) {
222 $plugins = $self->{'collect_cfg'}->{'plugin'};
223 }
224
225 # load all the plugins
226 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, \@global_opts);
227 if (scalar(@{$self->{'pluginfo'}}) == 0) {
228 print $outhandle "No plugins were loaded.\n";
229 die "\n";
230 }
231
232 # get the list of classifiers for this collection
233 my $classifiers = [];
234 if (defined $self->{'collect_cfg'}->{'classify'}) {
235 $classifiers = $self->{'collect_cfg'}->{'classify'};
236 }
237
238 # load all the classifiers
239 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
240
241 # load up any dontgdbm fields
242 $self->{'dontgdbm'} = {};
243 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
244 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
245 $self->{'dontgdbm'}->{$dg} = 1;
246 }
247 }
248
249 # load up the document processor for building
250 # if a buildproc class has been created for this collection, use it
251 # otherwise, use the mgpp buildproc
252 my ($buildprocdir, $buildproctype);
253 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
254 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
255 $buildproctype = "${collection}buildproc";
256 } else {
257 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
258 $buildproctype = "mgppbuildproc";
259 }
260 require "$buildprocdir/$buildproctype.pm";
261
262 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
263 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
264 die "$@" if $@;
265
266
267 return $self;
268}
269
270sub init {
271 my $self = shift (@_);
272
273 if (!$self->{'debug'} && !$self->{'keepold'}) {
274 # remove any old builds
275 &util::rm_r($self->{'build_dir'});
276 &util::mk_all_dir($self->{'build_dir'});
277
278 # make the text directory
279 my $textdir = "$self->{'build_dir'}/text";
280 &util::mk_all_dir($textdir);
281 }
282}
283
284sub set_strip_html {
285 my $self = shift (@_);
286 my ($strip) = @_;
287
288 $self->{'strip_html'} = $strip;
289 $self->{'buildproc'}->set_strip_html($strip);
290}
291
292sub compress_text {
293
294 my $self = shift (@_);
295 my ($textindex) = @_;
296
297 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
298 my $exe = &util::get_os_exe ();
299 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
300 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
301 my $outhandle = $self->{'outhandle'};
302
303 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
304
305 my $basefilename = "text/$self->{'collection'}";
306 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
307
308 my $osextra = "";
309 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
310 $fulltextprefix =~ s@/@\\@g;
311 }
312 else {
313 $osextra = " -d /";
314 }
315
316
317 # define the section names and possibly the doc name for mgpasses
318 # the compressor doesn't need to know about paragraphs - never want to
319 # retrieve them
320 my $mgpp_passes_sections = "";
321 my ($doc_level) = $self->{'doc_level'};
322 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} . " ";
323 foreach $level (keys %{$self->{'levels'}}) {
324 if ($level ne $doc_level && $level ne "paragraph") {
325 $mgpp_passes_sections .= "-K " . $level_map{$level} . " ";
326 }
327 }
328
329 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
330 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
331
332 # collect the statistics for the text
333 # -b $maxdocsize sets the maximum document size to be 12 meg
334 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
335 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
336
337 my ($handle);
338 if ($self->{'debug'}) {
339 $handle = STDOUT;
340 } else {
341 #print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n";
342 if (!-e "$mgpp_passes_exe" ||
343 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
344 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
345 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
346 }
347 $handle = mgppbuilder::PIPEOUT;
348 }
349 $self->{'buildproc'}->set_output_handle ($handle);
350 $self->{'buildproc'}->set_mode ('text');
351 $self->{'buildproc'}->set_index ($textindex);
352 $self->{'buildproc'}->set_indexing_text (0);
353 if ($self->{'no_text'}) {
354 $self->{'buildproc'}->set_store_text(0);
355 } else {
356 $self->{'buildproc'}->set_store_text(1);
357 }
358 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
359 $self->{'buildproc'}->set_levels ($self->{'levels'});
360 $self->{'buildproc'}->reset();
361 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
362 $self->{'buildproc'}, $self->{'maxdocs'});
363 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
364 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
365 &plugin::end($self->{'pluginfo'});
366 close (PIPEOUT);
367
368 close ($handle) unless $self->{'debug'};
369
370 $self->print_stats();
371
372 # create the compression dictionary
373 # the compression dictionary is built by assuming the stats are from a seed
374 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
375 # and the resulting dictionary must be less than 5 meg with the most
376 # frequent words being put into the dictionary first (-2 -k 5120)
377 # note: these options are left over from mg version
378 if (!$self->{'debug'}) {
379 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
380 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
381 if (!-e "$mgpp_compression_dict_exe") {
382 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
383 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
384 }
385 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
386
387 if (!$self->{'debug'}) {
388 #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n";
389 if (!-e "$mgpp_passes_exe" ||
390 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
391 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
392 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
393 }
394 }
395 }
396 else {
397 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
398 }
399
400 $self->{'buildproc'}->reset();
401 # compress the text
402 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
403 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
404
405 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
406 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
407 close ($handle) unless $self->{'debug'};
408
409 $self->print_stats();
410 print STDERR "</Stage>\n" if $self->{'gli'};
411}
412
413sub want_built {
414 my $self = shift (@_);
415 my ($index) = @_;
416
417 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
418 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
419 if ($index =~ /^$checkstr$/) {
420 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
421 $self->{'notbuilt'}->{$index} = 1;
422 return 0;
423 }
424 }
425 }
426
427 return 1;
428}
429
430sub build_indexes {
431 my $self = shift (@_);
432 my ($indexname) = @_;
433 my $outhandle = $self->{'outhandle'};
434
435 my $indexes = [];
436 if (defined $indexname && $indexname =~ /\w/) {
437 push @$indexes, $indexname;
438 } else {
439 $indexes = $self->{'collect_cfg'}->{'indexes'};
440 }
441
442 # create the mapping between the index descriptions
443 # and their directory names (includes subcolls and langs)
444 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
445
446 # build each of the indexes
447 foreach $index (@$indexes) {
448 if ($self->want_built($index)) {
449 print $outhandle "\n*** building index $index in subdirectory " .
450 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
451 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
452 $self->build_index($index);
453 } else {
454 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
455 }
456 }
457
458 #define the final field lists
459 $self->make_final_field_list();
460
461}
462
463# creates directory names for each of the index descriptions
464sub create_index_mapping {
465 my $self = shift (@_);
466 my ($indexes) = @_;
467
468 my %mapping = ();
469
470 $mapping{'indexmaporder'} = [];
471 $mapping{'subcollectionmaporder'} = [];
472 $mapping{'languagemaporder'} = [];
473
474 # dirnames is used to check for collisions. Start this off
475 # with the manditory directory names
476 my %dirnames = ('text'=>'text',
477 'extra'=>'extra');
478 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
479
480 foreach $index (@$indexes) {
481 my ($fields, $subcollection, $languages) = split (":", $index);
482 # the directory name starts with a processed version of index fields
483 #my ($pindex) = $self->process_field($fields);
484 #$pindex = lc ($pindex);
485 # now we only ever have one index, and its called 'idx'
486 $pindex = 'idx';
487
488 # next comes a processed version of the subcollection if there is one.
489 my $psub = $self->process_field ($subcollection);
490 $psub = lc ($psub);
491
492 # next comes a processed version of the language if there is one.
493 my $plang = $self->process_field ($languages);
494 $plang = lc ($plang);
495
496 my $dirname = $pindex . $psub . $plang;
497
498 # check to be sure all index names are unique
499 while (defined ($dirnames{$dirname})) {
500 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
501 }
502
503 $mapping{$index} = $dirname;
504
505 # store the mapping orders as well as the maps
506 # also put index, subcollection and language fields into the mapping thing -
507 # (the full index name (eg text:subcol:lang) is not used on
508 # the query page) -these are used for collectionmeta later on
509 if (!defined $mapping{'indexmap'}{"$fields"}) {
510 $mapping{'indexmap'}{"$fields"} = $pindex;
511 push (@{$mapping{'indexmaporder'}}, "$fields");
512 if (!defined $mapping{"$fields"}) {
513 $mapping{"$fields"} = $pindex;
514 }
515 }
516 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
517 $mapping{'subcollectionmap'}{$subcollection} = $psub;
518 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
519 $mapping{$subcollection} = $psub;
520 }
521 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
522 $mapping{'languagemap'}{$languages} = $plang;
523 push (@{$mapping{'languagemaporder'}}, $languages);
524 $mapping{$languages} = $plang;
525 }
526 $dirnames{$dirname} = $index;
527 $pnames{'index'}{$pindex} = "$fields";
528 $pnames{'subcollection'}{$psub} = $subcollection;
529 $pnames{'languages'}{$plang} = $languages;
530 }
531
532 return \%mapping;
533}
534
535# returns a processed version of a field.
536# if the field has only one component the processed
537# version will contain the first character and next consonant
538# of that componant - otherwise it will contain the first
539# character of the first two components
540sub process_field {
541 my $self = shift (@_);
542 my ($field) = @_;
543
544 return "" unless (defined ($field) && $field =~ /\w/);
545
546 my @components = split /,/, $field;
547 if (scalar @components >= 2) {
548 splice (@components, 2);
549 map {s/^(.).*$/$1/;} @components;
550 return join("", @components);
551 } else {
552 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
553 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
554 return "$a$b";
555 }
556}
557
558sub make_unique {
559 my $self = shift (@_);
560 my ($namehash, $index, $indexref, $subref, $langref) = @_;
561 my ($fields, $subcollection, $languages) = split (":", $index);
562
563 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
564 $self->get_next_version ($indexref);
565 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
566 $self->get_next_version ($subref);
567 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
568 $self->get_next_version ($langref);
569 }
570 return "$$indexref$$subref$$langref";
571}
572
573sub get_next_version {
574 my $self = shift (@_);
575 my ($nameref) = @_;
576
577 if ($$nameref =~ /(\d\d)$/) {
578 my $num = $1; $num ++;
579 $$nameref =~ s/\d\d$/$num/;
580 } elsif ($$nameref =~ /(\d)$/) {
581 my $num = $1;
582 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
583 else {$num ++; $$nameref =~ s/\d$/$num/;}
584 } else {
585 $$nameref =~ s/.$/0/;
586 }
587}
588
589sub build_index {
590 my $self = shift (@_);
591 my ($index) = @_;
592 my $outhandle = $self->{'outhandle'};
593
594 # get the full index directory path and make sure it exists
595 my $indexdir = $self->{'index_mapping'}->{$index};
596 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
597 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
598 $indexdir,
599 $self->{'collection'});
600 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
601 $self->{'collection'});
602
603 # get any os specific stuff
604 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
605
606 my $exe = &util::get_os_exe ();
607 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
608
609 # define the section names for mgpasses
610 # define the section names and possibly the doc name for mgpasses
611 my $mgpp_passes_sections = "";
612 my ($doc_level) = $self->{'doc_level'};
613 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} ." ";
614
615 foreach $level (keys %{$self->{'levels'}}) {
616 if ($level ne $doc_level) {
617 $mgpp_passes_sections .= "-K " . $level_map{$level}. " ";
618 }
619 }
620
621 my $mgpp_perf_hash_build_exe =
622 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
623 my $mgpp_weights_build_exe =
624 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
625 my $mgpp_invf_dict_exe =
626 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
627 my $mgpp_stem_idx_exe =
628 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
629
630 my $osextra = "";
631 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
632 $fullindexprefix =~ s@/@\\@g;
633 } else {
634 $osextra = " -d /";
635 if ($outhandle ne "STDERR") {
636 # so mgpp_passes doesn't print to stderr if we redirect output
637 $osextra .= " 2>/dev/null";
638 }
639 }
640
641 # get the index expression if this index belongs
642 # to a subcollection
643 my $indexexparr = [];
644
645 # there may be subcollection info, and language info.
646 my ($fields, $subcollection, $language) = split (":", $index);
647 my @subcollections = ();
648 @subcollections = split /,/, $subcollection if (defined $subcollection);
649
650 foreach $subcollection (@subcollections) {
651 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
652 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
653 }
654 }
655
656 # add expressions for languages if this index belongs to
657 # a language subcollection - only put languages expressions for the
658 # ones we want in the index
659
660 # this puts a separate Language/en entry in for each language in the list
661 # is this what we want?
662 # should we just have one entry with Language/en,es/ ??
663 my @languages = ();
664 @languages = split /,/, $language if (defined $language);
665 foreach $language (@languages) {
666 my $not=0;
667 if ($language =~ s/^\!//) {
668 $not = 1;
669 }
670 if ($not) {
671 push (@$indexexparr, "!Language/$language/");
672 } else {
673 push (@$indexexparr, "Language/$language/");
674 }
675 }
676
677 # Build index dictionary. Uses verbatim stem method
678 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
679 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
680 my ($handle);
681 if ($self->{'debug'}) {
682 $handle = STDOUT;
683 } else {
684 if (!-e "$mgpp_passes_exe" ||
685 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
686 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
687 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
688 }
689 $handle = mgppbuilder::PIPEOUT;
690 }
691
692 # set up the document processr
693 $self->{'buildproc'}->set_output_handle ($handle);
694 $self->{'buildproc'}->set_mode ('text');
695 $self->{'buildproc'}->set_index ($index, $indexexparr);
696 $self->{'buildproc'}->set_indexing_text (1);
697 $self->{'buildproc'}->set_store_text(1);
698 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
699 $self->{'buildproc'}->set_levels ($self->{'levels'});
700 $self->{'buildproc'}->reset();
701 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
702 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
703 close ($handle) unless $self->{'debug'};
704
705 $self->print_stats();
706
707 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
708 # we check on the .id file - index dictionary
709 my $dict_file = "$fullindexprefix.id";
710 if (!-e $dict_file) {
711 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
712 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
713 $self->{'notbuilt'}->{$index}=1;
714 return;
715 }
716
717 if (!$self->{'debug'}) {
718 # create the perfect hash function
719 if (!-e "$mgpp_perf_hash_build_exe") {
720 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
721 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
722 }
723 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
724
725 if (!-e "$mgpp_passes_exe" ||
726 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
727 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
728 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
729 }
730 }
731
732 # invert the text
733 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
734 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
735 $self->{'buildproc'}->reset();
736 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
737 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
738
739 $self->print_stats ();
740
741 if (!$self->{'debug'}) {
742
743 close ($handle);
744
745 # create the weights file
746 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
747 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
748 if (!-e "$mgpp_weights_build_exe") {
749 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
750 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
751 }
752 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
753
754 # create 'on-disk' stemmed dictionary
755 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
756 if (!-e "$mgpp_invf_dict_exe") {
757 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
758 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
759 }
760 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
761
762
763 # creates stem index files for the various stemming methods
764 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
765 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
766 if (!-e "$mgpp_stem_idx_exe") {
767 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
768 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
769 }
770 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
771 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
772 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
773
774 # remove unwanted files
775 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
776 opendir (DIR, $tmpdir) || die
777 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
778 foreach $file (readdir(DIR)) {
779 next if $file =~ /^\./;
780 my ($suffix) = $file =~ /\.([^\.]+)$/;
781 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
782 # delete it!
783 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
784 #&util::rm (&util::filename_cat ($tmpdir, $file));
785 }
786 }
787 closedir (DIR);
788 }
789 print STDERR "</Stage>\n" if $self->{'gli'};
790}
791
792sub make_infodatabase {
793 my $self = shift (@_);
794 my $outhandle = $self->{'outhandle'};
795
796
797 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
798 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
799 &util::mk_all_dir ($textdir);
800 &util::mk_all_dir ($assocdir);
801
802 # get db name
803 my $dbext = ".bdb";
804 $dbext = ".ldb" if &util::is_little_endian();
805 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
806 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
807
808 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
809 my $exe = &util::get_os_exe ();
810 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
811
812 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
813 if (!defined $self->{'build_cfg'}) {
814 $self->read_final_field_list();
815 }
816 print $outhandle "\n*** creating the info database and processing associated files\n"
817 if ($self->{'verbosity'} >= 1);
818 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
819
820 # init all the classifiers
821 &classify::init_classifiers ($self->{'classifiers'});
822
823 # set up the document processor
824 my ($handle);
825 if ($self->{'debug'}) {
826 $handle = STDOUT;
827 } else {
828 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
829 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
830 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
831 }
832 $handle = mgppbuilder::PIPEOUT;
833 }
834
835 $self->{'buildproc'}->set_output_handle ($handle);
836 $self->{'buildproc'}->set_mode ('infodb');
837 $self->{'buildproc'}->set_assocdir ($assocdir);
838 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
839 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
840 $self->{'buildproc'}->set_indexing_text (0);
841 $self->{'buildproc'}->set_store_text(1);
842 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
843
844 $self->{'buildproc'}->reset();
845
846 # do the collection info
847 print $handle "[collection]\n";
848
849 # first do the collection meta stuff - everything without a dot
850 my $collmetadefined = 0;
851 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
852 $collmetadefined = 1;
853 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
854 next if ($cmeta =~ /^\./); # for now, ignore ones with dots
855 my ($metadata_entry) = $self->create_language_db_map($cmeta, $cmeta);
856 #write the entry to the file
857 print $handle $metadata_entry;
858
859 } # foreach collmeta key
860 }
861 #add the index field macros to [collection]
862 # eg <TI>Title
863 # <SU>Subject
864 # these now come from collection meta. if that is not defined, usses the metadata name
865 $field_entry="";
866 foreach $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
867 $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
868 next if $shortfield eq 1;
869
870 # we need to check if some coll meta has been defined
871 my $collmeta = ".$longfield";
872 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
873 $metadata_entry = $self->create_language_db_map($collmeta, $shortfield);
874 $field_entry .= $metadata_entry;
875 } else { #use the metadata names, or the text macros for allfields and textonly
876 if ($longfield eq "allfields") {
877 $field_entry .= "<$shortfield>_query:textallfields_\n";
878 } elsif ($longfield eq "text") {
879 $field_entry .= "<$shortfield>_query:texttextonly_\n";
880 } else {
881 $field_entry .= "<$shortfield>$longfield\n";
882 }
883 }
884 }
885 print $handle $field_entry;
886
887 # now add the level names
888 $level_entry = "";
889 foreach $level (@{$self->{'collect_cfg'}->{'levels'}}) {
890 my $collmeta = ".$level"; # based on the original specification
891 $level =~ tr/A-Z/a-z/; # make it lower case
892 my $levelid = $level_map{$level}; # find the actual value we used in the index
893 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
894 $metadata_entry = $self->create_language_db_map($collmeta, $levelid);
895 $level_entry .= $metadata_entry;
896 } else {
897 # use the default macro
898 $level_entry .= "<$levelid>" . $level_map{$levelid} . "\n";
899 }
900 }
901 print $handle $level_entry;
902
903 # now add subcoll meta
904 $subcoll_entry = "";
905 foreach $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
906 if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
907 my $shortname = $self->{'index_mapping'}->{$subcoll};
908 $one_entry = $self->create_language_db_map(".$subcoll", $shortname);
909 $subcoll_entry .= $one_entry;
910 } else {
911 $subcoll_entry .= "<$shortname>$subcoll\n";
912 }
913 }
914 print $handle $subcoll_entry;
915 # now add language meta
916 $lang_entry = "";
917 foreach $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
918 if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
919 my $shortname = $self->{'index_mapping'}->{$lang};
920 $one_entry = $self->create_language_db_map(".$lang", $shortname);
921 $lang_entry .= $one_entry;
922 } else {
923 $lang_entry .= "<$shortname>$lang\n";
924 }
925 }
926 print $handle $lang_entry;
927 #end the collection entry
928 print $handle "\n" . ('-' x 70) . "\n";
929
930
931
932 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
933 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
934
935 # output classification information
936 &classify::output_classify_info ($self->{'classifiers'}, $handle,
937 $self->{'allclassifications'},
938 $self->{'gli'});
939
940 #output doclist
941 my @doclist = $self->{'buildproc'}->get_doc_list();
942 my $docs = join (";",@doclist);
943 print $handle "[browselist]\n";
944 print $handle "<hastxt>0\n";
945 print $handle "<childtype>VList\n";
946 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
947 print $handle "<thistype>Invisible\n";
948 print $handle "<contains>$docs";
949 print $handle "\n" . ('-' x 70) . "\n";
950 close ($handle) if !$self->{'debug'};
951
952 print STDERR "</Stage>\n" if $self->{'gli'};
953}
954
955sub create_language_db_map {
956 my $self = shift (@_);
957 my ($metaname, $mapname) = @_;
958 my $outhandle = $self->{'outhandle'};
959 my $defaultfound=0;
960 my $first=1;
961 my $metadata_entry = "";
962 my $default="";
963 #iterate through the languages
964 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}})) {
965 if ($first) {
966 $first=0;
967 #set the default default to the first entry
968 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
969 }
970 if ($lang =~ /default/) {
971 $defaultfound=1;
972 #the default entry goes first
973 $metadata_entry = "<$mapname>" .
974 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{'default'} . "\n" . $metadata_entry;
975 }
976 else {
977 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
978 if ($l) {
979 $metadata_entry .= "<$mapname:$l>" .
980 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang} . "\n";
981
982 # Use the English value as the default if no default is specified
983 if ($l =~ /en/i) {
984 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
985 }
986 }
987 }
988 } #foreach lang
989 #if we haven't found a default, put one in
990 if (!$defaultfound) {
991 $metadata_entry = "<$mapname>$default\n" . $metadata_entry;
992 }
993 return $metadata_entry;
994
995}
996sub collect_specific {
997 my $self = shift (@_);
998}
999
1000# at the end of building, we have an indexfieldmap with all teh mappings, plus
1001# some extras, and indexmap with any indexes in it that weren't specified in the index definition.
1002# we want to make an ordered list of fields that are indexed, and a list of mappings that are used. this will be used for the build.cfg file, and for collection meta definition
1003# we store these in a build.cfg bit
1004sub make_final_field_list {
1005 my $self = shift (@_);
1006
1007 $self->{'build_cfg'} = {};
1008
1009 # store the indexfieldmap information
1010 my @indexfieldmap = ();
1011 my @indexfields = ();
1012 my $specifiedfields = {};
1013 my @specifiedfieldorder = ();
1014 # go through the index definition and add each thing to a map, so we can easily check if it is already specified - when doing the metadata, we print out all the individual fields, but some may already be specified in the index definition, so we dont want to add those again.
1015 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
1016 # remove subcoll stuff
1017 my $parts = $field;
1018 $parts =~ s/:.*$//;
1019 my @fs = split(',', $parts);
1020 foreach $f(@fs) {
1021 if (!defined $specifiedfields->{$f}) {
1022 $specifiedfields->{$f}=1;
1023 push (@specifiedfieldorder, "$f");
1024 }
1025 }
1026 }
1027
1028 #add all fields bit
1029 foreach $field (@specifiedfieldorder) {
1030 if ($field eq "metadata") {
1031 foreach $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
1032 if (!defined $specifiedfields->{$newfield}) {
1033 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
1034 push (@indexfields, "$newfield");
1035 }
1036 }
1037
1038 } elsif ($field eq 'text') {
1039 push (@indexfieldmap, "text\-\>TX");
1040 push (@indexfields, "text");
1041 } elsif ($field eq 'allfields') {
1042 push (@indexfieldmap, "allfields\-\>ZZ");
1043 push (@indexfields, "allfields");
1044 } else {
1045 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
1046 push (@indexfields, "$field");
1047
1048 }
1049 }
1050 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1051 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1052
1053
1054}
1055
1056
1057# recreate the field list from the build.cfg file, look first in building, then in index to find it. if there is no build.cfg, we cant do the field list (there is unlikely to be any index anyway.)
1058sub read_final_field_list {
1059 my $self = shift (@_);
1060 $self->{'build_cfg'} = {};
1061 my @indexfieldmap = ();
1062 my @indexfields = ();
1063
1064 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
1065 # set the default mapping
1066 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
1067 }
1068 # we read the stuff in from the build.cfg file - if its there
1069 $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
1070
1071 if (!-e $buildconfigfile) {
1072 # try the index dir - but do we know where it is?? try here
1073 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
1074 if (!-e $buildconfigfile) {
1075 #we cant find a config file - just ignore the field list
1076 return;
1077 }
1078 }
1079 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
1080 if (defined $buildcfg->{'indexfields'}) {
1081 foreach $field (@{$buildcfg->{'indexfields'}}) {
1082 push (@indexfields, "$field");
1083 }
1084 }
1085 if (defined $buildcfg->{'indexfieldmap'}) {
1086 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
1087 push (@indexfieldmap, "$field");
1088 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
1089 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
1090 }
1091 }
1092
1093 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1094 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1095
1096}
1097sub make_auxiliary_files {
1098 my $self = shift (@_);
1099 my ($index);
1100
1101 my $build_cfg = {};
1102 # this already includes indexfieldmap and indexfields
1103 if (defined $self->{'build_cfg'}) {
1104 $build_cfg = $self->{'build_cfg'};
1105 }
1106 #my %build_cfg = ();
1107
1108 my $outhandle = $self->{'outhandle'};
1109 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
1110 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
1111
1112 # get the text directory
1113 &util::mk_all_dir ($self->{'build_dir'});
1114
1115 # store the build date
1116 $build_cfg->{'builddate'} = time;
1117 $build_cfg->{'buildtype'} = "mgpp"; #do we need this??
1118
1119 # store the level info
1120 my @indexlevels = ();
1121 foreach $l (@{$self->{'levelorder'}}) {
1122 push (@indexlevels, $level_map{$l});
1123 }
1124 $build_cfg->{'indexlevels'} = \@indexlevels;
1125
1126 if ($self->{'levels'}->{'section'}) {
1127 $build_cfg->{'textlevel'} = $level_map{'section'};
1128 } else {
1129 $build_cfg->{'textlevel'} = $level_map{'document'};
1130 }
1131 # store the number of documents and number of bytes
1132 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
1133 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
1134
1135 # store the mapping between the index names and the directory names
1136 my @indexmap = ();
1137 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
1138 if (not defined ($self->{'notbuilt'}->{$index})) {
1139 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
1140 }
1141 }
1142 $build_cfg->{'indexmap'} = \@indexmap;
1143
1144 my @subcollectionmap = ();
1145 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
1146 push (@subcollectionmap, "$subcollection\-\>" .
1147 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
1148 }
1149 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
1150
1151 my @languagemap = ();
1152 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
1153 push (@languagemap, "$language\-\>" .
1154 $self->{'index_mapping'}->{'languagemap'}->{$language});
1155 }
1156 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
1157
1158 #$build_cfg->{'notbuilt'} = $self->{'notbuilt'};
1159 my @notbuilt = ();
1160 foreach $nb (keys %{$self->{'notbuilt'}}) {
1161 push (@notbuilt, $nb);
1162 }
1163 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
1164
1165 # write out the build information
1166 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
1167 '^(builddate|buildtype|numdocs|numbytes|textlevel)$',
1168 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels)$');
1169
1170 print STDERR "</Stage>\n" if $self->{'gli'};
1171}
1172
1173sub deinit {
1174 my $self = shift (@_);
1175}
1176
1177sub print_stats {
1178 my $self = shift (@_);
1179
1180 my $outhandle = $self->{'outhandle'};
1181 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
1182 my $index = $self->{'buildproc'}->get_index();
1183 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
1184 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
1185
1186 if ($indexing_text) {
1187 print $outhandle "Stats (Creating index $index)\n";
1188 } else {
1189 print $outhandle "Stats (Compressing text from $index)\n";
1190 }
1191 print $outhandle "Total bytes in collection: $num_bytes\n";
1192 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
1193
1194 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
1195 print $outhandle "***************\n";
1196 if ($indexing_text) {
1197 print $outhandle "WARNING: There is very little or no text to process for $index\n";
1198 } elsif (!$self->{'no_text'}) {
1199 print $outhandle "WARNING: There is very little or no text to compress\n";
1200 }
1201 print $outhandle " Was this your intention?\n";
1202 print $outhandle "***************\n";
1203 }
1204
1205}
1206
12071;
1208
1209
Note: See TracBrowser for help on using the repository browser.