source: trunk/gsdl/perllib/mgppbuilder.pm@ 10113

Last change on this file since 10113 was 9938, checked in by kjdon, 19 years ago

added a new field to build.cfg: indexstem. specifies the root of teh index/gdbm filenames. can now rename a colleciton and it will still work without rebuilding

  • Property svn:keywords set to Author Date Id Revision
File size: 42.3 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mgpp
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48our $maxdocsize = 12000;
49
50our %level_map = ('document'=>'Doc',
51 'section'=>'Sec',
52 'paragraph'=>'Para',
53 'Doc'=>'_textdocument_',
54 'Sec'=>'_textsection_',
55 'Para'=>'_textparagraph_');
56
57#$doc_level = "Doc";
58#$sec_level = "Sec";
59#$para_level = "Para";
60
61our %wanted_index_files = ('td'=>1,
62 't'=>1,
63 'tl'=>1,
64 'ti'=>1,
65 'idb'=>1,
66 'ib1'=>1,
67 'ib2'=>1,
68 'ib3'=>1,
69 'i'=>1,
70 'il'=>1,
71 'w'=>1,
72 'wa'=>1);
73
74# change this so a user can add their own ones in via a file or cfg
75#add AND, OR, NOT NEAR to this list - these cannot be used as field names
76#also add the level names (Doc, Sec, Para)
77our %static_indexfield_map = ('Title'=>'TI',
78 'TI'=>1,
79 'Subject'=>'SU',
80 'SU'=>1,
81 'Creator'=>'CR',
82 'CR'=>1,
83 'Organization'=>'ORG',
84 'ORG'=>1,
85 'Source'=>'SO',
86 'SO'=>1,
87 'Howto'=>'HT',
88 'HT'=>1,
89 'ItemTitle'=>'IT',
90 'IT'=>1,
91 'ProgNumber'=>'PN',
92 'PN'=>1,
93 'People'=>'PE',
94 'PE'=>1,
95 'Coverage'=>'CO',
96 'CO'=>1,
97 'allfields'=>'ZZ',
98 'ZZ'=>1,
99 'text'=>'TX',
100 'TX'=>1,
101 'AND'=>1,
102 'OR'=>1,
103 'NOT'=>1,
104 'NEAR'=>1,
105 'Doc'=>1,
106 'Sec'=>1,
107 'Para'=>1);
108
109sub new {
110 my $class = shift(@_);
111
112 my ($collection, $source_dir, $build_dir, $verbosity,
113 $maxdocs, $debug, $keepold, $remove_empty_classifications,
114 $outhandle, $no_text, $failhandle, $gli) = @_;
115
116 $outhandle = STDERR unless defined $outhandle;
117 $no_text = 0 unless defined $no_text;
118
119 # create an mgppbuilder object
120 my $self = bless {'collection'=>$collection,
121 'source_dir'=>$source_dir,
122 'build_dir'=>$build_dir,
123 'verbosity'=>$verbosity,
124 'maxdocs'=>$maxdocs,
125 'debug'=>$debug,
126 'keepold'=>$keepold,
127 'remove_empty_classifications'=>$remove_empty_classifications,
128 'outhandle'=>$outhandle,
129 'no_text'=>$no_text,
130 'notbuilt'=>{}, # indexes not built
131 'indexfieldmap'=>\%static_indexfield_map,
132 'gli'=>$gli
133 }, $class;
134
135 $self->{'gli'} = 0 unless defined $self->{'gli'};
136
137 # read in the collection configuration file
138 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
139 if (!-e $colcfgname) {
140 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
141 }
142 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
143
144 # sort out the indexes
145 #indexes are specified with spaces, but we put them into one index
146 my $indexes = $self->{'collect_cfg'}->{'indexes'};
147 $self->{'collect_cfg'}->{'indexes'} = [];
148 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes));
149
150
151 # sort out subcollection indexes
152 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
153 my $indexes = $self->{'collect_cfg'}->{'indexes'};
154 $self->{'collect_cfg'}->{'indexes'} = [];
155 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
156 foreach my $index (@$indexes) {
157 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
158 }
159 }
160 }
161
162 # sort out language subindexes
163 if (defined $self->{'collect_cfg'}->{'languages'}) {
164 my $indexes = $self->{'collect_cfg'}->{'indexes'};
165 $self->{'collect_cfg'}->{'indexes'} = [];
166 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
167 foreach my $index (@$indexes) {
168 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
169 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
170 }
171 else { # add in an empty subcollection field
172 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
173
174 }
175 }
176 }
177 }
178
179 # make sure that the same index isn't specified more than once
180 my %tmphash = ();
181 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
182 $self->{'collect_cfg'}->{'indexes'} = [];
183 foreach my $i (@tmparray) {
184 if (!defined ($tmphash{$i})) {
185 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
186 $tmphash{$i} = 1;
187 }
188 }
189
190
191 # get the levels (Section, Paragraph) for indexing and compression
192 $self->{'levels'} = {};
193 $self->{'levelorder'} = ();
194 if (defined $self->{'collect_cfg'}->{'levels'}) {
195 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
196 $level =~ tr/A-Z/a-z/;
197 $self->{'levels'}->{$level} = 1;
198 push (@{$self->{'levelorder'}}, $level);
199 }
200 } else { # default to document
201 $self->{'levels'}->{'document'} = 1;
202 push (@{$self->{'levelorder'}}, 'document');
203 }
204
205 $self->{'doc_level'} = "document";
206 if (! $self->{'levels'}->{'document'}) {
207 if ($self->{'levels'}->{'section'}) {
208 $self->{'doc_level'} = "section";
209 } else {
210 die "you must have either document or section level specified!!\n";
211 }
212 }
213
214 print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
215 # get the list of plugins for this collection
216
217 #build up the extra global options for the plugins
218 my @global_opts = ();
219 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
220 push @global_opts, "-separate_cjk";
221 }
222
223 my $plugins = [];
224 if (defined $self->{'collect_cfg'}->{'plugin'}) {
225 $plugins = $self->{'collect_cfg'}->{'plugin'};
226 }
227
228 # load all the plugins
229 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, \@global_opts);
230 if (scalar(@{$self->{'pluginfo'}}) == 0) {
231 print $outhandle "No plugins were loaded.\n";
232 die "\n";
233 }
234
235 # get the list of classifiers for this collection
236 my $classifiers = [];
237 if (defined $self->{'collect_cfg'}->{'classify'}) {
238 $classifiers = $self->{'collect_cfg'}->{'classify'};
239 }
240
241 # load all the classifiers
242 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
243
244 # load up any dontgdbm fields
245 $self->{'dontgdbm'} = {};
246 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
247 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
248 $self->{'dontgdbm'}->{$dg} = 1;
249 }
250 }
251
252 # load up the document processor for building
253 # if a buildproc class has been created for this collection, use it
254 # otherwise, use the mgpp buildproc
255 my ($buildprocdir, $buildproctype);
256 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
257 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
258 $buildproctype = "${collection}buildproc";
259 } else {
260 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
261 $buildproctype = "mgppbuildproc";
262 }
263 require "$buildprocdir/$buildproctype.pm";
264
265 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
266 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
267 die "$@" if $@;
268
269 $self->{'buildtype'} = "mgpp";
270
271 return $self;
272}
273
274sub init {
275 my $self = shift (@_);
276
277 if (!$self->{'debug'} && !$self->{'keepold'}) {
278 # remove any old builds
279 &util::rm_r($self->{'build_dir'});
280 &util::mk_all_dir($self->{'build_dir'});
281
282 # make the text directory
283 my $textdir = "$self->{'build_dir'}/text";
284 &util::mk_all_dir($textdir);
285 }
286}
287
288sub set_strip_html {
289 my $self = shift (@_);
290 my ($strip) = @_;
291
292 $self->{'strip_html'} = $strip;
293 $self->{'buildproc'}->set_strip_html($strip);
294}
295
296sub compress_text {
297
298 my $self = shift (@_);
299 my ($textindex) = @_;
300
301 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
302 my $exe = &util::get_os_exe ();
303 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
304 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
305 my $outhandle = $self->{'outhandle'};
306
307 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
308
309 my $basefilename = "text/$self->{'collection'}";
310 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
311
312 my $osextra = "";
313 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
314 $fulltextprefix =~ s@/@\\@g;
315 }
316 else {
317 $osextra = " -d /";
318 }
319
320
321 # define the section names and possibly the doc name for mgpasses
322 # the compressor doesn't need to know about paragraphs - never want to
323 # retrieve them
324 my $mgpp_passes_sections = "";
325 my ($doc_level) = $self->{'doc_level'};
326 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} . " ";
327 foreach my $level (keys %{$self->{'levels'}}) {
328 if ($level ne $doc_level && $level ne "paragraph") {
329 $mgpp_passes_sections .= "-K " . $level_map{$level} . " ";
330 }
331 }
332 $mgpp_passes_sections .= "-K SENT ";
333 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
334 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
335
336 # collect the statistics for the text
337 # -b $maxdocsize sets the maximum document size to be 12 meg
338 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
339 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
340
341 my ($handle);
342 if ($self->{'debug'}) {
343 $handle = STDOUT;
344 } else {
345 #print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n";
346 if (!-e "$mgpp_passes_exe" ||
347 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
348 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
349 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
350 }
351 $handle = mgppbuilder::PIPEOUT;
352 }
353
354 # gdbm_level
355 my $gdbm_level = "document";
356 if ($self->{'levels'}->{'section'}) {
357 $gdbm_level = "section";
358 }
359
360 $self->{'buildproc'}->set_output_handle ($handle);
361 $self->{'buildproc'}->set_mode ('text');
362 $self->{'buildproc'}->set_index ($textindex);
363 $self->{'buildproc'}->set_indexing_text (0);
364 if ($self->{'no_text'}) {
365 $self->{'buildproc'}->set_store_text(0);
366 } else {
367 $self->{'buildproc'}->set_store_text(1);
368 }
369 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
370 $self->{'buildproc'}->set_levels ($self->{'levels'});
371 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
372 $self->{'buildproc'}->reset();
373 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
374 $self->{'buildproc'}, $self->{'maxdocs'});
375 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
376 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
377 &plugin::end($self->{'pluginfo'});
378 close (PIPEOUT);
379
380 close ($handle) unless $self->{'debug'};
381
382 $self->print_stats();
383
384 # create the compression dictionary
385 # the compression dictionary is built by assuming the stats are from a seed
386 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
387 # and the resulting dictionary must be less than 5 meg with the most
388 # frequent words being put into the dictionary first (-2 -k 5120)
389 # note: these options are left over from mg version
390 if (!$self->{'debug'}) {
391 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
392 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
393 if (!-e "$mgpp_compression_dict_exe") {
394 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
395 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
396 }
397 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
398
399 if (!$self->{'debug'}) {
400 #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n";
401 if (!-e "$mgpp_passes_exe" ||
402 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
403 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
404 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
405 }
406 }
407 }
408 else {
409 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
410 }
411
412 $self->{'buildproc'}->reset();
413 # compress the text
414 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
415 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
416
417 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
418 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
419 close ($handle) unless $self->{'debug'};
420
421 $self->print_stats();
422 print STDERR "</Stage>\n" if $self->{'gli'};
423}
424
425sub want_built {
426 my $self = shift (@_);
427 my ($index) = @_;
428
429 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
430 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
431 if ($index =~ /^$checkstr$/) {
432 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
433 $self->{'notbuilt'}->{$index} = 1;
434 return 0;
435 }
436 }
437 }
438
439 return 1;
440}
441
442sub build_indexes {
443 my $self = shift (@_);
444 my ($indexname) = @_;
445 my $outhandle = $self->{'outhandle'};
446
447 my $indexes = [];
448 if (defined $indexname && $indexname =~ /\w/) {
449 push @$indexes, $indexname;
450 } else {
451 $indexes = $self->{'collect_cfg'}->{'indexes'};
452 }
453
454 # create the mapping between the index descriptions
455 # and their directory names (includes subcolls and langs)
456 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
457
458 # build each of the indexes
459 foreach my $index (@$indexes) {
460 if ($self->want_built($index)) {
461 print $outhandle "\n*** building index $index in subdirectory " .
462 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
463 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
464 $self->build_index($index);
465 } else {
466 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
467 }
468 }
469
470 #define the final field lists
471 $self->make_final_field_list();
472
473}
474
475# creates directory names for each of the index descriptions
476sub create_index_mapping {
477 my $self = shift (@_);
478 my ($indexes) = @_;
479
480 my %mapping = ();
481
482 $mapping{'indexmaporder'} = [];
483 $mapping{'subcollectionmaporder'} = [];
484 $mapping{'languagemaporder'} = [];
485
486 # dirnames is used to check for collisions. Start this off
487 # with the manditory directory names
488 my %dirnames = ('text'=>'text',
489 'extra'=>'extra');
490 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
491
492 foreach my $index (@$indexes) {
493 my ($fields, $subcollection, $languages) = split (":", $index);
494 # the directory name starts with a processed version of index fields
495 #my ($pindex) = $self->process_field($fields);
496 #$pindex = lc ($pindex);
497 # now we only ever have one index, and its called 'idx'
498 my $pindex = 'idx';
499
500 # next comes a processed version of the subcollection if there is one.
501 my $psub = $self->process_field ($subcollection);
502 $psub = lc ($psub);
503
504 # next comes a processed version of the language if there is one.
505 my $plang = $self->process_field ($languages);
506 $plang = lc ($plang);
507
508 my $dirname = $pindex . $psub . $plang;
509
510 # check to be sure all index names are unique
511 while (defined ($dirnames{$dirname})) {
512 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
513 }
514
515 $mapping{$index} = $dirname;
516
517 # store the mapping orders as well as the maps
518 # also put index, subcollection and language fields into the mapping thing -
519 # (the full index name (eg text:subcol:lang) is not used on
520 # the query page) -these are used for collectionmeta later on
521 if (!defined $mapping{'indexmap'}{"$fields"}) {
522 $mapping{'indexmap'}{"$fields"} = $pindex;
523 push (@{$mapping{'indexmaporder'}}, "$fields");
524 if (!defined $mapping{"$fields"}) {
525 $mapping{"$fields"} = $pindex;
526 }
527 }
528 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
529 $mapping{'subcollectionmap'}{$subcollection} = $psub;
530 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
531 $mapping{$subcollection} = $psub;
532 }
533 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
534 $mapping{'languagemap'}{$languages} = $plang;
535 push (@{$mapping{'languagemaporder'}}, $languages);
536 $mapping{$languages} = $plang;
537 }
538 $dirnames{$dirname} = $index;
539 $pnames{'index'}->{$pindex} = "$fields";
540 $pnames{'subcollection'}->{$psub} = $subcollection;
541 $pnames{'languages'}->{$plang} = $languages;
542 }
543
544 return \%mapping;
545}
546
547# returns a processed version of a field.
548# if the field has only one component the processed
549# version will contain the first character and next consonant
550# of that componant - otherwise it will contain the first
551# character of the first two components
552sub process_field {
553 my $self = shift (@_);
554 my ($field) = @_;
555
556 return "" unless (defined ($field) && $field =~ /\w/);
557
558 my @components = split /,/, $field;
559 if (scalar @components >= 2) {
560 splice (@components, 2);
561 map {s/^(.).*$/$1/;} @components;
562 return join("", @components);
563 } else {
564 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
565 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
566 return "$a$b";
567 }
568}
569
570sub make_unique {
571 my $self = shift (@_);
572 my ($namehash, $index, $indexref, $subref, $langref) = @_;
573 my ($fields, $subcollection, $languages) = split (":", $index);
574
575 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
576 $self->get_next_version ($indexref);
577 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
578 $self->get_next_version ($subref);
579 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
580 $self->get_next_version ($langref);
581 }
582 return "$$indexref$$subref$$langref";
583}
584
585sub get_next_version {
586 my $self = shift (@_);
587 my ($nameref) = @_;
588 my $num=0;
589 if ($$nameref =~ /(\d\d)$/) {
590 $num = $1; $num ++;
591 $$nameref =~ s/\d\d$/$num/;
592 } elsif ($$nameref =~ /(\d)$/) {
593 $num = $1;
594 if ($num == 9) {$$nameref =~ s/\d$/10/;}
595 else {$num ++; $$nameref =~ s/\d$/$num/;}
596 } else {
597 $$nameref =~ s/.$/0/;
598 }
599}
600
601sub build_index {
602 my $self = shift (@_);
603 my ($index) = @_;
604 my $outhandle = $self->{'outhandle'};
605
606 # get the full index directory path and make sure it exists
607 my $indexdir = $self->{'index_mapping'}->{$index};
608 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
609 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
610 $indexdir,
611 $self->{'collection'});
612 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
613 $self->{'collection'});
614
615 # get any os specific stuff
616 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
617
618 my $exe = &util::get_os_exe ();
619 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
620
621 # define the section names for mgpasses
622 # define the section names and possibly the doc name for mgpasses
623 my $mgpp_passes_sections = "";
624 my ($doc_level) = $self->{'doc_level'};
625 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} ." ";
626
627 foreach my $level (keys %{$self->{'levels'}}) {
628 if ($level ne $doc_level) {
629 $mgpp_passes_sections .= "-K " . $level_map{$level}. " ";
630 }
631 }
632
633 my $mgpp_perf_hash_build_exe =
634 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
635 my $mgpp_weights_build_exe =
636 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
637 my $mgpp_invf_dict_exe =
638 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
639 my $mgpp_stem_idx_exe =
640 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
641
642 my $osextra = "";
643 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
644 $fullindexprefix =~ s@/@\\@g;
645 } else {
646 $osextra = " -d /";
647 if ($outhandle ne "STDERR") {
648 # so mgpp_passes doesn't print to stderr if we redirect output
649 $osextra .= " 2>/dev/null";
650 }
651 }
652
653 # get the index expression if this index belongs
654 # to a subcollection
655 my $indexexparr = [];
656 my $langarr = [];
657 # there may be subcollection info, and language info.
658 my ($fields, $subcollection, $language) = split (":", $index);
659 my @subcollections = ();
660 @subcollections = split /,/, $subcollection if (defined $subcollection);
661
662 foreach $subcollection (@subcollections) {
663 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
664 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
665 }
666 }
667
668 # add expressions for languages if this index belongs to
669 # a language subcollection - only put languages expressions for the
670 # ones we want in the index
671
672 my @languages = ();
673 my $language_metadata = "Language";
674 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
675 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
676 }
677 @languages = split /,/, $language if (defined $language);
678 foreach my $language (@languages) {
679 my $not=0;
680 if ($language =~ s/^\!//) {
681 $not = 1;
682 }
683 if($not) {
684 push (@$langarr, "!$language");
685 } else {
686 push (@$langarr, "$language");
687 }
688 }
689
690 # Build index dictionary. Uses verbatim stem method
691 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
692 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
693 my ($handle);
694 if ($self->{'debug'}) {
695 $handle = STDOUT;
696 } else {
697 if (!-e "$mgpp_passes_exe" ||
698 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
699 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
700 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
701 }
702 $handle = mgppbuilder::PIPEOUT;
703 }
704
705 # gdbm_level
706 my $gdbm_level = "document";
707 if ($self->{'levels'}->{'section'}) {
708 $gdbm_level = "section";
709 }
710
711 # set up the document processr
712 $self->{'buildproc'}->set_output_handle ($handle);
713 $self->{'buildproc'}->set_mode ('text');
714 $self->{'buildproc'}->set_index ($index, $indexexparr);
715 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
716 $self->{'buildproc'}->set_indexing_text (1);
717 $self->{'buildproc'}->set_store_text(1);
718 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
719 $self->{'buildproc'}->set_levels ($self->{'levels'});
720 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
721
722 $self->{'buildproc'}->reset();
723 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
724 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
725 close ($handle) unless $self->{'debug'};
726
727 $self->print_stats();
728
729 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
730 # we check on the .id file - index dictionary
731 my $dict_file = "$fullindexprefix.id";
732 if (!-e $dict_file) {
733 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
734 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
735 $self->{'notbuilt'}->{$index}=1;
736 return;
737 }
738
739 if (!$self->{'debug'}) {
740 # create the perfect hash function
741 if (!-e "$mgpp_perf_hash_build_exe") {
742 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
743 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
744 }
745 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
746
747 if (!-e "$mgpp_passes_exe" ||
748 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
749 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
750 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
751 }
752 }
753
754 # invert the text
755 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
756 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
757 $self->{'buildproc'}->reset();
758 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
759 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
760
761 $self->print_stats ();
762
763 if (!$self->{'debug'}) {
764
765 close ($handle);
766
767 # create the weights file
768 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
769 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
770 if (!-e "$mgpp_weights_build_exe") {
771 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
772 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
773 }
774 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
775
776 # create 'on-disk' stemmed dictionary
777 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
778 if (!-e "$mgpp_invf_dict_exe") {
779 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
780 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
781 }
782 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
783
784
785 # creates stem index files for the various stemming methods
786 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
787 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
788 if (!-e "$mgpp_stem_idx_exe") {
789 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
790 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
791 }
792 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
793 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
794 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
795
796 # remove unwanted files
797 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
798 opendir (DIR, $tmpdir) || die
799 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
800 foreach my $file (readdir(DIR)) {
801 next if $file =~ /^\./;
802 my ($suffix) = $file =~ /\.([^\.]+)$/;
803 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
804 # delete it!
805 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
806 #&util::rm (&util::filename_cat ($tmpdir, $file));
807 }
808 }
809 closedir (DIR);
810 }
811 print STDERR "</Stage>\n" if $self->{'gli'};
812}
813
814sub make_infodatabase {
815 my $self = shift (@_);
816 my $outhandle = $self->{'outhandle'};
817
818
819 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
820 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
821 &util::mk_all_dir ($textdir);
822 &util::mk_all_dir ($assocdir);
823
824 # get db name
825 my $dbext = ".bdb";
826 $dbext = ".ldb" if &util::is_little_endian();
827 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
828 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
829
830 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
831 my $exe = &util::get_os_exe ();
832 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
833
834 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
835 if (!defined $self->{'build_cfg'}) {
836 $self->read_final_field_list();
837 }
838 print $outhandle "\n*** creating the info database and processing associated files\n"
839 if ($self->{'verbosity'} >= 1);
840 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
841
842 # init all the classifiers
843 &classify::init_classifiers ($self->{'classifiers'});
844
845 # set up the document processor
846 my ($handle);
847 if ($self->{'debug'}) {
848 $handle = STDOUT;
849 } else {
850 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
851 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
852 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
853 }
854 $handle = mgppbuilder::PIPEOUT;
855 }
856
857 $self->{'buildproc'}->set_output_handle ($handle);
858 $self->{'buildproc'}->set_mode ('infodb');
859 $self->{'buildproc'}->set_assocdir ($assocdir);
860 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
861 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
862 $self->{'buildproc'}->set_indexing_text (0);
863 $self->{'buildproc'}->set_store_text(1);
864 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
865
866 $self->{'buildproc'}->reset();
867
868 # do the collection info
869 print $handle "[collection]\n";
870
871 # first do the collection meta stuff - everything without a dot
872 my $collmetadefined = 0;
873 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
874 $collmetadefined = 1;
875 foreach my $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
876 next if ($cmeta =~ /^\./); # for now, ignore ones with dots
877 my ($metadata_entry) = $self->create_language_db_map($cmeta, $cmeta);
878 #write the entry to the file
879 print $handle $metadata_entry;
880
881 } # foreach collmeta key
882 }
883 #add the index field macros to [collection]
884 # eg <TI>Title
885 # <SU>Subject
886 # these now come from collection meta. if that is not defined, usses the metadata name
887 my $field_entry="";
888 my $collmeta = "";
889 foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
890 my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
891 next if $shortfield eq 1;
892
893 # we need to check if some coll meta has been defined
894 $collmeta = ".$longfield";
895 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
896 my $metadata_entry = $self->create_language_db_map($collmeta, $shortfield);
897 $field_entry .= $metadata_entry;
898 } else { #use the metadata names, or the text macros for allfields and textonly
899 if ($longfield eq "allfields") {
900 $field_entry .= "<$shortfield>_query:textallfields_\n";
901 } elsif ($longfield eq "text") {
902 $field_entry .= "<$shortfield>_query:texttextonly_\n";
903 } else {
904 $field_entry .= "<$shortfield>$longfield\n";
905 }
906 }
907 }
908 print $handle $field_entry;
909
910 # now add the level names
911 my $level_entry = "";
912 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
913 $collmeta = ".$level"; # based on the original specification
914 $level =~ tr/A-Z/a-z/; # make it lower case
915 my $levelid = $level_map{$level}; # find the actual value we used in the index
916 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
917 my $metadata_entry = $self->create_language_db_map($collmeta, $levelid);
918 $level_entry .= $metadata_entry;
919 } else {
920 # use the default macro
921 $level_entry .= "<$levelid>" . $level_map{$levelid} . "\n";
922 }
923 }
924 print $handle $level_entry;
925
926 # now add subcoll meta
927 my $subcoll_entry = "";
928 my $shortname = "";
929 my $one_entry = "";
930 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
931 if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
932 $shortname = $self->{'index_mapping'}->{$subcoll};
933 $one_entry = $self->create_language_db_map(".$subcoll", $shortname);
934 $subcoll_entry .= $one_entry;
935 } else {
936 $subcoll_entry .= "<$shortname>$subcoll\n";
937 }
938 }
939 print $handle $subcoll_entry;
940 # now add language meta
941 my $lang_entry = "";
942 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
943 if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
944 $shortname = $self->{'index_mapping'}->{$lang};
945 $one_entry = $self->create_language_db_map(".$lang", $shortname);
946 $lang_entry .= $one_entry;
947 } else {
948 $lang_entry .= "<$shortname>$lang\n";
949 }
950 }
951 print $handle $lang_entry;
952 #end the collection entry
953 print $handle "\n" . ('-' x 70) . "\n";
954
955
956
957 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
958 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
959
960 # output classification information
961 &classify::output_classify_info ($self->{'classifiers'}, $handle,
962 $self->{'remove_empty_classifications'},
963 $self->{'gli'});
964
965 #output doclist
966 my @doclist = $self->{'buildproc'}->get_doc_list();
967 my $docs = join (";",@doclist);
968 print $handle "[browselist]\n";
969 print $handle "<hastxt>0\n";
970 print $handle "<childtype>VList\n";
971 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
972 print $handle "<thistype>Invisible\n";
973 print $handle "<contains>$docs";
974 print $handle "\n" . ('-' x 70) . "\n";
975 close ($handle) if !$self->{'debug'};
976
977 print STDERR "</Stage>\n" if $self->{'gli'};
978}
979
980sub create_language_db_map {
981 my $self = shift (@_);
982 my ($metaname, $mapname) = @_;
983 my $outhandle = $self->{'outhandle'};
984 my $defaultfound=0;
985 my $first=1;
986 my $metadata_entry = "";
987 my $default="";
988 #iterate through the languages
989 foreach my $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}})) {
990 if ($first) {
991 $first=0;
992 #set the default default to the first entry
993 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
994 }
995 if ($lang =~ /default/) {
996 $defaultfound=1;
997 #the default entry goes first
998 $metadata_entry = "<$mapname>" .
999 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{'default'} . "\n" . $metadata_entry;
1000 }
1001 else {
1002 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
1003 if ($l) {
1004 $metadata_entry .= "<$mapname:$l>" .
1005 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang} . "\n";
1006
1007 # Use the English value as the default if no default is specified
1008 if ($l =~ /en/i) {
1009 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
1010 }
1011 }
1012 }
1013 } #foreach lang
1014 #if we haven't found a default, put one in
1015 if (!$defaultfound) {
1016 $metadata_entry = "<$mapname>$default\n" . $metadata_entry;
1017 }
1018 return $metadata_entry;
1019
1020}
1021sub collect_specific {
1022 my $self = shift (@_);
1023}
1024
1025# at the end of building, we have an indexfieldmap with all teh mappings, plus
1026# some extras, and indexmap with any indexes in it that weren't specified in the index definition.
1027# we want to make an ordered list of fields that are indexed, and a list of mappings that are used. this will be used for the build.cfg file, and for collection meta definition
1028# we store these in a build.cfg bit
1029sub make_final_field_list {
1030 my $self = shift (@_);
1031
1032 $self->{'build_cfg'} = {};
1033
1034 # store the indexfieldmap information
1035 my @indexfieldmap = ();
1036 my @indexfields = ();
1037 my $specifiedfields = {};
1038 my @specifiedfieldorder = ();
1039 # go through the index definition and add each thing to a map, so we can easily check if it is already specified - when doing the metadata, we print out all the individual fields, but some may already be specified in the index definition, so we dont want to add those again.
1040 foreach my $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
1041 # remove subcoll stuff
1042 my $parts = $field;
1043 $parts =~ s/:.*$//;
1044 my @fs = split(',', $parts);
1045 foreach my $f(@fs) {
1046 if (!defined $specifiedfields->{$f}) {
1047 $specifiedfields->{$f}=1;
1048 push (@specifiedfieldorder, "$f");
1049 }
1050 }
1051 }
1052
1053 #add all fields bit
1054 foreach my $field (@specifiedfieldorder) {
1055 if ($field eq "metadata") {
1056 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
1057 if (!defined $specifiedfields->{$newfield}) {
1058 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
1059 push (@indexfields, "$newfield");
1060 }
1061 }
1062
1063 } elsif ($field eq 'text') {
1064 push (@indexfieldmap, "text\-\>TX");
1065 push (@indexfields, "text");
1066 } elsif ($field eq 'allfields') {
1067 push (@indexfieldmap, "allfields\-\>ZZ");
1068 push (@indexfields, "allfields");
1069 } else {
1070 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
1071 push (@indexfields, "$field");
1072
1073 }
1074 }
1075 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1076 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1077
1078
1079}
1080
1081
1082# recreate the field list from the build.cfg file, look first in building, then in index to find it. if there is no build.cfg, we cant do the field list (there is unlikely to be any index anyway.)
1083sub read_final_field_list {
1084 my $self = shift (@_);
1085 $self->{'build_cfg'} = {};
1086 my @indexfieldmap = ();
1087 my @indexfields = ();
1088
1089 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
1090 # set the default mapping
1091 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
1092 }
1093 # we read the stuff in from the build.cfg file - if its there
1094 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
1095
1096 if (!-e $buildconfigfile) {
1097 # try the index dir - but do we know where it is?? try here
1098 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
1099 if (!-e $buildconfigfile) {
1100 #we cant find a config file - just ignore the field list
1101 return;
1102 }
1103 }
1104 my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
1105 if (defined $buildcfg->{'indexfields'}) {
1106 foreach my $field (@{$buildcfg->{'indexfields'}}) {
1107 push (@indexfields, "$field");
1108 }
1109 }
1110 if (defined $buildcfg->{'indexfieldmap'}) {
1111 foreach my $field (@{$buildcfg->{'indexfieldmap'}}) {
1112 push (@indexfieldmap, "$field");
1113 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
1114 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
1115 }
1116 }
1117
1118 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1119 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1120
1121}
1122sub make_auxiliary_files {
1123 my $self = shift (@_);
1124 my ($index);
1125
1126 my $build_cfg = {};
1127 # this already includes indexfieldmap and indexfields
1128 if (defined $self->{'build_cfg'}) {
1129 $build_cfg = $self->{'build_cfg'};
1130 }
1131 #my %build_cfg = ();
1132
1133 my $outhandle = $self->{'outhandle'};
1134 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
1135 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
1136
1137 # get the text directory
1138 &util::mk_all_dir ($self->{'build_dir'});
1139
1140 # store the build date
1141 $build_cfg->{'builddate'} = time;
1142 $build_cfg->{'buildtype'} = $self->{'buildtype'};
1143 $build_cfg->{'indexstem'} = $self->{'collection'};
1144 # store the level info
1145 my @indexlevels = ();
1146 my @levelmap = ();
1147 foreach my $l (@{$self->{'levelorder'}}) {
1148 push (@indexlevels, $level_map{$l});
1149 push (@levelmap, "$l\-\>$level_map{$l}");
1150 }
1151 $build_cfg->{'indexlevels'} = \@indexlevels;
1152 $build_cfg->{'levelmap'} = \@levelmap;
1153
1154 if ($self->{'levels'}->{'section'}) {
1155 $build_cfg->{'textlevel'} = $level_map{'section'};
1156 } else {
1157 $build_cfg->{'textlevel'} = $level_map{'document'};
1158 }
1159 # store the number of documents and number of bytes
1160 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
1161 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
1162
1163 # store the mapping between the index names and the directory names
1164 my @indexmap = ();
1165 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
1166 if (not defined ($self->{'notbuilt'}->{$index})) {
1167 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
1168 }
1169 }
1170 $build_cfg->{'indexmap'} = \@indexmap;
1171
1172 my @subcollectionmap = ();
1173 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
1174 push (@subcollectionmap, "$subcollection\-\>" .
1175 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
1176 }
1177 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
1178
1179 my @languagemap = ();
1180 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
1181 push (@languagemap, "$language\-\>" .
1182 $self->{'index_mapping'}->{'languagemap'}->{$language});
1183 }
1184 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
1185
1186 #$build_cfg->{'notbuilt'} = $self->{'notbuilt'};
1187 my @notbuilt = ();
1188 foreach my $nb (keys %{$self->{'notbuilt'}}) {
1189 push (@notbuilt, $nb);
1190 }
1191 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
1192
1193 # write out the build information
1194 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
1195 '^(builddate|buildtype|numdocs|numbytes|textlevel|indexstem)$',
1196 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels|levelmap)$');
1197
1198 print STDERR "</Stage>\n" if $self->{'gli'};
1199}
1200
1201sub deinit {
1202 my $self = shift (@_);
1203}
1204
1205sub print_stats {
1206 my $self = shift (@_);
1207
1208 my $outhandle = $self->{'outhandle'};
1209 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
1210 my $index = $self->{'buildproc'}->get_index();
1211 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
1212 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
1213
1214 if ($indexing_text) {
1215 print $outhandle "Stats (Creating index $index)\n";
1216 } else {
1217 print $outhandle "Stats (Compressing text from $index)\n";
1218 }
1219 print $outhandle "Total bytes in collection: $num_bytes\n";
1220 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
1221
1222 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
1223 print $outhandle "***************\n";
1224 if ($indexing_text) {
1225 print $outhandle "WARNING: There is very little or no text to process for $index\n";
1226 } elsif (!$self->{'no_text'}) {
1227 print $outhandle "WARNING: There is very little or no text to compress\n";
1228 }
1229 print $outhandle " Was this your intention?\n";
1230 print $outhandle "***************\n";
1231 }
1232
1233}
1234
12351;
1236
1237
Note: See TracBrowser for help on using the repository browser.