source: main/tags/2.60/gsdl/perllib/mgppbuilder.pm@ 32313

Last change on this file since 32313 was 9669, checked in by kjdon, 19 years ago

fixed up the case where you have subcollection partitions and language partitions - was doing an OR on both, but really needed an OR for partitions ANDed with an OR for langs

  • Property svn:keywords set to Author Date Id Revision
File size: 41.7 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mgpp
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48our $maxdocsize = 12000;
49
50our %level_map = ('document'=>'Doc',
51 'section'=>'Sec',
52 'paragraph'=>'Para',
53 'Doc'=>'_textdocument_',
54 'Sec'=>'_textsection_',
55 'Para'=>'_textparagraph_');
56
57#$doc_level = "Doc";
58#$sec_level = "Sec";
59#$para_level = "Para";
60
61our %wanted_index_files = ('td'=>1,
62 't'=>1,
63 'tl'=>1,
64 'ti'=>1,
65 'idb'=>1,
66 'ib1'=>1,
67 'ib2'=>1,
68 'ib3'=>1,
69 'i'=>1,
70 'il'=>1,
71 'w'=>1,
72 'wa'=>1);
73
74# change this so a user can add their own ones in via a file or cfg
75#add AND, OR, NOT NEAR to this list - these cannot be used as field names
76#also add the level names (Doc, Sec, Para)
77our %static_indexfield_map = ('Title'=>'TI',
78 'TI'=>1,
79 'Subject'=>'SU',
80 'SU'=>1,
81 'Creator'=>'CR',
82 'CR'=>1,
83 'Organization'=>'ORG',
84 'ORG'=>1,
85 'Source'=>'SO',
86 'SO'=>1,
87 'Howto'=>'HT',
88 'HT'=>1,
89 'ItemTitle'=>'IT',
90 'IT'=>1,
91 'ProgNumber'=>'PN',
92 'PN'=>1,
93 'People'=>'PE',
94 'PE'=>1,
95 'Coverage'=>'CO',
96 'CO'=>1,
97 'allfields'=>'ZZ',
98 'ZZ'=>1,
99 'text'=>'TX',
100 'TX'=>1,
101 'AND'=>1,
102 'OR'=>1,
103 'NOT'=>1,
104 'NEAR'=>1,
105 'Doc'=>1,
106 'Sec'=>1,
107 'Para'=>1);
108
109sub new {
110 my $class = shift(@_);
111
112 my ($collection, $source_dir, $build_dir, $verbosity,
113 $maxdocs, $debug, $keepold, $remove_empty_classifications,
114 $outhandle, $no_text, $failhandle, $gli) = @_;
115
116 $outhandle = STDERR unless defined $outhandle;
117 $no_text = 0 unless defined $no_text;
118
119 # create an mgppbuilder object
120 my $self = bless {'collection'=>$collection,
121 'source_dir'=>$source_dir,
122 'build_dir'=>$build_dir,
123 'verbosity'=>$verbosity,
124 'maxdocs'=>$maxdocs,
125 'debug'=>$debug,
126 'keepold'=>$keepold,
127 'remove_empty_classifications'=>$remove_empty_classifications,
128 'outhandle'=>$outhandle,
129 'no_text'=>$no_text,
130 'notbuilt'=>{}, # indexes not built
131 'indexfieldmap'=>\%static_indexfield_map,
132 'gli'=>$gli
133 }, $class;
134
135 $self->{'gli'} = 0 unless defined $self->{'gli'};
136
137 # read in the collection configuration file
138 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
139 if (!-e $colcfgname) {
140 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
141 }
142 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
143
144 # sort out the indexes
145 #indexes are specified with spaces, but we put them into one index
146 my $indexes = $self->{'collect_cfg'}->{'indexes'};
147 $self->{'collect_cfg'}->{'indexes'} = [];
148 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes));
149
150
151 # sort out subcollection indexes
152 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
153 my $indexes = $self->{'collect_cfg'}->{'indexes'};
154 $self->{'collect_cfg'}->{'indexes'} = [];
155 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
156 foreach my $index (@$indexes) {
157 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
158 }
159 }
160 }
161
162 # sort out language subindexes
163 if (defined $self->{'collect_cfg'}->{'languages'}) {
164 my $indexes = $self->{'collect_cfg'}->{'indexes'};
165 $self->{'collect_cfg'}->{'indexes'} = [];
166 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
167 foreach my $index (@$indexes) {
168 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
169 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
170 }
171 else { # add in an empty subcollection field
172 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
173
174 }
175 }
176 }
177 }
178
179 # make sure that the same index isn't specified more than once
180 my %tmphash = ();
181 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
182 $self->{'collect_cfg'}->{'indexes'} = [];
183 foreach my $i (@tmparray) {
184 if (!defined ($tmphash{$i})) {
185 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
186 $tmphash{$i} = 1;
187 }
188 }
189
190
191 # get the levels (Section, Paragraph) for indexing and compression
192 $self->{'levels'} = {};
193 $self->{'levelorder'} = ();
194 if (defined $self->{'collect_cfg'}->{'levels'}) {
195 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
196 $level =~ tr/A-Z/a-z/;
197 $self->{'levels'}->{$level} = 1;
198 push (@{$self->{'levelorder'}}, $level);
199 }
200 } else { # default to document
201 $self->{'levels'}->{'document'} = 1;
202 push (@{$self->{'levelorder'}}, 'document');
203 }
204
205 $self->{'doc_level'} = "document";
206 if (! $self->{'levels'}->{'document'}) {
207 if ($self->{'levels'}->{'section'}) {
208 $self->{'doc_level'} = "section";
209 } else {
210 die "you must have either document or section level specified!!\n";
211 }
212 }
213 print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
214 # get the list of plugins for this collection
215
216 #build up the extra global options for the plugins
217 my @global_opts = ();
218 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
219 push @global_opts, "-separate_cjk";
220 }
221
222 my $plugins = [];
223 if (defined $self->{'collect_cfg'}->{'plugin'}) {
224 $plugins = $self->{'collect_cfg'}->{'plugin'};
225 }
226
227 # load all the plugins
228 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, \@global_opts);
229 if (scalar(@{$self->{'pluginfo'}}) == 0) {
230 print $outhandle "No plugins were loaded.\n";
231 die "\n";
232 }
233
234 # get the list of classifiers for this collection
235 my $classifiers = [];
236 if (defined $self->{'collect_cfg'}->{'classify'}) {
237 $classifiers = $self->{'collect_cfg'}->{'classify'};
238 }
239
240 # load all the classifiers
241 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
242
243 # load up any dontgdbm fields
244 $self->{'dontgdbm'} = {};
245 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
246 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
247 $self->{'dontgdbm'}->{$dg} = 1;
248 }
249 }
250
251 # load up the document processor for building
252 # if a buildproc class has been created for this collection, use it
253 # otherwise, use the mgpp buildproc
254 my ($buildprocdir, $buildproctype);
255 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
256 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
257 $buildproctype = "${collection}buildproc";
258 } else {
259 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
260 $buildproctype = "mgppbuildproc";
261 }
262 require "$buildprocdir/$buildproctype.pm";
263
264 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
265 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
266 die "$@" if $@;
267
268 $self->{'buildtype'} = "mgpp";
269
270 return $self;
271}
272
273sub init {
274 my $self = shift (@_);
275
276 if (!$self->{'debug'} && !$self->{'keepold'}) {
277 # remove any old builds
278 &util::rm_r($self->{'build_dir'});
279 &util::mk_all_dir($self->{'build_dir'});
280
281 # make the text directory
282 my $textdir = "$self->{'build_dir'}/text";
283 &util::mk_all_dir($textdir);
284 }
285}
286
287sub set_strip_html {
288 my $self = shift (@_);
289 my ($strip) = @_;
290
291 $self->{'strip_html'} = $strip;
292 $self->{'buildproc'}->set_strip_html($strip);
293}
294
295sub compress_text {
296
297 my $self = shift (@_);
298 my ($textindex) = @_;
299
300 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
301 my $exe = &util::get_os_exe ();
302 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
303 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
304 my $outhandle = $self->{'outhandle'};
305
306 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
307
308 my $basefilename = "text/$self->{'collection'}";
309 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
310
311 my $osextra = "";
312 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
313 $fulltextprefix =~ s@/@\\@g;
314 }
315 else {
316 $osextra = " -d /";
317 }
318
319
320 # define the section names and possibly the doc name for mgpasses
321 # the compressor doesn't need to know about paragraphs - never want to
322 # retrieve them
323 my $mgpp_passes_sections = "";
324 my ($doc_level) = $self->{'doc_level'};
325 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} . " ";
326 foreach my $level (keys %{$self->{'levels'}}) {
327 if ($level ne $doc_level && $level ne "paragraph") {
328 $mgpp_passes_sections .= "-K " . $level_map{$level} . " ";
329 }
330 }
331
332 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
333 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
334
335 # collect the statistics for the text
336 # -b $maxdocsize sets the maximum document size to be 12 meg
337 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
338 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
339
340 my ($handle);
341 if ($self->{'debug'}) {
342 $handle = STDOUT;
343 } else {
344 #print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n";
345 if (!-e "$mgpp_passes_exe" ||
346 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
347 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
348 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
349 }
350 $handle = mgppbuilder::PIPEOUT;
351 }
352 $self->{'buildproc'}->set_output_handle ($handle);
353 $self->{'buildproc'}->set_mode ('text');
354 $self->{'buildproc'}->set_index ($textindex);
355 $self->{'buildproc'}->set_indexing_text (0);
356 if ($self->{'no_text'}) {
357 $self->{'buildproc'}->set_store_text(0);
358 } else {
359 $self->{'buildproc'}->set_store_text(1);
360 }
361 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
362 $self->{'buildproc'}->set_levels ($self->{'levels'});
363 $self->{'buildproc'}->reset();
364 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
365 $self->{'buildproc'}, $self->{'maxdocs'});
366 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
367 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
368 &plugin::end($self->{'pluginfo'});
369 close (PIPEOUT);
370
371 close ($handle) unless $self->{'debug'};
372
373 $self->print_stats();
374
375 # create the compression dictionary
376 # the compression dictionary is built by assuming the stats are from a seed
377 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
378 # and the resulting dictionary must be less than 5 meg with the most
379 # frequent words being put into the dictionary first (-2 -k 5120)
380 # note: these options are left over from mg version
381 if (!$self->{'debug'}) {
382 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
383 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
384 if (!-e "$mgpp_compression_dict_exe") {
385 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
386 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
387 }
388 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
389
390 if (!$self->{'debug'}) {
391 #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n";
392 if (!-e "$mgpp_passes_exe" ||
393 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
394 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
395 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
396 }
397 }
398 }
399 else {
400 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
401 }
402
403 $self->{'buildproc'}->reset();
404 # compress the text
405 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
406 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
407
408 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
409 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
410 close ($handle) unless $self->{'debug'};
411
412 $self->print_stats();
413 print STDERR "</Stage>\n" if $self->{'gli'};
414}
415
416sub want_built {
417 my $self = shift (@_);
418 my ($index) = @_;
419
420 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
421 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
422 if ($index =~ /^$checkstr$/) {
423 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
424 $self->{'notbuilt'}->{$index} = 1;
425 return 0;
426 }
427 }
428 }
429
430 return 1;
431}
432
433sub build_indexes {
434 my $self = shift (@_);
435 my ($indexname) = @_;
436 my $outhandle = $self->{'outhandle'};
437
438 my $indexes = [];
439 if (defined $indexname && $indexname =~ /\w/) {
440 push @$indexes, $indexname;
441 } else {
442 $indexes = $self->{'collect_cfg'}->{'indexes'};
443 }
444
445 # create the mapping between the index descriptions
446 # and their directory names (includes subcolls and langs)
447 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
448
449 # build each of the indexes
450 foreach my $index (@$indexes) {
451 if ($self->want_built($index)) {
452 print $outhandle "\n*** building index $index in subdirectory " .
453 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
454 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
455 $self->build_index($index);
456 } else {
457 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
458 }
459 }
460
461 #define the final field lists
462 $self->make_final_field_list();
463
464}
465
466# creates directory names for each of the index descriptions
467sub create_index_mapping {
468 my $self = shift (@_);
469 my ($indexes) = @_;
470
471 my %mapping = ();
472
473 $mapping{'indexmaporder'} = [];
474 $mapping{'subcollectionmaporder'} = [];
475 $mapping{'languagemaporder'} = [];
476
477 # dirnames is used to check for collisions. Start this off
478 # with the manditory directory names
479 my %dirnames = ('text'=>'text',
480 'extra'=>'extra');
481 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
482
483 foreach my $index (@$indexes) {
484 my ($fields, $subcollection, $languages) = split (":", $index);
485 # the directory name starts with a processed version of index fields
486 #my ($pindex) = $self->process_field($fields);
487 #$pindex = lc ($pindex);
488 # now we only ever have one index, and its called 'idx'
489 my $pindex = 'idx';
490
491 # next comes a processed version of the subcollection if there is one.
492 my $psub = $self->process_field ($subcollection);
493 $psub = lc ($psub);
494
495 # next comes a processed version of the language if there is one.
496 my $plang = $self->process_field ($languages);
497 $plang = lc ($plang);
498
499 my $dirname = $pindex . $psub . $plang;
500
501 # check to be sure all index names are unique
502 while (defined ($dirnames{$dirname})) {
503 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
504 }
505
506 $mapping{$index} = $dirname;
507
508 # store the mapping orders as well as the maps
509 # also put index, subcollection and language fields into the mapping thing -
510 # (the full index name (eg text:subcol:lang) is not used on
511 # the query page) -these are used for collectionmeta later on
512 if (!defined $mapping{'indexmap'}{"$fields"}) {
513 $mapping{'indexmap'}{"$fields"} = $pindex;
514 push (@{$mapping{'indexmaporder'}}, "$fields");
515 if (!defined $mapping{"$fields"}) {
516 $mapping{"$fields"} = $pindex;
517 }
518 }
519 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
520 $mapping{'subcollectionmap'}{$subcollection} = $psub;
521 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
522 $mapping{$subcollection} = $psub;
523 }
524 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
525 $mapping{'languagemap'}{$languages} = $plang;
526 push (@{$mapping{'languagemaporder'}}, $languages);
527 $mapping{$languages} = $plang;
528 }
529 $dirnames{$dirname} = $index;
530 $pnames{'index'}->{$pindex} = "$fields";
531 $pnames{'subcollection'}->{$psub} = $subcollection;
532 $pnames{'languages'}->{$plang} = $languages;
533 }
534
535 return \%mapping;
536}
537
538# returns a processed version of a field.
539# if the field has only one component the processed
540# version will contain the first character and next consonant
541# of that componant - otherwise it will contain the first
542# character of the first two components
543sub process_field {
544 my $self = shift (@_);
545 my ($field) = @_;
546
547 return "" unless (defined ($field) && $field =~ /\w/);
548
549 my @components = split /,/, $field;
550 if (scalar @components >= 2) {
551 splice (@components, 2);
552 map {s/^(.).*$/$1/;} @components;
553 return join("", @components);
554 } else {
555 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
556 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
557 return "$a$b";
558 }
559}
560
561sub make_unique {
562 my $self = shift (@_);
563 my ($namehash, $index, $indexref, $subref, $langref) = @_;
564 my ($fields, $subcollection, $languages) = split (":", $index);
565
566 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
567 $self->get_next_version ($indexref);
568 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
569 $self->get_next_version ($subref);
570 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
571 $self->get_next_version ($langref);
572 }
573 return "$$indexref$$subref$$langref";
574}
575
576sub get_next_version {
577 my $self = shift (@_);
578 my ($nameref) = @_;
579 my $num=0;
580 if ($$nameref =~ /(\d\d)$/) {
581 $num = $1; $num ++;
582 $$nameref =~ s/\d\d$/$num/;
583 } elsif ($$nameref =~ /(\d)$/) {
584 $num = $1;
585 if ($num == 9) {$$nameref =~ s/\d$/10/;}
586 else {$num ++; $$nameref =~ s/\d$/$num/;}
587 } else {
588 $$nameref =~ s/.$/0/;
589 }
590}
591
592sub build_index {
593 my $self = shift (@_);
594 my ($index) = @_;
595 my $outhandle = $self->{'outhandle'};
596
597 # get the full index directory path and make sure it exists
598 my $indexdir = $self->{'index_mapping'}->{$index};
599 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
600 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
601 $indexdir,
602 $self->{'collection'});
603 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
604 $self->{'collection'});
605
606 # get any os specific stuff
607 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
608
609 my $exe = &util::get_os_exe ();
610 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
611
612 # define the section names for mgpasses
613 # define the section names and possibly the doc name for mgpasses
614 my $mgpp_passes_sections = "";
615 my ($doc_level) = $self->{'doc_level'};
616 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} ." ";
617
618 foreach my $level (keys %{$self->{'levels'}}) {
619 if ($level ne $doc_level) {
620 $mgpp_passes_sections .= "-K " . $level_map{$level}. " ";
621 }
622 }
623
624 my $mgpp_perf_hash_build_exe =
625 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
626 my $mgpp_weights_build_exe =
627 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
628 my $mgpp_invf_dict_exe =
629 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
630 my $mgpp_stem_idx_exe =
631 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
632
633 my $osextra = "";
634 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
635 $fullindexprefix =~ s@/@\\@g;
636 } else {
637 $osextra = " -d /";
638 if ($outhandle ne "STDERR") {
639 # so mgpp_passes doesn't print to stderr if we redirect output
640 $osextra .= " 2>/dev/null";
641 }
642 }
643
644 # get the index expression if this index belongs
645 # to a subcollection
646 my $indexexparr = [];
647 my $langarr = [];
648 # there may be subcollection info, and language info.
649 my ($fields, $subcollection, $language) = split (":", $index);
650 my @subcollections = ();
651 @subcollections = split /,/, $subcollection if (defined $subcollection);
652
653 foreach $subcollection (@subcollections) {
654 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
655 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
656 }
657 }
658
659 # add expressions for languages if this index belongs to
660 # a language subcollection - only put languages expressions for the
661 # ones we want in the index
662
663 my @languages = ();
664 my $language_metadata = "Language";
665 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
666 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
667 }
668 @languages = split /,/, $language if (defined $language);
669 foreach my $language (@languages) {
670 my $not=0;
671 if ($language =~ s/^\!//) {
672 $not = 1;
673 }
674 if($not) {
675 push (@$langarr, "!$language");
676 } else {
677 push (@$langarr, "$language");
678 }
679 }
680
681 # Build index dictionary. Uses verbatim stem method
682 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
683 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
684 my ($handle);
685 if ($self->{'debug'}) {
686 $handle = STDOUT;
687 } else {
688 if (!-e "$mgpp_passes_exe" ||
689 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
690 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
691 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
692 }
693 $handle = mgppbuilder::PIPEOUT;
694 }
695
696 # set up the document processr
697 $self->{'buildproc'}->set_output_handle ($handle);
698 $self->{'buildproc'}->set_mode ('text');
699 $self->{'buildproc'}->set_index ($index, $indexexparr);
700 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
701 $self->{'buildproc'}->set_indexing_text (1);
702 $self->{'buildproc'}->set_store_text(1);
703 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
704 $self->{'buildproc'}->set_levels ($self->{'levels'});
705 $self->{'buildproc'}->reset();
706 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
707 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
708 close ($handle) unless $self->{'debug'};
709
710 $self->print_stats();
711
712 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
713 # we check on the .id file - index dictionary
714 my $dict_file = "$fullindexprefix.id";
715 if (!-e $dict_file) {
716 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
717 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
718 $self->{'notbuilt'}->{$index}=1;
719 return;
720 }
721
722 if (!$self->{'debug'}) {
723 # create the perfect hash function
724 if (!-e "$mgpp_perf_hash_build_exe") {
725 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
726 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
727 }
728 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
729
730 if (!-e "$mgpp_passes_exe" ||
731 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
732 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
733 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
734 }
735 }
736
737 # invert the text
738 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
739 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
740 $self->{'buildproc'}->reset();
741 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
742 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
743
744 $self->print_stats ();
745
746 if (!$self->{'debug'}) {
747
748 close ($handle);
749
750 # create the weights file
751 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
752 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
753 if (!-e "$mgpp_weights_build_exe") {
754 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
755 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
756 }
757 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
758
759 # create 'on-disk' stemmed dictionary
760 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
761 if (!-e "$mgpp_invf_dict_exe") {
762 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
763 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
764 }
765 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
766
767
768 # creates stem index files for the various stemming methods
769 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
770 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
771 if (!-e "$mgpp_stem_idx_exe") {
772 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
773 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
774 }
775 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
776 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
777 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
778
779 # remove unwanted files
780 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
781 opendir (DIR, $tmpdir) || die
782 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
783 foreach my $file (readdir(DIR)) {
784 next if $file =~ /^\./;
785 my ($suffix) = $file =~ /\.([^\.]+)$/;
786 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
787 # delete it!
788 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
789 #&util::rm (&util::filename_cat ($tmpdir, $file));
790 }
791 }
792 closedir (DIR);
793 }
794 print STDERR "</Stage>\n" if $self->{'gli'};
795}
796
797sub make_infodatabase {
798 my $self = shift (@_);
799 my $outhandle = $self->{'outhandle'};
800
801
802 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
803 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
804 &util::mk_all_dir ($textdir);
805 &util::mk_all_dir ($assocdir);
806
807 # get db name
808 my $dbext = ".bdb";
809 $dbext = ".ldb" if &util::is_little_endian();
810 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
811 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
812
813 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
814 my $exe = &util::get_os_exe ();
815 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
816
817 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
818 if (!defined $self->{'build_cfg'}) {
819 $self->read_final_field_list();
820 }
821 print $outhandle "\n*** creating the info database and processing associated files\n"
822 if ($self->{'verbosity'} >= 1);
823 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
824
825 # init all the classifiers
826 &classify::init_classifiers ($self->{'classifiers'});
827
828 # set up the document processor
829 my ($handle);
830 if ($self->{'debug'}) {
831 $handle = STDOUT;
832 } else {
833 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
834 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
835 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
836 }
837 $handle = mgppbuilder::PIPEOUT;
838 }
839
840 $self->{'buildproc'}->set_output_handle ($handle);
841 $self->{'buildproc'}->set_mode ('infodb');
842 $self->{'buildproc'}->set_assocdir ($assocdir);
843 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
844 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
845 $self->{'buildproc'}->set_indexing_text (0);
846 $self->{'buildproc'}->set_store_text(1);
847 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
848
849 $self->{'buildproc'}->reset();
850
851 # do the collection info
852 print $handle "[collection]\n";
853
854 # first do the collection meta stuff - everything without a dot
855 my $collmetadefined = 0;
856 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
857 $collmetadefined = 1;
858 foreach my $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
859 next if ($cmeta =~ /^\./); # for now, ignore ones with dots
860 my ($metadata_entry) = $self->create_language_db_map($cmeta, $cmeta);
861 #write the entry to the file
862 print $handle $metadata_entry;
863
864 } # foreach collmeta key
865 }
866 #add the index field macros to [collection]
867 # eg <TI>Title
868 # <SU>Subject
869 # these now come from collection meta. if that is not defined, usses the metadata name
870 my $field_entry="";
871 my $collmeta = "";
872 foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
873 my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
874 next if $shortfield eq 1;
875
876 # we need to check if some coll meta has been defined
877 $collmeta = ".$longfield";
878 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
879 my $metadata_entry = $self->create_language_db_map($collmeta, $shortfield);
880 $field_entry .= $metadata_entry;
881 } else { #use the metadata names, or the text macros for allfields and textonly
882 if ($longfield eq "allfields") {
883 $field_entry .= "<$shortfield>_query:textallfields_\n";
884 } elsif ($longfield eq "text") {
885 $field_entry .= "<$shortfield>_query:texttextonly_\n";
886 } else {
887 $field_entry .= "<$shortfield>$longfield\n";
888 }
889 }
890 }
891 print $handle $field_entry;
892
893 # now add the level names
894 my $level_entry = "";
895 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
896 $collmeta = ".$level"; # based on the original specification
897 $level =~ tr/A-Z/a-z/; # make it lower case
898 my $levelid = $level_map{$level}; # find the actual value we used in the index
899 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
900 my $metadata_entry = $self->create_language_db_map($collmeta, $levelid);
901 $level_entry .= $metadata_entry;
902 } else {
903 # use the default macro
904 $level_entry .= "<$levelid>" . $level_map{$levelid} . "\n";
905 }
906 }
907 print $handle $level_entry;
908
909 # now add subcoll meta
910 my $subcoll_entry = "";
911 my $shortname = "";
912 my $one_entry = "";
913 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
914 if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
915 $shortname = $self->{'index_mapping'}->{$subcoll};
916 $one_entry = $self->create_language_db_map(".$subcoll", $shortname);
917 $subcoll_entry .= $one_entry;
918 } else {
919 $subcoll_entry .= "<$shortname>$subcoll\n";
920 }
921 }
922 print $handle $subcoll_entry;
923 # now add language meta
924 my $lang_entry = "";
925 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
926 if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
927 $shortname = $self->{'index_mapping'}->{$lang};
928 $one_entry = $self->create_language_db_map(".$lang", $shortname);
929 $lang_entry .= $one_entry;
930 } else {
931 $lang_entry .= "<$shortname>$lang\n";
932 }
933 }
934 print $handle $lang_entry;
935 #end the collection entry
936 print $handle "\n" . ('-' x 70) . "\n";
937
938
939
940 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
941 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
942
943 # output classification information
944 &classify::output_classify_info ($self->{'classifiers'}, $handle,
945 $self->{'remove_empty_classifications'},
946 $self->{'gli'});
947
948 #output doclist
949 my @doclist = $self->{'buildproc'}->get_doc_list();
950 my $docs = join (";",@doclist);
951 print $handle "[browselist]\n";
952 print $handle "<hastxt>0\n";
953 print $handle "<childtype>VList\n";
954 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
955 print $handle "<thistype>Invisible\n";
956 print $handle "<contains>$docs";
957 print $handle "\n" . ('-' x 70) . "\n";
958 close ($handle) if !$self->{'debug'};
959
960 print STDERR "</Stage>\n" if $self->{'gli'};
961}
962
963sub create_language_db_map {
964 my $self = shift (@_);
965 my ($metaname, $mapname) = @_;
966 my $outhandle = $self->{'outhandle'};
967 my $defaultfound=0;
968 my $first=1;
969 my $metadata_entry = "";
970 my $default="";
971 #iterate through the languages
972 foreach my $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}})) {
973 if ($first) {
974 $first=0;
975 #set the default default to the first entry
976 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
977 }
978 if ($lang =~ /default/) {
979 $defaultfound=1;
980 #the default entry goes first
981 $metadata_entry = "<$mapname>" .
982 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{'default'} . "\n" . $metadata_entry;
983 }
984 else {
985 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
986 if ($l) {
987 $metadata_entry .= "<$mapname:$l>" .
988 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang} . "\n";
989
990 # Use the English value as the default if no default is specified
991 if ($l =~ /en/i) {
992 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
993 }
994 }
995 }
996 } #foreach lang
997 #if we haven't found a default, put one in
998 if (!$defaultfound) {
999 $metadata_entry = "<$mapname>$default\n" . $metadata_entry;
1000 }
1001 return $metadata_entry;
1002
1003}
1004sub collect_specific {
1005 my $self = shift (@_);
1006}
1007
1008# at the end of building, we have an indexfieldmap with all teh mappings, plus
1009# some extras, and indexmap with any indexes in it that weren't specified in the index definition.
1010# we want to make an ordered list of fields that are indexed, and a list of mappings that are used. this will be used for the build.cfg file, and for collection meta definition
1011# we store these in a build.cfg bit
1012sub make_final_field_list {
1013 my $self = shift (@_);
1014
1015 $self->{'build_cfg'} = {};
1016
1017 # store the indexfieldmap information
1018 my @indexfieldmap = ();
1019 my @indexfields = ();
1020 my $specifiedfields = {};
1021 my @specifiedfieldorder = ();
1022 # go through the index definition and add each thing to a map, so we can easily check if it is already specified - when doing the metadata, we print out all the individual fields, but some may already be specified in the index definition, so we dont want to add those again.
1023 foreach my $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
1024 # remove subcoll stuff
1025 my $parts = $field;
1026 $parts =~ s/:.*$//;
1027 my @fs = split(',', $parts);
1028 foreach my $f(@fs) {
1029 if (!defined $specifiedfields->{$f}) {
1030 $specifiedfields->{$f}=1;
1031 push (@specifiedfieldorder, "$f");
1032 }
1033 }
1034 }
1035
1036 #add all fields bit
1037 foreach my $field (@specifiedfieldorder) {
1038 if ($field eq "metadata") {
1039 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
1040 if (!defined $specifiedfields->{$newfield}) {
1041 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
1042 push (@indexfields, "$newfield");
1043 }
1044 }
1045
1046 } elsif ($field eq 'text') {
1047 push (@indexfieldmap, "text\-\>TX");
1048 push (@indexfields, "text");
1049 } elsif ($field eq 'allfields') {
1050 push (@indexfieldmap, "allfields\-\>ZZ");
1051 push (@indexfields, "allfields");
1052 } else {
1053 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
1054 push (@indexfields, "$field");
1055
1056 }
1057 }
1058 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1059 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1060
1061
1062}
1063
1064
1065# recreate the field list from the build.cfg file, look first in building, then in index to find it. if there is no build.cfg, we cant do the field list (there is unlikely to be any index anyway.)
1066sub read_final_field_list {
1067 my $self = shift (@_);
1068 $self->{'build_cfg'} = {};
1069 my @indexfieldmap = ();
1070 my @indexfields = ();
1071
1072 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
1073 # set the default mapping
1074 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
1075 }
1076 # we read the stuff in from the build.cfg file - if its there
1077 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
1078
1079 if (!-e $buildconfigfile) {
1080 # try the index dir - but do we know where it is?? try here
1081 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
1082 if (!-e $buildconfigfile) {
1083 #we cant find a config file - just ignore the field list
1084 return;
1085 }
1086 }
1087 my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
1088 if (defined $buildcfg->{'indexfields'}) {
1089 foreach my $field (@{$buildcfg->{'indexfields'}}) {
1090 push (@indexfields, "$field");
1091 }
1092 }
1093 if (defined $buildcfg->{'indexfieldmap'}) {
1094 foreach my $field (@{$buildcfg->{'indexfieldmap'}}) {
1095 push (@indexfieldmap, "$field");
1096 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
1097 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
1098 }
1099 }
1100
1101 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1102 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1103
1104}
1105sub make_auxiliary_files {
1106 my $self = shift (@_);
1107 my ($index);
1108
1109 my $build_cfg = {};
1110 # this already includes indexfieldmap and indexfields
1111 if (defined $self->{'build_cfg'}) {
1112 $build_cfg = $self->{'build_cfg'};
1113 }
1114 #my %build_cfg = ();
1115
1116 my $outhandle = $self->{'outhandle'};
1117 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
1118 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
1119
1120 # get the text directory
1121 &util::mk_all_dir ($self->{'build_dir'});
1122
1123 # store the build date
1124 $build_cfg->{'builddate'} = time;
1125 $build_cfg->{'buildtype'} = $self->{'buildtype'};
1126
1127 # store the level info
1128 my @indexlevels = ();
1129 foreach my $l (@{$self->{'levelorder'}}) {
1130 push (@indexlevels, $level_map{$l});
1131 }
1132 $build_cfg->{'indexlevels'} = \@indexlevels;
1133
1134 if ($self->{'levels'}->{'section'}) {
1135 $build_cfg->{'textlevel'} = $level_map{'section'};
1136 } else {
1137 $build_cfg->{'textlevel'} = $level_map{'document'};
1138 }
1139 # store the number of documents and number of bytes
1140 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
1141 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
1142
1143 # store the mapping between the index names and the directory names
1144 my @indexmap = ();
1145 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
1146 if (not defined ($self->{'notbuilt'}->{$index})) {
1147 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
1148 }
1149 }
1150 $build_cfg->{'indexmap'} = \@indexmap;
1151
1152 my @subcollectionmap = ();
1153 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
1154 push (@subcollectionmap, "$subcollection\-\>" .
1155 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
1156 }
1157 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
1158
1159 my @languagemap = ();
1160 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
1161 push (@languagemap, "$language\-\>" .
1162 $self->{'index_mapping'}->{'languagemap'}->{$language});
1163 }
1164 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
1165
1166 #$build_cfg->{'notbuilt'} = $self->{'notbuilt'};
1167 my @notbuilt = ();
1168 foreach my $nb (keys %{$self->{'notbuilt'}}) {
1169 push (@notbuilt, $nb);
1170 }
1171 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
1172
1173 # write out the build information
1174 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
1175 '^(builddate|buildtype|numdocs|numbytes|textlevel)$',
1176 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels)$');
1177
1178 print STDERR "</Stage>\n" if $self->{'gli'};
1179}
1180
1181sub deinit {
1182 my $self = shift (@_);
1183}
1184
1185sub print_stats {
1186 my $self = shift (@_);
1187
1188 my $outhandle = $self->{'outhandle'};
1189 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
1190 my $index = $self->{'buildproc'}->get_index();
1191 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
1192 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
1193
1194 if ($indexing_text) {
1195 print $outhandle "Stats (Creating index $index)\n";
1196 } else {
1197 print $outhandle "Stats (Compressing text from $index)\n";
1198 }
1199 print $outhandle "Total bytes in collection: $num_bytes\n";
1200 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
1201
1202 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
1203 print $outhandle "***************\n";
1204 if ($indexing_text) {
1205 print $outhandle "WARNING: There is very little or no text to process for $index\n";
1206 } elsif (!$self->{'no_text'}) {
1207 print $outhandle "WARNING: There is very little or no text to compress\n";
1208 }
1209 print $outhandle " Was this your intention?\n";
1210 print $outhandle "***************\n";
1211 }
1212
1213}
1214
12151;
1216
1217
Note: See TracBrowser for help on using the repository browser.