source: trunk/gsdl/perllib/mgppbuilder.pm@ 10168

Last change on this file since 10168 was 10158, checked in by davidb, 19 years ago

*builder.pm packages (principally lucenebuilder.pl which inherits from
mgppbuilder) upgraded to support incremental building.

  • Property svn:keywords set to Author Date Id Revision
File size: 42.8 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mgpp
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48our $maxdocsize = 12000;
49
50our %level_map = ('document'=>'Doc',
51 'section'=>'Sec',
52 'paragraph'=>'Para',
53 'Doc'=>'_textdocument_',
54 'Sec'=>'_textsection_',
55 'Para'=>'_textparagraph_');
56
57#$doc_level = "Doc";
58#$sec_level = "Sec";
59#$para_level = "Para";
60
61our %wanted_index_files = ('td'=>1,
62 't'=>1,
63 'tl'=>1,
64 'ti'=>1,
65 'idb'=>1,
66 'ib1'=>1,
67 'ib2'=>1,
68 'ib3'=>1,
69 'i'=>1,
70 'il'=>1,
71 'w'=>1,
72 'wa'=>1);
73
74# change this so a user can add their own ones in via a file or cfg
75#add AND, OR, NOT NEAR to this list - these cannot be used as field names
76#also add the level names (Doc, Sec, Para)
77our %static_indexfield_map = ('Title'=>'TI',
78 'TI'=>1,
79 'Subject'=>'SU',
80 'SU'=>1,
81 'Creator'=>'CR',
82 'CR'=>1,
83 'Organization'=>'ORG',
84 'ORG'=>1,
85 'Source'=>'SO',
86 'SO'=>1,
87 'Howto'=>'HT',
88 'HT'=>1,
89 'ItemTitle'=>'IT',
90 'IT'=>1,
91 'ProgNumber'=>'PN',
92 'PN'=>1,
93 'People'=>'PE',
94 'PE'=>1,
95 'Coverage'=>'CO',
96 'CO'=>1,
97 'allfields'=>'ZZ',
98 'ZZ'=>1,
99 'text'=>'TX',
100 'TX'=>1,
101 'AND'=>1,
102 'OR'=>1,
103 'NOT'=>1,
104 'NEAR'=>1,
105 'Doc'=>1,
106 'Sec'=>1,
107 'Para'=>1);
108
109sub new {
110 my $class = shift(@_);
111
112 my ($collection, $source_dir, $build_dir, $verbosity,
113 $maxdocs, $debug, $keepold, $remove_empty_classifications,
114 $outhandle, $no_text, $failhandle, $gli) = @_;
115
116 $outhandle = STDERR unless defined $outhandle;
117 $no_text = 0 unless defined $no_text;
118
119 # create an mgppbuilder object
120 my $self = bless {'collection'=>$collection,
121 'source_dir'=>$source_dir,
122 'build_dir'=>$build_dir,
123 'verbosity'=>$verbosity,
124 'maxdocs'=>$maxdocs,
125 'debug'=>$debug,
126 'keepold'=>$keepold,
127 'remove_empty_classifications'=>$remove_empty_classifications,
128 'outhandle'=>$outhandle,
129 'no_text'=>$no_text,
130 'notbuilt'=>{}, # indexes not built
131 'indexfieldmap'=>\%static_indexfield_map,
132 'gli'=>$gli
133 }, $class;
134
135 $self->{'gli'} = 0 unless defined $self->{'gli'};
136
137 # read in the collection configuration file
138 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
139 if (!-e $colcfgname) {
140 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
141 }
142 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
143
144 # sort out the indexes
145 #indexes are specified with spaces, but we put them into one index
146 my $indexes = $self->{'collect_cfg'}->{'indexes'};
147 $self->{'collect_cfg'}->{'indexes'} = [];
148 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes));
149
150
151 # sort out subcollection indexes
152 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
153 my $indexes = $self->{'collect_cfg'}->{'indexes'};
154 $self->{'collect_cfg'}->{'indexes'} = [];
155 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
156 foreach my $index (@$indexes) {
157 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
158 }
159 }
160 }
161
162 # sort out language subindexes
163 if (defined $self->{'collect_cfg'}->{'languages'}) {
164 my $indexes = $self->{'collect_cfg'}->{'indexes'};
165 $self->{'collect_cfg'}->{'indexes'} = [];
166 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
167 foreach my $index (@$indexes) {
168 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
169 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
170 }
171 else { # add in an empty subcollection field
172 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
173
174 }
175 }
176 }
177 }
178
179 # make sure that the same index isn't specified more than once
180 my %tmphash = ();
181 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
182 $self->{'collect_cfg'}->{'indexes'} = [];
183 foreach my $i (@tmparray) {
184 if (!defined ($tmphash{$i})) {
185 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
186 $tmphash{$i} = 1;
187 }
188 }
189
190
191 # get the levels (Section, Paragraph) for indexing and compression
192 $self->{'levels'} = {};
193 $self->{'levelorder'} = ();
194 if (defined $self->{'collect_cfg'}->{'levels'}) {
195 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
196 $level =~ tr/A-Z/a-z/;
197 $self->{'levels'}->{$level} = 1;
198 push (@{$self->{'levelorder'}}, $level);
199 }
200 } else { # default to document
201 $self->{'levels'}->{'document'} = 1;
202 push (@{$self->{'levelorder'}}, 'document');
203 }
204
205 $self->{'doc_level'} = "document";
206 if (! $self->{'levels'}->{'document'}) {
207 if ($self->{'levels'}->{'section'}) {
208 $self->{'doc_level'} = "section";
209 } else {
210 die "you must have either document or section level specified!!\n";
211 }
212 }
213
214 print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
215 # get the list of plugins for this collection
216
217 #build up the extra global options for the plugins
218 my @global_opts = ();
219 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
220 push @global_opts, "-separate_cjk";
221 }
222
223 my $plugins = [];
224 if (defined $self->{'collect_cfg'}->{'plugin'}) {
225 $plugins = $self->{'collect_cfg'}->{'plugin'};
226 }
227
228 # load all the plugins
229 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, \@global_opts);
230 if (scalar(@{$self->{'pluginfo'}}) == 0) {
231 print $outhandle "No plugins were loaded.\n";
232 die "\n";
233 }
234
235 # get the list of classifiers for this collection
236 my $classifiers = [];
237 if (defined $self->{'collect_cfg'}->{'classify'}) {
238 $classifiers = $self->{'collect_cfg'}->{'classify'};
239 }
240
241 # load all the classifiers
242 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
243
244 # load up any dontgdbm fields
245 $self->{'dontgdbm'} = {};
246 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
247 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
248 $self->{'dontgdbm'}->{$dg} = 1;
249 }
250 }
251
252 # load up the document processor for building
253 # if a buildproc class has been created for this collection, use it
254 # otherwise, use the mgpp buildproc
255 my ($buildprocdir, $buildproctype);
256 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
257 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
258 $buildproctype = "${collection}buildproc";
259 } else {
260 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
261 $buildproctype = "mgppbuildproc";
262 }
263 require "$buildprocdir/$buildproctype.pm";
264
265 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
266 "\$source_dir, \$build_dir, \$keepold, \$verbosity, \$outhandle)");
267 die "$@" if $@;
268
269 $self->{'buildtype'} = "mgpp";
270
271 return $self;
272}
273
274sub init {
275 my $self = shift (@_);
276
277 if (!$self->{'debug'} && !$self->{'keepold'}) {
278 # remove any old builds
279 &util::rm_r($self->{'build_dir'});
280 &util::mk_all_dir($self->{'build_dir'});
281
282 # make the text directory
283 my $textdir = "$self->{'build_dir'}/text";
284 &util::mk_all_dir($textdir);
285 }
286}
287
288sub set_strip_html {
289 my $self = shift (@_);
290 my ($strip) = @_;
291
292 $self->{'strip_html'} = $strip;
293 $self->{'buildproc'}->set_strip_html($strip);
294}
295
296sub compress_text {
297
298 my $self = shift (@_);
299 my ($textindex) = @_;
300
301 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
302 my $exe = &util::get_os_exe ();
303 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
304 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
305 my $outhandle = $self->{'outhandle'};
306
307 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
308
309 my $basefilename = "text/$self->{'collection'}";
310 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
311
312 my $osextra = "";
313 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
314 $fulltextprefix =~ s@/@\\@g;
315 }
316 else {
317 $osextra = " -d /";
318 }
319
320
321 # define the section names and possibly the doc name for mgpasses
322 # the compressor doesn't need to know about paragraphs - never want to
323 # retrieve them
324 my $mgpp_passes_sections = "";
325 my ($doc_level) = $self->{'doc_level'};
326 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} . " ";
327 foreach my $level (keys %{$self->{'levels'}}) {
328 if ($level ne $doc_level && $level ne "paragraph") {
329 $mgpp_passes_sections .= "-K " . $level_map{$level} . " ";
330 }
331 }
332 $mgpp_passes_sections .= "-K SENT ";
333 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
334 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
335
336 # collect the statistics for the text
337 # -b $maxdocsize sets the maximum document size to be 12 meg
338 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
339 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
340
341 my ($handle);
342 if ($self->{'debug'}) {
343 $handle = STDOUT;
344 } else {
345 #print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n";
346 if (!-e "$mgpp_passes_exe" ||
347 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
348 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
349 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
350 }
351 $handle = mgppbuilder::PIPEOUT;
352 }
353
354 # gdbm_level
355 my $gdbm_level = "document";
356 if ($self->{'levels'}->{'section'}) {
357 $gdbm_level = "section";
358 }
359
360 $self->{'buildproc'}->set_output_handle ($handle);
361 $self->{'buildproc'}->set_mode ('text');
362 $self->{'buildproc'}->set_index ($textindex);
363 $self->{'buildproc'}->set_indexing_text (0);
364 if ($self->{'no_text'}) {
365 $self->{'buildproc'}->set_store_text(0);
366 } else {
367 $self->{'buildproc'}->set_store_text(1);
368 }
369 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
370 $self->{'buildproc'}->set_levels ($self->{'levels'});
371 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
372 $self->{'buildproc'}->reset();
373 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
374 $self->{'buildproc'}, $self->{'maxdocs'});
375 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
376 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
377 &plugin::end($self->{'pluginfo'});
378 close (PIPEOUT);
379
380 close ($handle) unless $self->{'debug'};
381
382 $self->print_stats();
383
384 # create the compression dictionary
385 # the compression dictionary is built by assuming the stats are from a seed
386 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
387 # and the resulting dictionary must be less than 5 meg with the most
388 # frequent words being put into the dictionary first (-2 -k 5120)
389 # note: these options are left over from mg version
390 if (!$self->{'debug'}) {
391 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
392 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
393 if (!-e "$mgpp_compression_dict_exe") {
394 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
395 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
396 }
397 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
398
399 if (!$self->{'debug'}) {
400 #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n";
401 if (!-e "$mgpp_passes_exe" ||
402 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
403 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
404 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
405 }
406 }
407 }
408 else {
409 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
410 }
411
412 $self->{'buildproc'}->reset();
413 # compress the text
414 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
415 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
416
417 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
418 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
419 close ($handle) unless $self->{'debug'};
420
421 $self->print_stats();
422 print STDERR "</Stage>\n" if $self->{'gli'};
423}
424
425sub want_built {
426 my $self = shift (@_);
427 my ($index) = @_;
428
429 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
430 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
431 if ($index =~ /^$checkstr$/) {
432 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
433 $self->{'notbuilt'}->{$index} = 1;
434 return 0;
435 }
436 }
437 }
438
439 return 1;
440}
441
442sub build_indexes {
443 my $self = shift (@_);
444 my ($indexname) = @_;
445 my $outhandle = $self->{'outhandle'};
446
447 my $indexes = [];
448 if (defined $indexname && $indexname =~ /\w/) {
449 push @$indexes, $indexname;
450 } else {
451 $indexes = $self->{'collect_cfg'}->{'indexes'};
452 }
453
454 # create the mapping between the index descriptions
455 # and their directory names (includes subcolls and langs)
456 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
457
458 # build each of the indexes
459 foreach my $index (@$indexes) {
460 if ($self->want_built($index)) {
461 print $outhandle "\n*** building index $index in subdirectory " .
462 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
463 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
464 $self->build_index($index);
465 } else {
466 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
467 }
468 }
469
470 #define the final field lists
471 $self->make_final_field_list();
472
473}
474
475# creates directory names for each of the index descriptions
476sub create_index_mapping {
477 my $self = shift (@_);
478 my ($indexes) = @_;
479
480 my %mapping = ();
481
482 $mapping{'indexmaporder'} = [];
483 $mapping{'subcollectionmaporder'} = [];
484 $mapping{'languagemaporder'} = [];
485
486 # dirnames is used to check for collisions. Start this off
487 # with the manditory directory names
488 my %dirnames = ('text'=>'text',
489 'extra'=>'extra');
490 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
491
492 foreach my $index (@$indexes) {
493 my ($fields, $subcollection, $languages) = split (":", $index);
494 # the directory name starts with a processed version of index fields
495 #my ($pindex) = $self->process_field($fields);
496 #$pindex = lc ($pindex);
497 # now we only ever have one index, and its called 'idx'
498 my $pindex = 'idx';
499
500 # next comes a processed version of the subcollection if there is one.
501 my $psub = $self->process_field ($subcollection);
502 $psub = lc ($psub);
503
504 # next comes a processed version of the language if there is one.
505 my $plang = $self->process_field ($languages);
506 $plang = lc ($plang);
507
508 my $dirname = $pindex . $psub . $plang;
509
510 # check to be sure all index names are unique
511 while (defined ($dirnames{$dirname})) {
512 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
513 }
514
515 $mapping{$index} = $dirname;
516
517 # store the mapping orders as well as the maps
518 # also put index, subcollection and language fields into the mapping thing -
519 # (the full index name (eg text:subcol:lang) is not used on
520 # the query page) -these are used for collectionmeta later on
521 if (!defined $mapping{'indexmap'}{"$fields"}) {
522 $mapping{'indexmap'}{"$fields"} = $pindex;
523 push (@{$mapping{'indexmaporder'}}, "$fields");
524 if (!defined $mapping{"$fields"}) {
525 $mapping{"$fields"} = $pindex;
526 }
527 }
528 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
529 $mapping{'subcollectionmap'}{$subcollection} = $psub;
530 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
531 $mapping{$subcollection} = $psub;
532 }
533 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
534 $mapping{'languagemap'}{$languages} = $plang;
535 push (@{$mapping{'languagemaporder'}}, $languages);
536 $mapping{$languages} = $plang;
537 }
538 $dirnames{$dirname} = $index;
539 $pnames{'index'}->{$pindex} = "$fields";
540 $pnames{'subcollection'}->{$psub} = $subcollection;
541 $pnames{'languages'}->{$plang} = $languages;
542 }
543
544 return \%mapping;
545}
546
547# returns a processed version of a field.
548# if the field has only one component the processed
549# version will contain the first character and next consonant
550# of that componant - otherwise it will contain the first
551# character of the first two components
552sub process_field {
553 my $self = shift (@_);
554 my ($field) = @_;
555
556 return "" unless (defined ($field) && $field =~ /\w/);
557
558 my @components = split /,/, $field;
559 if (scalar @components >= 2) {
560 splice (@components, 2);
561 map {s/^(.).*$/$1/;} @components;
562 return join("", @components);
563 } else {
564 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
565 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
566 return "$a$b";
567 }
568}
569
570sub make_unique {
571 my $self = shift (@_);
572 my ($namehash, $index, $indexref, $subref, $langref) = @_;
573 my ($fields, $subcollection, $languages) = split (":", $index);
574
575 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
576 $self->get_next_version ($indexref);
577 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
578 $self->get_next_version ($subref);
579 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
580 $self->get_next_version ($langref);
581 }
582 return "$$indexref$$subref$$langref";
583}
584
585sub get_next_version {
586 my $self = shift (@_);
587 my ($nameref) = @_;
588 my $num=0;
589 if ($$nameref =~ /(\d\d)$/) {
590 $num = $1; $num ++;
591 $$nameref =~ s/\d\d$/$num/;
592 } elsif ($$nameref =~ /(\d)$/) {
593 $num = $1;
594 if ($num == 9) {$$nameref =~ s/\d$/10/;}
595 else {$num ++; $$nameref =~ s/\d$/$num/;}
596 } else {
597 $$nameref =~ s/.$/0/;
598 }
599}
600
601sub build_index {
602 my $self = shift (@_);
603 my ($index) = @_;
604 my $outhandle = $self->{'outhandle'};
605
606 # get the full index directory path and make sure it exists
607 my $indexdir = $self->{'index_mapping'}->{$index};
608 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
609 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
610 $indexdir,
611 $self->{'collection'});
612 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
613 $self->{'collection'});
614
615 # get any os specific stuff
616 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
617
618 my $exe = &util::get_os_exe ();
619 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
620
621 # define the section names for mgpasses
622 # define the section names and possibly the doc name for mgpasses
623 my $mgpp_passes_sections = "";
624 my ($doc_level) = $self->{'doc_level'};
625 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} ." ";
626
627 foreach my $level (keys %{$self->{'levels'}}) {
628 if ($level ne $doc_level) {
629 $mgpp_passes_sections .= "-K " . $level_map{$level}. " ";
630 }
631 }
632
633 my $mgpp_perf_hash_build_exe =
634 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
635 my $mgpp_weights_build_exe =
636 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
637 my $mgpp_invf_dict_exe =
638 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
639 my $mgpp_stem_idx_exe =
640 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
641
642 my $osextra = "";
643 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
644 $fullindexprefix =~ s@/@\\@g;
645 } else {
646 $osextra = " -d /";
647 if ($outhandle ne "STDERR") {
648 # so mgpp_passes doesn't print to stderr if we redirect output
649 $osextra .= " 2>/dev/null";
650 }
651 }
652
653 # get the index expression if this index belongs
654 # to a subcollection
655 my $indexexparr = [];
656 my $langarr = [];
657 # there may be subcollection info, and language info.
658 my ($fields, $subcollection, $language) = split (":", $index);
659 my @subcollections = ();
660 @subcollections = split /,/, $subcollection if (defined $subcollection);
661
662 foreach $subcollection (@subcollections) {
663 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
664 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
665 }
666 }
667
668 # add expressions for languages if this index belongs to
669 # a language subcollection - only put languages expressions for the
670 # ones we want in the index
671
672 my @languages = ();
673 my $language_metadata = "Language";
674 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
675 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
676 }
677 @languages = split /,/, $language if (defined $language);
678 foreach my $language (@languages) {
679 my $not=0;
680 if ($language =~ s/^\!//) {
681 $not = 1;
682 }
683 if($not) {
684 push (@$langarr, "!$language");
685 } else {
686 push (@$langarr, "$language");
687 }
688 }
689
690 # Build index dictionary. Uses verbatim stem method
691 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
692 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
693 my ($handle);
694 if ($self->{'debug'}) {
695 $handle = STDOUT;
696 } else {
697 if (!-e "$mgpp_passes_exe" ||
698 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
699 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
700 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
701 }
702 $handle = mgppbuilder::PIPEOUT;
703 }
704
705 # gdbm_level
706 my $gdbm_level = "document";
707 if ($self->{'levels'}->{'section'}) {
708 $gdbm_level = "section";
709 }
710
711 # set up the document processr
712 $self->{'buildproc'}->set_output_handle ($handle);
713 $self->{'buildproc'}->set_mode ('text');
714 $self->{'buildproc'}->set_index ($index, $indexexparr);
715 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
716 $self->{'buildproc'}->set_indexing_text (1);
717 $self->{'buildproc'}->set_store_text(1);
718 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
719 $self->{'buildproc'}->set_levels ($self->{'levels'});
720 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
721
722 $self->{'buildproc'}->reset();
723 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
724 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
725 close ($handle) unless $self->{'debug'};
726
727 $self->print_stats();
728
729 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
730 # we check on the .id file - index dictionary
731 my $dict_file = "$fullindexprefix.id";
732 if (!-e $dict_file) {
733 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
734 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
735 $self->{'notbuilt'}->{$index}=1;
736 return;
737 }
738
739 if (!$self->{'debug'}) {
740 # create the perfect hash function
741 if (!-e "$mgpp_perf_hash_build_exe") {
742 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
743 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
744 }
745 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
746
747 if (!-e "$mgpp_passes_exe" ||
748 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
749 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
750 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
751 }
752 }
753
754 # invert the text
755 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
756 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
757 $self->{'buildproc'}->reset();
758 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
759 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
760
761 $self->print_stats ();
762
763 if (!$self->{'debug'}) {
764
765 close ($handle);
766
767 # create the weights file
768 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
769 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
770 if (!-e "$mgpp_weights_build_exe") {
771 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
772 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
773 }
774 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
775
776 # create 'on-disk' stemmed dictionary
777 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
778 if (!-e "$mgpp_invf_dict_exe") {
779 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
780 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
781 }
782 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
783
784
785 # creates stem index files for the various stemming methods
786 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
787 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
788 if (!-e "$mgpp_stem_idx_exe") {
789 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
790 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
791 }
792 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
793 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
794 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
795
796 # remove unwanted files
797 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
798 opendir (DIR, $tmpdir) || die
799 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
800 foreach my $file (readdir(DIR)) {
801 next if $file =~ /^\./;
802 my ($suffix) = $file =~ /\.([^\.]+)$/;
803 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
804 # delete it!
805 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
806 #&util::rm (&util::filename_cat ($tmpdir, $file));
807 }
808 }
809 closedir (DIR);
810 }
811 print STDERR "</Stage>\n" if $self->{'gli'};
812}
813
814sub make_infodatabase {
815 my $self = shift (@_);
816 my $outhandle = $self->{'outhandle'};
817
818
819 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
820 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
821 &util::mk_all_dir ($textdir);
822 &util::mk_all_dir ($assocdir);
823
824 # get db name
825 my $dbext = ".bdb";
826 $dbext = ".ldb" if &util::is_little_endian();
827 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
828 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
829
830 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
831 my $exe = &util::get_os_exe ();
832 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
833
834 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
835 if (!defined $self->{'build_cfg'}) {
836 $self->read_final_field_list();
837 }
838 print $outhandle "\n*** creating the info database and processing associated files\n"
839 if ($self->{'verbosity'} >= 1);
840 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
841
842 # init all the classifiers
843 &classify::init_classifiers ($self->{'classifiers'});
844
845 # set up the document processor
846 my ($handle);
847 if ($self->{'debug'}) {
848 $handle = STDOUT;
849 } else {
850 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
851 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
852 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
853 }
854 $handle = mgppbuilder::PIPEOUT;
855 }
856
857 $self->{'buildproc'}->set_output_handle ($handle);
858 $self->{'buildproc'}->set_mode ('infodb');
859 $self->{'buildproc'}->set_assocdir ($assocdir);
860 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
861 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
862 $self->{'buildproc'}->set_indexing_text (0);
863 $self->{'buildproc'}->set_store_text(1);
864 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
865
866 # make_infodatabase does not support incremental build
867 # => full reset needed
868 $self->{'buildproc'}->zero_reset();
869
870 # do the collection info
871 print $handle "[collection]\n";
872
873 # first do the collection meta stuff - everything without a dot
874 my $collmetadefined = 0;
875 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
876 $collmetadefined = 1;
877 foreach my $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
878 next if ($cmeta =~ /^\./); # for now, ignore ones with dots
879 my ($metadata_entry) = $self->create_language_db_map($cmeta, $cmeta);
880 #write the entry to the file
881 print $handle $metadata_entry;
882
883 } # foreach collmeta key
884 }
885 #add the index field macros to [collection]
886 # eg <TI>Title
887 # <SU>Subject
888 # these now come from collection meta. if that is not defined, usses the metadata name
889 my $field_entry="";
890 my $collmeta = "";
891 foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
892 my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
893 next if $shortfield eq 1;
894
895 # we need to check if some coll meta has been defined
896 $collmeta = ".$longfield";
897 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
898 my $metadata_entry = $self->create_language_db_map($collmeta, $shortfield);
899 $field_entry .= $metadata_entry;
900 } else { #use the metadata names, or the text macros for allfields and textonly
901 if ($longfield eq "allfields") {
902 $field_entry .= "<$shortfield>_query:textallfields_\n";
903 } elsif ($longfield eq "text") {
904 $field_entry .= "<$shortfield>_query:texttextonly_\n";
905 } else {
906 $field_entry .= "<$shortfield>$longfield\n";
907 }
908 }
909 }
910 print $handle $field_entry;
911
912 # now add the level names
913 my $level_entry = "";
914 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
915 $collmeta = ".$level"; # based on the original specification
916 $level =~ tr/A-Z/a-z/; # make it lower case
917 my $levelid = $level_map{$level}; # find the actual value we used in the index
918 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
919 my $metadata_entry = $self->create_language_db_map($collmeta, $levelid);
920 $level_entry .= $metadata_entry;
921 } else {
922 # use the default macro
923 $level_entry .= "<$levelid>" . $level_map{$levelid} . "\n";
924 }
925 }
926 print $handle $level_entry;
927
928 # now add subcoll meta
929 my $subcoll_entry = "";
930 my $shortname = "";
931 my $one_entry = "";
932 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
933 if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
934 $shortname = $self->{'index_mapping'}->{$subcoll};
935 $one_entry = $self->create_language_db_map(".$subcoll", $shortname);
936 $subcoll_entry .= $one_entry;
937 } else {
938 $subcoll_entry .= "<$shortname>$subcoll\n";
939 }
940 }
941 print $handle $subcoll_entry;
942
943 # now add language meta
944 my $lang_entry = "";
945 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
946 if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
947 $shortname = $self->{'index_mapping'}->{$lang};
948 $one_entry = $self->create_language_db_map(".$lang", $shortname);
949 $lang_entry .= $one_entry;
950 } else {
951 $lang_entry .= "<$shortname>$lang\n";
952 }
953 }
954 print $handle $lang_entry;
955 # end the collection entry
956 print $handle "\n" . ('-' x 70) . "\n";
957
958 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
959 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
960
961 # output classification information
962 &classify::output_classify_info ($self->{'classifiers'}, $handle,
963 $self->{'remove_empty_classifications'},
964 $self->{'gli'});
965
966 #output doclist
967 my @doclist = $self->{'buildproc'}->get_doc_list();
968 my $docs = join (";",@doclist);
969 print $handle "[browselist]\n";
970 print $handle "<hastxt>0\n";
971 print $handle "<childtype>VList\n";
972 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
973 print $handle "<thistype>Invisible\n";
974 print $handle "<contains>$docs";
975 print $handle "\n" . ('-' x 70) . "\n";
976 close ($handle) if !$self->{'debug'};
977
978 print STDERR "</Stage>\n" if $self->{'gli'};
979}
980
981sub create_language_db_map {
982 my $self = shift (@_);
983 my ($metaname, $mapname) = @_;
984 my $outhandle = $self->{'outhandle'};
985 my $defaultfound=0;
986 my $first=1;
987 my $metadata_entry = "";
988 my $default="";
989 #iterate through the languages
990 foreach my $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}})) {
991 if ($first) {
992 $first=0;
993 #set the default default to the first entry
994 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
995 }
996 if ($lang =~ /default/) {
997 $defaultfound=1;
998 #the default entry goes first
999 $metadata_entry = "<$mapname>" .
1000 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{'default'} . "\n" . $metadata_entry;
1001 }
1002 else {
1003 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
1004 if ($l) {
1005 $metadata_entry .= "<$mapname:$l>" .
1006 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang} . "\n";
1007
1008 # Use the English value as the default if no default is specified
1009 if ($l =~ /en/i) {
1010 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
1011 }
1012 }
1013 }
1014 } #foreach lang
1015 #if we haven't found a default, put one in
1016 if (!$defaultfound) {
1017 $metadata_entry = "<$mapname>$default\n" . $metadata_entry;
1018 }
1019 return $metadata_entry;
1020
1021}
1022sub collect_specific {
1023 my $self = shift (@_);
1024}
1025
1026# at the end of building, we have an indexfieldmap with all the mappings,
1027# plus some extras, and indexmap with any indexes in it that weren't
1028# specified in the index definition. we want to make an ordered list of
1029# fields that are indexed, and a list of mappings that are used. this will
1030# be used for the build.cfg file, and for collection meta definition we
1031# store these in a build.cfg bit
1032sub make_final_field_list {
1033 my $self = shift (@_);
1034
1035 $self->{'build_cfg'} = {};
1036
1037 # store the indexfieldmap information
1038 my @indexfieldmap = ();
1039 my @indexfields = ();
1040 my $specifiedfields = {};
1041 my @specifiedfieldorder = ();
1042
1043 # go through the index definition and add each thing to a map, so we
1044 # can easily check if it is already specified - when doing the
1045 # metadata, we print out all the individual fields, but some may
1046 # already be specified in the index definition, so we dont want to add
1047 # those again.
1048
1049 foreach my $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
1050 # remove subcoll stuff
1051 my $parts = $field;
1052 $parts =~ s/:.*$//;
1053 my @fs = split(',', $parts);
1054 foreach my $f(@fs) {
1055 if (!defined $specifiedfields->{$f}) {
1056 $specifiedfields->{$f}=1;
1057 push (@specifiedfieldorder, "$f");
1058 }
1059 }
1060 }
1061
1062 #add all fields bit
1063 foreach my $field (@specifiedfieldorder) {
1064 if ($field eq "metadata") {
1065 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
1066 if (!defined $specifiedfields->{$newfield}) {
1067 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
1068 push (@indexfields, "$newfield");
1069 }
1070 }
1071
1072 } elsif ($field eq 'text') {
1073 push (@indexfieldmap, "text\-\>TX");
1074 push (@indexfields, "text");
1075 } elsif ($field eq 'allfields') {
1076 push (@indexfieldmap, "allfields\-\>ZZ");
1077 push (@indexfields, "allfields");
1078 } else {
1079 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
1080 push (@indexfields, "$field");
1081
1082 }
1083 }
1084
1085 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1086 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1087}
1088
1089
1090# recreate the field list from the build.cfg file, look first in building,
1091# then in index to find it. if there is no build.cfg, we can't do the field
1092# list (there is unlikely to be any index anyway.)
1093sub read_final_field_list {
1094 my $self = shift (@_);
1095 $self->{'build_cfg'} = {};
1096 my @indexfieldmap = ();
1097 my @indexfields = ();
1098
1099 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
1100 # set the default mapping
1101 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
1102 }
1103 # we read the stuff in from the build.cfg file - if its there
1104 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
1105
1106 if (!-e $buildconfigfile) {
1107 # try the index dir - but do we know where it is?? try here
1108 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
1109 if (!-e $buildconfigfile) {
1110 #we cant find a config file - just ignore the field list
1111 return;
1112 }
1113 }
1114
1115 my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
1116
1117 if (defined $buildcfg->{'indexfields'}) {
1118 foreach my $field (@{$buildcfg->{'indexfields'}}) {
1119 push (@indexfields, "$field");
1120 }
1121 }
1122
1123 if (defined $buildcfg->{'indexfieldmap'}) {
1124 foreach my $field (@{$buildcfg->{'indexfieldmap'}}) {
1125 push (@indexfieldmap, "$field");
1126 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
1127 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
1128 }
1129 }
1130
1131 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1132 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1133}
1134
1135sub make_auxiliary_files {
1136 my $self = shift (@_);
1137 my ($index);
1138
1139 my $build_cfg = {};
1140 # this already includes indexfieldmap and indexfields
1141 if (defined $self->{'build_cfg'}) {
1142 $build_cfg = $self->{'build_cfg'};
1143 }
1144
1145 my $outhandle = $self->{'outhandle'};
1146 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
1147 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
1148
1149 # get the text directory
1150 &util::mk_all_dir ($self->{'build_dir'});
1151
1152 # store the build date
1153 $build_cfg->{'builddate'} = time;
1154 $build_cfg->{'buildtype'} = $self->{'buildtype'};
1155 $build_cfg->{'indexstem'} = $self->{'collection'};
1156 # store the level info
1157 my @indexlevels = ();
1158 my @levelmap = ();
1159 foreach my $l (@{$self->{'levelorder'}}) {
1160 push (@indexlevels, $level_map{$l});
1161 push (@levelmap, "$l\-\>$level_map{$l}");
1162 }
1163 $build_cfg->{'indexlevels'} = \@indexlevels;
1164 $build_cfg->{'levelmap'} = \@levelmap;
1165
1166 if ($self->{'levels'}->{'section'}) {
1167 $build_cfg->{'textlevel'} = $level_map{'section'};
1168 } else {
1169 $build_cfg->{'textlevel'} = $level_map{'document'};
1170 }
1171 # store the number of documents and number of bytes
1172 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
1173 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
1174 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
1175
1176 # store the mapping between the index names and the directory names
1177 my @indexmap = ();
1178 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
1179 if (not defined ($self->{'notbuilt'}->{$index})) {
1180 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
1181 }
1182 }
1183 $build_cfg->{'indexmap'} = \@indexmap;
1184
1185 my @subcollectionmap = ();
1186 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
1187 push (@subcollectionmap, "$subcollection\-\>" .
1188 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
1189 }
1190 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
1191
1192 my @languagemap = ();
1193 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
1194 push (@languagemap, "$language\-\>" .
1195 $self->{'index_mapping'}->{'languagemap'}->{$language});
1196 }
1197 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
1198
1199 my @notbuilt = ();
1200 foreach my $nb (keys %{$self->{'notbuilt'}}) {
1201 push (@notbuilt, $nb);
1202 }
1203 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
1204
1205 # write out the build information
1206 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
1207 '^(builddate|buildtype|numdocs|numsections|numbytes|textlevel|indexstem)$',
1208 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels|levelmap)$');
1209
1210 print STDERR "</Stage>\n" if $self->{'gli'};
1211}
1212
1213sub deinit {
1214 my $self = shift (@_);
1215
1216 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
1217}
1218
1219sub print_stats {
1220 my $self = shift (@_);
1221
1222 my $outhandle = $self->{'outhandle'};
1223 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
1224 my $index = $self->{'buildproc'}->get_index();
1225 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
1226 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
1227
1228 if ($indexing_text) {
1229 print $outhandle "Stats (Creating index $index)\n";
1230 } else {
1231 print $outhandle "Stats (Compressing text from $index)\n";
1232 }
1233 print $outhandle "Total bytes in collection: $num_bytes\n";
1234 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
1235
1236 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
1237
1238 if ($self->{'keepold'}) {
1239 if ($num_processed_bytes == 0) {
1240 if ($indexing_text) {
1241 print $outhandle "No additional text was added to $index\n";
1242 } elsif (!$self->{'no_text'}) {
1243 print $outhandle "No additional text was compressed\n";
1244 }
1245 }
1246 }
1247 else {
1248 print $outhandle "***************\n";
1249 if ($indexing_text) {
1250 print $outhandle "WARNING: There is very little or no text to process for $index\n";
1251 } elsif (!$self->{'no_text'}) {
1252 print $outhandle "WARNING: There is very little or no text to compress\n";
1253 }
1254 print $outhandle " Was this your intention?\n";
1255 print $outhandle "***************\n";
1256 }
1257
1258 }
1259
1260}
1261
12621;
1263
1264
Note: See TracBrowser for help on using the repository browser.