source: trunk/gsdl/perllib/mgppbuilder.pm@ 6407

Last change on this file since 6407 was 6407, checked in by jmt12, 20 years ago

Added non-language specific messages useful for progress checking. These are enabled by providing a -gli argument.

  • Property svn:keywords set to Author Date Id Revision
File size: 40.3 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mgpp
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50%level_map = ('document'=>'Doc',
51 'section'=>'Sec',
52 'paragraph'=>'Para',
53 'Doc'=>'_textdocument_',
54 'Sec'=>'_textsection_',
55 'Para'=>'_textparagraph_');
56
57#$doc_level = "Doc";
58#$sec_level = "Sec";
59#$para_level = "Para";
60
61%wanted_index_files = ('td'=>1,
62 't'=>1,
63 'tl'=>1,
64 'ti'=>1,
65 'idb'=>1,
66 'ib1'=>1,
67 'ib2'=>1,
68 'ib3'=>1,
69 'i'=>1,
70 'il'=>1,
71 'w'=>1,
72 'wa'=>1);
73
74# change this so a user can add their own ones in via a file or cfg
75#add AND, OR, NOT NEAR to this list - these cannot be used as field names
76#also add the level names (Doc, Sec, Para)
77%static_indexfield_map = ('Title'=>'TI',
78 'TI'=>1,
79 'Subject'=>'SU',
80 'SU'=>1,
81 'Creator'=>'CR',
82 'CR'=>1,
83 'Organization'=>'ORG',
84 'ORG'=>1,
85 'Source'=>'SO',
86 'SO'=>1,
87 'Howto'=>'HT',
88 'HT'=>1,
89 'ItemTitle'=>'IT',
90 'IT'=>1,
91 'ProgNumber'=>'PN',
92 'PN'=>1,
93 'People'=>'PE',
94 'PE'=>1,
95 'Coverage'=>'CO',
96 'CO'=>1,
97 'allfields'=>'ZZ',
98 'ZZ'=>1,
99 'text'=>'TX',
100 'TX'=>1,
101 'AND'=>1,
102 'OR'=>1,
103 'NOT'=>1,
104 'NEAR'=>1,
105 'Doc'=>1,
106 'Sec'=>1,
107 'Para'=>1);
108
109sub new {
110 my ($class, $collection, $source_dir, $build_dir, $verbosity,
111 $maxdocs, $debug, $keepold, $allclassifications,
112 $outhandle, $no_text, $gli) = @_;
113
114 $outhandle = STDERR unless defined $outhandle;
115 $no_text = 0 unless defined $no_text;
116
117 # create an mgppbuilder object
118 my $self = bless {'collection'=>$collection,
119 'source_dir'=>$source_dir,
120 'build_dir'=>$build_dir,
121 'verbosity'=>$verbosity,
122 'maxdocs'=>$maxdocs,
123 'debug'=>$debug,
124 'keepold'=>$keepold,
125 'allclassifications'=>$allclassifications,
126 'outhandle'=>$outhandle,
127 'no_text'=>$no_text,
128 'notbuilt'=>{}, # indexes not built
129 'indexfieldmap'=>\%static_indexfield_map,
130 'gli'=>$gli
131 }, $class;
132
133 $self->{'gli'} = 0 unless defined $self->{'gli'};
134
135 # read in the collection configuration file
136 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
137 if (!-e $colcfgname) {
138 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
139 }
140 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
141
142 # sort out the indexes
143 #indexes are specified with spaces, but we put them into one index
144 my $indexes = $self->{'collect_cfg'}->{'indexes'};
145 $self->{'collect_cfg'}->{'indexes'} = [];
146 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes));
147
148
149 # sort out subcollection indexes
150 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
151 my $indexes = $self->{'collect_cfg'}->{'indexes'};
152 $self->{'collect_cfg'}->{'indexes'} = [];
153 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
154 foreach $index (@$indexes) {
155 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
156 }
157 }
158 }
159
160 # sort out language subindexes
161 if (defined $self->{'collect_cfg'}->{'languages'}) {
162 my $indexes = $self->{'collect_cfg'}->{'indexes'};
163 $self->{'collect_cfg'}->{'indexes'} = [];
164 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
165 foreach $index (@$indexes) {
166 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
167 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
168 }
169 else { # add in an empty subcollection field
170 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
171 }
172 }
173 }
174 }
175
176 # make sure that the same index isn't specified more than once
177 my %tmphash = ();
178 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
179 $self->{'collect_cfg'}->{'indexes'} = [];
180 foreach my $i (@tmparray) {
181 if (!defined ($tmphash{$i})) {
182 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
183 $tmphash{$i} = 1;
184 }
185 }
186
187
188 # get the levels (Section, Paragraph) for indexing and compression
189 $self->{'levels'} = {};
190 $self->{'levelorder'} = ();
191 if (defined $self->{'collect_cfg'}->{'levels'}) {
192 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
193 $level =~ tr/A-Z/a-z/;
194 $self->{'levels'}->{$level} = 1;
195 push (@{$self->{'levelorder'}}, $level);
196 }
197 } else { # default to document
198 $self->{'levels'}->{'document'} = 1;
199 push (@{$self->{'levelorder'}}, 'document');
200 }
201
202 $self->{'doc_level'} = "document";
203 if (! $self->{'levels'}->{'document'}) {
204 if ($self->{'levels'}->{'section'}) {
205 $self->{'doc_level'} = "section";
206 } else {
207 die "you must have either document or section level specified!!\n";
208 }
209 }
210 print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
211 # get the list of plugins for this collection
212 my $plugins = [];
213 if (defined $self->{'collect_cfg'}->{'plugin'}) {
214 $plugins = $self->{'collect_cfg'}->{'plugin'};
215 }
216
217 # load all the plugins
218 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
219 if (scalar(@{$self->{'pluginfo'}}) == 0) {
220 print $outhandle "No plugins were loaded.\n";
221 die "\n";
222 }
223
224 # get the list of classifiers for this collection
225 my $classifiers = [];
226 if (defined $self->{'collect_cfg'}->{'classify'}) {
227 $classifiers = $self->{'collect_cfg'}->{'classify'};
228 }
229
230 # load all the classifiers
231 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
232
233 # load up any dontgdbm fields
234 $self->{'dontgdbm'} = {};
235 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
236 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
237 $self->{'dontgdbm'}->{$dg} = 1;
238 }
239 }
240
241 # load up the document processor for building
242 # if a buildproc class has been created for this collection, use it
243 # otherwise, use the mgpp buildproc
244 my ($buildprocdir, $buildproctype);
245 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
246 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
247 $buildproctype = "${collection}buildproc";
248 } else {
249 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
250 $buildproctype = "mgppbuildproc";
251 }
252 require "$buildprocdir/$buildproctype.pm";
253
254 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
255 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
256 die "$@" if $@;
257
258
259 return $self;
260}
261
262sub init {
263 my $self = shift (@_);
264
265 if (!$self->{'debug'} && !$self->{'keepold'}) {
266 # remove any old builds
267 &util::rm_r($self->{'build_dir'});
268 &util::mk_all_dir($self->{'build_dir'});
269
270 # make the text directory
271 my $textdir = "$self->{'build_dir'}/text";
272 &util::mk_all_dir($textdir);
273 }
274}
275
276sub set_strip_html {
277 my $self = shift (@_);
278 my ($strip) = @_;
279
280 $self->{'strip_html'} = $strip;
281 $self->{'buildproc'}->set_strip_html($strip);
282}
283
284sub compress_text {
285
286 my $self = shift (@_);
287 my ($textindex) = @_;
288
289 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
290 my $exe = &util::get_os_exe ();
291 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
292 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
293 my $outhandle = $self->{'outhandle'};
294
295 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
296
297 my $basefilename = "text/$self->{'collection'}";
298 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
299
300 my $osextra = "";
301 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
302 $fulltextprefix =~ s@/@\\@g;
303 }
304 else {
305 $osextra = " -d /";
306 }
307
308
309 # define the section names and possibly the doc name for mgpasses
310 # the compressor doesn't need to know about paragraphs - never want to
311 # retrieve them
312 my $mgpp_passes_sections = "";
313 my ($doc_level) = $self->{'doc_level'};
314 $mgpp_passes_sections .= "-J " . %level_map->{$doc_level} . " ";
315 foreach $level (keys %{$self->{'levels'}}) {
316 if ($level ne $doc_level && $level ne "paragraph") {
317 $mgpp_passes_sections .= "-K " . %level_map->{$level} . " ";
318 }
319 }
320
321 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
322 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
323
324 # collect the statistics for the text
325 # -b $maxdocsize sets the maximum document size to be 12 meg
326 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
327 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
328
329 my ($handle);
330 if ($self->{'debug'}) {
331 $handle = STDOUT;
332 } else {
333 #print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n";
334 if (!-e "$mgpp_passes_exe" ||
335 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
336 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
337 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
338 }
339 $handle = mgppbuilder::PIPEOUT;
340 }
341 $self->{'buildproc'}->set_output_handle ($handle);
342 $self->{'buildproc'}->set_mode ('text');
343 $self->{'buildproc'}->set_index ($textindex);
344 $self->{'buildproc'}->set_indexing_text (0);
345 if ($self->{'no_text'}) {
346 $self->{'buildproc'}->set_store_text(0);
347 } else {
348 $self->{'buildproc'}->set_store_text(1);
349 }
350 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
351 $self->{'buildproc'}->set_levels ($self->{'levels'});
352 $self->{'buildproc'}->reset();
353 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
354 $self->{'buildproc'}, $self->{'maxdocs'});
355 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
356 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
357 &plugin::end($self->{'pluginfo'});
358 close (PIPEOUT);
359
360 close ($handle) unless $self->{'debug'};
361
362 $self->print_stats();
363
364 # create the compression dictionary
365 # the compression dictionary is built by assuming the stats are from a seed
366 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
367 # and the resulting dictionary must be less than 5 meg with the most
368 # frequent words being put into the dictionary first (-2 -k 5120)
369 # note: these options are left over from mg version
370 if (!$self->{'debug'}) {
371 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
372 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
373 if (!-e "$mgpp_compression_dict_exe") {
374 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
375 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
376 }
377 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
378
379 if (!$self->{'debug'}) {
380 #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n";
381 if (!-e "$mgpp_passes_exe" ||
382 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
383 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
384 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
385 }
386 }
387 }
388 else {
389 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
390 }
391
392 $self->{'buildproc'}->reset();
393 # compress the text
394 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
395 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
396
397 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
398 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
399 close ($handle) unless $self->{'debug'};
400
401 $self->print_stats();
402 print STDERR "</Stage>\n" if $self->{'gli'};
403}
404
405sub want_built {
406 my $self = shift (@_);
407 my ($index) = @_;
408
409 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
410 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
411 if ($index =~ /^$checkstr$/) {
412 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
413 $self->{'notbuilt'}->{$index} = 1;
414 return 0;
415 }
416 }
417 }
418
419 return 1;
420}
421
422sub build_indexes {
423 my $self = shift (@_);
424 my ($indexname) = @_;
425 my $outhandle = $self->{'outhandle'};
426
427 my $indexes = [];
428 if (defined $indexname && $indexname =~ /\w/) {
429 push @$indexes, $indexname;
430 } else {
431 $indexes = $self->{'collect_cfg'}->{'indexes'};
432 }
433
434 # create the mapping between the index descriptions
435 # and their directory names (includes subcolls and langs)
436 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
437
438 # build each of the indexes
439 foreach $index (@$indexes) {
440 if ($self->want_built($index)) {
441 print $outhandle "\n*** building index $index in subdirectory " .
442 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
443 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
444 $self->build_index($index);
445 } else {
446 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
447 }
448 }
449
450 #define the final field lists
451 $self->make_final_field_list();
452
453}
454
455# creates directory names for each of the index descriptions
456sub create_index_mapping {
457 my $self = shift (@_);
458 my ($indexes) = @_;
459
460 my %mapping = ();
461
462 $mapping{'indexmaporder'} = [];
463 $mapping{'subcollectionmaporder'} = [];
464 $mapping{'languagemaporder'} = [];
465
466 # dirnames is used to check for collisions. Start this off
467 # with the manditory directory names
468 my %dirnames = ('text'=>'text',
469 'extra'=>'extra');
470 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
471
472 foreach $index (@$indexes) {
473 my ($fields, $subcollection, $languages) = split (":", $index);
474 # the directory name starts with a processed version of index fields
475 #my ($pindex) = $self->process_field($fields);
476 #$pindex = lc ($pindex);
477 # now we only ever have one index, and its called 'idx'
478 $pindex = 'idx';
479
480 # next comes a processed version of the subcollection if there is one.
481 my $psub = $self->process_field ($subcollection);
482 $psub = lc ($psub);
483
484 # next comes a processed version of the language if there is one.
485 my $plang = $self->process_field ($languages);
486 $plang = lc ($plang);
487
488 my $dirname = $pindex . $psub . $plang;
489
490 # check to be sure all index names are unique
491 while (defined ($dirnames{$dirname})) {
492 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
493 }
494
495 $mapping{$index} = $dirname;
496
497 # store the mapping orders as well as the maps
498 # also put index, subcollection and language fields into the mapping thing -
499 # (the full index name (eg text:subcol:lang) is not used on
500 # the query page) -these are used for collectionmeta later on
501 if (!defined $mapping{'indexmap'}{"$fields"}) {
502 $mapping{'indexmap'}{"$fields"} = $pindex;
503 push (@{$mapping{'indexmaporder'}}, "$fields");
504 if (!defined $mapping{"$fields"}) {
505 $mapping{"$fields"} = $pindex;
506 }
507 }
508 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
509 $mapping{'subcollectionmap'}{$subcollection} = $psub;
510 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
511 $mapping{$subcollection} = $psub;
512 }
513 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
514 $mapping{'languagemap'}{$languages} = $plang;
515 push (@{$mapping{'languagemaporder'}}, $language);
516 $mapping{$languages} = $plang;
517 }
518 $dirnames{$dirname} = $index;
519 $pnames{'index'}{$pindex} = "$fields";
520 $pnames{'subcollection'}{$psub} = $subcollection;
521 $pnames{'languages'}{$plang} = $languages;
522 }
523
524 return \%mapping;
525}
526
527# returns a processed version of a field.
528# if the field has only one component the processed
529# version will contain the first character and next consonant
530# of that componant - otherwise it will contain the first
531# character of the first two components
532sub process_field {
533 my $self = shift (@_);
534 my ($field) = @_;
535
536 return "" unless (defined ($field) && $field =~ /\w/);
537
538 my @components = split /,/, $field;
539 if (scalar @components >= 2) {
540 splice (@components, 2);
541 map {s/^(.).*$/$1/;} @components;
542 return join("", @components);
543 } else {
544 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
545 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
546 return "$a$b";
547 }
548}
549
550sub make_unique {
551 my $self = shift (@_);
552 my ($namehash, $index, $indexref, $subref, $langref) = @_;
553 my ($fields, $subcollection, $languages) = split (":", $index);
554
555 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
556 $self->get_next_version ($indexref);
557 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
558 $self->get_next_version ($subref);
559 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
560 $self->get_next_version ($langref);
561 }
562 return "$$indexref$$subref$$langref";
563}
564
565sub get_next_version {
566 my $self = shift (@_);
567 my ($nameref) = @_;
568
569 if ($$nameref =~ /(\d\d)$/) {
570 my $num = $1; $num ++;
571 $$nameref =~ s/\d\d$/$num/;
572 } elsif ($$nameref =~ /(\d)$/) {
573 my $num = $1;
574 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
575 else {$num ++; $$nameref =~ s/\d$/$num/;}
576 } else {
577 $$nameref =~ s/.$/0/;
578 }
579}
580
581sub build_index {
582 my $self = shift (@_);
583 my ($index) = @_;
584 my $outhandle = $self->{'outhandle'};
585
586 # get the full index directory path and make sure it exists
587 my $indexdir = $self->{'index_mapping'}->{$index};
588 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
589 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
590 $indexdir,
591 $self->{'collection'});
592 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
593 $self->{'collection'});
594
595 # get any os specific stuff
596 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
597
598 my $exe = &util::get_os_exe ();
599 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
600
601 # define the section names for mgpasses
602 # define the section names and possibly the doc name for mgpasses
603 my $mgpp_passes_sections = "";
604 my ($doc_level) = $self->{'doc_level'};
605 $mgpp_passes_sections .= "-J " . %level_map->{$doc_level} ." ";
606
607 foreach $level (keys %{$self->{'levels'}}) {
608 if ($level ne $doc_level) {
609 $mgpp_passes_sections .= "-K " . %level_map->{$level}. " ";
610 }
611 }
612
613 my $mgpp_perf_hash_build_exe =
614 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
615 my $mgpp_weights_build_exe =
616 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
617 my $mgpp_invf_dict_exe =
618 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
619 my $mgpp_stem_idx_exe =
620 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
621
622 my $osextra = "";
623 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
624 $fullindexprefix =~ s@/@\\@g;
625 } else {
626 $osextra = " -d /";
627 if ($outhandle ne "STDERR") {
628 # so mgpp_passes doesn't print to stderr if we redirect output
629 $osextra .= " 2>/dev/null";
630 }
631 }
632
633 # get the index expression if this index belongs
634 # to a subcollection
635 my $indexexparr = [];
636
637 # there may be subcollection info, and language info.
638 my ($fields, $subcollection, $language) = split (":", $index);
639 my @subcollections = ();
640 @subcollections = split /,/, $subcollection if (defined $subcollection);
641
642 foreach $subcollection (@subcollections) {
643 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
644 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
645 }
646 }
647
648 # add expressions for languages if this index belongs to
649 # a language subcollection - only put languages expressions for the
650 # ones we want in the index
651
652 my @languages = ();
653 @languages = split /,/, $language if (defined $language);
654 foreach $language (@languages) {
655 my $not=0;
656 if ($language =~ s/^\!//) {
657 $not = 1;
658 }
659 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
660 if ($lang eq $language) {
661 if ($not) {
662 push (@$indexexparr, "!Language/$language/");
663 } else {
664 push (@$indexexparr, "Language/$language/");
665 }
666 last;
667 }
668 }
669 }
670
671 # Build index dictionary. Uses verbatim stem method
672 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
673 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
674 my ($handle);
675 if ($self->{'debug'}) {
676 $handle = STDOUT;
677 } else {
678 if (!-e "$mgpp_passes_exe" ||
679 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
680 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
681 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
682 }
683 $handle = mgppbuilder::PIPEOUT;
684 }
685
686 # set up the document processr
687 $self->{'buildproc'}->set_output_handle ($handle);
688 $self->{'buildproc'}->set_mode ('text');
689 $self->{'buildproc'}->set_index ($index, $indexexparr);
690 $self->{'buildproc'}->set_indexing_text (1);
691 $self->{'buildproc'}->set_store_text(1);
692 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
693 $self->{'buildproc'}->set_levels ($self->{'levels'});
694 $self->{'buildproc'}->reset();
695 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
696 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
697 close ($handle) unless $self->{'debug'};
698
699 $self->print_stats();
700
701 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
702 # we check on the .id file - index dictionary
703 my $dict_file = "$fullindexprefix.id";
704 if (!-e $dict_file) {
705 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
706 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
707 $self->{'notbuilt'}->{$index}=1;
708 return;
709 }
710
711 if (!$self->{'debug'}) {
712 # create the perfect hash function
713 if (!-e "$mgpp_perf_hash_build_exe") {
714 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
715 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
716 }
717 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
718
719 if (!-e "$mgpp_passes_exe" ||
720 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
721 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
722 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
723 }
724 }
725
726 # invert the text
727 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
728 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
729 $self->{'buildproc'}->reset();
730 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
731 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
732
733 $self->print_stats ();
734
735 if (!$self->{'debug'}) {
736
737 close ($handle);
738
739 # create the weights file
740 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
741 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
742 if (!-e "$mgpp_weights_build_exe") {
743 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
744 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
745 }
746 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
747
748 # create 'on-disk' stemmed dictionary
749 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
750 if (!-e "$mgpp_invf_dict_exe") {
751 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
752 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
753 }
754 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
755
756
757 # creates stem index files for the various stemming methods
758 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
759 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
760 if (!-e "$mgpp_stem_idx_exe") {
761 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
762 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
763 }
764 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
765 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
766 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
767
768 # remove unwanted files
769 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
770 opendir (DIR, $tmpdir) || die
771 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
772 foreach $file (readdir(DIR)) {
773 next if $file =~ /^\./;
774 my ($suffix) = $file =~ /\.([^\.]+)$/;
775 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
776 # delete it!
777 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
778 #&util::rm (&util::filename_cat ($tmpdir, $file));
779 }
780 }
781 closedir (DIR);
782 }
783 print STDERR "</Stage>\n" if $self->{'gli'};
784}
785
786sub make_infodatabase {
787 my $self = shift (@_);
788 my $outhandle = $self->{'outhandle'};
789
790
791 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
792 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
793 &util::mk_all_dir ($textdir);
794 &util::mk_all_dir ($assocdir);
795
796 # get db name
797 my $dbext = ".bdb";
798 $dbext = ".ldb" if &util::is_little_endian();
799 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
800 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
801
802 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
803 my $exe = &util::get_os_exe ();
804 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
805
806 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
807 if (!defined $self->{'build_cfg'}) {
808 $self->read_final_field_list();
809 }
810 print $outhandle "\n*** creating the info database and processing associated files\n"
811 if ($self->{'verbosity'} >= 1);
812 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
813
814 # init all the classifiers
815 &classify::init_classifiers ($self->{'classifiers'});
816
817 # set up the document processor
818 my ($handle);
819 if ($self->{'debug'}) {
820 $handle = STDOUT;
821 } else {
822 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
823 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
824 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
825 }
826 $handle = mgppbuilder::PIPEOUT;
827 }
828
829 $self->{'buildproc'}->set_output_handle ($handle);
830 $self->{'buildproc'}->set_mode ('infodb');
831 $self->{'buildproc'}->set_assocdir ($assocdir);
832 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
833 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
834 $self->{'buildproc'}->set_indexing_text (0);
835 $self->{'buildproc'}->set_store_text(1);
836 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
837
838 $self->{'buildproc'}->reset();
839
840 # do the collection info
841 print $handle "[collection]\n";
842
843 # first do the collection meta stuff - everything without a dot
844 my $collmetadefined = 0;
845 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
846 $collmetadefined = 1;
847 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
848 next if ($cmeta =~ /^\./); # for now, ignore ones with dots
849 my ($metadata_entry) = $self->create_language_db_map($cmeta, $cmeta);
850 #write the entry to the file
851 print $handle $metadata_entry;
852
853 } # foreach collmeta key
854 }
855 #add the index field macros to [collection]
856 # eg <TI>Title
857 # <SU>Subject
858 # these now come from collection meta. if that is not defined, usses the metadata name
859 $field_entry="";
860 foreach $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
861 $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
862 next if $shortfield eq 1;
863
864 # we need to check if some coll meta has been defined
865 my $collmeta = ".$longfield";
866 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
867 $metadata_entry = $self->create_language_db_map($collmeta, $shortfield);
868 $field_entry .= $metadata_entry;
869 } else { #use the metadata names, or the text macros for allfields and textonly
870 if ($longfield eq "allfields") {
871 $field_entry .= "<$shortfield>_query:textallfields_\n";
872 } elsif ($longfield eq "text") {
873 $field_entry .= "<$shortfield>_query:texttextonly_\n";
874 } else {
875 $field_entry .= "<$shortfield>$longfield\n";
876 }
877 }
878 }
879 print $handle $field_entry;
880
881 # now add the level names
882 $level_entry = "";
883 foreach $level (@{$self->{'collect_cfg'}->{'levels'}}) {
884 my $collmeta = ".$level"; # based on the original specification
885 $level =~ tr/A-Z/a-z/; # make it lower case
886 my $levelid = %level_map->{$level}; # find the actual value we used in the index
887 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
888 $metadata_entry = $self->create_language_db_map($collmeta, $levelid);
889 $level_entry .= $metadata_entry;
890 } else {
891 # use the default macro
892 $level_entry .= "<$levelid>" . %level_map->{$levelid} . "\n";
893 }
894 }
895 print $handle $level_entry;
896
897 # now add subcoll meta
898 $subcoll_entry = "";
899 foreach $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
900 if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
901 my $shortname = $self->{'index_mapping'}->{$subcoll};
902 $one_entry = $self->create_language_db_map(".$subcoll", $shortname);
903 $subcoll_entry .= $one_entry;
904 } else {
905 $subcoll_entry .= "<$shortname>$subcoll\n";
906 }
907 }
908 print $handle $subcoll_entry;
909 #end the collection entry
910 print $handle "\n" . ('-' x 70) . "\n";
911
912
913
914 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
915 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
916
917 # output classification information
918 &classify::output_classify_info ($self->{'classifiers'}, $handle,
919 $self->{'allclassifications'},
920 $self->{'gli'});
921
922 #output doclist
923 my @doclist = $self->{'buildproc'}->get_doc_list();
924 my $docs = join (";",@doclist);
925 print $handle "[browselist]\n";
926 print $handle "<hastxt>0\n";
927 print $handle "<childtype>VList\n";
928 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
929 print $handle "<thistype>Invisible\n";
930 print $handle "<contains>$docs";
931 print $handle "\n" . ('-' x 70) . "\n";
932 close ($handle) if !$self->{'debug'};
933
934 print STDERR "</Stage>\n" if $self->{'gli'};
935}
936
937sub create_language_db_map {
938 my $self = shift (@_);
939 my ($metaname, $mapname) = @_;
940 my $outhandle = $self->{'outhandle'};
941 my $defaultfound=0;
942 my $first=1;
943 my $metadata_entry = "";
944 my $default="";
945 #iterate through the languages
946 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}})) {
947 if ($first) {
948 $first=0;
949 #set the default default to the first entry
950 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
951 }
952 if ($lang =~ /default/) {
953 $defaultfound=1;
954 #the default entry goes first
955 $metadata_entry = "<$mapname>" .
956 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{'default'} . "\n" . $metadata_entry;
957 }
958 else {
959 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
960 if ($l) {
961 $metadata_entry .= "<$mapname:$l>" .
962 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang} . "\n";
963 }
964 }
965 } #foreach lang
966 #if we haven't found a default, put one in
967 if (!$defaultfound) {
968 $metadata_entry = "<$mapname>$default\n" . $metadata_entry;
969 }
970 return $metadata_entry;
971
972}
973sub collect_specific {
974 my $self = shift (@_);
975}
976
977# at the end of building, we have an indexfieldmap with all teh mappings, plus
978# some extras, and indexmap with any indexes in it that weren't specified in the index definition.
979# we want to make an ordered list of fields that are indexed, and a list of mappings that are used. this will be used for the build.cfg file, and for collection meta definition
980# we store these in a build.cfg bit
981sub make_final_field_list {
982 my $self = shift (@_);
983
984 $self->{'build_cfg'} = {};
985
986 # store the indexfieldmap information
987 my @indexfieldmap = ();
988 my @indexfields = ();
989 my $specifiedfields = {};
990 my @specifiedfieldorder = ();
991 # go through the index definition and add each thing to a map, so we can easily check if it is already specified - when doing the metadata, we print out all the individual fields, but some may already be specified in the index definition, so we dont want to add those again.
992 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
993 # remove subcoll stuff
994 my $parts = $field;
995 $parts =~ s/:.*$//;
996 my @fs = split(',', $parts);
997 foreach $f(@fs) {
998 if (!defined $specifiedfields->{$f}) {
999 $specifiedfields->{$f}=1;
1000 push (@specifiedfieldorder, "$f");
1001 }
1002 }
1003 }
1004
1005 #add all fields bit
1006 foreach $field (@specifiedfieldorder) {
1007 if ($field eq "metadata") {
1008 foreach $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
1009 if (!defined $specifiedfields->{$newfield}) {
1010 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
1011 push (@indexfields, "$newfield");
1012 }
1013 }
1014
1015 } elsif ($field eq 'text') {
1016 push (@indexfieldmap, "text\-\>TX");
1017 push (@indexfields, "text");
1018 } elsif ($field eq 'allfields') {
1019 push (@indexfieldmap, "allfields\-\>ZZ");
1020 push (@indexfields, "allfields");
1021 } else {
1022 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
1023 push (@indexfields, "$field");
1024
1025 }
1026 }
1027 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1028 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1029
1030
1031}
1032
1033
1034# recreate the field list from the build.cfg file, look first in building, then in index to find it. if there is no build.cfg, we cant do the field list (there is unlikely to be any index anyway.)
1035sub read_final_field_list {
1036 my $self = shift (@_);
1037 $self->{'build_cfg'} = {};
1038 my @indexfieldmap = ();
1039 my @indexfields = ();
1040
1041 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
1042 # set the default mapping
1043 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
1044 }
1045 # we read the stuff in from the build.cfg file - if its there
1046 $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
1047
1048 if (!-e $buildconfigfile) {
1049 # try the index dir - but do we know where it is?? try here
1050 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
1051 if (!-e $buildconfigfile) {
1052 #we cant find a config file - just ignore the field list
1053 return;
1054 }
1055 }
1056 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
1057 if (defined $buildcfg->{'indexfields'}) {
1058 foreach $field (@{$buildcfg->{'indexfields'}}) {
1059 push (@indexfields, "$field");
1060 }
1061 }
1062 if (defined $buildcfg->{'indexfieldmap'}) {
1063 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
1064 push (@indexfieldmap, "$field");
1065 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
1066 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
1067 }
1068 }
1069
1070 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1071 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1072
1073}
1074sub make_auxiliary_files {
1075 my $self = shift (@_);
1076 my ($index);
1077
1078 my $build_cfg = {};
1079 # this already includes indexfieldmap and indexfields
1080 if (defined $self->{'build_cfg'}) {
1081 $build_cfg = $self->{'build_cfg'};
1082 }
1083 #my %build_cfg = ();
1084
1085 my $outhandle = $self->{'outhandle'};
1086 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
1087 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
1088
1089 # get the text directory
1090 &util::mk_all_dir ($self->{'build_dir'});
1091
1092 # store the build date
1093 $build_cfg->{'builddate'} = time;
1094 $build_cfg->{'buildtype'} = "mgpp"; #do we need this??
1095
1096 # store the level info
1097 my @indexlevels = ();
1098 foreach $l (@{$self->{'levelorder'}}) {
1099 push (@indexlevels, %level_map->{$l});
1100 }
1101 $build_cfg->{'indexlevels'} = \@indexlevels;
1102
1103 if ($self->{'levels'}->{'section'}) {
1104 $build_cfg->{'textlevel'} = %level_map->{'section'};
1105 } else {
1106 $build_cfg->{'textlevel'} = %level_map->{'document'};
1107 }
1108 # store the number of documents and number of bytes
1109 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
1110 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
1111
1112 # store the mapping between the index names and the directory names
1113 my @indexmap = ();
1114 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
1115 if (not defined ($self->{'notbuilt'}->{$index})) {
1116 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
1117 }
1118 }
1119 $build_cfg->{'indexmap'} = \@indexmap;
1120
1121 my @subcollectionmap = ();
1122 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
1123 push (@subcollectionmap, "$subcollection\-\>" .
1124 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
1125 }
1126 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
1127
1128 my @languagemap = ();
1129 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
1130 push (@languagemap, "$language\-\>" .
1131 $self->{'index_mapping'}->{'languagemap'}->{$language});
1132 }
1133 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
1134
1135 #$build_cfg->{'notbuilt'} = $self->{'notbuilt'};
1136 my @notbuilt = ();
1137 foreach $nb (keys %{$self->{'notbuilt'}}) {
1138 push (@notbuilt, $nb);
1139 }
1140 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
1141
1142 # write out the build information
1143 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
1144 '^(builddate|buildtype|numdocs|numbytes|textlevel)$',
1145 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels)$');
1146
1147 print STDERR "</Stage>\n" if $self->{'gli'};
1148}
1149
1150sub deinit {
1151 my $self = shift (@_);
1152}
1153
1154sub print_stats {
1155 my $self = shift (@_);
1156
1157 my $outhandle = $self->{'outhandle'};
1158 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
1159 my $index = $self->{'buildproc'}->get_index();
1160 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
1161 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
1162
1163 if ($indexing_text) {
1164 print $outhandle "Stats (Creating index $index)\n";
1165 } else {
1166 print $outhandle "Stats (Compressing text from $index)\n";
1167 }
1168 print $outhandle "Total bytes in collection: $num_bytes\n";
1169 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
1170
1171 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
1172 print $outhandle "***************\n";
1173 if ($indexing_text) {
1174 print $outhandle "WARNING: There is very little or no text to process for $index\n";
1175 } elsif (!$self->{'no_text'}) {
1176 print $outhandle "WARNING: There is very little or no text to compress\n";
1177 }
1178 print $outhandle " Was this your intention?\n";
1179 print $outhandle "***************\n";
1180 }
1181
1182}
1183
11841;
1185
1186
Note: See TracBrowser for help on using the repository browser.