source: trunk/gsdl/perllib/mgppbuilder.pm@ 3115

Last change on this file since 3115 was 3115, checked in by jrm21, 22 years ago

Redirect mg(pp)_passes stderr to /dev/null if the "-out xxx" option is given
to buildcol.pl, as some things (eg cron) think a program fails if there is
any output to stderr.

  • Property svn:keywords set to Author Date Id Revision
File size: 30.4 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mg
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50
51%wanted_index_files = ('td'=>1,
52 't'=>1,
53 'tl'=>1,
54 'ti'=>1,
55 'idb'=>1,
56 'ib1'=>1,
57 'ib2'=>1,
58 'ib3'=>1,
59 'i'=>1,
60 'il'=>1,
61 'w'=>1,
62 'wa'=>1);
63
64# change this so a user can add their own ones in via a file or cfg
65%static_indexfield_map = ('Title'=>'TI',
66 'TI'=>1,
67 'Subject'=>'SU',
68 'SU'=>1,
69 'Creator'=>'CR',
70 'CR'=>1,
71 'Organization'=>'OR',
72 'OR'=>1,
73 'Source'=>'SO',
74 'SO'=>1,
75 'Howto'=>'HT',
76 'HT'=>1,
77 'ItemTitle'=>'IT',
78 'IT'=>1,
79 'ProgNumber'=>'PN',
80 'PN'=>1,
81 'People'=>'PE',
82 'PE'=>1,
83 'TextOnly'=>'TX',
84 'TX'=>1);
85
86sub new {
87 my ($class, $collection, $source_dir, $build_dir, $verbosity,
88 $maxdocs, $debug, $keepold, $allclassifications,
89 $outhandle, $no_text) = @_;
90
91 $outhandle = STDERR unless defined $outhandle;
92 $no_text = 0 unless defined $no_text;
93
94 # create an mgppbuilder object
95 my $self = bless {'collection'=>$collection,
96 'source_dir'=>$source_dir,
97 'build_dir'=>$build_dir,
98 'verbosity'=>$verbosity,
99 'maxdocs'=>$maxdocs,
100 'debug'=>$debug,
101 'keepold'=>$keepold,
102 'allclassifications'=>$allclassifications,
103 'outhandle'=>$outhandle,
104 'no_text'=>$no_text,
105 'notbuilt'=>[], # indexes not built
106 'indexfieldmap'=>\%static_indexfield_map
107 }, $class;
108
109
110 # read in the collection configuration file
111 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
112 if (!-e $colcfgname) {
113 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
114 }
115 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
116
117 # sort out subcollection indexes
118 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
119 my $indexes = $self->{'collect_cfg'}->{'indexes'};
120 $self->{'collect_cfg'}->{'indexes'} = [];
121 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
122 foreach $index (@$indexes) {
123 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
124 }
125 }
126 }
127
128 # sort out language subindexes
129 if (defined $self->{'collect_cfg'}->{'languages'}) {
130 my $indexes = $self->{'collect_cfg'}->{'indexes'};
131 $self->{'collect_cfg'}->{'indexes'} = [];
132 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
133 foreach $index (@$indexes) {
134 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
135 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
136 }
137 else { # add in an empty subcollection field
138 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
139 }
140 }
141 }
142 }
143
144 # make sure that the same index isn't specified more than once
145 my %tmphash = ();
146 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
147 $self->{'collect_cfg'}->{'indexes'} = [];
148 foreach my $i (@tmparray) {
149 if (!defined ($tmphash{$i})) {
150 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
151 $tmphash{$i} = 1;
152 }
153 }
154
155
156 # get the levels (Section, Paragraph) for indexing and compression
157 $self->{'levels'} = {};
158 if (defined $self->{'collect_cfg'}->{'levels'}) {
159 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
160 $self->{'levels'}->{$level} = 1;
161 }
162 }
163
164 # get the list of plugins for this collection
165 my $plugins = [];
166 if (defined $self->{'collect_cfg'}->{'plugin'}) {
167 $plugins = $self->{'collect_cfg'}->{'plugin'};
168 }
169
170 # load all the plugins
171 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
172 if (scalar(@{$self->{'pluginfo'}}) == 0) {
173 print $outhandle "No plugins were loaded.\n";
174 die "\n";
175 }
176
177 # get the list of classifiers for this collection
178 my $classifiers = [];
179 if (defined $self->{'collect_cfg'}->{'classify'}) {
180 $classifiers = $self->{'collect_cfg'}->{'classify'};
181 }
182
183 # load all the classifiers
184 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
185
186 # load up any dontgdbm fields
187 $self->{'dontgdbm'} = {};
188 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
189 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
190 $self->{'dontgdbm'}->{$dg} = 1;
191 }
192 }
193
194 # load up the document processor for building
195 # if a buildproc class has been created for this collection, use it
196 # otherwise, use the mgpp buildproc
197 my ($buildprocdir, $buildproctype);
198 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
199 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
200 $buildproctype = "${collection}buildproc";
201 } else {
202 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
203 $buildproctype = "mgppbuildproc";
204 }
205 require "$buildprocdir/$buildproctype.pm";
206
207 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
208 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
209 die "$@" if $@;
210
211
212 return $self;
213}
214
215sub init {
216 my $self = shift (@_);
217
218 if (!$self->{'debug'} && !$self->{'keepold'}) {
219 # remove any old builds
220 &util::rm_r($self->{'build_dir'});
221 &util::mk_all_dir($self->{'build_dir'});
222
223 # make the text directory
224 my $textdir = "$self->{'build_dir'}/text";
225 &util::mk_all_dir($textdir);
226 }
227}
228
229sub set_strip_html {
230 my $self = shift (@_);
231 my ($strip) = @_;
232
233 $self->{'strip_html'} = $strip;
234 $self->{'buildproc'}->set_strip_html($strip);
235}
236
237sub compress_text {
238
239 my $self = shift (@_);
240 my ($textindex) = @_;
241
242 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
243 my $exe = &util::get_os_exe ();
244 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
245 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
246 my $outhandle = $self->{'outhandle'};
247
248 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
249
250 my $basefilename = "text/$self->{'collection'}";
251 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
252
253 my $osextra = "";
254 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
255 $fulltextprefix =~ s@/@\\@g;
256 }
257 else {
258 $osextra = " -d /";
259 }
260
261
262 # define the section names for mgpasses
263 # the compressor doesn't need to know about paragraphs - never want to
264 # retrieve them
265 my $mgpp_passes_sections = "";
266 if ($self->{'levels'}->{'Section'}) {
267 $mgpp_passes_sections .= "-K Section ";
268 }
269
270 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
271
272 # collect the statistics for the text
273 # -b $maxdocsize sets the maximum document size to be 12 meg
274 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
275
276 my ($handle);
277 if ($self->{'debug'}) {
278 $handle = STDOUT;
279 } else {
280 if (!-e "$mgpp_passes_exe" ||
281 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
282 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
283 }
284 $handle = mgppbuilder::PIPEOUT;
285 }
286 $self->{'buildproc'}->set_output_handle ($handle);
287 $self->{'buildproc'}->set_mode ('text');
288 $self->{'buildproc'}->set_index ($textindex);
289 $self->{'buildproc'}->set_indexing_text (0);
290 if ($self->{'no_text'}) {
291 $self->{'buildproc'}->set_store_text(0);
292 } else {
293 $self->{'buildproc'}->set_store_text(1);
294 }
295 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
296 $self->{'buildproc'}->set_levels ($self->{'levels'});
297 $self->{'buildproc'}->reset();
298 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
299 $self->{'buildproc'}, $self->{'maxdocs'});
300 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
301 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
302 &plugin::end($self->{'pluginfo'});
303 close (PIPEOUT);
304
305 close ($handle) unless $self->{'debug'};
306
307 $self->print_stats();
308
309 # create the compression dictionary
310 # the compression dictionary is built by assuming the stats are from a seed
311 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
312 # and the resulting dictionary must be less than 5 meg with the most
313 # frequent words being put into the dictionary first (-2 -k 5120)
314 # note: these options are left over from mg version
315 if (!$self->{'debug'}) {
316 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
317 if (!-e "$mgpp_compression_dict_exe") {
318 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
319 }
320 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
321
322 if (!$self->{'debug'}) {
323 if (!-e "$mgpp_passes_exe" ||
324 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
325 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
326 }
327 }
328 }
329
330 $self->{'buildproc'}->reset();
331 # compress the text
332 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
333 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
334 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
335 close ($handle) unless $self->{'debug'};
336
337 $self->print_stats();
338}
339
340sub want_built {
341 my $self = shift (@_);
342 my ($index) = @_;
343
344 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
345 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
346 if ($index =~ /^$checkstr$/) {
347 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
348 return 0;
349 }
350 }
351 }
352
353 return 1;
354}
355
356sub build_indexes {
357 my $self = shift (@_);
358 my ($indexname) = @_;
359 my $outhandle = $self->{'outhandle'};
360
361 my $indexes = [];
362 if (defined $indexname && $indexname =~ /\w/) {
363 push @$indexes, $indexname;
364 } else {
365 $indexes = $self->{'collect_cfg'}->{'indexes'};
366 }
367
368 # create the mapping between the index descriptions
369 # and their directory names
370 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
371
372 # build each of the indexes
373 foreach $index (@$indexes) {
374 if ($self->want_built($index)) {
375 print $outhandle "\n*** building index $index in subdirectory " .
376 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
377 $self->build_index($index);
378 } else {
379 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
380 }
381 }
382}
383
384# creates directory names for each of the index descriptions
385sub create_index_mapping {
386 my $self = shift (@_);
387 my ($indexes) = @_;
388
389 my %mapping = ();
390 $mapping{'indexmaporder'} = [];
391 $mapping{'subcollectionmaporder'} = [];
392 $mapping{'languagemaporder'} = [];
393
394 # dirnames is used to check for collisions. Start this off
395 # with the manditory directory names
396 my %dirnames = ('text'=>'text',
397 'extra'=>'extra');
398 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
399
400 foreach $index (@$indexes) {
401 my ($fields, $subcollection, $languages) = split (":", $index);
402
403 # the directory name starts with a processed version of index fields
404 my ($pindex) = $self->process_field($fields);
405 # next comes a processed version of the index
406 $pindex = lc ($pindex);
407
408 # next comes a processed version of the subcollection if there is one.
409 my $psub = $self->process_field ($subcollection);
410 $psub = lc ($psub);
411
412 # next comes a processed version of the language if there is one.
413 my $plang = $self->process_field ($languages);
414 $plang = lc ($plang);
415
416 my $dirname = $pindex . $psub . $plang;
417
418 # check to be sure all index names are unique
419 while (defined ($dirnames{$dirname})) {
420 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
421 }
422
423 $mapping{$index} = $dirname;
424
425 # store the mapping orders as well as the maps
426 # also put index, subcollection and language fields into the mapping thing -
427 # (the full index name (eg document:text:subcol:lang) is not used on
428 # the query page) -these are used for collectionmeta later on
429 if (!defined $mapping{'indexmap'}{"$fields"}) {
430 $mapping{'indexmap'}{"$fields"} = $pindex;
431 push (@{$mapping{'indexmaporder'}}, "$fields");
432 if (!defined $mapping{"$fields"}) {
433 $mapping{"$fields"} = $pindex;
434 }
435 }
436 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
437 $mapping{'subcollectionmap'}{$subcollection} = $psub;
438 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
439 $mapping{$subcollection} = $psub;
440 }
441 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
442 $mapping{'languagemap'}{$languages} = $plang;
443 push (@{$mapping{'languagemaporder'}}, $language);
444 $mapping{$languages} = $plang;
445 }
446 $dirnames{$dirname} = $index;
447 $pnames{'index'}{$pindex} = "$fields";
448 $pnames{'subcollection'}{$psub} = $subcollection;
449 $pnames{'languages'}{$plang} = $languages;
450 }
451
452 return \%mapping;
453}
454
455# returns a processed version of a field.
456# if the field has only one component the processed
457# version will contain the first character and next consonant
458# of that componant - otherwise it will contain the first
459# character of the first two components
460sub process_field {
461 my $self = shift (@_);
462 my ($field) = @_;
463
464 return "" unless (defined ($field) && $field =~ /\w/);
465
466 my @components = split /,/, $field;
467 if (scalar @components >= 2) {
468 splice (@components, 2);
469 map {s/^(.).*$/$1/;} @components;
470 return join("", @components);
471 } else {
472 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
473 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
474 return "$a$b";
475 }
476}
477
478sub make_unique {
479 my $self = shift (@_);
480 my ($namehash, $index, $indexref, $subref, $langref) = @_;
481 my ($fields, $subcollection, $languages) = split (":", $index);
482
483 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
484 $self->get_next_version ($indexref);
485 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
486 $self->get_next_version ($subref);
487 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
488 $self->get_next_version ($langref);
489 }
490 return "$$indexref$$subref$$langref";
491}
492
493sub get_next_version {
494 my $self = shift (@_);
495 my ($nameref) = @_;
496
497 if ($$nameref =~ /(\d\d)$/) {
498 my $num = $1; $num ++;
499 $$nameref =~ s/\d\d$/$num/;
500 } elsif ($$nameref =~ /(\d)$/) {
501 my $num = $1;
502 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
503 else {$num ++; $$nameref =~ s/\d$/$num/;}
504 } else {
505 $$nameref =~ s/.$/0/;
506 }
507}
508
509sub build_index {
510 my $self = shift (@_);
511 my ($index) = @_;
512 my $outhandle = $self->{'outhandle'};
513
514 # get the full index directory path and make sure it exists
515 my $indexdir = $self->{'index_mapping'}->{$index};
516 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
517 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
518 $indexdir,
519 $self->{'collection'});
520 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
521 $self->{'collection'});
522
523 # get any os specific stuff
524 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
525
526 my $exe = &util::get_os_exe ();
527 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
528
529 # define the section names for mgpasses
530 my $mgpp_passes_sections = "";
531 foreach $level (keys (%{$self->{'levels'}})) {
532 if ($level eq "Section" || $level eq "Paragraph") {
533 $mgpp_passes_sections .= "-K $level ";
534 }
535 }
536
537 my $mgpp_perf_hash_build_exe =
538 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
539 my $mgpp_weights_build_exe =
540 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
541 my $mgpp_invf_dict_exe =
542 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
543 my $mgpp_stem_idx_exe =
544 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
545
546 my $osextra = "";
547 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
548 $fullindexprefix =~ s@/@\\@g;
549 } else {
550 $osextra = " -d /";
551 if ($outhandle ne "STDERR") {
552 # so mgpp_passes doesn't print to stderr if we redirect output
553 $osextra .= " 2>/dev/null";
554 }
555 }
556
557 # get the index expression if this index belongs
558 # to a subcollection
559 my $indexexparr = [];
560
561 # there may be subcollection info, and language info.
562 my ($fields, $subcollection, $language) = split (":", $index);
563 my @subcollections = ();
564 @subcollections = split /,/, $subcollection if (defined $subcollection);
565
566 foreach $subcollection (@subcollections) {
567 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
568 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
569 }
570 }
571
572 # add expressions for languages if this index belongs to
573 # a language subcollection - only put languages expressions for the
574 # ones we want in the index
575
576 my @languages = ();
577 @languages = split /,/, $language if (defined $language);
578 foreach $language (@languages) {
579 my $not=0;
580 if ($language =~ s/^\!//) {
581 $not = 1;
582 }
583 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
584 if ($lang eq $language) {
585 if ($not) {
586 push (@$indexexparr, "!Language/$language/");
587 } else {
588 push (@$indexexparr, "Language/$language/");
589 }
590 last;
591 }
592 }
593 }
594
595 # Build index dictionary. Uses verbatim stem method
596 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
597 my ($handle);
598 if ($self->{'debug'}) {
599 $handle = STDOUT;
600 } else {
601 if (!-e "$mgpp_passes_exe" ||
602 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
603 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
604 }
605 $handle = mgppbuilder::PIPEOUT;
606 }
607
608 # set up the document processor
609 $self->{'buildproc'}->set_output_handle ($handle);
610 $self->{'buildproc'}->set_mode ('text');
611 $self->{'buildproc'}->set_index ($index, $indexexparr);
612 $self->{'buildproc'}->set_indexing_text (1);
613 $self->{'buildproc'}->set_store_text(1);
614 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
615 $self->{'buildproc'}->set_levels ($self->{'levels'});
616 $self->{'buildproc'}->reset();
617 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
618 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
619 close ($handle) unless $self->{'debug'};
620
621 $self->print_stats();
622
623 if (!$self->{'debug'}) {
624 # create the perfect hash function
625 if (!-e "$mgpp_perf_hash_build_exe") {
626 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
627 }
628 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
629
630 if (!-e "$mgpp_passes_exe" ||
631 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
632 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
633 }
634 }
635
636 # invert the text
637 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
638
639 $self->{'buildproc'}->reset();
640 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
641 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
642
643 $self->print_stats ();
644
645 if (!$self->{'debug'}) {
646
647 close ($handle);
648
649 # create the weights file
650 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
651 if (!-e "$mgpp_weights_build_exe") {
652 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
653 }
654 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
655
656 # create 'on-disk' stemmed dictionary
657 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
658 if (!-e "$mgpp_invf_dict_exe") {
659 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
660 }
661 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
662
663
664 # creates stem index files for the various stemming methods
665 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
666 if (!-e "$mgpp_stem_idx_exe") {
667 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
668 }
669 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
670 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
671 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
672
673
674 # remove unwanted files
675 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
676 opendir (DIR, $tmpdir) || die
677 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
678 foreach $file (readdir(DIR)) {
679 next if $file =~ /^\./;
680 my ($suffix) = $file =~ /\.([^\.]+)$/;
681 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
682 # delete it!
683 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
684 #&util::rm (&util::filename_cat ($tmpdir, $file));
685 }
686 }
687 closedir (DIR);
688 }
689}
690
691sub make_infodatabase {
692 my $self = shift (@_);
693 my $outhandle = $self->{'outhandle'};
694
695
696 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
697 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
698 &util::mk_all_dir ($textdir);
699 &util::mk_all_dir ($assocdir);
700
701 # get db name
702 my $dbext = ".bdb";
703 $dbext = ".ldb" if &util::is_little_endian();
704 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
705 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
706
707 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
708 my $exe = &util::get_os_exe ();
709 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
710
711 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
712 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
713 #check build.cfg to see if indexfields have been filled in
714 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building/build.cfg");
715 if (-e $buildconfigfile) {
716 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
717 if (defined $buildcfg->{'indexfields'}) {
718 foreach $field (@{$buildcfg->{'indexfields'}}) {
719 $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
720 }
721 }
722 if (defined $buildcfg->{'indexfieldmap'}) {
723 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
724 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
725 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
726 }
727 }
728 }
729 }
730
731 print $outhandle "\n*** creating the info database and processing associated files\n"
732 if ($self->{'verbosity'} >= 1);
733
734 # init all the classifiers
735 &classify::init_classifiers ($self->{'classifiers'});
736
737 # set up the document processor
738 my ($handle);
739 if ($self->{'debug'}) {
740 $handle = STDOUT;
741 } else {
742 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
743 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
744 }
745 $handle = mgppbuilder::PIPEOUT;
746 }
747
748 $self->{'buildproc'}->set_output_handle ($handle);
749 $self->{'buildproc'}->set_mode ('infodb');
750 $self->{'buildproc'}->set_assocdir ($assocdir);
751 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
752 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
753 $self->{'buildproc'}->set_indexing_text (0);
754 $self->{'buildproc'}->set_store_text(1);
755 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
756
757 $self->{'buildproc'}->reset();
758
759 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
760
761 if (!defined $self->{'index_mapping'}) {
762 $self->{'index_mapping'} =
763 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
764 }
765
766 print $handle "[collection]\n";
767
768 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
769 my $defaultfound=0;
770 my $first=1;
771 my $metadata_entry = "";
772 my $default="";
773 my $cmetamap = "";
774 if ($cmeta =~ s/^\.//) {
775 if (defined $self->{'index_mapping'}->{$cmeta}) {
776 $cmetamap = $self->{'index_mapping'}->{$cmeta};
777 $cmeta = ".$cmeta";
778 }
779 else {
780 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
781 next; #ignore this one
782 }
783 }
784 else {
785 $cmetamap = $cmeta; # just using the same name
786 }
787 #iterate through the languages
788 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
789 if ($first) {
790 $first=0;
791 #set the default default to the first entry
792 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
793 }
794 if ($lang =~ /default/) {
795 $defaultfound=1;
796 #the default entry goes first
797 $metadata_entry = "<$cmetamap>" .
798 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
799 }
800 else {
801 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
802 if ($l) {
803 $metadata_entry .= "<$cmetamap:$l>" .
804 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
805 }
806 }
807 }
808 #if we haven't found a default, put one in
809 if (!$defaultfound) {
810 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
811 }
812 #write the entry to the file
813 print $handle $metadata_entry;
814
815 }
816
817 print $handle "\n" . ('-' x 70) . "\n";
818
819 }
820
821 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
822 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
823
824 # output classification information
825 &classify::output_classify_info ($self->{'classifiers'}, $handle,
826 $self->{'allclassifications'});
827
828 #output doclist
829 my @doclist = $self->{'buildproc'}->get_doc_list();
830 my $docs = join (";",@doclist);
831 print $handle "[browselist]\n";
832 print $handle "<hastxt>0\n";
833 print $handle "<childtype>VList\n";
834 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
835 print $handle "<thistype>Invisible\n";
836 print $handle "<contains>$docs";
837 print $handle "\n" . ('-' x 70) . "\n";
838 close ($handle) if !$self->{'debug'};
839
840}
841
842sub collect_specific {
843 my $self = shift (@_);
844}
845
846sub make_auxiliary_files {
847 my $self = shift (@_);
848 my ($index);
849 my %build_cfg = ();
850
851 my $outhandle = $self->{'outhandle'};
852 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
853
854 # get the text directory
855 &util::mk_all_dir ($self->{'build_dir'});
856
857 # store the build date
858 $build_cfg->{'builddate'} = time;
859 $build_cfg->{'buildtype'} = "mgpp";
860
861 # store the number of documents and number of bytes
862 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
863 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
864
865 # store the mapping between the index names and the directory names
866 my @indexmap = ();
867 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
868 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
869 }
870 $build_cfg->{'indexmap'} = \@indexmap;
871
872 my @subcollectionmap = ();
873 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
874 push (@subcollectionmap, "$subcollection\-\>" .
875 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
876 }
877 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
878
879 my @languagemap = ();
880 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
881 push (@languagemap, "$language\-\>" .
882 $self->{'index_mapping'}->{'languagemap'}->{$language});
883 }
884 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
885
886 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
887
888 # store the indexfieldmap information
889 my @indexfieldmap = ();
890 #add all fields bit
891 foreach $field (keys %{$self->{'buildproc'}->{'indexfields'}}) {
892 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
893 }
894
895 $build_cfg->{'indexfieldmap'} = \@indexfieldmap;
896
897 #store the indexed field information
898 foreach $field (sort keys %{$self->{'buildproc'}->{'indexfields'}}) {
899
900 push (@{$build_cfg->{'indexfields'}}, $field);
901 }
902 # write out the build information
903 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
904 '^(builddate|buildtype|numdocs|numbytes)$',
905 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields)$');
906
907}
908
909sub deinit {
910 my $self = shift (@_);
911}
912
913sub print_stats {
914 my $self = shift (@_);
915
916 my $outhandle = $self->{'outhandle'};
917 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
918 my $index = $self->{'buildproc'}->get_index();
919 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
920 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
921
922 if ($indexing_text) {
923 print $outhandle "Stats (Creating index $index)\n";
924 } else {
925 print $outhandle "Stats (Compressing text from $index)\n";
926 }
927 print $outhandle "Total bytes in collection: $num_bytes\n";
928 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
929
930 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
931 print $outhandle "***************\n";
932 if ($indexing_text) {
933 print $outhandle "WARNING: There is very little or no text to process for $index\n";
934 } elsif (!$self->{'no_text'}) {
935 print $outhandle "WARNING: There is very little or no text to compress\n";
936 }
937 print $outhandle " Was this your intention?\n";
938 print $outhandle "***************\n";
939 }
940
941}
942
9431;
944
945
Note: See TracBrowser for help on using the repository browser.