source: main/trunk/greenstone2/perllib/mgppbuilder.pm@ 32541

Last change on this file since 32541 was 29329, checked in by ak19, 10 years ago

Dr Bainbridge added SIG PIPE handling to mgbuilder to help him discover when pipes failed in executing cmds that were run with open(). Have yet to test that the SIG PIPE handling code still allows the mgppbuilder.pm and mgbuilder.pm to run on Windows.

  • Property svn:keywords set to Author Date Id Revision
File size: 33.8 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use colcfg;
30use plugin;
31use strict; no strict 'refs';
32use util;
33use FileUtils;
34
35
36sub BEGIN {
37 @mgppbuilder::ISA = ('basebuilder');
38}
39
40
41$SIG{PIPE} = sub {
42 print "got SIGPIPE\n";
43 die "$0: Error: $!";
44};
45
46
47our %level_map = ('document'=>'Doc',
48 'section'=>'Sec',
49 'paragraph'=>'Para',
50 'Doc'=>'_textdocument_',
51 'Sec'=>'_textsection_',
52 'Para'=>'_textparagraph_');
53
54our %wanted_index_files = ('td'=>1,
55 't'=>1,
56 'tl'=>1,
57 'ti'=>1,
58 'idb'=>1,
59 'ib1'=>1,
60 'ib2'=>1,
61 'ib3'=>1,
62 'ib4'=>1,
63 'ib5'=>1,
64 'ib6'=>1,
65 'ib7'=>1,
66 'i'=>1,
67 'il'=>1,
68 'w'=>1,
69 'wa'=>1);
70
71
72my $maxdocsize = $basebuilder::maxdocsize;
73
74sub new {
75 my $class = shift(@_);
76
77 my $self = new basebuilder (@_);
78 $self = bless $self, $class;
79
80 #$self->{'indexfieldmap'} = \%static_indexfield_map;
81
82 # get the levels (Section, Paragraph) for indexing and compression
83 $self->{'levels'} = {};
84 $self->{'levelorder'} = ();
85 if (defined $self->{'collect_cfg'}->{'levels'}) {
86 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
87 $level =~ tr/A-Z/a-z/;
88 $self->{'levels'}->{$level} = 1;
89 push (@{$self->{'levelorder'}}, $level);
90 }
91 } else { # default to document
92 $self->{'levels'}->{'document'} = 1;
93 push (@{$self->{'levelorder'}}, 'document');
94 }
95
96 $self->{'buildtype'} = "mgpp";
97
98 return $self;
99}
100
101sub generate_index_list {
102 my $self = shift (@_);
103
104 # sort out the indexes
105 #indexes are specified with spaces, but we put them into one index
106 my $indexes = $self->{'collect_cfg'}->{'indexes'};
107 if (defined $indexes) {
108 $self->{'collect_cfg'}->{'indexes'} = [];
109
110 # remove any ex. from index spec but iff it is the only namespace in the metadata name
111 my @indexes_copy = @$indexes; # make a copy, as 'map' changes entry in array
112 #map { $_ =~ s/(^|,|;)ex\.([^.]+)$/$1$2/; } @indexes_copy; # No. Will replace metanames like flex.Image with fl.Image
113 map { $_ =~ s/(,|;)/$1 /g; } @indexes_copy; # introduce a space after every separator
114 map { $_ =~ s/(^| )ex\.([^.,:]+)(,|;|$)/$1$2$3/g; } @indexes_copy; # replace all <ex.> at start of metanames or <, ex.> when in a comma separated list
115 map { $_ =~ s/(,|:) /$1/g; } @indexes_copy; # remove space introduced after every separator
116 my $single_index = join(';', @indexes_copy).";";
117
118 push (@{$self->{'collect_cfg'}->{'indexes'}}, $single_index);
119 }
120}
121
122sub generate_index_options {
123 my $self = shift (@_);
124
125 $self->SUPER::generate_index_options();
126
127 $self->{'casefold'} = 0;
128 $self->{'stem'} = 0;
129 $self->{'accentfold'} = 0;
130
131 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
132 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
133 if ($option =~ /stem/) {
134 $self->{'stem'} = 1;
135 } elsif ($option =~ /casefold/) {
136 $self->{'casefold'} = 1;
137 } elsif ($option =~ /accentfold/) {
138 $self->{'accentfold'} = 1;
139 }
140 }
141 }
142
143 # now we record this for the build cfg
144 $self->{'stemindexes'} = 0;
145 if ($self->{'casefold'}) {
146 $self->{'stemindexes'} += 1;
147 }
148 if ($self->{'stem'}) {
149 $self->{'stemindexes'} += 2;
150 }
151 if ($self->{'accentfold'}) {
152 $self->{'stemindexes'} += 4;
153 }
154
155}
156
157sub default_buildproc {
158 my $self = shift (@_);
159
160 return "mgppbuildproc";
161}
162
163sub compress_text {
164
165 my $self = shift (@_);
166
167 # we don't do anything if we don't want compressed text
168 return if $self->{'no_text'};
169
170 my ($textindex) = @_;
171
172 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
173 my $exe = &util::get_os_exe ();
174 my $mgpp_passes_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_passes$exe");
175 my $mgpp_compression_dict_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_compression_dict$exe");
176 my $outhandle = $self->{'outhandle'};
177
178 my $maxnumeric = $self->{'maxnumeric'};
179
180 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, "text"));
181
182 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
183 my $basefilename = &FileUtils::filenameConcatenate("text",$collect_tail);
184 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, $basefilename);
185
186 my $osextra = "";
187 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
188 $fulltextprefix =~ s@/@\\@g;
189 }
190 else {
191 $osextra = " -d /";
192 }
193
194
195 # define the section names and possibly the doc name for mgpasses
196 # the compressor doesn't need to know about paragraphs - never want to
197 # retrieve them
198
199 # always use Doc and Sec levels
200 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
201
202 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
203 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
204
205 # collect the statistics for the text
206 # -b $maxdocsize sets the maximum document size to be 12 meg
207 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
208 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
209
210 my ($handle);
211 if ($self->{'debug'}) {
212 $handle = *STDOUT;
213 }
214 else {
215 my $mgpp_passes_cmd = "mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra";
216 print $outhandle "\ncmd: $mgpp_passes_cmd\n" if ($self->{'verbosity'} >= 4);
217 if (!-e "$mgpp_passes_exe" || !open($handle, "| $mgpp_passes_cmd")) {
218 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
219 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
220 }
221 }
222
223 my $db_level = "section";
224
225 $self->{'buildproc'}->set_output_handle ($handle);
226 $self->{'buildproc'}->set_mode ('text');
227 $self->{'buildproc'}->set_index ($textindex);
228 $self->{'buildproc'}->set_indexing_text (0);
229 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
230 $self->{'buildproc'}->set_levels ($self->{'levels'});
231 $self->{'buildproc'}->set_db_level ($db_level);
232 $self->{'buildproc'}->reset();
233 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
234 $self->{'buildproc'}, $self->{'maxdocs'});
235 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
236 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
237 &plugin::end($self->{'pluginfo'});
238
239 close ($handle) unless $self->{'debug'};
240
241 $self->print_stats();
242
243 # create the compression dictionary
244 # the compression dictionary is built by assuming the stats are from a seed
245 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
246 # and the resulting dictionary must be less than 5 meg with the most
247 # frequent words being put into the dictionary first (-2 -k 5120)
248 # note: these options are left over from mg version
249 if (!$self->{'debug'}) {
250 my $compdict_cmd = "mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra";
251 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
252 print $outhandle "\ncmd: $compdict_cmd\n" if ($self->{'verbosity'} >= 4);
253 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
254 if (!-e "$mgpp_compression_dict_exe") {
255 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
256 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
257 }
258 my $comp_dict_status = system ($compdict_cmd);
259 if($comp_dict_status != 0) {
260 print $outhandle "\nmgppbuilder::compress_text - Warning: there's no compressed text\n";
261 $self->{'notbuilt'}->{'compressedtext'} = 1;
262 print STDERR "<Warning name='NoCompressedText'/>\n</Stage>\n" if $self->{'gli'};
263 return;
264 }
265
266 if (!$self->{'debug'}) {
267 my $mgpp_passes_cmd = "mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra";
268 print $outhandle "\ncmd: $mgpp_passes_cmd\n" if ($self->{'verbosity'} >= 4);
269
270 if (!-e "$mgpp_passes_exe" || !open ($handle, "| $mgpp_passes_cmd")) {
271 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
272 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
273 }
274 }
275 }
276 else {
277 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
278 }
279
280 $self->{'buildproc'}->set_output_handle ($handle);
281 $self->{'buildproc'}->reset();
282
283 # compress the text
284 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
285 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
286
287 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
288 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
289 close ($handle) unless $self->{'debug'};
290
291 $self->print_stats();
292 print STDERR "</Stage>\n" if $self->{'gli'};
293}
294
295
296sub post_build_indexes {
297 my $self = shift(@_);
298
299 #define the final field lists
300 $self->make_final_field_list();
301}
302
303# creates directory names for each of the index descriptions
304sub create_index_mapping {
305 my $self = shift (@_);
306 my ($indexes) = @_;
307
308 my %mapping = ();
309
310 return \%mapping if !(scalar @$indexes);
311
312 $mapping{'indexmaporder'} = [];
313 $mapping{'subcollectionmaporder'} = [];
314 $mapping{'languagemaporder'} = [];
315
316 # dirnames is used to check for collisions. Start this off
317 # with the manditory directory names
318 my %dirnames = ('text'=>'text',
319 'extra'=>'extra');
320 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
321
322 foreach my $index (@$indexes) {
323 my ($fields, $subcollection, $languages) = split (":", $index);
324
325 # we only ever have one index, and its called 'idx'
326 my $pindex = 'idx';
327
328 # next comes a processed version of the subcollection if there is one.
329 my $psub = $self->process_field ($subcollection);
330 $psub = lc ($psub);
331
332 # next comes a processed version of the language if there is one.
333 my $plang = $self->process_field ($languages);
334 $plang = lc ($plang);
335
336 my $dirname = $pindex . $psub . $plang;
337
338 # check to be sure all index names are unique
339 while (defined ($dirnames{$dirname})) {
340 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
341 }
342
343 $mapping{$index} = $dirname;
344
345 # store the mapping orders as well as the maps
346 # also put index, subcollection and language fields into the mapping thing -
347 # (the full index name (eg text:subcol:lang) is not used on
348 # the query page) -these are used for collectionmeta later on
349 if (!defined $mapping{'indexmap'}{"$fields"}) {
350 $mapping{'indexmap'}{"$fields"} = $pindex;
351 push (@{$mapping{'indexmaporder'}}, "$fields");
352 if (!defined $mapping{"$fields"}) {
353 $mapping{"$fields"} = $pindex;
354 }
355 }
356 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
357 $mapping{'subcollectionmap'}{$subcollection} = $psub;
358 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
359 $mapping{$subcollection} = $psub;
360 }
361 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
362 $mapping{'languagemap'}{$languages} = $plang;
363 push (@{$mapping{'languagemaporder'}}, $languages);
364 $mapping{$languages} = $plang;
365 }
366 $dirnames{$dirname} = $index;
367 $pnames{'index'}->{$pindex} = "$fields";
368 $pnames{'subcollection'}->{$psub} = $subcollection;
369 $pnames{'languages'}->{$plang} = $languages;
370 }
371
372 return \%mapping;
373}
374
375sub make_unique {
376 my $self = shift (@_);
377 my ($namehash, $index, $indexref, $subref, $langref) = @_;
378 my ($fields, $subcollection, $languages) = split (":", $index);
379
380 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
381 $self->get_next_version ($indexref);
382 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
383 $self->get_next_version ($subref);
384 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
385 $self->get_next_version ($langref);
386 }
387 return "$$indexref$$subref$$langref";
388}
389
390
391sub build_index {
392 my $self = shift (@_);
393 my ($index) = @_;
394 my $outhandle = $self->{'outhandle'};
395
396 # get the full index directory path and make sure it exists
397 my $indexdir = $self->{'index_mapping'}->{$index};
398 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir));
399
400 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
401 my $fullindexprefix = &FileUtils::filenameConcatenate($self->{'build_dir'},
402 $indexdir,
403 $collect_tail);
404 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text",
405 $collect_tail);
406
407 # get any os specific stuff
408 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
409
410 my $exe = &util::get_os_exe ();
411 my $mgpp_passes_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_passes$exe");
412
413 # define the section names for mgpasses
414 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
415 if ($self->{'levels'}->{'paragraph'}) {
416 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
417 }
418
419 my $mgpp_perf_hash_build_exe =
420 &FileUtils::filenameConcatenate($exedir, "mgpp_perf_hash_build$exe");
421 my $mgpp_weights_build_exe =
422 &FileUtils::filenameConcatenate($exedir, "mgpp_weights_build$exe");
423 my $mgpp_invf_dict_exe =
424 &FileUtils::filenameConcatenate($exedir, "mgpp_invf_dict$exe");
425 my $mgpp_stem_idx_exe =
426 &FileUtils::filenameConcatenate($exedir, "mgpp_stem_idx$exe");
427
428 my $maxnumeric = $self->{'maxnumeric'};
429
430 my $osextra = "";
431 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
432 $fullindexprefix =~ s@/@\\@g;
433 } else {
434 $osextra = " -d /";
435 if ($outhandle ne "STDERR") {
436 # so mgpp_passes doesn't print to stderr if we redirect output
437 $osextra .= " 2>/dev/null";
438 }
439 }
440
441 # get the index expression if this index belongs
442 # to a subcollection
443 my $indexexparr = [];
444 my $langarr = [];
445 # there may be subcollection info, and language info.
446 my ($fields, $subcollection, $language) = split (":", $index);
447 my @subcollections = ();
448 @subcollections = split /,/, $subcollection if (defined $subcollection);
449
450 foreach $subcollection (@subcollections) {
451 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
452 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
453 }
454 }
455
456 # add expressions for languages if this index belongs to
457 # a language subcollection - only put languages expressions for the
458 # ones we want in the index
459
460 my @languages = ();
461 my $languagemetadata = "Language";
462 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
463 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
464 }
465 @languages = split /,/, $language if (defined $language);
466 foreach my $language (@languages) {
467 my $not=0;
468 if ($language =~ s/^\!//) {
469 $not = 1;
470 }
471 if($not) {
472 push (@$langarr, "!$language");
473 } else {
474 push (@$langarr, "$language");
475 }
476 }
477
478 # Build index dictionary. Uses verbatim stem method
479 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
480 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
481 my ($handle);
482 if ($self->{'debug'}) {
483 $handle = *STDOUT;
484 }
485 else {
486 my $mgpp_passes_cmd = "mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra";
487 print $outhandle "\ncmd: $mgpp_passes_cmd\n" if ($self->{'verbosity'} >= 4);
488 if (!-e "$mgpp_passes_exe" || !open($handle, "| $mgpp_passes_cmd")) {
489 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
490 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
491 }
492 }
493
494 # db_level is always section
495 my $db_level = "section";
496
497 # set up the document processr
498 $self->{'buildproc'}->set_output_handle ($handle);
499 $self->{'buildproc'}->set_mode ('text');
500 $self->{'buildproc'}->set_index ($index, $indexexparr);
501 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
502 $self->{'buildproc'}->set_indexing_text (1);
503 $self->{'buildproc'}->set_levels ($self->{'levels'});
504 $self->{'buildproc'}->set_db_level ($db_level);
505
506 $self->{'buildproc'}->reset();
507
508 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
509 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
510 close ($handle) unless $self->{'debug'};
511
512 $self->print_stats();
513
514 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
515 # we check on the .id file - index dictionary
516 my $dict_file = "$fullindexprefix.id";
517 if (!-e $dict_file) {
518 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
519 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
520 $self->{'notbuilt'}->{$index}=1;
521 return;
522 }
523
524 if (!$self->{'debug'}) {
525 # create the perfect hash function
526 if (!-e "$mgpp_perf_hash_build_exe") {
527 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
528 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
529 }
530 my $hash_cmd = "mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra";
531 print $outhandle "\ncmd: $hash_cmd\n" if ($self->{'verbosity'} >= 4);
532
533 my $hash_status = system ($hash_cmd);
534 print $outhandle "\nstatus from running hash_cmd: $hash_status\n" if ($self->{'verbosity'} >= 4);
535 # check that perf hash was generated - if not, don't carry on
536 if ($hash_status !=0) {
537 print $outhandle "mgppbuilder::build_index - Couldn't create index $index as there are too few words in the index.\n";
538 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
539 $self->{'notbuilt'}->{$index}=1;
540 return;
541
542 }
543
544 my $mgpp_passes_cmd = "mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra";
545 print $outhandle "\ncmd: $mgpp_passes_cmd\n" if ($self->{'verbosity'} >= 4);
546 if (!-e "$mgpp_passes_exe" || !open ($handle, "| $mgpp_passes_cmd")) {
547 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
548 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
549 }
550 }
551
552 # invert the text
553 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
554 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
555
556 $self->{'buildproc'}->set_output_handle ($handle);
557 $self->{'buildproc'}->reset();
558
559 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
560 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
561
562 $self->print_stats ();
563
564 if (!$self->{'debug'}) {
565
566 close ($handle);
567 my $passes_exit_status = $?;
568 print $outhandle "\nMGPP Passes exit status $passes_exit_status\n" if ($self->{'verbosity'} >= 4);
569
570 # create the weights file
571 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
572 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
573 if (!-e "$mgpp_weights_build_exe") {
574 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
575 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
576 }
577 my $weights_cmd = "mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra";
578 print $outhandle "\ncmd: $weights_cmd\n" if ($self->{'verbosity'} >= 4);
579 my $weights_status = system ($weights_cmd);
580 # check that it worked - if not, don't carry on
581 if ($weights_status !=0) {
582 print $outhandle "mgppbuilder::build_index - No Index: couldn't create weights file, error calling mgpp_weights_build.\n";
583 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
584 $self->{'notbuilt'}->{$index}=1;
585 return;
586
587 }
588
589 # create 'on-disk' stemmed dictionary
590 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
591 if (!-e "$mgpp_invf_dict_exe") {
592 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
593 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
594 }
595 my $invdict_cmd = "mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra";
596 print $outhandle "\ncmd: $invdict_cmd\n" if ($self->{'verbosity'} >= 4);
597 my $invdict_status = system ($invdict_cmd);
598 # check that it worked - if not, don't carry on
599 if ($invdict_status !=0) {
600 print $outhandle "mgppbuilder::build_index - No Index: couldn't create on-disk stemmed dictionary, error calling mgpp_invf_dict.\n";
601 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
602 $self->{'notbuilt'}->{$index}=1;
603 return;
604
605 }
606
607 # creates stem index files for the various stemming methods
608 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
609 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
610 if (!-e "$mgpp_stem_idx_exe") {
611 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
612 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
613 }
614 my $accent_folding_enabled = 1;
615 if ($self->{'accentfold'}) {
616 my $accentfold_cmd = "mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra";
617 print $outhandle "\ncmd: $accentfold_cmd\n" if ($self->{'verbosity'} >= 4);
618 # the first time we do this, we test for accent folding enabled
619 my $accent_status = system ($accentfold_cmd);
620 if ($accent_status == 2) {
621 # accent folding has not been enabled in mgpp
622 $accent_folding_enabled = 0;
623 $self->{'stemindexes'} -= 4;
624 } elsif ($accent_status != 0) {
625 print $outhandle "\nAccent folding failed: mgpp_stem_idx exit status $accent_status\n" if ($self->{'verbosity'} >= 4);
626 $self->{'accentfold'} = 0;
627 #$accent_folding_enabled = 0;
628 $self->{'stemindexes'} -= 4;
629 }
630 }
631 if ($self->{'casefold'}) {
632 my $casefold_cmd = "mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra";
633 print $outhandle "\ncmd: $casefold_cmd\n" if ($self->{'verbosity'} >= 4);
634 my $casefold_status = system ($casefold_cmd);
635 if ($casefold_status != 0) {
636 print $outhandle "\nCase folding failed: mgpp_stem_idx exit status $casefold_status\n" if ($self->{'verbosity'} >= 4);
637 $self->{'casefold'} = 0;
638 $self->{'stemindexes'} -= 1;
639 }
640
641 elsif ($accent_folding_enabled && $self->{'accentfold'}) {
642 my $accent_casefold_cmd = "mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra";
643 print $outhandle "\ncmd: $accent_casefold_cmd\n" if ($self->{'verbosity'} >= 4);
644 my $status = system ($accent_casefold_cmd);
645 if($status != 0) {
646 print $outhandle "\nAccent folding (with casefolding) failed: mgpp_stem_idx exit status $status\n" if ($self->{'verbosity'} >= 4);
647 $self->{'accentfold'} = 0;
648 $self->{'stemindexes'} -= 4; # casefold worked, only accentfold failed, so -= 4, not -= 5
649 }
650 }
651 }
652 if ($self->{'stem'}) {
653 my $stem_cmd = "mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra";
654 print $outhandle "\ncmd: $stem_cmd\n" if ($self->{'verbosity'} >= 4);
655 my $stem_status = system ($stem_cmd);
656 if ($stem_status != 0) {
657 print $outhandle "\nStemming failed: mgpp_stem_idx exit status $stem_status\n" if ($self->{'verbosity'} >= 4);
658 $self->{'stem'} = 0;
659 $self->{'stemindexes'} -= 2;
660 }
661 elsif ($accent_folding_enabled && $self->{'accentfold'}) {
662 my $accent_stem_cmd = "mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra";
663 print $outhandle "\ncmd: $accent_stem_cmd\n" if ($self->{'verbosity'} >= 4);
664 my $status = system ($accent_stem_cmd);
665 if($status != 0) {
666 print $outhandle "\nAccent folding (with stemming) failed: mgpp_stem_idx exit status $status\n" if ($self->{'verbosity'} >= 4);
667 $self->{'accentfold'} = 0;
668 $self->{'stemindexes'} -= 4; # stem worked, only accentfold failed, so -= 4, not -= 6
669 }
670 }
671 }
672 if ($self->{'casefold'} && $self->{'stem'}) {
673 my $case_stem_cmd = "mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra";
674 print $outhandle "\ncmd: $case_stem_cmd\n" if ($self->{'verbosity'} >= 4);
675 my $case_and_stem_status = system ($case_stem_cmd);
676 if ($case_and_stem_status != 0) {
677 print $outhandle "\nCasefolding and stemming failed: mgpp_stem_idx exit status $case_and_stem_status\n" if ($self->{'verbosity'} >= 4);
678 $self->{'stem'} = 0;
679 $self->{'casefold'} = 0;
680 $self->{'stemindexes'} -= 3;
681 }
682 elsif ($accent_folding_enabled && $self->{'accentfold'}) {
683 my $accent_case_stem_cmd = "mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra";
684 print $outhandle "\ncmd: $accent_case_stem_cmd\n" if ($self->{'verbosity'} >= 4);
685 my $status = system ($accent_case_stem_cmd);
686 if($status != 0) {
687 print $outhandle "\nAccent folding (with stemming and casefolding) failed: mgpp_stem_idx exit status $status\n" if ($self->{'verbosity'} >= 4);
688 $self->{'accentfold'} = 0;
689 $self->{'stemindexes'} -= 4; # casefold and stem worked, only accentfold failed, so -= 4, not -= 7
690 }
691 }
692 }
693
694 # remove unwanted files
695 my $tmpdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir);
696 opendir (DIR, $tmpdir) || die
697 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
698 foreach my $file (readdir(DIR)) {
699 next if $file =~ /^\./;
700 my ($suffix) = $file =~ /\.([^\.]+)$/;
701 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
702 # delete it!
703 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
704 #&util::rm (&FileUtils::filenameConcatenate($tmpdir, $file));
705 }
706 }
707 closedir (DIR);
708 }
709 print STDERR "</Stage>\n" if $self->{'gli'};
710}
711
712
713sub get_collection_meta_indexes
714{
715 my $self = shift(@_);
716 my $collection_infodb = shift(@_);
717
718 # define the indexed field mapping if not already done so
719 # (i.e. if infodb called separately from build_index)
720 if (!defined $self->{'build_cfg'}) {
721 $self->read_final_field_list();
722 }
723
724 # first do the collection meta stuff - everything without a dot
725 my $collmetadefined = 0;
726 my $metadata_entry;
727 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
728 $collmetadefined = 1;
729 }
730
731 #add the index field macros to [collection]
732 # eg <TI>Title
733 # <SU>Subject
734 # these now come from collection meta. if that is not defined, uses the metadata name
735 my $collmeta = "";
736 if (defined $self->{'build_cfg'}->{'extraindexfields'}) {
737 foreach my $longfield (@{$self->{'build_cfg'}->{'extraindexfields'}}){
738 my $shortfield = $self->{'buildproc'}->{'fieldnamemap'}->{$longfield};
739 next if $shortfield eq 1;
740
741 # we need to check if some coll meta has been defined - don't output
742 # any that have
743 $collmeta = ".$longfield";
744 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
745 if ($longfield eq "allfields") {
746 $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
747 } elsif ($longfield eq "text") {
748 $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
749 } else {
750 $collection_infodb->{$shortfield} = [ $longfield ];
751 }
752 }
753 }
754 }
755
756 # now add the level names
757 my $level_entry = "";
758 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
759 $collmeta = ".$level"; # based on the original specification
760 $level =~ tr/A-Z/a-z/; # make it lower case
761 my $levelid = $level_map{$level}; # find the actual value we used in the index
762 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
763 # use the default macro
764 $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
765 }
766 }
767
768 # now add subcoll meta
769 my $subcoll_entry = "";
770 my $shortname = "";
771 my $one_entry = "";
772 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
773 $shortname = $self->{'index_mapping'}->{$subcoll};
774 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
775 $collection_infodb->{$shortname} = [ $subcoll ];
776 }
777 }
778
779 # now add language meta
780 my $lang_entry = "";
781 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
782 $shortname = $self->{'index_mapping'}->{$lang};
783 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
784 $collection_infodb->{$shortname} = [ $lang ];
785 }
786 }
787}
788
789
790# default is to output the metadata sets (prefixes) used in collection
791sub output_collection_meta
792{
793 my $self = shift(@_);
794 my $infodb_handle = shift(@_);
795
796 my %collection_infodb = ();
797 $self->get_collection_meta_sets(\%collection_infodb);
798 $self->get_collection_meta_indexes(\%collection_infodb);
799 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
800}
801
802
803# at the end of building, we have an indexfieldmap with all the mappings,
804# plus some extras, and indexmap with any indexes in it that weren't
805# specified in the index definition. We want to make an ordered list of
806# fields that are indexed, and a list of mappings that are used. This will
807# be used for the build.cfg file, and for collection meta definition we
808# store these in a build.cfg bit
809sub make_final_field_list {
810 my $self = shift (@_);
811
812 $self->{'build_cfg'} = {};
813
814 # store the indexfieldmap information
815 my @indexfieldmap = ();
816 my @indexfields = ();
817 my $specifiedfields = {};
818 my @specifiedfieldorder = ();
819
820 # go through the index definition and add each thing to a map, so we
821 # can easily check if it is already specified - when doing the
822 # metadata, we print out all the individual fields, but some may
823 # already be specified in the index definition, so we dont want to add
824 # those again.
825
826 my $field;
827 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
828 # remove subcoll stuff
829 my $parts = $field;
830 $parts =~ s/:.*$//;
831 # *************
832 my @fs = split(';', $parts);
833 foreach my $f(@fs) {
834 if (!defined $specifiedfields->{$f}) {
835 $specifiedfields->{$f}=1;
836 push (@specifiedfieldorder, "$f");
837 }
838 }
839 }
840
841 #add all fields bit
842 my $fnm = $self->{'buildproc'}->{'fieldnamemap'};
843
844 foreach $field (@specifiedfieldorder) {
845 if ($field eq "metadata") {
846 foreach my $newfield (keys %{$self->{'buildproc'}->{'extraindexfields'}}) {
847 if (!defined $specifiedfields->{$newfield}) {
848 push (@indexfieldmap, "$newfield\-\>$fnm->{$newfield}");
849 push (@indexfields, "$newfield");
850 }
851 }
852
853 } elsif ($field eq 'text') {
854 push (@indexfieldmap, "text\-\>TX");
855 push (@indexfields, "text");
856 } elsif ($field eq 'allfields') {
857 push (@indexfieldmap, "allfields\-\>ZZ");
858 push (@indexfields, "allfields");
859 } else {
860 # we only add in the ones that have been processed
861 if (defined $self->{'buildproc'}->{'allindexfields'}->{$field}) {
862 push (@indexfieldmap, "$field\-\>$fnm->{$field}");
863 push (@indexfields, "$field");
864 }
865 }
866 }
867
868 if (scalar @indexfieldmap) {
869 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
870 }
871
872 if (scalar @indexfields) {
873 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
874 }
875}
876
877
878# recreate the field list from the build.cfg file, look first in building,
879# then in index to find it. if there is no build.cfg, we can't do the field
880# list (there is unlikely to be any index anyway.)
881sub read_final_field_list {
882 my $self = shift (@_);
883 $self->{'build_cfg'} = {};
884 my @indexfieldmap = ();
885 my @indexfields = ();
886 my @indexmap = ();
887
888 # we read the stuff in from the build.cfg file - if its there
889 my $buildcfg = $self->read_build_cfg();
890 return unless defined $buildcfg;
891
892 my $field;
893 if (defined $buildcfg->{'indexfields'}) {
894 foreach $field (@{$buildcfg->{'indexfields'}}) {
895 push (@indexfields, "$field");
896 }
897 }
898
899 if (defined $buildcfg->{'indexfieldmap'}) {
900 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
901 push (@indexfieldmap, "$field");
902 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
903 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
904 }
905 }
906
907 if (defined $buildcfg->{'indexmap'}) {
908 foreach $field (@{$buildcfg->{'indexmap'}}) {
909 push (@indexmap, "$field");
910 }
911 }
912
913 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
914 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
915 $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
916}
917
918
919sub build_cfg_extra {
920 my $self = shift (@_);
921 my ($build_cfg) = @_;
922
923 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
924
925 # store the level info
926 my @indexlevels = ();
927 my @levelmap = ();
928 foreach my $l (@{$self->{'levelorder'}}) {
929 push (@indexlevels, $level_map{$l});
930 push (@levelmap, "$l\-\>$level_map{$l}");
931 }
932 $build_cfg->{'indexlevels'} = \@indexlevels;
933 $build_cfg->{'levelmap'} = \@levelmap;
934
935 # text level (and database level) is always section
936 $build_cfg->{'textlevel'} = $level_map{'section'};
937
938}
939
9401;
941
942
Note: See TracBrowser for help on using the repository browser.