source: main/trunk/greenstone2/perllib/mgppbuilder.pm@ 28119

Last change on this file since 28119 was 28119, checked in by ak19, 11 years ago

First commit of better error checking of system commands in mgppbuilder. Before using bitwise operations with flags for indexes.

  • Property svn:keywords set to Author Date Id Revision
File size: 32.2 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use colcfg;
30use plugin;
31use strict; no strict 'refs';
32use util;
33use FileUtils;
34
35
36sub BEGIN {
37 @mgppbuilder::ISA = ('basebuilder');
38}
39
40
41
42our %level_map = ('document'=>'Doc',
43 'section'=>'Sec',
44 'paragraph'=>'Para',
45 'Doc'=>'_textdocument_',
46 'Sec'=>'_textsection_',
47 'Para'=>'_textparagraph_');
48
49our %wanted_index_files = ('td'=>1,
50 't'=>1,
51 'tl'=>1,
52 'ti'=>1,
53 'idb'=>1,
54 'ib1'=>1,
55 'ib2'=>1,
56 'ib3'=>1,
57 'ib4'=>1,
58 'ib5'=>1,
59 'ib6'=>1,
60 'ib7'=>1,
61 'i'=>1,
62 'il'=>1,
63 'w'=>1,
64 'wa'=>1);
65
66
67my $maxdocsize = $basebuilder::maxdocsize;
68
69sub new {
70 my $class = shift(@_);
71
72 my $self = new basebuilder (@_);
73 $self = bless $self, $class;
74
75 #$self->{'indexfieldmap'} = \%static_indexfield_map;
76
77 # get the levels (Section, Paragraph) for indexing and compression
78 $self->{'levels'} = {};
79 $self->{'levelorder'} = ();
80 if (defined $self->{'collect_cfg'}->{'levels'}) {
81 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
82 $level =~ tr/A-Z/a-z/;
83 $self->{'levels'}->{$level} = 1;
84 push (@{$self->{'levelorder'}}, $level);
85 }
86 } else { # default to document
87 $self->{'levels'}->{'document'} = 1;
88 push (@{$self->{'levelorder'}}, 'document');
89 }
90
91 $self->{'buildtype'} = "mgpp";
92
93 return $self;
94}
95
96sub generate_index_list {
97 my $self = shift (@_);
98
99 # sort out the indexes
100 #indexes are specified with spaces, but we put them into one index
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 if (defined $indexes) {
103 $self->{'collect_cfg'}->{'indexes'} = [];
104
105 # remove any ex. from index spec but iff it is the only namespace in the metadata name
106 my @indexes_copy = @$indexes; # make a copy, as 'map' changes entry in array
107 #map { $_ =~ s/(^|,|;)ex\.([^.]+)$/$1$2/; } @indexes_copy; # No. Will replace metanames like flex.Image with fl.Image
108 map { $_ =~ s/(,|;)/$1 /g; } @indexes_copy; # introduce a space after every separator
109 map { $_ =~ s/(^| )ex\.([^.,:]+)(,|;|$)/$1$2$3/g; } @indexes_copy; # replace all <ex.> at start of metanames or <, ex.> when in a comma separated list
110 map { $_ =~ s/(,|:) /$1/g; } @indexes_copy; # remove space introduced after every separator
111 my $single_index = join(';', @indexes_copy).";";
112
113 push (@{$self->{'collect_cfg'}->{'indexes'}}, $single_index);
114 }
115}
116
117sub generate_index_options {
118 my $self = shift (@_);
119
120 $self->SUPER::generate_index_options();
121
122 $self->{'casefold'} = 0;
123 $self->{'stem'} = 0;
124 $self->{'accentfold'} = 0;
125
126 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
127 # just use default options
128 $self->{'casefold'} = 1;
129 $self->{'stem'} = 1;
130 $self->{'accentfold'} = 1;
131 } else {
132 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
133 if ($option =~ /stem/) {
134 $self->{'stem'} = 1;
135 } elsif ($option =~ /casefold/) {
136 $self->{'casefold'} = 1;
137 } elsif ($option =~ /accentfold/) {
138 $self->{'accentfold'} = 1;
139 }
140 }
141 }
142
143 # now we record this for the build cfg
144 $self->{'stemindexes'} = 0;
145 if ($self->{'casefold'}) {
146 $self->{'stemindexes'} += 1;
147 }
148 if ($self->{'stem'}) {
149 $self->{'stemindexes'} += 2;
150 }
151 if ($self->{'accentfold'}) {
152 $self->{'stemindexes'} += 4;
153 }
154
155}
156
157sub default_buildproc {
158 my $self = shift (@_);
159
160 return "mgppbuildproc";
161}
162
163sub compress_text {
164
165 my $self = shift (@_);
166
167 # we don't do anything if we don't want compressed text
168 return if $self->{'no_text'};
169
170 my ($textindex) = @_;
171
172 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
173 my $exe = &util::get_os_exe ();
174 my $mgpp_passes_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_passes$exe");
175 my $mgpp_compression_dict_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_compression_dict$exe");
176 my $outhandle = $self->{'outhandle'};
177
178 my $maxnumeric = $self->{'maxnumeric'};
179
180 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, "text"));
181
182 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
183 my $basefilename = &FileUtils::filenameConcatenate("text",$collect_tail);
184 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, $basefilename);
185
186 my $osextra = "";
187 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
188 $fulltextprefix =~ s@/@\\@g;
189 }
190 else {
191 $osextra = " -d /";
192 }
193
194
195 # define the section names and possibly the doc name for mgpasses
196 # the compressor doesn't need to know about paragraphs - never want to
197 # retrieve them
198
199 # always use Doc and Sec levels
200 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
201
202 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
203 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
204
205 # collect the statistics for the text
206 # -b $maxdocsize sets the maximum document size to be 12 meg
207 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
208 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
209
210 my ($handle);
211 if ($self->{'debug'}) {
212 $handle = *STDOUT;
213 }
214 else {
215 if (!-e "$mgpp_passes_exe" ||
216 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
217 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
218 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
219 }
220 }
221
222 my $db_level = "section";
223
224 $self->{'buildproc'}->set_output_handle ($handle);
225 $self->{'buildproc'}->set_mode ('text');
226 $self->{'buildproc'}->set_index ($textindex);
227 $self->{'buildproc'}->set_indexing_text (0);
228 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
229 $self->{'buildproc'}->set_levels ($self->{'levels'});
230 $self->{'buildproc'}->set_db_level ($db_level);
231 $self->{'buildproc'}->reset();
232 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
233 $self->{'buildproc'}, $self->{'maxdocs'});
234 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
235 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
236 &plugin::end($self->{'pluginfo'});
237
238 close ($handle) unless $self->{'debug'};
239
240 $self->print_stats();
241
242 # create the compression dictionary
243 # the compression dictionary is built by assuming the stats are from a seed
244 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
245 # and the resulting dictionary must be less than 5 meg with the most
246 # frequent words being put into the dictionary first (-2 -k 5120)
247 # note: these options are left over from mg version
248 if (!$self->{'debug'}) {
249 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
250 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
251 if (!-e "$mgpp_compression_dict_exe") {
252 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
253 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
254 }
255 my $comp_dict_status = system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
256 if($comp_dict_status != 0) {
257 print $outhandle "\nmgppbuilder::compress_text - Warning: there's no compressed text\n";
258 $self->{'notbuilt'}->{'compressedtext'} = 1;
259 print STDERR "<Warning name='NoCompressedText'/>\n</Stage>\n" if $self->{'gli'};
260 return;
261 }
262
263 if (!$self->{'debug'}) {
264 if (!-e "$mgpp_passes_exe" ||
265 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
266 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
267 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
268 }
269 }
270 }
271 else {
272 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
273 }
274
275 $self->{'buildproc'}->set_output_handle ($handle);
276 $self->{'buildproc'}->reset();
277
278 # compress the text
279 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
280 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
281
282 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
283 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
284 close ($handle) unless $self->{'debug'};
285
286 $self->print_stats();
287 print STDERR "</Stage>\n" if $self->{'gli'};
288}
289
290
291sub post_build_indexes {
292 my $self = shift(@_);
293
294 #define the final field lists
295 $self->make_final_field_list();
296}
297
298# creates directory names for each of the index descriptions
299sub create_index_mapping {
300 my $self = shift (@_);
301 my ($indexes) = @_;
302
303 my %mapping = ();
304
305 return \%mapping if !(scalar @$indexes);
306
307 $mapping{'indexmaporder'} = [];
308 $mapping{'subcollectionmaporder'} = [];
309 $mapping{'languagemaporder'} = [];
310
311 # dirnames is used to check for collisions. Start this off
312 # with the manditory directory names
313 my %dirnames = ('text'=>'text',
314 'extra'=>'extra');
315 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
316
317 foreach my $index (@$indexes) {
318 my ($fields, $subcollection, $languages) = split (":", $index);
319
320 # we only ever have one index, and its called 'idx'
321 my $pindex = 'idx';
322
323 # next comes a processed version of the subcollection if there is one.
324 my $psub = $self->process_field ($subcollection);
325 $psub = lc ($psub);
326
327 # next comes a processed version of the language if there is one.
328 my $plang = $self->process_field ($languages);
329 $plang = lc ($plang);
330
331 my $dirname = $pindex . $psub . $plang;
332
333 # check to be sure all index names are unique
334 while (defined ($dirnames{$dirname})) {
335 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
336 }
337
338 $mapping{$index} = $dirname;
339
340 # store the mapping orders as well as the maps
341 # also put index, subcollection and language fields into the mapping thing -
342 # (the full index name (eg text:subcol:lang) is not used on
343 # the query page) -these are used for collectionmeta later on
344 if (!defined $mapping{'indexmap'}{"$fields"}) {
345 $mapping{'indexmap'}{"$fields"} = $pindex;
346 push (@{$mapping{'indexmaporder'}}, "$fields");
347 if (!defined $mapping{"$fields"}) {
348 $mapping{"$fields"} = $pindex;
349 }
350 }
351 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
352 $mapping{'subcollectionmap'}{$subcollection} = $psub;
353 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
354 $mapping{$subcollection} = $psub;
355 }
356 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
357 $mapping{'languagemap'}{$languages} = $plang;
358 push (@{$mapping{'languagemaporder'}}, $languages);
359 $mapping{$languages} = $plang;
360 }
361 $dirnames{$dirname} = $index;
362 $pnames{'index'}->{$pindex} = "$fields";
363 $pnames{'subcollection'}->{$psub} = $subcollection;
364 $pnames{'languages'}->{$plang} = $languages;
365 }
366
367 return \%mapping;
368}
369
370sub make_unique {
371 my $self = shift (@_);
372 my ($namehash, $index, $indexref, $subref, $langref) = @_;
373 my ($fields, $subcollection, $languages) = split (":", $index);
374
375 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
376 $self->get_next_version ($indexref);
377 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
378 $self->get_next_version ($subref);
379 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
380 $self->get_next_version ($langref);
381 }
382 return "$$indexref$$subref$$langref";
383}
384
385
386sub build_index {
387 my $self = shift (@_);
388 my ($index) = @_;
389 my $outhandle = $self->{'outhandle'};
390
391 # get the full index directory path and make sure it exists
392 my $indexdir = $self->{'index_mapping'}->{$index};
393 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir));
394
395 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
396 my $fullindexprefix = &FileUtils::filenameConcatenate($self->{'build_dir'},
397 $indexdir,
398 $collect_tail);
399 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text",
400 $collect_tail);
401
402 # get any os specific stuff
403 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
404
405 my $exe = &util::get_os_exe ();
406 my $mgpp_passes_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_passes$exe");
407
408 # define the section names for mgpasses
409 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
410 if ($self->{'levels'}->{'paragraph'}) {
411 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
412 }
413
414 my $mgpp_perf_hash_build_exe =
415 &FileUtils::filenameConcatenate($exedir, "mgpp_perf_hash_build$exe");
416 my $mgpp_weights_build_exe =
417 &FileUtils::filenameConcatenate($exedir, "mgpp_weights_build$exe");
418 my $mgpp_invf_dict_exe =
419 &FileUtils::filenameConcatenate($exedir, "mgpp_invf_dict$exe");
420 my $mgpp_stem_idx_exe =
421 &FileUtils::filenameConcatenate($exedir, "mgpp_stem_idx$exe");
422
423 my $maxnumeric = $self->{'maxnumeric'};
424
425 my $osextra = "";
426 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
427 $fullindexprefix =~ s@/@\\@g;
428 } else {
429 $osextra = " -d /";
430 if ($outhandle ne "STDERR") {
431 # so mgpp_passes doesn't print to stderr if we redirect output
432 $osextra .= " 2>/dev/null";
433 }
434 }
435
436 # get the index expression if this index belongs
437 # to a subcollection
438 my $indexexparr = [];
439 my $langarr = [];
440 # there may be subcollection info, and language info.
441 my ($fields, $subcollection, $language) = split (":", $index);
442 my @subcollections = ();
443 @subcollections = split /,/, $subcollection if (defined $subcollection);
444
445 foreach $subcollection (@subcollections) {
446 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
447 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
448 }
449 }
450
451 # add expressions for languages if this index belongs to
452 # a language subcollection - only put languages expressions for the
453 # ones we want in the index
454
455 my @languages = ();
456 my $languagemetadata = "Language";
457 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
458 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
459 }
460 @languages = split /,/, $language if (defined $language);
461 foreach my $language (@languages) {
462 my $not=0;
463 if ($language =~ s/^\!//) {
464 $not = 1;
465 }
466 if($not) {
467 push (@$langarr, "!$language");
468 } else {
469 push (@$langarr, "$language");
470 }
471 }
472
473 # Build index dictionary. Uses verbatim stem method
474 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
475 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
476 my ($handle);
477 if ($self->{'debug'}) {
478 $handle = *STDOUT;
479 }
480 else {
481 if (!-e "$mgpp_passes_exe" ||
482 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
483 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
484 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
485 }
486 }
487
488 # db_level is always section
489 my $db_level = "section";
490
491 # set up the document processr
492 $self->{'buildproc'}->set_output_handle ($handle);
493 $self->{'buildproc'}->set_mode ('text');
494 $self->{'buildproc'}->set_index ($index, $indexexparr);
495 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
496 $self->{'buildproc'}->set_indexing_text (1);
497 $self->{'buildproc'}->set_levels ($self->{'levels'});
498 $self->{'buildproc'}->set_db_level ($db_level);
499
500 $self->{'buildproc'}->reset();
501
502 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
503 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
504 close ($handle) unless $self->{'debug'};
505
506 $self->print_stats();
507
508 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
509 # we check on the .id file - index dictionary
510 my $dict_file = "$fullindexprefix.id";
511 if (!-e $dict_file) {
512 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
513 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
514 $self->{'notbuilt'}->{$index}=1;
515 return;
516 }
517
518 if (!$self->{'debug'}) {
519 # create the perfect hash function
520 if (!-e "$mgpp_perf_hash_build_exe") {
521 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
522 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
523 }
524 my $hash_cmd = "mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra";
525 print $outhandle "\ncmd: $hash_cmd\n" if ($self->{'verbosity'} >= 4);
526
527 my $hash_status = system ($hash_cmd);
528 print $outhandle "\nstatus from running hash_cmd: $hash_status\n" if ($self->{'verbosity'} >= 4);
529 # check that perf hash was generated - if not, don't carry on
530 if ($hash_status !=0) {
531 print $outhandle "mgppbuilder::build_index - Couldn't create index $index as there are too few words in the index.\n";
532 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
533 $self->{'notbuilt'}->{$index}=1;
534 return;
535
536 }
537
538 if (!-e "$mgpp_passes_exe" ||
539 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
540 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
541 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
542 }
543 }
544
545 # invert the text
546 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
547 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
548
549 $self->{'buildproc'}->set_output_handle ($handle);
550 $self->{'buildproc'}->reset();
551
552 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
553 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
554
555 $self->print_stats ();
556
557 if (!$self->{'debug'}) {
558
559 close ($handle);
560 my $passes_exit_status = $?;
561 print $outhandle "\nMGPP Passes exit status $passes_exit_status\n" if ($self->{'verbosity'} >= 4);
562
563 # create the weights file
564 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
565 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
566 if (!-e "$mgpp_weights_build_exe") {
567 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
568 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
569 }
570 my $weights_cmd = "mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra";
571 print $outhandle "\ncmd: $weights_cmd\n" if ($self->{'verbosity'} >= 4);
572 my $weights_status = system ($weights_cmd);
573 # check that it worked - if not, don't carry on
574 if ($weights_status !=0) {
575 print $outhandle "mgppbuilder::build_index - No Index: couldn't create weights file, error calling mggp_weights_build.\n";
576 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
577 $self->{'notbuilt'}->{$index}=1;
578 return;
579
580 }
581
582 # create 'on-disk' stemmed dictionary
583 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
584 if (!-e "$mgpp_invf_dict_exe") {
585 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
586 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
587 }
588 my $invdict_status = system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
589 # check that it worked - if not, don't carry on
590 if ($invdict_status !=0) {
591 print $outhandle "mgppbuilder::build_index - No Index: couldn't create on-disk stemmed dictionary, error calling mggp_invf_dict.\n";
592 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
593 $self->{'notbuilt'}->{$index}=1;
594 return;
595
596 }
597
598 # creates stem index files for the various stemming methods
599 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
600 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
601 if (!-e "$mgpp_stem_idx_exe") {
602 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
603 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
604 }
605 my $accent_folding_enabled = 1;
606 if ($self->{'accentfold'}) {
607 # the first time we do this, we test for accent folding enabled
608 my $accent_status = system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra");
609 if ($accent_status == 2) {
610 # accent folding has not been enabled in mgpp
611 $accent_folding_enabled = 0;
612 $self->{'stemindexes'} -= 4;
613 } elsif ($accent_status != 0) {
614 print $outhandle "\nAccent folding failed: mgpp_stem_idx exit status $accent_status\n" if ($self->{'verbosity'} >= 4);
615 $self->{'accentfold'} = 0;
616 #$accent_folding_enabled = 0;
617 $self->{'stemindexes'} -= 4;
618 }
619 }
620 if ($self->{'casefold'}) {
621 my $casefold_status = system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
622 if ($casefold_status != 0) {
623 print $outhandle "\nCase folding failed: mgpp_stem_idx exit status $casefold_status\n" if ($self->{'verbosity'} >= 4);
624 $self->{'casefold'} = 0;
625 $self->{'stemindexes'} -= 1;
626 }
627
628 elsif ($accent_folding_enabled && $self->{'accentfold'}) {
629 my $status = system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
630 if($status != 0) {
631 print $outhandle "\nAccent folding (with casefolding) failed: mgpp_stem_idx exit status $status\n" if ($self->{'verbosity'} >= 4);
632 $self->{'accentfold'} = 0;
633 $self->{'stemindexes'} -= 4; # casefold worked, only accentfold failed, so -= 4, not -= 5
634 }
635 }
636 }
637 if ($self->{'stem'}) {
638 my $stem_status = system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
639 if ($stem_status != 0) {
640 print $outhandle "\nStemming failed: mgpp_stem_idx exit status $stem_status\n" if ($self->{'verbosity'} >= 4);
641 $self->{'stem'} = 0;
642 $self->{'stemindexes'} -= 2;
643 }
644 elsif ($accent_folding_enabled && $self->{'accentfold'}) {
645 my $status = system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
646 if($status != 0) {
647 print $outhandle "\nAccent folding (with stemming) failed: mgpp_stem_idx exit status $status\n" if ($self->{'verbosity'} >= 4);
648 $self->{'accentfold'} = 0;
649 $self->{'stemindexes'} -= 4; # stem worked, only accentfold failed, so -= 4, not -= 6
650 }
651 }
652 }
653 if ($self->{'casefold'} && $self->{'stem'}) {
654 my $case_and_stem_status = system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
655 if ($case_and_stem_status != 0) {
656 print $outhandle "\nCasefolding and stemming failed: mgpp_stem_idx exit status $case_and_stem_status\n" if ($self->{'verbosity'} >= 4);
657 $self->{'stem'} = 0;
658 $self->{'casefold'} = 0;
659 $self->{'stemindexes'} -= 3;
660 }
661 elsif ($accent_folding_enabled && $self->{'accentfold'}) {
662 my $status = system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
663 if($status != 0) {
664 print $outhandle "\nAccent folding (with stemming and casefolding) failed: mgpp_stem_idx exit status $status\n" if ($self->{'verbosity'} >= 4);
665 $self->{'accentfold'} = 0;
666 $self->{'stemindexes'} -= 4; # casefold and stem worked, only accentfold failed, so -= 4, not -= 7
667 }
668 }
669 }
670
671 # remove unwanted files
672 my $tmpdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir);
673 opendir (DIR, $tmpdir) || die
674 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
675 foreach my $file (readdir(DIR)) {
676 next if $file =~ /^\./;
677 my ($suffix) = $file =~ /\.([^\.]+)$/;
678 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
679 # delete it!
680 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
681 #&util::rm (&FileUtils::filenameConcatenate($tmpdir, $file));
682 }
683 }
684 closedir (DIR);
685 }
686 print STDERR "</Stage>\n" if $self->{'gli'};
687}
688
689
690sub get_collection_meta_indexes
691{
692 my $self = shift(@_);
693 my $collection_infodb = shift(@_);
694
695 # define the indexed field mapping if not already done so
696 # (i.e. if infodb called separately from build_index)
697 if (!defined $self->{'build_cfg'}) {
698 $self->read_final_field_list();
699 }
700
701 # first do the collection meta stuff - everything without a dot
702 my $collmetadefined = 0;
703 my $metadata_entry;
704 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
705 $collmetadefined = 1;
706 }
707
708 #add the index field macros to [collection]
709 # eg <TI>Title
710 # <SU>Subject
711 # these now come from collection meta. if that is not defined, uses the metadata name
712 my $collmeta = "";
713 if (defined $self->{'build_cfg'}->{'extraindexfields'}) {
714 foreach my $longfield (@{$self->{'build_cfg'}->{'extraindexfields'}}){
715 my $shortfield = $self->{'buildproc'}->{'fieldnamemap'}->{$longfield};
716 next if $shortfield eq 1;
717
718 # we need to check if some coll meta has been defined - don't output
719 # any that have
720 $collmeta = ".$longfield";
721 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
722 if ($longfield eq "allfields") {
723 $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
724 } elsif ($longfield eq "text") {
725 $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
726 } else {
727 $collection_infodb->{$shortfield} = [ $longfield ];
728 }
729 }
730 }
731 }
732
733 # now add the level names
734 my $level_entry = "";
735 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
736 $collmeta = ".$level"; # based on the original specification
737 $level =~ tr/A-Z/a-z/; # make it lower case
738 my $levelid = $level_map{$level}; # find the actual value we used in the index
739 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
740 # use the default macro
741 $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
742 }
743 }
744
745 # now add subcoll meta
746 my $subcoll_entry = "";
747 my $shortname = "";
748 my $one_entry = "";
749 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
750 $shortname = $self->{'index_mapping'}->{$subcoll};
751 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
752 $collection_infodb->{$shortname} = [ $subcoll ];
753 }
754 }
755
756 # now add language meta
757 my $lang_entry = "";
758 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
759 $shortname = $self->{'index_mapping'}->{$lang};
760 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
761 $collection_infodb->{$shortname} = [ $lang ];
762 }
763 }
764}
765
766
767# default is to output the metadata sets (prefixes) used in collection
768sub output_collection_meta
769{
770 my $self = shift(@_);
771 my $infodb_handle = shift(@_);
772
773 my %collection_infodb = ();
774 $self->get_collection_meta_sets(\%collection_infodb);
775 $self->get_collection_meta_indexes(\%collection_infodb);
776 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
777}
778
779
780# at the end of building, we have an indexfieldmap with all the mappings,
781# plus some extras, and indexmap with any indexes in it that weren't
782# specified in the index definition. We want to make an ordered list of
783# fields that are indexed, and a list of mappings that are used. This will
784# be used for the build.cfg file, and for collection meta definition we
785# store these in a build.cfg bit
786sub make_final_field_list {
787 my $self = shift (@_);
788
789 $self->{'build_cfg'} = {};
790
791 # store the indexfieldmap information
792 my @indexfieldmap = ();
793 my @indexfields = ();
794 my $specifiedfields = {};
795 my @specifiedfieldorder = ();
796
797 # go through the index definition and add each thing to a map, so we
798 # can easily check if it is already specified - when doing the
799 # metadata, we print out all the individual fields, but some may
800 # already be specified in the index definition, so we dont want to add
801 # those again.
802
803 my $field;
804 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
805 # remove subcoll stuff
806 my $parts = $field;
807 $parts =~ s/:.*$//;
808 # *************
809 my @fs = split(';', $parts);
810 foreach my $f(@fs) {
811 if (!defined $specifiedfields->{$f}) {
812 $specifiedfields->{$f}=1;
813 push (@specifiedfieldorder, "$f");
814 }
815 }
816 }
817
818 #add all fields bit
819 my $fnm = $self->{'buildproc'}->{'fieldnamemap'};
820
821 foreach $field (@specifiedfieldorder) {
822 if ($field eq "metadata") {
823 foreach my $newfield (keys %{$self->{'buildproc'}->{'extraindexfields'}}) {
824 if (!defined $specifiedfields->{$newfield}) {
825 push (@indexfieldmap, "$newfield\-\>$fnm->{$newfield}");
826 push (@indexfields, "$newfield");
827 }
828 }
829
830 } elsif ($field eq 'text') {
831 push (@indexfieldmap, "text\-\>TX");
832 push (@indexfields, "text");
833 } elsif ($field eq 'allfields') {
834 push (@indexfieldmap, "allfields\-\>ZZ");
835 push (@indexfields, "allfields");
836 } else {
837 # we only add in the ones that have been processed
838 if (defined $self->{'buildproc'}->{'allindexfields'}->{$field}) {
839 push (@indexfieldmap, "$field\-\>$fnm->{$field}");
840 push (@indexfields, "$field");
841 }
842 }
843 }
844
845 if (scalar @indexfieldmap) {
846 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
847 }
848
849 if (scalar @indexfields) {
850 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
851 }
852}
853
854
855# recreate the field list from the build.cfg file, look first in building,
856# then in index to find it. if there is no build.cfg, we can't do the field
857# list (there is unlikely to be any index anyway.)
858sub read_final_field_list {
859 my $self = shift (@_);
860 $self->{'build_cfg'} = {};
861 my @indexfieldmap = ();
862 my @indexfields = ();
863 my @indexmap = ();
864
865 # we read the stuff in from the build.cfg file - if its there
866 my $buildcfg = $self->read_build_cfg();
867 return unless defined $buildcfg;
868
869 my $field;
870 if (defined $buildcfg->{'indexfields'}) {
871 foreach $field (@{$buildcfg->{'indexfields'}}) {
872 push (@indexfields, "$field");
873 }
874 }
875
876 if (defined $buildcfg->{'indexfieldmap'}) {
877 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
878 push (@indexfieldmap, "$field");
879 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
880 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
881 }
882 }
883
884 if (defined $buildcfg->{'indexmap'}) {
885 foreach $field (@{$buildcfg->{'indexmap'}}) {
886 push (@indexmap, "$field");
887 }
888 }
889
890 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
891 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
892 $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
893}
894
895
896sub build_cfg_extra {
897 my $self = shift (@_);
898 my ($build_cfg) = @_;
899
900 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
901
902 # store the level info
903 my @indexlevels = ();
904 my @levelmap = ();
905 foreach my $l (@{$self->{'levelorder'}}) {
906 push (@indexlevels, $level_map{$l});
907 push (@levelmap, "$l\-\>$level_map{$l}");
908 }
909 $build_cfg->{'indexlevels'} = \@indexlevels;
910 $build_cfg->{'levelmap'} = \@levelmap;
911
912 # text level (and database level) is always section
913 $build_cfg->{'textlevel'} = $level_map{'section'};
914
915}
916
9171;
918
919
Note: See TracBrowser for help on using the repository browser.