source: main/trunk/greenstone2/perllib/mgppbuilder.pm@ 27985

Last change on this file since 27985 was 27985, checked in by ak19, 11 years ago

Now prints out exit status and weights and passes commands that get run when verbosity is set high

  • Property svn:keywords set to Author Date Id Revision
File size: 28.9 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use colcfg;
30use plugin;
31use strict; no strict 'refs';
32use util;
33use FileUtils;
34
35
36sub BEGIN {
37 @mgppbuilder::ISA = ('basebuilder');
38}
39
40
41
42our %level_map = ('document'=>'Doc',
43 'section'=>'Sec',
44 'paragraph'=>'Para',
45 'Doc'=>'_textdocument_',
46 'Sec'=>'_textsection_',
47 'Para'=>'_textparagraph_');
48
49our %wanted_index_files = ('td'=>1,
50 't'=>1,
51 'tl'=>1,
52 'ti'=>1,
53 'idb'=>1,
54 'ib1'=>1,
55 'ib2'=>1,
56 'ib3'=>1,
57 'ib4'=>1,
58 'ib5'=>1,
59 'ib6'=>1,
60 'ib7'=>1,
61 'i'=>1,
62 'il'=>1,
63 'w'=>1,
64 'wa'=>1);
65
66
67my $maxdocsize = $basebuilder::maxdocsize;
68
69sub new {
70 my $class = shift(@_);
71
72 my $self = new basebuilder (@_);
73 $self = bless $self, $class;
74
75 #$self->{'indexfieldmap'} = \%static_indexfield_map;
76
77 # get the levels (Section, Paragraph) for indexing and compression
78 $self->{'levels'} = {};
79 $self->{'levelorder'} = ();
80 if (defined $self->{'collect_cfg'}->{'levels'}) {
81 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
82 $level =~ tr/A-Z/a-z/;
83 $self->{'levels'}->{$level} = 1;
84 push (@{$self->{'levelorder'}}, $level);
85 }
86 } else { # default to document
87 $self->{'levels'}->{'document'} = 1;
88 push (@{$self->{'levelorder'}}, 'document');
89 }
90
91 $self->{'buildtype'} = "mgpp";
92
93 return $self;
94}
95
96sub generate_index_list {
97 my $self = shift (@_);
98
99 # sort out the indexes
100 #indexes are specified with spaces, but we put them into one index
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 if (defined $indexes) {
103 $self->{'collect_cfg'}->{'indexes'} = [];
104
105 # remove any ex. from index spec but iff it is the only namespace in the metadata name
106 my @indexes_copy = @$indexes; # make a copy, as 'map' changes entry in array
107 #map { $_ =~ s/(^|,|;)ex\.([^.]+)$/$1$2/; } @indexes_copy; # No. Will replace metanames like flex.Image with fl.Image
108 map { $_ =~ s/(,|;)/$1 /g; } @indexes_copy; # introduce a space after every separator
109 map { $_ =~ s/(^| )ex\.([^.,:]+)(,|;|$)/$1$2$3/g; } @indexes_copy; # replace all <ex.> at start of metanames or <, ex.> when in a comma separated list
110 map { $_ =~ s/(,|:) /$1/g; } @indexes_copy; # remove space introduced after every separator
111 my $single_index = join(';', @indexes_copy).";";
112
113 push (@{$self->{'collect_cfg'}->{'indexes'}}, $single_index);
114 }
115}
116
117sub generate_index_options {
118 my $self = shift (@_);
119
120 $self->SUPER::generate_index_options();
121
122 $self->{'casefold'} = 0;
123 $self->{'stem'} = 0;
124 $self->{'accentfold'} = 0;
125
126 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
127 # just use default options
128 $self->{'casefold'} = 1;
129 $self->{'stem'} = 1;
130 $self->{'accentfold'} = 1;
131 } else {
132 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
133 if ($option =~ /stem/) {
134 $self->{'stem'} = 1;
135 } elsif ($option =~ /casefold/) {
136 $self->{'casefold'} = 1;
137 } elsif ($option =~ /accentfold/) {
138 $self->{'accentfold'} = 1;
139 }
140 }
141 }
142
143 # now we record this for the build cfg
144 $self->{'stemindexes'} = 0;
145 if ($self->{'casefold'}) {
146 $self->{'stemindexes'} += 1;
147 }
148 if ($self->{'stem'}) {
149 $self->{'stemindexes'} += 2;
150 }
151 if ($self->{'accentfold'}) {
152 $self->{'stemindexes'} += 4;
153 }
154
155}
156
157sub default_buildproc {
158 my $self = shift (@_);
159
160 return "mgppbuildproc";
161}
162
163sub compress_text {
164
165 my $self = shift (@_);
166
167 # we don't do anything if we don't want compressed text
168 return if $self->{'no_text'};
169
170 my ($textindex) = @_;
171
172 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
173 my $exe = &util::get_os_exe ();
174 my $mgpp_passes_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_passes$exe");
175 my $mgpp_compression_dict_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_compression_dict$exe");
176 my $outhandle = $self->{'outhandle'};
177
178 my $maxnumeric = $self->{'maxnumeric'};
179
180 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, "text"));
181
182 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
183 my $basefilename = &FileUtils::filenameConcatenate("text",$collect_tail);
184 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, $basefilename);
185
186 my $osextra = "";
187 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
188 $fulltextprefix =~ s@/@\\@g;
189 }
190 else {
191 $osextra = " -d /";
192 }
193
194
195 # define the section names and possibly the doc name for mgpasses
196 # the compressor doesn't need to know about paragraphs - never want to
197 # retrieve them
198
199 # always use Doc and Sec levels
200 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
201
202 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
203 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
204
205 # collect the statistics for the text
206 # -b $maxdocsize sets the maximum document size to be 12 meg
207 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
208 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
209
210 my ($handle);
211 if ($self->{'debug'}) {
212 $handle = *STDOUT;
213 }
214 else {
215 if (!-e "$mgpp_passes_exe" ||
216 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
217 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
218 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
219 }
220 }
221
222 my $db_level = "section";
223
224 $self->{'buildproc'}->set_output_handle ($handle);
225 $self->{'buildproc'}->set_mode ('text');
226 $self->{'buildproc'}->set_index ($textindex);
227 $self->{'buildproc'}->set_indexing_text (0);
228 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
229 $self->{'buildproc'}->set_levels ($self->{'levels'});
230 $self->{'buildproc'}->set_db_level ($db_level);
231 $self->{'buildproc'}->reset();
232 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
233 $self->{'buildproc'}, $self->{'maxdocs'});
234 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
235 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
236 &plugin::end($self->{'pluginfo'});
237
238 close ($handle) unless $self->{'debug'};
239
240 $self->print_stats();
241
242 # create the compression dictionary
243 # the compression dictionary is built by assuming the stats are from a seed
244 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
245 # and the resulting dictionary must be less than 5 meg with the most
246 # frequent words being put into the dictionary first (-2 -k 5120)
247 # note: these options are left over from mg version
248 if (!$self->{'debug'}) {
249 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
250 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
251 if (!-e "$mgpp_compression_dict_exe") {
252 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
253 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
254 }
255 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
256
257 if (!$self->{'debug'}) {
258 if (!-e "$mgpp_passes_exe" ||
259 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
260 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
261 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
262 }
263 }
264 }
265 else {
266 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
267 }
268
269 $self->{'buildproc'}->set_output_handle ($handle);
270 $self->{'buildproc'}->reset();
271
272 # compress the text
273 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
274 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
275
276 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
277 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
278 close ($handle) unless $self->{'debug'};
279
280 $self->print_stats();
281 print STDERR "</Stage>\n" if $self->{'gli'};
282}
283
284
285sub post_build_indexes {
286 my $self = shift(@_);
287
288 #define the final field lists
289 $self->make_final_field_list();
290}
291
292# creates directory names for each of the index descriptions
293sub create_index_mapping {
294 my $self = shift (@_);
295 my ($indexes) = @_;
296
297 my %mapping = ();
298
299 return \%mapping if !(scalar @$indexes);
300
301 $mapping{'indexmaporder'} = [];
302 $mapping{'subcollectionmaporder'} = [];
303 $mapping{'languagemaporder'} = [];
304
305 # dirnames is used to check for collisions. Start this off
306 # with the manditory directory names
307 my %dirnames = ('text'=>'text',
308 'extra'=>'extra');
309 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
310
311 foreach my $index (@$indexes) {
312 my ($fields, $subcollection, $languages) = split (":", $index);
313
314 # we only ever have one index, and its called 'idx'
315 my $pindex = 'idx';
316
317 # next comes a processed version of the subcollection if there is one.
318 my $psub = $self->process_field ($subcollection);
319 $psub = lc ($psub);
320
321 # next comes a processed version of the language if there is one.
322 my $plang = $self->process_field ($languages);
323 $plang = lc ($plang);
324
325 my $dirname = $pindex . $psub . $plang;
326
327 # check to be sure all index names are unique
328 while (defined ($dirnames{$dirname})) {
329 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
330 }
331
332 $mapping{$index} = $dirname;
333
334 # store the mapping orders as well as the maps
335 # also put index, subcollection and language fields into the mapping thing -
336 # (the full index name (eg text:subcol:lang) is not used on
337 # the query page) -these are used for collectionmeta later on
338 if (!defined $mapping{'indexmap'}{"$fields"}) {
339 $mapping{'indexmap'}{"$fields"} = $pindex;
340 push (@{$mapping{'indexmaporder'}}, "$fields");
341 if (!defined $mapping{"$fields"}) {
342 $mapping{"$fields"} = $pindex;
343 }
344 }
345 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
346 $mapping{'subcollectionmap'}{$subcollection} = $psub;
347 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
348 $mapping{$subcollection} = $psub;
349 }
350 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
351 $mapping{'languagemap'}{$languages} = $plang;
352 push (@{$mapping{'languagemaporder'}}, $languages);
353 $mapping{$languages} = $plang;
354 }
355 $dirnames{$dirname} = $index;
356 $pnames{'index'}->{$pindex} = "$fields";
357 $pnames{'subcollection'}->{$psub} = $subcollection;
358 $pnames{'languages'}->{$plang} = $languages;
359 }
360
361 return \%mapping;
362}
363
364sub make_unique {
365 my $self = shift (@_);
366 my ($namehash, $index, $indexref, $subref, $langref) = @_;
367 my ($fields, $subcollection, $languages) = split (":", $index);
368
369 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
370 $self->get_next_version ($indexref);
371 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
372 $self->get_next_version ($subref);
373 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
374 $self->get_next_version ($langref);
375 }
376 return "$$indexref$$subref$$langref";
377}
378
379
380sub build_index {
381 my $self = shift (@_);
382 my ($index) = @_;
383 my $outhandle = $self->{'outhandle'};
384
385 # get the full index directory path and make sure it exists
386 my $indexdir = $self->{'index_mapping'}->{$index};
387 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir));
388
389 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
390 my $fullindexprefix = &FileUtils::filenameConcatenate($self->{'build_dir'},
391 $indexdir,
392 $collect_tail);
393 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text",
394 $collect_tail);
395
396 # get any os specific stuff
397 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
398
399 my $exe = &util::get_os_exe ();
400 my $mgpp_passes_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_passes$exe");
401
402 # define the section names for mgpasses
403 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
404 if ($self->{'levels'}->{'paragraph'}) {
405 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
406 }
407
408 my $mgpp_perf_hash_build_exe =
409 &FileUtils::filenameConcatenate($exedir, "mgpp_perf_hash_build$exe");
410 my $mgpp_weights_build_exe =
411 &FileUtils::filenameConcatenate($exedir, "mgpp_weights_build$exe");
412 my $mgpp_invf_dict_exe =
413 &FileUtils::filenameConcatenate($exedir, "mgpp_invf_dict$exe");
414 my $mgpp_stem_idx_exe =
415 &FileUtils::filenameConcatenate($exedir, "mgpp_stem_idx$exe");
416
417 my $maxnumeric = $self->{'maxnumeric'};
418
419 my $osextra = "";
420 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
421 $fullindexprefix =~ s@/@\\@g;
422 } else {
423 $osextra = " -d /";
424 if ($outhandle ne "STDERR") {
425 # so mgpp_passes doesn't print to stderr if we redirect output
426 $osextra .= " 2>/dev/null";
427 }
428 }
429
430 # get the index expression if this index belongs
431 # to a subcollection
432 my $indexexparr = [];
433 my $langarr = [];
434 # there may be subcollection info, and language info.
435 my ($fields, $subcollection, $language) = split (":", $index);
436 my @subcollections = ();
437 @subcollections = split /,/, $subcollection if (defined $subcollection);
438
439 foreach $subcollection (@subcollections) {
440 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
441 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
442 }
443 }
444
445 # add expressions for languages if this index belongs to
446 # a language subcollection - only put languages expressions for the
447 # ones we want in the index
448
449 my @languages = ();
450 my $languagemetadata = "Language";
451 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
452 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
453 }
454 @languages = split /,/, $language if (defined $language);
455 foreach my $language (@languages) {
456 my $not=0;
457 if ($language =~ s/^\!//) {
458 $not = 1;
459 }
460 if($not) {
461 push (@$langarr, "!$language");
462 } else {
463 push (@$langarr, "$language");
464 }
465 }
466
467 # Build index dictionary. Uses verbatim stem method
468 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
469 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
470 my ($handle);
471 if ($self->{'debug'}) {
472 $handle = *STDOUT;
473 }
474 else {
475 if (!-e "$mgpp_passes_exe" ||
476 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
477 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
478 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
479 }
480 }
481
482 # db_level is always section
483 my $db_level = "section";
484
485 # set up the document processr
486 $self->{'buildproc'}->set_output_handle ($handle);
487 $self->{'buildproc'}->set_mode ('text');
488 $self->{'buildproc'}->set_index ($index, $indexexparr);
489 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
490 $self->{'buildproc'}->set_indexing_text (1);
491 $self->{'buildproc'}->set_levels ($self->{'levels'});
492 $self->{'buildproc'}->set_db_level ($db_level);
493
494 $self->{'buildproc'}->reset();
495
496 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
497 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
498 close ($handle) unless $self->{'debug'};
499
500 $self->print_stats();
501
502 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
503 # we check on the .id file - index dictionary
504 my $dict_file = "$fullindexprefix.id";
505 if (!-e $dict_file) {
506 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
507 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
508 $self->{'notbuilt'}->{$index}=1;
509 return;
510 }
511
512 if (!$self->{'debug'}) {
513 # create the perfect hash function
514 if (!-e "$mgpp_perf_hash_build_exe") {
515 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
516 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
517 }
518 my $hash_cmd = "mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra";
519 print $outhandle "\ncmd: $hash_cmd\n" if ($self->{'verbosity'} >= 4);
520
521 my $hash_status = system ($hash_cmd);
522 print $outhandle "\nstatus from running hash_cmd: $hash_status\n" if ($self->{'verbosity'} >= 4);
523
524 if (!-e "$mgpp_passes_exe" ||
525 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
526 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
527 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
528 }
529 }
530
531 # invert the text
532 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
533 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
534
535 $self->{'buildproc'}->set_output_handle ($handle);
536 $self->{'buildproc'}->reset();
537
538 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
539 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
540
541 $self->print_stats ();
542
543 if (!$self->{'debug'}) {
544
545 close ($handle);
546 my $passes_exit_status = $?;
547 print $outhandle "\nMGPP Passes exit status $passes_exit_status\n" if ($self->{'verbosity'} >= 4);
548
549 # create the weights file
550 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
551 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
552 if (!-e "$mgpp_weights_build_exe") {
553 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
554 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
555 }
556 my $weights_cmd = "mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra";
557 print $outhandle "\ncmd: $weights_cmd\n" if ($self->{'verbosity'} >= 4);
558 system ($weights_cmd);
559
560 # create 'on-disk' stemmed dictionary
561 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
562 if (!-e "$mgpp_invf_dict_exe") {
563 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
564 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
565 }
566 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
567
568
569 # creates stem index files for the various stemming methods
570 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
571 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
572 if (!-e "$mgpp_stem_idx_exe") {
573 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
574 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
575 }
576 my $accent_folding_enabled = 1;
577 if ($self->{'accentfold'}) {
578 # the first time we do this, we test for accent folding enabled
579 if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") == 2) {
580 # accent folding has not been enabled in mgpp
581 $accent_folding_enabled = 0;
582 $self->{'stemindexes'} -= 4;
583 }
584 }
585 if ($self->{'casefold'}) {
586 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
587 if ($accent_folding_enabled && $self->{'accentfold'}) {
588 system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
589 }
590 }
591 if ($self->{'stem'}) {
592 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
593 if ($accent_folding_enabled && $self->{'accentfold'}) {
594 system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
595 }
596 }
597 if ($self->{'casefold'} && $self->{'stem'}) {
598 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
599 if ($accent_folding_enabled && $self->{'accentfold'}) {
600 system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
601 }
602 }
603
604 # remove unwanted files
605 my $tmpdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir);
606 opendir (DIR, $tmpdir) || die
607 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
608 foreach my $file (readdir(DIR)) {
609 next if $file =~ /^\./;
610 my ($suffix) = $file =~ /\.([^\.]+)$/;
611 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
612 # delete it!
613 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
614 #&util::rm (&FileUtils::filenameConcatenate($tmpdir, $file));
615 }
616 }
617 closedir (DIR);
618 }
619 print STDERR "</Stage>\n" if $self->{'gli'};
620}
621
622
623sub get_collection_meta_indexes
624{
625 my $self = shift(@_);
626 my $collection_infodb = shift(@_);
627
628 # define the indexed field mapping if not already done so
629 # (i.e. if infodb called separately from build_index)
630 if (!defined $self->{'build_cfg'}) {
631 $self->read_final_field_list();
632 }
633
634 # first do the collection meta stuff - everything without a dot
635 my $collmetadefined = 0;
636 my $metadata_entry;
637 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
638 $collmetadefined = 1;
639 }
640
641 #add the index field macros to [collection]
642 # eg <TI>Title
643 # <SU>Subject
644 # these now come from collection meta. if that is not defined, uses the metadata name
645 my $collmeta = "";
646 if (defined $self->{'build_cfg'}->{'extraindexfields'}) {
647 foreach my $longfield (@{$self->{'build_cfg'}->{'extraindexfields'}}){
648 my $shortfield = $self->{'buildproc'}->{'fieldnamemap'}->{$longfield};
649 next if $shortfield eq 1;
650
651 # we need to check if some coll meta has been defined - don't output
652 # any that have
653 $collmeta = ".$longfield";
654 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
655 if ($longfield eq "allfields") {
656 $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
657 } elsif ($longfield eq "text") {
658 $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
659 } else {
660 $collection_infodb->{$shortfield} = [ $longfield ];
661 }
662 }
663 }
664 }
665
666 # now add the level names
667 my $level_entry = "";
668 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
669 $collmeta = ".$level"; # based on the original specification
670 $level =~ tr/A-Z/a-z/; # make it lower case
671 my $levelid = $level_map{$level}; # find the actual value we used in the index
672 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
673 # use the default macro
674 $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
675 }
676 }
677
678 # now add subcoll meta
679 my $subcoll_entry = "";
680 my $shortname = "";
681 my $one_entry = "";
682 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
683 $shortname = $self->{'index_mapping'}->{$subcoll};
684 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
685 $collection_infodb->{$shortname} = [ $subcoll ];
686 }
687 }
688
689 # now add language meta
690 my $lang_entry = "";
691 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
692 $shortname = $self->{'index_mapping'}->{$lang};
693 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
694 $collection_infodb->{$shortname} = [ $lang ];
695 }
696 }
697}
698
699
700# default is to output the metadata sets (prefixes) used in collection
701sub output_collection_meta
702{
703 my $self = shift(@_);
704 my $infodb_handle = shift(@_);
705
706 my %collection_infodb = ();
707 $self->get_collection_meta_sets(\%collection_infodb);
708 $self->get_collection_meta_indexes(\%collection_infodb);
709 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
710}
711
712
713# at the end of building, we have an indexfieldmap with all the mappings,
714# plus some extras, and indexmap with any indexes in it that weren't
715# specified in the index definition. We want to make an ordered list of
716# fields that are indexed, and a list of mappings that are used. This will
717# be used for the build.cfg file, and for collection meta definition we
718# store these in a build.cfg bit
719sub make_final_field_list {
720 my $self = shift (@_);
721
722 $self->{'build_cfg'} = {};
723
724 # store the indexfieldmap information
725 my @indexfieldmap = ();
726 my @indexfields = ();
727 my $specifiedfields = {};
728 my @specifiedfieldorder = ();
729
730 # go through the index definition and add each thing to a map, so we
731 # can easily check if it is already specified - when doing the
732 # metadata, we print out all the individual fields, but some may
733 # already be specified in the index definition, so we dont want to add
734 # those again.
735
736 my $field;
737 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
738 # remove subcoll stuff
739 my $parts = $field;
740 $parts =~ s/:.*$//;
741 # *************
742 my @fs = split(';', $parts);
743 foreach my $f(@fs) {
744 if (!defined $specifiedfields->{$f}) {
745 $specifiedfields->{$f}=1;
746 push (@specifiedfieldorder, "$f");
747 }
748 }
749 }
750
751 #add all fields bit
752 my $fnm = $self->{'buildproc'}->{'fieldnamemap'};
753
754 foreach $field (@specifiedfieldorder) {
755 if ($field eq "metadata") {
756 foreach my $newfield (keys %{$self->{'buildproc'}->{'extraindexfields'}}) {
757 if (!defined $specifiedfields->{$newfield}) {
758 push (@indexfieldmap, "$newfield\-\>$fnm->{$newfield}");
759 push (@indexfields, "$newfield");
760 }
761 }
762
763 } elsif ($field eq 'text') {
764 push (@indexfieldmap, "text\-\>TX");
765 push (@indexfields, "text");
766 } elsif ($field eq 'allfields') {
767 push (@indexfieldmap, "allfields\-\>ZZ");
768 push (@indexfields, "allfields");
769 } else {
770 # we only add in the ones that have been processed
771 if (defined $self->{'buildproc'}->{'allindexfields'}->{$field}) {
772 push (@indexfieldmap, "$field\-\>$fnm->{$field}");
773 push (@indexfields, "$field");
774 }
775 }
776 }
777
778 if (scalar @indexfieldmap) {
779 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
780 }
781
782 if (scalar @indexfields) {
783 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
784 }
785}
786
787
788# recreate the field list from the build.cfg file, look first in building,
789# then in index to find it. if there is no build.cfg, we can't do the field
790# list (there is unlikely to be any index anyway.)
791sub read_final_field_list {
792 my $self = shift (@_);
793 $self->{'build_cfg'} = {};
794 my @indexfieldmap = ();
795 my @indexfields = ();
796 my @indexmap = ();
797
798 # we read the stuff in from the build.cfg file - if its there
799 my $buildcfg = $self->read_build_cfg();
800 return unless defined $buildcfg;
801
802 my $field;
803 if (defined $buildcfg->{'indexfields'}) {
804 foreach $field (@{$buildcfg->{'indexfields'}}) {
805 push (@indexfields, "$field");
806 }
807 }
808
809 if (defined $buildcfg->{'indexfieldmap'}) {
810 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
811 push (@indexfieldmap, "$field");
812 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
813 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
814 }
815 }
816
817 if (defined $buildcfg->{'indexmap'}) {
818 foreach $field (@{$buildcfg->{'indexmap'}}) {
819 push (@indexmap, "$field");
820 }
821 }
822
823 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
824 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
825 $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
826}
827
828
829sub build_cfg_extra {
830 my $self = shift (@_);
831 my ($build_cfg) = @_;
832
833 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
834
835 # store the level info
836 my @indexlevels = ();
837 my @levelmap = ();
838 foreach my $l (@{$self->{'levelorder'}}) {
839 push (@indexlevels, $level_map{$l});
840 push (@levelmap, "$l\-\>$level_map{$l}");
841 }
842 $build_cfg->{'indexlevels'} = \@indexlevels;
843 $build_cfg->{'levelmap'} = \@levelmap;
844
845 # text level (and database level) is always section
846 $build_cfg->{'textlevel'} = $level_map{'section'};
847
848}
849
8501;
851
852
Note: See TracBrowser for help on using the repository browser.