source: main/trunk/greenstone2/perllib/mgppbuilder.pm@ 27997

Last change on this file since 27997 was 27997, checked in by kjdon, 11 years ago

need to check that perfect hash function was generated otherwise we can't continue and build the index

  • Property svn:keywords set to Author Date Id Revision
File size: 29.3 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use colcfg;
30use plugin;
31use strict; no strict 'refs';
32use util;
33use FileUtils;
34
35
36sub BEGIN {
37 @mgppbuilder::ISA = ('basebuilder');
38}
39
40
41
42our %level_map = ('document'=>'Doc',
43 'section'=>'Sec',
44 'paragraph'=>'Para',
45 'Doc'=>'_textdocument_',
46 'Sec'=>'_textsection_',
47 'Para'=>'_textparagraph_');
48
49our %wanted_index_files = ('td'=>1,
50 't'=>1,
51 'tl'=>1,
52 'ti'=>1,
53 'idb'=>1,
54 'ib1'=>1,
55 'ib2'=>1,
56 'ib3'=>1,
57 'ib4'=>1,
58 'ib5'=>1,
59 'ib6'=>1,
60 'ib7'=>1,
61 'i'=>1,
62 'il'=>1,
63 'w'=>1,
64 'wa'=>1);
65
66
67my $maxdocsize = $basebuilder::maxdocsize;
68
69sub new {
70 my $class = shift(@_);
71
72 my $self = new basebuilder (@_);
73 $self = bless $self, $class;
74
75 #$self->{'indexfieldmap'} = \%static_indexfield_map;
76
77 # get the levels (Section, Paragraph) for indexing and compression
78 $self->{'levels'} = {};
79 $self->{'levelorder'} = ();
80 if (defined $self->{'collect_cfg'}->{'levels'}) {
81 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
82 $level =~ tr/A-Z/a-z/;
83 $self->{'levels'}->{$level} = 1;
84 push (@{$self->{'levelorder'}}, $level);
85 }
86 } else { # default to document
87 $self->{'levels'}->{'document'} = 1;
88 push (@{$self->{'levelorder'}}, 'document');
89 }
90
91 $self->{'buildtype'} = "mgpp";
92
93 return $self;
94}
95
96sub generate_index_list {
97 my $self = shift (@_);
98
99 # sort out the indexes
100 #indexes are specified with spaces, but we put them into one index
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 if (defined $indexes) {
103 $self->{'collect_cfg'}->{'indexes'} = [];
104
105 # remove any ex. from index spec but iff it is the only namespace in the metadata name
106 my @indexes_copy = @$indexes; # make a copy, as 'map' changes entry in array
107 #map { $_ =~ s/(^|,|;)ex\.([^.]+)$/$1$2/; } @indexes_copy; # No. Will replace metanames like flex.Image with fl.Image
108 map { $_ =~ s/(,|;)/$1 /g; } @indexes_copy; # introduce a space after every separator
109 map { $_ =~ s/(^| )ex\.([^.,:]+)(,|;|$)/$1$2$3/g; } @indexes_copy; # replace all <ex.> at start of metanames or <, ex.> when in a comma separated list
110 map { $_ =~ s/(,|:) /$1/g; } @indexes_copy; # remove space introduced after every separator
111 my $single_index = join(';', @indexes_copy).";";
112
113 push (@{$self->{'collect_cfg'}->{'indexes'}}, $single_index);
114 }
115}
116
117sub generate_index_options {
118 my $self = shift (@_);
119
120 $self->SUPER::generate_index_options();
121
122 $self->{'casefold'} = 0;
123 $self->{'stem'} = 0;
124 $self->{'accentfold'} = 0;
125
126 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
127 # just use default options
128 $self->{'casefold'} = 1;
129 $self->{'stem'} = 1;
130 $self->{'accentfold'} = 1;
131 } else {
132 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
133 if ($option =~ /stem/) {
134 $self->{'stem'} = 1;
135 } elsif ($option =~ /casefold/) {
136 $self->{'casefold'} = 1;
137 } elsif ($option =~ /accentfold/) {
138 $self->{'accentfold'} = 1;
139 }
140 }
141 }
142
143 # now we record this for the build cfg
144 $self->{'stemindexes'} = 0;
145 if ($self->{'casefold'}) {
146 $self->{'stemindexes'} += 1;
147 }
148 if ($self->{'stem'}) {
149 $self->{'stemindexes'} += 2;
150 }
151 if ($self->{'accentfold'}) {
152 $self->{'stemindexes'} += 4;
153 }
154
155}
156
157sub default_buildproc {
158 my $self = shift (@_);
159
160 return "mgppbuildproc";
161}
162
163sub compress_text {
164
165 my $self = shift (@_);
166
167 # we don't do anything if we don't want compressed text
168 return if $self->{'no_text'};
169
170 my ($textindex) = @_;
171
172 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
173 my $exe = &util::get_os_exe ();
174 my $mgpp_passes_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_passes$exe");
175 my $mgpp_compression_dict_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_compression_dict$exe");
176 my $outhandle = $self->{'outhandle'};
177
178 my $maxnumeric = $self->{'maxnumeric'};
179
180 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, "text"));
181
182 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
183 my $basefilename = &FileUtils::filenameConcatenate("text",$collect_tail);
184 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, $basefilename);
185
186 my $osextra = "";
187 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
188 $fulltextprefix =~ s@/@\\@g;
189 }
190 else {
191 $osextra = " -d /";
192 }
193
194
195 # define the section names and possibly the doc name for mgpasses
196 # the compressor doesn't need to know about paragraphs - never want to
197 # retrieve them
198
199 # always use Doc and Sec levels
200 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
201
202 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
203 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
204
205 # collect the statistics for the text
206 # -b $maxdocsize sets the maximum document size to be 12 meg
207 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
208 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
209
210 my ($handle);
211 if ($self->{'debug'}) {
212 $handle = *STDOUT;
213 }
214 else {
215 if (!-e "$mgpp_passes_exe" ||
216 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
217 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
218 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
219 }
220 }
221
222 my $db_level = "section";
223
224 $self->{'buildproc'}->set_output_handle ($handle);
225 $self->{'buildproc'}->set_mode ('text');
226 $self->{'buildproc'}->set_index ($textindex);
227 $self->{'buildproc'}->set_indexing_text (0);
228 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
229 $self->{'buildproc'}->set_levels ($self->{'levels'});
230 $self->{'buildproc'}->set_db_level ($db_level);
231 $self->{'buildproc'}->reset();
232 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
233 $self->{'buildproc'}, $self->{'maxdocs'});
234 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
235 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
236 &plugin::end($self->{'pluginfo'});
237
238 close ($handle) unless $self->{'debug'};
239
240 $self->print_stats();
241
242 # create the compression dictionary
243 # the compression dictionary is built by assuming the stats are from a seed
244 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
245 # and the resulting dictionary must be less than 5 meg with the most
246 # frequent words being put into the dictionary first (-2 -k 5120)
247 # note: these options are left over from mg version
248 if (!$self->{'debug'}) {
249 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
250 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
251 if (!-e "$mgpp_compression_dict_exe") {
252 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
253 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
254 }
255 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
256
257 if (!$self->{'debug'}) {
258 if (!-e "$mgpp_passes_exe" ||
259 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
260 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
261 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
262 }
263 }
264 }
265 else {
266 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
267 }
268
269 $self->{'buildproc'}->set_output_handle ($handle);
270 $self->{'buildproc'}->reset();
271
272 # compress the text
273 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
274 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
275
276 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
277 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
278 close ($handle) unless $self->{'debug'};
279
280 $self->print_stats();
281 print STDERR "</Stage>\n" if $self->{'gli'};
282}
283
284
285sub post_build_indexes {
286 my $self = shift(@_);
287
288 #define the final field lists
289 $self->make_final_field_list();
290}
291
292# creates directory names for each of the index descriptions
293sub create_index_mapping {
294 my $self = shift (@_);
295 my ($indexes) = @_;
296
297 my %mapping = ();
298
299 return \%mapping if !(scalar @$indexes);
300
301 $mapping{'indexmaporder'} = [];
302 $mapping{'subcollectionmaporder'} = [];
303 $mapping{'languagemaporder'} = [];
304
305 # dirnames is used to check for collisions. Start this off
306 # with the manditory directory names
307 my %dirnames = ('text'=>'text',
308 'extra'=>'extra');
309 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
310
311 foreach my $index (@$indexes) {
312 my ($fields, $subcollection, $languages) = split (":", $index);
313
314 # we only ever have one index, and its called 'idx'
315 my $pindex = 'idx';
316
317 # next comes a processed version of the subcollection if there is one.
318 my $psub = $self->process_field ($subcollection);
319 $psub = lc ($psub);
320
321 # next comes a processed version of the language if there is one.
322 my $plang = $self->process_field ($languages);
323 $plang = lc ($plang);
324
325 my $dirname = $pindex . $psub . $plang;
326
327 # check to be sure all index names are unique
328 while (defined ($dirnames{$dirname})) {
329 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
330 }
331
332 $mapping{$index} = $dirname;
333
334 # store the mapping orders as well as the maps
335 # also put index, subcollection and language fields into the mapping thing -
336 # (the full index name (eg text:subcol:lang) is not used on
337 # the query page) -these are used for collectionmeta later on
338 if (!defined $mapping{'indexmap'}{"$fields"}) {
339 $mapping{'indexmap'}{"$fields"} = $pindex;
340 push (@{$mapping{'indexmaporder'}}, "$fields");
341 if (!defined $mapping{"$fields"}) {
342 $mapping{"$fields"} = $pindex;
343 }
344 }
345 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
346 $mapping{'subcollectionmap'}{$subcollection} = $psub;
347 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
348 $mapping{$subcollection} = $psub;
349 }
350 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
351 $mapping{'languagemap'}{$languages} = $plang;
352 push (@{$mapping{'languagemaporder'}}, $languages);
353 $mapping{$languages} = $plang;
354 }
355 $dirnames{$dirname} = $index;
356 $pnames{'index'}->{$pindex} = "$fields";
357 $pnames{'subcollection'}->{$psub} = $subcollection;
358 $pnames{'languages'}->{$plang} = $languages;
359 }
360
361 return \%mapping;
362}
363
364sub make_unique {
365 my $self = shift (@_);
366 my ($namehash, $index, $indexref, $subref, $langref) = @_;
367 my ($fields, $subcollection, $languages) = split (":", $index);
368
369 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
370 $self->get_next_version ($indexref);
371 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
372 $self->get_next_version ($subref);
373 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
374 $self->get_next_version ($langref);
375 }
376 return "$$indexref$$subref$$langref";
377}
378
379
380sub build_index {
381 my $self = shift (@_);
382 my ($index) = @_;
383 my $outhandle = $self->{'outhandle'};
384
385 # get the full index directory path and make sure it exists
386 my $indexdir = $self->{'index_mapping'}->{$index};
387 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir));
388
389 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
390 my $fullindexprefix = &FileUtils::filenameConcatenate($self->{'build_dir'},
391 $indexdir,
392 $collect_tail);
393 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text",
394 $collect_tail);
395
396 # get any os specific stuff
397 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
398
399 my $exe = &util::get_os_exe ();
400 my $mgpp_passes_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_passes$exe");
401
402 # define the section names for mgpasses
403 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
404 if ($self->{'levels'}->{'paragraph'}) {
405 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
406 }
407
408 my $mgpp_perf_hash_build_exe =
409 &FileUtils::filenameConcatenate($exedir, "mgpp_perf_hash_build$exe");
410 my $mgpp_weights_build_exe =
411 &FileUtils::filenameConcatenate($exedir, "mgpp_weights_build$exe");
412 my $mgpp_invf_dict_exe =
413 &FileUtils::filenameConcatenate($exedir, "mgpp_invf_dict$exe");
414 my $mgpp_stem_idx_exe =
415 &FileUtils::filenameConcatenate($exedir, "mgpp_stem_idx$exe");
416
417 my $maxnumeric = $self->{'maxnumeric'};
418
419 my $osextra = "";
420 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
421 $fullindexprefix =~ s@/@\\@g;
422 } else {
423 $osextra = " -d /";
424 if ($outhandle ne "STDERR") {
425 # so mgpp_passes doesn't print to stderr if we redirect output
426 $osextra .= " 2>/dev/null";
427 }
428 }
429
430 # get the index expression if this index belongs
431 # to a subcollection
432 my $indexexparr = [];
433 my $langarr = [];
434 # there may be subcollection info, and language info.
435 my ($fields, $subcollection, $language) = split (":", $index);
436 my @subcollections = ();
437 @subcollections = split /,/, $subcollection if (defined $subcollection);
438
439 foreach $subcollection (@subcollections) {
440 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
441 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
442 }
443 }
444
445 # add expressions for languages if this index belongs to
446 # a language subcollection - only put languages expressions for the
447 # ones we want in the index
448
449 my @languages = ();
450 my $languagemetadata = "Language";
451 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
452 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
453 }
454 @languages = split /,/, $language if (defined $language);
455 foreach my $language (@languages) {
456 my $not=0;
457 if ($language =~ s/^\!//) {
458 $not = 1;
459 }
460 if($not) {
461 push (@$langarr, "!$language");
462 } else {
463 push (@$langarr, "$language");
464 }
465 }
466
467 # Build index dictionary. Uses verbatim stem method
468 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
469 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
470 my ($handle);
471 if ($self->{'debug'}) {
472 $handle = *STDOUT;
473 }
474 else {
475 if (!-e "$mgpp_passes_exe" ||
476 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
477 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
478 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
479 }
480 }
481
482 # db_level is always section
483 my $db_level = "section";
484
485 # set up the document processr
486 $self->{'buildproc'}->set_output_handle ($handle);
487 $self->{'buildproc'}->set_mode ('text');
488 $self->{'buildproc'}->set_index ($index, $indexexparr);
489 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
490 $self->{'buildproc'}->set_indexing_text (1);
491 $self->{'buildproc'}->set_levels ($self->{'levels'});
492 $self->{'buildproc'}->set_db_level ($db_level);
493
494 $self->{'buildproc'}->reset();
495
496 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
497 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
498 close ($handle) unless $self->{'debug'};
499
500 $self->print_stats();
501
502 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
503 # we check on the .id file - index dictionary
504 my $dict_file = "$fullindexprefix.id";
505 if (!-e $dict_file) {
506 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
507 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
508 $self->{'notbuilt'}->{$index}=1;
509 return;
510 }
511
512 if (!$self->{'debug'}) {
513 # create the perfect hash function
514 if (!-e "$mgpp_perf_hash_build_exe") {
515 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
516 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
517 }
518 my $hash_cmd = "mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra";
519 print $outhandle "\ncmd: $hash_cmd\n" if ($self->{'verbosity'} >= 4);
520
521 my $hash_status = system ($hash_cmd);
522 print $outhandle "\nstatus from running hash_cmd: $hash_status\n" if ($self->{'verbosity'} >= 4);
523 # check that perf hash was generated - if not, don't carry on
524 if ($hash_status !=0) {
525 print $outhandle "mgppbuilder::build_index - Couldn't create index $index as there are too few words in the index.\n";
526 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
527 $self->{'notbuilt'}->{$index}=1;
528 return;
529
530 }
531
532 if (!-e "$mgpp_passes_exe" ||
533 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
534 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
535 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
536 }
537 }
538
539 # invert the text
540 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
541 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
542
543 $self->{'buildproc'}->set_output_handle ($handle);
544 $self->{'buildproc'}->reset();
545
546 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
547 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
548
549 $self->print_stats ();
550
551 if (!$self->{'debug'}) {
552
553 close ($handle);
554 my $passes_exit_status = $?;
555 print $outhandle "\nMGPP Passes exit status $passes_exit_status\n" if ($self->{'verbosity'} >= 4);
556
557 # create the weights file
558 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
559 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
560 if (!-e "$mgpp_weights_build_exe") {
561 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
562 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
563 }
564 my $weights_cmd = "mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra";
565 print $outhandle "\ncmd: $weights_cmd\n" if ($self->{'verbosity'} >= 4);
566 system ($weights_cmd);
567
568 # create 'on-disk' stemmed dictionary
569 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
570 if (!-e "$mgpp_invf_dict_exe") {
571 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
572 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
573 }
574 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
575
576
577 # creates stem index files for the various stemming methods
578 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
579 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
580 if (!-e "$mgpp_stem_idx_exe") {
581 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
582 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
583 }
584 my $accent_folding_enabled = 1;
585 if ($self->{'accentfold'}) {
586 # the first time we do this, we test for accent folding enabled
587 if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") == 2) {
588 # accent folding has not been enabled in mgpp
589 $accent_folding_enabled = 0;
590 $self->{'stemindexes'} -= 4;
591 }
592 }
593 if ($self->{'casefold'}) {
594 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
595 if ($accent_folding_enabled && $self->{'accentfold'}) {
596 system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
597 }
598 }
599 if ($self->{'stem'}) {
600 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
601 if ($accent_folding_enabled && $self->{'accentfold'}) {
602 system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
603 }
604 }
605 if ($self->{'casefold'} && $self->{'stem'}) {
606 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
607 if ($accent_folding_enabled && $self->{'accentfold'}) {
608 system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
609 }
610 }
611
612 # remove unwanted files
613 my $tmpdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir);
614 opendir (DIR, $tmpdir) || die
615 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
616 foreach my $file (readdir(DIR)) {
617 next if $file =~ /^\./;
618 my ($suffix) = $file =~ /\.([^\.]+)$/;
619 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
620 # delete it!
621 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
622 #&util::rm (&FileUtils::filenameConcatenate($tmpdir, $file));
623 }
624 }
625 closedir (DIR);
626 }
627 print STDERR "</Stage>\n" if $self->{'gli'};
628}
629
630
631sub get_collection_meta_indexes
632{
633 my $self = shift(@_);
634 my $collection_infodb = shift(@_);
635
636 # define the indexed field mapping if not already done so
637 # (i.e. if infodb called separately from build_index)
638 if (!defined $self->{'build_cfg'}) {
639 $self->read_final_field_list();
640 }
641
642 # first do the collection meta stuff - everything without a dot
643 my $collmetadefined = 0;
644 my $metadata_entry;
645 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
646 $collmetadefined = 1;
647 }
648
649 #add the index field macros to [collection]
650 # eg <TI>Title
651 # <SU>Subject
652 # these now come from collection meta. if that is not defined, uses the metadata name
653 my $collmeta = "";
654 if (defined $self->{'build_cfg'}->{'extraindexfields'}) {
655 foreach my $longfield (@{$self->{'build_cfg'}->{'extraindexfields'}}){
656 my $shortfield = $self->{'buildproc'}->{'fieldnamemap'}->{$longfield};
657 next if $shortfield eq 1;
658
659 # we need to check if some coll meta has been defined - don't output
660 # any that have
661 $collmeta = ".$longfield";
662 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
663 if ($longfield eq "allfields") {
664 $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
665 } elsif ($longfield eq "text") {
666 $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
667 } else {
668 $collection_infodb->{$shortfield} = [ $longfield ];
669 }
670 }
671 }
672 }
673
674 # now add the level names
675 my $level_entry = "";
676 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
677 $collmeta = ".$level"; # based on the original specification
678 $level =~ tr/A-Z/a-z/; # make it lower case
679 my $levelid = $level_map{$level}; # find the actual value we used in the index
680 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
681 # use the default macro
682 $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
683 }
684 }
685
686 # now add subcoll meta
687 my $subcoll_entry = "";
688 my $shortname = "";
689 my $one_entry = "";
690 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
691 $shortname = $self->{'index_mapping'}->{$subcoll};
692 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
693 $collection_infodb->{$shortname} = [ $subcoll ];
694 }
695 }
696
697 # now add language meta
698 my $lang_entry = "";
699 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
700 $shortname = $self->{'index_mapping'}->{$lang};
701 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
702 $collection_infodb->{$shortname} = [ $lang ];
703 }
704 }
705}
706
707
708# default is to output the metadata sets (prefixes) used in collection
709sub output_collection_meta
710{
711 my $self = shift(@_);
712 my $infodb_handle = shift(@_);
713
714 my %collection_infodb = ();
715 $self->get_collection_meta_sets(\%collection_infodb);
716 $self->get_collection_meta_indexes(\%collection_infodb);
717 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
718}
719
720
721# at the end of building, we have an indexfieldmap with all the mappings,
722# plus some extras, and indexmap with any indexes in it that weren't
723# specified in the index definition. We want to make an ordered list of
724# fields that are indexed, and a list of mappings that are used. This will
725# be used for the build.cfg file, and for collection meta definition we
726# store these in a build.cfg bit
727sub make_final_field_list {
728 my $self = shift (@_);
729
730 $self->{'build_cfg'} = {};
731
732 # store the indexfieldmap information
733 my @indexfieldmap = ();
734 my @indexfields = ();
735 my $specifiedfields = {};
736 my @specifiedfieldorder = ();
737
738 # go through the index definition and add each thing to a map, so we
739 # can easily check if it is already specified - when doing the
740 # metadata, we print out all the individual fields, but some may
741 # already be specified in the index definition, so we dont want to add
742 # those again.
743
744 my $field;
745 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
746 # remove subcoll stuff
747 my $parts = $field;
748 $parts =~ s/:.*$//;
749 # *************
750 my @fs = split(';', $parts);
751 foreach my $f(@fs) {
752 if (!defined $specifiedfields->{$f}) {
753 $specifiedfields->{$f}=1;
754 push (@specifiedfieldorder, "$f");
755 }
756 }
757 }
758
759 #add all fields bit
760 my $fnm = $self->{'buildproc'}->{'fieldnamemap'};
761
762 foreach $field (@specifiedfieldorder) {
763 if ($field eq "metadata") {
764 foreach my $newfield (keys %{$self->{'buildproc'}->{'extraindexfields'}}) {
765 if (!defined $specifiedfields->{$newfield}) {
766 push (@indexfieldmap, "$newfield\-\>$fnm->{$newfield}");
767 push (@indexfields, "$newfield");
768 }
769 }
770
771 } elsif ($field eq 'text') {
772 push (@indexfieldmap, "text\-\>TX");
773 push (@indexfields, "text");
774 } elsif ($field eq 'allfields') {
775 push (@indexfieldmap, "allfields\-\>ZZ");
776 push (@indexfields, "allfields");
777 } else {
778 # we only add in the ones that have been processed
779 if (defined $self->{'buildproc'}->{'allindexfields'}->{$field}) {
780 push (@indexfieldmap, "$field\-\>$fnm->{$field}");
781 push (@indexfields, "$field");
782 }
783 }
784 }
785
786 if (scalar @indexfieldmap) {
787 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
788 }
789
790 if (scalar @indexfields) {
791 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
792 }
793}
794
795
796# recreate the field list from the build.cfg file, look first in building,
797# then in index to find it. if there is no build.cfg, we can't do the field
798# list (there is unlikely to be any index anyway.)
799sub read_final_field_list {
800 my $self = shift (@_);
801 $self->{'build_cfg'} = {};
802 my @indexfieldmap = ();
803 my @indexfields = ();
804 my @indexmap = ();
805
806 # we read the stuff in from the build.cfg file - if its there
807 my $buildcfg = $self->read_build_cfg();
808 return unless defined $buildcfg;
809
810 my $field;
811 if (defined $buildcfg->{'indexfields'}) {
812 foreach $field (@{$buildcfg->{'indexfields'}}) {
813 push (@indexfields, "$field");
814 }
815 }
816
817 if (defined $buildcfg->{'indexfieldmap'}) {
818 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
819 push (@indexfieldmap, "$field");
820 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
821 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
822 }
823 }
824
825 if (defined $buildcfg->{'indexmap'}) {
826 foreach $field (@{$buildcfg->{'indexmap'}}) {
827 push (@indexmap, "$field");
828 }
829 }
830
831 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
832 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
833 $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
834}
835
836
837sub build_cfg_extra {
838 my $self = shift (@_);
839 my ($build_cfg) = @_;
840
841 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
842
843 # store the level info
844 my @indexlevels = ();
845 my @levelmap = ();
846 foreach my $l (@{$self->{'levelorder'}}) {
847 push (@indexlevels, $level_map{$l});
848 push (@levelmap, "$l\-\>$level_map{$l}");
849 }
850 $build_cfg->{'indexlevels'} = \@indexlevels;
851 $build_cfg->{'levelmap'} = \@levelmap;
852
853 # text level (and database level) is always section
854 $build_cfg->{'textlevel'} = $level_map{'section'};
855
856}
857
8581;
859
860
Note: See TracBrowser for help on using the repository browser.