source: main/trunk/greenstone2/perllib/mgppbuilder.pm@ 22352

Last change on this file since 22352 was 22352, checked in by kjdon, 14 years ago

remove ex. when generating index lists. Don't want any ex. in build.cfg. This fixes the problem where index list had eg ex.Photographer and collectionmeta in config file had .Photographer and then they didn't match up.

  • Property svn:keywords set to Author Date Id Revision
File size: 27.7 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use colcfg;
30use plugin;
31use strict; no strict 'refs';
32use util;
33
34
35sub BEGIN {
36 @mgppbuilder::ISA = ('basebuilder');
37}
38
39
40
41our %level_map = ('document'=>'Doc',
42 'section'=>'Sec',
43 'paragraph'=>'Para',
44 'Doc'=>'_textdocument_',
45 'Sec'=>'_textsection_',
46 'Para'=>'_textparagraph_');
47
48our %wanted_index_files = ('td'=>1,
49 't'=>1,
50 'tl'=>1,
51 'ti'=>1,
52 'idb'=>1,
53 'ib1'=>1,
54 'ib2'=>1,
55 'ib3'=>1,
56 'ib4'=>1,
57 'ib5'=>1,
58 'ib6'=>1,
59 'ib7'=>1,
60 'i'=>1,
61 'il'=>1,
62 'w'=>1,
63 'wa'=>1);
64
65
66my $maxdocsize = $basebuilder::maxdocsize;
67
68sub new {
69 my $class = shift(@_);
70
71 my $self = new basebuilder (@_);
72 $self = bless $self, $class;
73
74 #$self->{'indexfieldmap'} = \%static_indexfield_map;
75
76 # get the levels (Section, Paragraph) for indexing and compression
77 $self->{'levels'} = {};
78 $self->{'levelorder'} = ();
79 if (defined $self->{'collect_cfg'}->{'levels'}) {
80 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
81 $level =~ tr/A-Z/a-z/;
82 $self->{'levels'}->{$level} = 1;
83 push (@{$self->{'levelorder'}}, $level);
84 }
85 } else { # default to document
86 $self->{'levels'}->{'document'} = 1;
87 push (@{$self->{'levelorder'}}, 'document');
88 }
89
90 $self->{'buildtype'} = "mgpp";
91
92 return $self;
93}
94
95sub generate_index_list {
96 my $self = shift (@_);
97
98 # sort out the indexes
99 #indexes are specified with spaces, but we put them into one index
100 my $indexes = $self->{'collect_cfg'}->{'indexes'};
101 if (defined $indexes) {
102 $self->{'collect_cfg'}->{'indexes'} = [];
103 my $single_index = join(';', @$indexes).";";
104 # remove any ex. from index spec
105 $single_index =~ s/^ex\.//;
106 $single_index =~ s/([,;])ex\./$1/g;
107 push (@{$self->{'collect_cfg'}->{'indexes'}}, $single_index);
108 }
109}
110
111sub generate_index_options {
112 my $self = shift (@_);
113
114 $self->SUPER::generate_index_options();
115
116 $self->{'casefold'} = 0;
117 $self->{'stem'} = 0;
118 $self->{'accentfold'} = 0;
119
120 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
121 # just use default options
122 $self->{'casefold'} = 1;
123 $self->{'stem'} = 1;
124 $self->{'accentfold'} = 1;
125 } else {
126 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
127 if ($option =~ /stem/) {
128 $self->{'stem'} = 1;
129 } elsif ($option =~ /casefold/) {
130 $self->{'casefold'} = 1;
131 } elsif ($option =~ /accentfold/) {
132 $self->{'accentfold'} = 1;
133 }
134 }
135 }
136
137 # now we record this for the build cfg
138 $self->{'stemindexes'} = 0;
139 if ($self->{'casefold'}) {
140 $self->{'stemindexes'} += 1;
141 }
142 if ($self->{'stem'}) {
143 $self->{'stemindexes'} += 2;
144 }
145 if ($self->{'accentfold'}) {
146 $self->{'stemindexes'} += 4;
147 }
148
149}
150
151sub default_buildproc {
152 my $self = shift (@_);
153
154 return "mgppbuildproc";
155}
156
157sub compress_text {
158
159 my $self = shift (@_);
160
161 # we don't do anything if we don't want compressed text
162 return if $self->{'no_text'};
163
164 my ($textindex) = @_;
165
166 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
167 my $exe = &util::get_os_exe ();
168 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
169 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
170 my $outhandle = $self->{'outhandle'};
171
172 my $maxnumeric = $self->{'maxnumeric'};
173
174 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
175
176 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
177 my $basefilename = &util::filename_cat("text",$collect_tail);
178 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
179
180 my $osextra = "";
181 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
182 $fulltextprefix =~ s@/@\\@g;
183 }
184 else {
185 $osextra = " -d /";
186 }
187
188
189 # define the section names and possibly the doc name for mgpasses
190 # the compressor doesn't need to know about paragraphs - never want to
191 # retrieve them
192
193 # always use Doc and Sec levels
194 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
195
196 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
197 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
198
199 # collect the statistics for the text
200 # -b $maxdocsize sets the maximum document size to be 12 meg
201 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
202 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
203
204 my ($handle);
205 if ($self->{'debug'}) {
206 $handle = *STDOUT;
207 }
208 else {
209 if (!-e "$mgpp_passes_exe" ||
210 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
211 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
212 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
213 }
214 }
215
216 my $db_level = "section";
217
218 $self->{'buildproc'}->set_output_handle ($handle);
219 $self->{'buildproc'}->set_mode ('text');
220 $self->{'buildproc'}->set_index ($textindex);
221 $self->{'buildproc'}->set_indexing_text (0);
222 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
223 $self->{'buildproc'}->set_levels ($self->{'levels'});
224 $self->{'buildproc'}->set_db_level ($db_level);
225 $self->{'buildproc'}->reset();
226 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
227 $self->{'buildproc'}, $self->{'maxdocs'});
228 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
229 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
230 &plugin::end($self->{'pluginfo'});
231
232 close ($handle) unless $self->{'debug'};
233
234 $self->print_stats();
235
236 # create the compression dictionary
237 # the compression dictionary is built by assuming the stats are from a seed
238 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
239 # and the resulting dictionary must be less than 5 meg with the most
240 # frequent words being put into the dictionary first (-2 -k 5120)
241 # note: these options are left over from mg version
242 if (!$self->{'debug'}) {
243 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
244 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
245 if (!-e "$mgpp_compression_dict_exe") {
246 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
247 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
248 }
249 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
250
251 if (!$self->{'debug'}) {
252 if (!-e "$mgpp_passes_exe" ||
253 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
254 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
255 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
256 }
257 }
258 }
259 else {
260 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
261 }
262
263 $self->{'buildproc'}->reset();
264 # compress the text
265 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
266 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
267
268 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
269 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
270 close ($handle) unless $self->{'debug'};
271
272 $self->print_stats();
273 print STDERR "</Stage>\n" if $self->{'gli'};
274}
275
276
277sub build_indexes_extra {
278 my $self = shift(@_);
279 #define the final field lists
280 $self->make_final_field_list();
281}
282
283# creates directory names for each of the index descriptions
284sub create_index_mapping {
285 my $self = shift (@_);
286 my ($indexes) = @_;
287
288 my %mapping = ();
289
290 return \%mapping if !(scalar @$indexes);
291
292 $mapping{'indexmaporder'} = [];
293 $mapping{'subcollectionmaporder'} = [];
294 $mapping{'languagemaporder'} = [];
295
296 # dirnames is used to check for collisions. Start this off
297 # with the manditory directory names
298 my %dirnames = ('text'=>'text',
299 'extra'=>'extra');
300 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
301
302 foreach my $index (@$indexes) {
303 my ($fields, $subcollection, $languages) = split (":", $index);
304
305 # we only ever have one index, and its called 'idx'
306 my $pindex = 'idx';
307
308 # next comes a processed version of the subcollection if there is one.
309 my $psub = $self->process_field ($subcollection);
310 $psub = lc ($psub);
311
312 # next comes a processed version of the language if there is one.
313 my $plang = $self->process_field ($languages);
314 $plang = lc ($plang);
315
316 my $dirname = $pindex . $psub . $plang;
317
318 # check to be sure all index names are unique
319 while (defined ($dirnames{$dirname})) {
320 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
321 }
322
323 $mapping{$index} = $dirname;
324
325 # store the mapping orders as well as the maps
326 # also put index, subcollection and language fields into the mapping thing -
327 # (the full index name (eg text:subcol:lang) is not used on
328 # the query page) -these are used for collectionmeta later on
329 if (!defined $mapping{'indexmap'}{"$fields"}) {
330 $mapping{'indexmap'}{"$fields"} = $pindex;
331 push (@{$mapping{'indexmaporder'}}, "$fields");
332 if (!defined $mapping{"$fields"}) {
333 $mapping{"$fields"} = $pindex;
334 }
335 }
336 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
337 $mapping{'subcollectionmap'}{$subcollection} = $psub;
338 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
339 $mapping{$subcollection} = $psub;
340 }
341 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
342 $mapping{'languagemap'}{$languages} = $plang;
343 push (@{$mapping{'languagemaporder'}}, $languages);
344 $mapping{$languages} = $plang;
345 }
346 $dirnames{$dirname} = $index;
347 $pnames{'index'}->{$pindex} = "$fields";
348 $pnames{'subcollection'}->{$psub} = $subcollection;
349 $pnames{'languages'}->{$plang} = $languages;
350 }
351
352 return \%mapping;
353}
354
355sub make_unique {
356 my $self = shift (@_);
357 my ($namehash, $index, $indexref, $subref, $langref) = @_;
358 my ($fields, $subcollection, $languages) = split (":", $index);
359
360 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
361 $self->get_next_version ($indexref);
362 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
363 $self->get_next_version ($subref);
364 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
365 $self->get_next_version ($langref);
366 }
367 return "$$indexref$$subref$$langref";
368}
369
370
371sub build_index {
372 my $self = shift (@_);
373 my ($index) = @_;
374 my $outhandle = $self->{'outhandle'};
375
376 # get the full index directory path and make sure it exists
377 my $indexdir = $self->{'index_mapping'}->{$index};
378 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
379
380 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
381 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
382 $indexdir,
383 $collect_tail);
384 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
385 $collect_tail);
386
387 # get any os specific stuff
388 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
389
390 my $exe = &util::get_os_exe ();
391 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
392
393 # define the section names for mgpasses
394 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
395 if ($self->{'levels'}->{'paragraph'}) {
396 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
397 }
398
399 my $mgpp_perf_hash_build_exe =
400 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
401 my $mgpp_weights_build_exe =
402 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
403 my $mgpp_invf_dict_exe =
404 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
405 my $mgpp_stem_idx_exe =
406 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
407
408 my $maxnumeric = $self->{'maxnumeric'};
409
410 my $osextra = "";
411 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
412 $fullindexprefix =~ s@/@\\@g;
413 } else {
414 $osextra = " -d /";
415 if ($outhandle ne "STDERR") {
416 # so mgpp_passes doesn't print to stderr if we redirect output
417 $osextra .= " 2>/dev/null";
418 }
419 }
420
421 # get the index expression if this index belongs
422 # to a subcollection
423 my $indexexparr = [];
424 my $langarr = [];
425 # there may be subcollection info, and language info.
426 my ($fields, $subcollection, $language) = split (":", $index);
427 my @subcollections = ();
428 @subcollections = split /,/, $subcollection if (defined $subcollection);
429
430 foreach $subcollection (@subcollections) {
431 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
432 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
433 }
434 }
435
436 # add expressions for languages if this index belongs to
437 # a language subcollection - only put languages expressions for the
438 # ones we want in the index
439
440 my @languages = ();
441 my $languagemetadata = "Language";
442 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
443 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
444 }
445 @languages = split /,/, $language if (defined $language);
446 foreach my $language (@languages) {
447 my $not=0;
448 if ($language =~ s/^\!//) {
449 $not = 1;
450 }
451 if($not) {
452 push (@$langarr, "!$language");
453 } else {
454 push (@$langarr, "$language");
455 }
456 }
457
458 # Build index dictionary. Uses verbatim stem method
459 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
460 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
461 my ($handle);
462 if ($self->{'debug'}) {
463 $handle = *STDOUT;
464 }
465 else {
466 if (!-e "$mgpp_passes_exe" ||
467 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
468 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
469 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
470 }
471 }
472
473 # db_level is always section
474 my $db_level = "section";
475
476 # set up the document processr
477 $self->{'buildproc'}->set_output_handle ($handle);
478 $self->{'buildproc'}->set_mode ('text');
479 $self->{'buildproc'}->set_index ($index, $indexexparr);
480 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
481 $self->{'buildproc'}->set_indexing_text (1);
482 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
483 $self->{'buildproc'}->set_levels ($self->{'levels'});
484 $self->{'buildproc'}->set_db_level ($db_level);
485
486 $self->{'buildproc'}->reset();
487 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
488 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
489 close ($handle) unless $self->{'debug'};
490
491 $self->print_stats();
492
493 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
494 # we check on the .id file - index dictionary
495 my $dict_file = "$fullindexprefix.id";
496 if (!-e $dict_file) {
497 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
498 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
499 $self->{'notbuilt'}->{$index}=1;
500 return;
501 }
502
503 if (!$self->{'debug'}) {
504 # create the perfect hash function
505 if (!-e "$mgpp_perf_hash_build_exe") {
506 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
507 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
508 }
509 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
510
511 if (!-e "$mgpp_passes_exe" ||
512 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
513 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
514 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
515 }
516 }
517
518 # invert the text
519 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
520 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
521 $self->{'buildproc'}->reset();
522 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
523 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
524
525 $self->print_stats ();
526
527 if (!$self->{'debug'}) {
528
529 close ($handle);
530
531 # create the weights file
532 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
533 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
534 if (!-e "$mgpp_weights_build_exe") {
535 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
536 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
537 }
538 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
539
540 # create 'on-disk' stemmed dictionary
541 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
542 if (!-e "$mgpp_invf_dict_exe") {
543 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
544 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
545 }
546 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
547
548
549 # creates stem index files for the various stemming methods
550 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
551 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
552 if (!-e "$mgpp_stem_idx_exe") {
553 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
554 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
555 }
556 my $accent_folding_enabled = 1;
557 if ($self->{'accentfold'}) {
558 # the first time we do this, we test for accent folding enabled
559 if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") == 2) {
560 # accent folding has not been enabled in mgpp
561 $accent_folding_enabled = 0;
562 $self->{'stemindexes'} -= 4;
563 }
564 }
565 if ($self->{'casefold'}) {
566 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
567 if ($accent_folding_enabled && $self->{'accentfold'}) {
568 system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
569 }
570 }
571 if ($self->{'stem'}) {
572 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
573 if ($accent_folding_enabled && $self->{'accentfold'}) {
574 system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
575 }
576 }
577 if ($self->{'casefold'} && $self->{'stem'}) {
578 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
579 if ($accent_folding_enabled && $self->{'accentfold'}) {
580 system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
581 }
582 }
583
584 # remove unwanted files
585 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
586 opendir (DIR, $tmpdir) || die
587 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
588 foreach my $file (readdir(DIR)) {
589 next if $file =~ /^\./;
590 my ($suffix) = $file =~ /\.([^\.]+)$/;
591 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
592 # delete it!
593 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
594 #&util::rm (&util::filename_cat ($tmpdir, $file));
595 }
596 }
597 closedir (DIR);
598 }
599 print STDERR "</Stage>\n" if $self->{'gli'};
600}
601
602
603sub get_collection_meta_indexes
604{
605 my $self = shift(@_);
606 my $collection_infodb = shift(@_);
607
608 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
609 if (!defined $self->{'build_cfg'}) {
610 $self->read_final_field_list();
611 }
612
613 # first do the collection meta stuff - everything without a dot
614 my $collmetadefined = 0;
615 my $metadata_entry;
616 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
617 $collmetadefined = 1;
618 }
619
620 #add the index field macros to [collection]
621 # eg <TI>Title
622 # <SU>Subject
623 # these now come from collection meta. if that is not defined, uses the metadata name
624 my $collmeta = "";
625 if (defined $self->{'build_cfg'}->{'indexfields'}) {
626 foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
627 my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
628 next if $shortfield eq 1;
629
630 # we need to check if some coll meta has been defined - don't output
631 # any that have
632 $collmeta = ".$longfield";
633 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
634 if ($longfield eq "allfields") {
635 $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
636 } elsif ($longfield eq "text") {
637 $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
638 } else {
639 $collection_infodb->{$shortfield} = [ $longfield ];
640 }
641 }
642 }
643 }
644
645 # now add the level names
646 my $level_entry = "";
647 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
648 $collmeta = ".$level"; # based on the original specification
649 $level =~ tr/A-Z/a-z/; # make it lower case
650 my $levelid = $level_map{$level}; # find the actual value we used in the index
651 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
652 # use the default macro
653 $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
654 }
655 }
656
657 # now add subcoll meta
658 my $subcoll_entry = "";
659 my $shortname = "";
660 my $one_entry = "";
661 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
662 $shortname = $self->{'index_mapping'}->{$subcoll};
663 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
664 $collection_infodb->{$shortname} = [ $subcoll ];
665 }
666 }
667
668 # now add language meta
669 my $lang_entry = "";
670 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
671 $shortname = $self->{'index_mapping'}->{$lang};
672 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
673 $collection_infodb->{$shortname} = [ $lang ];
674 }
675 }
676}
677
678
679# default is to output the metadata sets (prefixes) used in collection
680sub output_collection_meta
681{
682 my $self = shift(@_);
683 my $infodb_handle = shift(@_);
684
685 my %collection_infodb = ();
686 $self->get_collection_meta_sets(\%collection_infodb);
687 $self->get_collection_meta_indexes(\%collection_infodb);
688 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
689}
690
691
692# at the end of building, we have an indexfieldmap with all the mappings,
693# plus some extras, and indexmap with any indexes in it that weren't
694# specified in the index definition. we want to make an ordered list of
695# fields that are indexed, and a list of mappings that are used. this will
696# be used for the build.cfg file, and for collection meta definition we
697# store these in a build.cfg bit
698sub make_final_field_list {
699 my $self = shift (@_);
700
701 $self->{'build_cfg'} = {};
702
703 # store the indexfieldmap information
704 my @indexfieldmap = ();
705 my @indexfields = ();
706 my $specifiedfields = {};
707 my @specifiedfieldorder = ();
708
709 # go through the index definition and add each thing to a map, so we
710 # can easily check if it is already specified - when doing the
711 # metadata, we print out all the individual fields, but some may
712 # already be specified in the index definition, so we dont want to add
713 # those again.
714
715 my $field;
716 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
717 # remove subcoll stuff
718 my $parts = $field;
719 $parts =~ s/:.*$//;
720 # *************
721 my @fs = split(';', $parts);
722 foreach my $f(@fs) {
723 if (!defined $specifiedfields->{$f}) {
724 $specifiedfields->{$f}=1;
725 push (@specifiedfieldorder, "$f");
726 }
727 }
728 }
729
730 #add all fields bit
731 my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
732
733 foreach $field (@specifiedfieldorder) {
734 if ($field eq "metadata") {
735 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
736 if (!defined $specifiedfields->{$newfield}) {
737 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
738 push (@indexfields, "$newfield");
739 }
740 }
741
742 } elsif ($field eq 'text') {
743 push (@indexfieldmap, "text\-\>TX");
744 push (@indexfields, "text");
745 } elsif ($field eq 'allfields') {
746 push (@indexfieldmap, "allfields\-\>ZZ");
747 push (@indexfields, "allfields");
748 } else {
749 # we only add in the ones that have been processed
750 if (defined $ifm->{$field}) {
751 push (@indexfieldmap, "$field\-\>$ifm->{$field}");
752 push (@indexfields, "$field");
753 }
754
755
756 }
757 }
758
759 if (scalar @indexfieldmap) {
760 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
761 }
762 if (scalar @indexfields) {
763 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
764 }
765}
766
767
768# recreate the field list from the build.cfg file, look first in building,
769# then in index to find it. if there is no build.cfg, we can't do the field
770# list (there is unlikely to be any index anyway.)
771sub read_final_field_list {
772 my $self = shift (@_);
773 $self->{'build_cfg'} = {};
774 my @indexfieldmap = ();
775 my @indexfields = ();
776 my @indexmap = ();
777
778 # we read the stuff in from the build.cfg file - if its there
779 my $buildcfg = $self->read_build_cfg();
780 return unless defined $buildcfg;
781
782 my $field;
783 if (defined $buildcfg->{'indexfields'}) {
784 foreach $field (@{$buildcfg->{'indexfields'}}) {
785 push (@indexfields, "$field");
786 }
787 }
788
789 if (defined $buildcfg->{'indexfieldmap'}) {
790 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
791 push (@indexfieldmap, "$field");
792 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
793 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
794 }
795 }
796
797 if (defined $buildcfg->{'indexmap'}) {
798 foreach $field (@{$buildcfg->{'indexmap'}}) {
799 push (@indexmap, "$field");
800 }
801 }
802
803 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
804 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
805 $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
806}
807
808
809sub build_cfg_extra {
810 my $self = shift (@_);
811 my ($build_cfg) = @_;
812
813 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
814
815 # store the level info
816 my @indexlevels = ();
817 my @levelmap = ();
818 foreach my $l (@{$self->{'levelorder'}}) {
819 push (@indexlevels, $level_map{$l});
820 push (@levelmap, "$l\-\>$level_map{$l}");
821 }
822 $build_cfg->{'indexlevels'} = \@indexlevels;
823 $build_cfg->{'levelmap'} = \@levelmap;
824
825 # text level (and database level) is always section
826 $build_cfg->{'textlevel'} = $level_map{'section'};
827
828}
829
8301;
831
832
Note: See TracBrowser for help on using the repository browser.