source: main/trunk/greenstone2/perllib/mgppbuilder.pm@ 27328

Last change on this file since 27328 was 27328, checked in by kjdon, 11 years ago

changed the way we store the list of fields that has been indexed, and the mapping between index and shortname. They are separated now, to avoid calculating a shortname for a field each time a new document is indexed - previously if there was no value, then the shortname was not remembered as it wasn't indexed, so each new document saw the shortname being calculated again. remove namespaces from meta fields before calculating shortnames, to make them more sensible. eg dc.Title->TI instead of DC.

  • Property svn:keywords set to Author Date Id Revision
File size: 28.5 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use colcfg;
30use plugin;
31use strict; no strict 'refs';
32use util;
33use FileUtils;
34
35
36sub BEGIN {
37 @mgppbuilder::ISA = ('basebuilder');
38}
39
40
41
42our %level_map = ('document'=>'Doc',
43 'section'=>'Sec',
44 'paragraph'=>'Para',
45 'Doc'=>'_textdocument_',
46 'Sec'=>'_textsection_',
47 'Para'=>'_textparagraph_');
48
49our %wanted_index_files = ('td'=>1,
50 't'=>1,
51 'tl'=>1,
52 'ti'=>1,
53 'idb'=>1,
54 'ib1'=>1,
55 'ib2'=>1,
56 'ib3'=>1,
57 'ib4'=>1,
58 'ib5'=>1,
59 'ib6'=>1,
60 'ib7'=>1,
61 'i'=>1,
62 'il'=>1,
63 'w'=>1,
64 'wa'=>1);
65
66
67my $maxdocsize = $basebuilder::maxdocsize;
68
69sub new {
70 my $class = shift(@_);
71
72 my $self = new basebuilder (@_);
73 $self = bless $self, $class;
74
75 #$self->{'indexfieldmap'} = \%static_indexfield_map;
76
77 # get the levels (Section, Paragraph) for indexing and compression
78 $self->{'levels'} = {};
79 $self->{'levelorder'} = ();
80 if (defined $self->{'collect_cfg'}->{'levels'}) {
81 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
82 $level =~ tr/A-Z/a-z/;
83 $self->{'levels'}->{$level} = 1;
84 push (@{$self->{'levelorder'}}, $level);
85 }
86 } else { # default to document
87 $self->{'levels'}->{'document'} = 1;
88 push (@{$self->{'levelorder'}}, 'document');
89 }
90
91 $self->{'buildtype'} = "mgpp";
92
93 return $self;
94}
95
96sub generate_index_list {
97 my $self = shift (@_);
98
99 # sort out the indexes
100 #indexes are specified with spaces, but we put them into one index
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 if (defined $indexes) {
103 $self->{'collect_cfg'}->{'indexes'} = [];
104
105 # remove any ex. from index spec but iff it is the only namespace in the metadata name
106 my @indexes_copy = @$indexes; # make a copy, as 'map' changes entry in array
107 #map { $_ =~ s/(^|,|;)ex\.([^.]+)$/$1$2/; } @indexes_copy; # No. Will replace metanames like flex.Image with fl.Image
108 map { $_ =~ s/(,|;)/$1 /g; } @indexes_copy; # introduce a space after every separator
109 map { $_ =~ s/(^| )ex\.([^.,:]+)(,|;|$)/$1$2$3/g; } @indexes_copy; # replace all <ex.> at start of metanames or <, ex.> when in a comma separated list
110 map { $_ =~ s/(,|:) /$1/g; } @indexes_copy; # remove space introduced after every separator
111 my $single_index = join(';', @indexes_copy).";";
112
113 push (@{$self->{'collect_cfg'}->{'indexes'}}, $single_index);
114 }
115}
116
117sub generate_index_options {
118 my $self = shift (@_);
119
120 $self->SUPER::generate_index_options();
121
122 $self->{'casefold'} = 0;
123 $self->{'stem'} = 0;
124 $self->{'accentfold'} = 0;
125
126 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
127 # just use default options
128 $self->{'casefold'} = 1;
129 $self->{'stem'} = 1;
130 $self->{'accentfold'} = 1;
131 } else {
132 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
133 if ($option =~ /stem/) {
134 $self->{'stem'} = 1;
135 } elsif ($option =~ /casefold/) {
136 $self->{'casefold'} = 1;
137 } elsif ($option =~ /accentfold/) {
138 $self->{'accentfold'} = 1;
139 }
140 }
141 }
142
143 # now we record this for the build cfg
144 $self->{'stemindexes'} = 0;
145 if ($self->{'casefold'}) {
146 $self->{'stemindexes'} += 1;
147 }
148 if ($self->{'stem'}) {
149 $self->{'stemindexes'} += 2;
150 }
151 if ($self->{'accentfold'}) {
152 $self->{'stemindexes'} += 4;
153 }
154
155}
156
157sub default_buildproc {
158 my $self = shift (@_);
159
160 return "mgppbuildproc";
161}
162
163sub compress_text {
164
165 my $self = shift (@_);
166
167 # we don't do anything if we don't want compressed text
168 return if $self->{'no_text'};
169
170 my ($textindex) = @_;
171
172 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
173 my $exe = &util::get_os_exe ();
174 my $mgpp_passes_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_passes$exe");
175 my $mgpp_compression_dict_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_compression_dict$exe");
176 my $outhandle = $self->{'outhandle'};
177
178 my $maxnumeric = $self->{'maxnumeric'};
179
180 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, "text"));
181
182 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
183 my $basefilename = &FileUtils::filenameConcatenate("text",$collect_tail);
184 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, $basefilename);
185
186 my $osextra = "";
187 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
188 $fulltextprefix =~ s@/@\\@g;
189 }
190 else {
191 $osextra = " -d /";
192 }
193
194
195 # define the section names and possibly the doc name for mgpasses
196 # the compressor doesn't need to know about paragraphs - never want to
197 # retrieve them
198
199 # always use Doc and Sec levels
200 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
201
202 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
203 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
204
205 # collect the statistics for the text
206 # -b $maxdocsize sets the maximum document size to be 12 meg
207 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
208 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
209
210 my ($handle);
211 if ($self->{'debug'}) {
212 $handle = *STDOUT;
213 }
214 else {
215 if (!-e "$mgpp_passes_exe" ||
216 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
217 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
218 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
219 }
220 }
221
222 my $db_level = "section";
223
224 $self->{'buildproc'}->set_output_handle ($handle);
225 $self->{'buildproc'}->set_mode ('text');
226 $self->{'buildproc'}->set_index ($textindex);
227 $self->{'buildproc'}->set_indexing_text (0);
228 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
229 $self->{'buildproc'}->set_levels ($self->{'levels'});
230 $self->{'buildproc'}->set_db_level ($db_level);
231 $self->{'buildproc'}->reset();
232 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
233 $self->{'buildproc'}, $self->{'maxdocs'});
234 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
235 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
236 &plugin::end($self->{'pluginfo'});
237
238 close ($handle) unless $self->{'debug'};
239
240 $self->print_stats();
241
242 # create the compression dictionary
243 # the compression dictionary is built by assuming the stats are from a seed
244 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
245 # and the resulting dictionary must be less than 5 meg with the most
246 # frequent words being put into the dictionary first (-2 -k 5120)
247 # note: these options are left over from mg version
248 if (!$self->{'debug'}) {
249 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
250 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
251 if (!-e "$mgpp_compression_dict_exe") {
252 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
253 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
254 }
255 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
256
257 if (!$self->{'debug'}) {
258 if (!-e "$mgpp_passes_exe" ||
259 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
260 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
261 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
262 }
263 }
264 }
265 else {
266 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
267 }
268
269 $self->{'buildproc'}->set_output_handle ($handle);
270 $self->{'buildproc'}->reset();
271
272 # compress the text
273 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
274 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
275
276 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
277 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
278 close ($handle) unless $self->{'debug'};
279
280 $self->print_stats();
281 print STDERR "</Stage>\n" if $self->{'gli'};
282}
283
284
285sub post_build_indexes {
286 my $self = shift(@_);
287
288 #define the final field lists
289 $self->make_final_field_list();
290}
291
292# creates directory names for each of the index descriptions
293sub create_index_mapping {
294 my $self = shift (@_);
295 my ($indexes) = @_;
296
297 my %mapping = ();
298
299 return \%mapping if !(scalar @$indexes);
300
301 $mapping{'indexmaporder'} = [];
302 $mapping{'subcollectionmaporder'} = [];
303 $mapping{'languagemaporder'} = [];
304
305 # dirnames is used to check for collisions. Start this off
306 # with the manditory directory names
307 my %dirnames = ('text'=>'text',
308 'extra'=>'extra');
309 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
310
311 foreach my $index (@$indexes) {
312 my ($fields, $subcollection, $languages) = split (":", $index);
313
314 # we only ever have one index, and its called 'idx'
315 my $pindex = 'idx';
316
317 # next comes a processed version of the subcollection if there is one.
318 my $psub = $self->process_field ($subcollection);
319 $psub = lc ($psub);
320
321 # next comes a processed version of the language if there is one.
322 my $plang = $self->process_field ($languages);
323 $plang = lc ($plang);
324
325 my $dirname = $pindex . $psub . $plang;
326
327 # check to be sure all index names are unique
328 while (defined ($dirnames{$dirname})) {
329 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
330 }
331
332 $mapping{$index} = $dirname;
333
334 # store the mapping orders as well as the maps
335 # also put index, subcollection and language fields into the mapping thing -
336 # (the full index name (eg text:subcol:lang) is not used on
337 # the query page) -these are used for collectionmeta later on
338 if (!defined $mapping{'indexmap'}{"$fields"}) {
339 $mapping{'indexmap'}{"$fields"} = $pindex;
340 push (@{$mapping{'indexmaporder'}}, "$fields");
341 if (!defined $mapping{"$fields"}) {
342 $mapping{"$fields"} = $pindex;
343 }
344 }
345 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
346 $mapping{'subcollectionmap'}{$subcollection} = $psub;
347 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
348 $mapping{$subcollection} = $psub;
349 }
350 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
351 $mapping{'languagemap'}{$languages} = $plang;
352 push (@{$mapping{'languagemaporder'}}, $languages);
353 $mapping{$languages} = $plang;
354 }
355 $dirnames{$dirname} = $index;
356 $pnames{'index'}->{$pindex} = "$fields";
357 $pnames{'subcollection'}->{$psub} = $subcollection;
358 $pnames{'languages'}->{$plang} = $languages;
359 }
360
361 return \%mapping;
362}
363
364sub make_unique {
365 my $self = shift (@_);
366 my ($namehash, $index, $indexref, $subref, $langref) = @_;
367 my ($fields, $subcollection, $languages) = split (":", $index);
368
369 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
370 $self->get_next_version ($indexref);
371 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
372 $self->get_next_version ($subref);
373 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
374 $self->get_next_version ($langref);
375 }
376 return "$$indexref$$subref$$langref";
377}
378
379
380sub build_index {
381 my $self = shift (@_);
382 my ($index) = @_;
383 my $outhandle = $self->{'outhandle'};
384
385 # get the full index directory path and make sure it exists
386 my $indexdir = $self->{'index_mapping'}->{$index};
387 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir));
388
389 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
390 my $fullindexprefix = &FileUtils::filenameConcatenate($self->{'build_dir'},
391 $indexdir,
392 $collect_tail);
393 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text",
394 $collect_tail);
395
396 # get any os specific stuff
397 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
398
399 my $exe = &util::get_os_exe ();
400 my $mgpp_passes_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_passes$exe");
401
402 # define the section names for mgpasses
403 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
404 if ($self->{'levels'}->{'paragraph'}) {
405 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
406 }
407
408 my $mgpp_perf_hash_build_exe =
409 &FileUtils::filenameConcatenate($exedir, "mgpp_perf_hash_build$exe");
410 my $mgpp_weights_build_exe =
411 &FileUtils::filenameConcatenate($exedir, "mgpp_weights_build$exe");
412 my $mgpp_invf_dict_exe =
413 &FileUtils::filenameConcatenate($exedir, "mgpp_invf_dict$exe");
414 my $mgpp_stem_idx_exe =
415 &FileUtils::filenameConcatenate($exedir, "mgpp_stem_idx$exe");
416
417 my $maxnumeric = $self->{'maxnumeric'};
418
419 my $osextra = "";
420 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
421 $fullindexprefix =~ s@/@\\@g;
422 } else {
423 $osextra = " -d /";
424 if ($outhandle ne "STDERR") {
425 # so mgpp_passes doesn't print to stderr if we redirect output
426 $osextra .= " 2>/dev/null";
427 }
428 }
429
430 # get the index expression if this index belongs
431 # to a subcollection
432 my $indexexparr = [];
433 my $langarr = [];
434 # there may be subcollection info, and language info.
435 my ($fields, $subcollection, $language) = split (":", $index);
436 my @subcollections = ();
437 @subcollections = split /,/, $subcollection if (defined $subcollection);
438
439 foreach $subcollection (@subcollections) {
440 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
441 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
442 }
443 }
444
445 # add expressions for languages if this index belongs to
446 # a language subcollection - only put languages expressions for the
447 # ones we want in the index
448
449 my @languages = ();
450 my $languagemetadata = "Language";
451 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
452 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
453 }
454 @languages = split /,/, $language if (defined $language);
455 foreach my $language (@languages) {
456 my $not=0;
457 if ($language =~ s/^\!//) {
458 $not = 1;
459 }
460 if($not) {
461 push (@$langarr, "!$language");
462 } else {
463 push (@$langarr, "$language");
464 }
465 }
466
467 # Build index dictionary. Uses verbatim stem method
468 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
469 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
470 my ($handle);
471 if ($self->{'debug'}) {
472 $handle = *STDOUT;
473 }
474 else {
475 if (!-e "$mgpp_passes_exe" ||
476 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
477 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
478 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
479 }
480 }
481
482 # db_level is always section
483 my $db_level = "section";
484
485 # set up the document processr
486 $self->{'buildproc'}->set_output_handle ($handle);
487 $self->{'buildproc'}->set_mode ('text');
488 $self->{'buildproc'}->set_index ($index, $indexexparr);
489 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
490 $self->{'buildproc'}->set_indexing_text (1);
491 $self->{'buildproc'}->set_levels ($self->{'levels'});
492 $self->{'buildproc'}->set_db_level ($db_level);
493
494 $self->{'buildproc'}->reset();
495
496 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
497 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
498 close ($handle) unless $self->{'debug'};
499
500 $self->print_stats();
501
502 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
503 # we check on the .id file - index dictionary
504 my $dict_file = "$fullindexprefix.id";
505 if (!-e $dict_file) {
506 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
507 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
508 $self->{'notbuilt'}->{$index}=1;
509 return;
510 }
511
512 if (!$self->{'debug'}) {
513 # create the perfect hash function
514 if (!-e "$mgpp_perf_hash_build_exe") {
515 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
516 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
517 }
518 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
519
520 if (!-e "$mgpp_passes_exe" ||
521 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
522 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
523 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
524 }
525 }
526
527 # invert the text
528 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
529 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
530
531 $self->{'buildproc'}->set_output_handle ($handle);
532 $self->{'buildproc'}->reset();
533
534 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
535 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
536
537 $self->print_stats ();
538
539 if (!$self->{'debug'}) {
540
541 close ($handle);
542
543 # create the weights file
544 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
545 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
546 if (!-e "$mgpp_weights_build_exe") {
547 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
548 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
549 }
550 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
551
552 # create 'on-disk' stemmed dictionary
553 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
554 if (!-e "$mgpp_invf_dict_exe") {
555 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
556 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
557 }
558 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
559
560
561 # creates stem index files for the various stemming methods
562 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
563 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
564 if (!-e "$mgpp_stem_idx_exe") {
565 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
566 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
567 }
568 my $accent_folding_enabled = 1;
569 if ($self->{'accentfold'}) {
570 # the first time we do this, we test for accent folding enabled
571 if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") == 2) {
572 # accent folding has not been enabled in mgpp
573 $accent_folding_enabled = 0;
574 $self->{'stemindexes'} -= 4;
575 }
576 }
577 if ($self->{'casefold'}) {
578 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
579 if ($accent_folding_enabled && $self->{'accentfold'}) {
580 system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
581 }
582 }
583 if ($self->{'stem'}) {
584 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
585 if ($accent_folding_enabled && $self->{'accentfold'}) {
586 system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
587 }
588 }
589 if ($self->{'casefold'} && $self->{'stem'}) {
590 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
591 if ($accent_folding_enabled && $self->{'accentfold'}) {
592 system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
593 }
594 }
595
596 # remove unwanted files
597 my $tmpdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir);
598 opendir (DIR, $tmpdir) || die
599 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
600 foreach my $file (readdir(DIR)) {
601 next if $file =~ /^\./;
602 my ($suffix) = $file =~ /\.([^\.]+)$/;
603 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
604 # delete it!
605 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
606 #&util::rm (&FileUtils::filenameConcatenate($tmpdir, $file));
607 }
608 }
609 closedir (DIR);
610 }
611 print STDERR "</Stage>\n" if $self->{'gli'};
612}
613
614
615sub get_collection_meta_indexes
616{
617 my $self = shift(@_);
618 my $collection_infodb = shift(@_);
619
620 # define the indexed field mapping if not already done so
621 # (i.e. if infodb called separately from build_index)
622 if (!defined $self->{'build_cfg'}) {
623 $self->read_final_field_list();
624 }
625
626 # first do the collection meta stuff - everything without a dot
627 my $collmetadefined = 0;
628 my $metadata_entry;
629 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
630 $collmetadefined = 1;
631 }
632
633 #add the index field macros to [collection]
634 # eg <TI>Title
635 # <SU>Subject
636 # these now come from collection meta. if that is not defined, uses the metadata name
637 my $collmeta = "";
638 if (defined $self->{'build_cfg'}->{'extraindexfields'}) {
639 foreach my $longfield (@{$self->{'build_cfg'}->{'extraindexfields'}}){
640 my $shortfield = $self->{'buildproc'}->{'fieldnamemap'}->{$longfield};
641 next if $shortfield eq 1;
642
643 # we need to check if some coll meta has been defined - don't output
644 # any that have
645 $collmeta = ".$longfield";
646 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
647 if ($longfield eq "allfields") {
648 $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
649 } elsif ($longfield eq "text") {
650 $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
651 } else {
652 $collection_infodb->{$shortfield} = [ $longfield ];
653 }
654 }
655 }
656 }
657
658 # now add the level names
659 my $level_entry = "";
660 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
661 $collmeta = ".$level"; # based on the original specification
662 $level =~ tr/A-Z/a-z/; # make it lower case
663 my $levelid = $level_map{$level}; # find the actual value we used in the index
664 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
665 # use the default macro
666 $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
667 }
668 }
669
670 # now add subcoll meta
671 my $subcoll_entry = "";
672 my $shortname = "";
673 my $one_entry = "";
674 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
675 $shortname = $self->{'index_mapping'}->{$subcoll};
676 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
677 $collection_infodb->{$shortname} = [ $subcoll ];
678 }
679 }
680
681 # now add language meta
682 my $lang_entry = "";
683 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
684 $shortname = $self->{'index_mapping'}->{$lang};
685 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
686 $collection_infodb->{$shortname} = [ $lang ];
687 }
688 }
689}
690
691
692# default is to output the metadata sets (prefixes) used in collection
693sub output_collection_meta
694{
695 my $self = shift(@_);
696 my $infodb_handle = shift(@_);
697
698 my %collection_infodb = ();
699 $self->get_collection_meta_sets(\%collection_infodb);
700 $self->get_collection_meta_indexes(\%collection_infodb);
701 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
702}
703
704
705# at the end of building, we have an indexfieldmap with all the mappings,
706# plus some extras, and indexmap with any indexes in it that weren't
707# specified in the index definition. We want to make an ordered list of
708# fields that are indexed, and a list of mappings that are used. This will
709# be used for the build.cfg file, and for collection meta definition we
710# store these in a build.cfg bit
711sub make_final_field_list {
712 my $self = shift (@_);
713
714 $self->{'build_cfg'} = {};
715
716 # store the indexfieldmap information
717 my @indexfieldmap = ();
718 my @indexfields = ();
719 my $specifiedfields = {};
720 my @specifiedfieldorder = ();
721
722 # go through the index definition and add each thing to a map, so we
723 # can easily check if it is already specified - when doing the
724 # metadata, we print out all the individual fields, but some may
725 # already be specified in the index definition, so we dont want to add
726 # those again.
727
728 my $field;
729 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
730 # remove subcoll stuff
731 my $parts = $field;
732 $parts =~ s/:.*$//;
733 # *************
734 my @fs = split(';', $parts);
735 foreach my $f(@fs) {
736 if (!defined $specifiedfields->{$f}) {
737 $specifiedfields->{$f}=1;
738 push (@specifiedfieldorder, "$f");
739 }
740 }
741 }
742
743 #add all fields bit
744 my $fnm = $self->{'buildproc'}->{'fieldnamemap'};
745
746 foreach $field (@specifiedfieldorder) {
747 if ($field eq "metadata") {
748 foreach my $newfield (keys %{$self->{'buildproc'}->{'extraindexfields'}}) {
749 if (!defined $specifiedfields->{$newfield}) {
750 push (@indexfieldmap, "$newfield\-\>$fnm->{$newfield}");
751 push (@indexfields, "$newfield");
752 }
753 }
754
755 } elsif ($field eq 'text') {
756 push (@indexfieldmap, "text\-\>TX");
757 push (@indexfields, "text");
758 } elsif ($field eq 'allfields') {
759 push (@indexfieldmap, "allfields\-\>ZZ");
760 push (@indexfields, "allfields");
761 } else {
762 # we only add in the ones that have been processed
763 if (defined $self->{'buildproc'}->{'allindexfields'}->{$field}) {
764 push (@indexfieldmap, "$field\-\>$fnm->{$field}");
765 push (@indexfields, "$field");
766 }
767 }
768 }
769
770 if (scalar @indexfieldmap) {
771 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
772 }
773
774 if (scalar @indexfields) {
775 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
776 }
777}
778
779
780# recreate the field list from the build.cfg file, look first in building,
781# then in index to find it. if there is no build.cfg, we can't do the field
782# list (there is unlikely to be any index anyway.)
783sub read_final_field_list {
784 my $self = shift (@_);
785 $self->{'build_cfg'} = {};
786 my @indexfieldmap = ();
787 my @indexfields = ();
788 my @indexmap = ();
789
790 # we read the stuff in from the build.cfg file - if its there
791 my $buildcfg = $self->read_build_cfg();
792 return unless defined $buildcfg;
793
794 my $field;
795 if (defined $buildcfg->{'indexfields'}) {
796 foreach $field (@{$buildcfg->{'indexfields'}}) {
797 push (@indexfields, "$field");
798 }
799 }
800
801 if (defined $buildcfg->{'indexfieldmap'}) {
802 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
803 push (@indexfieldmap, "$field");
804 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
805 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
806 }
807 }
808
809 if (defined $buildcfg->{'indexmap'}) {
810 foreach $field (@{$buildcfg->{'indexmap'}}) {
811 push (@indexmap, "$field");
812 }
813 }
814
815 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
816 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
817 $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
818}
819
820
821sub build_cfg_extra {
822 my $self = shift (@_);
823 my ($build_cfg) = @_;
824
825 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
826
827 # store the level info
828 my @indexlevels = ();
829 my @levelmap = ();
830 foreach my $l (@{$self->{'levelorder'}}) {
831 push (@indexlevels, $level_map{$l});
832 push (@levelmap, "$l\-\>$level_map{$l}");
833 }
834 $build_cfg->{'indexlevels'} = \@indexlevels;
835 $build_cfg->{'levelmap'} = \@levelmap;
836
837 # text level (and database level) is always section
838 $build_cfg->{'textlevel'} = $level_map{'section'};
839
840}
841
8421;
843
844
Note: See TracBrowser for help on using the repository browser.