source: gsdl/trunk/perllib/mgppbuilder.pm@ 17110

Last change on this file since 17110 was 17110, checked in by kjdon, 16 years ago

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

  • Property svn:keywords set to Author Date Id Revision
File size: 28.5 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use colcfg;
30use plugin;
31use strict; no strict 'refs';
32use util;
33
34
35sub BEGIN {
36 @mgppbuilder::ISA = ('basebuilder');
37}
38
39
40
41our %level_map = ('document'=>'Doc',
42 'section'=>'Sec',
43 'paragraph'=>'Para',
44 'Doc'=>'_textdocument_',
45 'Sec'=>'_textsection_',
46 'Para'=>'_textparagraph_');
47
48our %wanted_index_files = ('td'=>1,
49 't'=>1,
50 'tl'=>1,
51 'ti'=>1,
52 'idb'=>1,
53 'ib1'=>1,
54 'ib2'=>1,
55 'ib3'=>1,
56 'ib4'=>1,
57 'ib5'=>1,
58 'ib6'=>1,
59 'ib7'=>1,
60 'i'=>1,
61 'il'=>1,
62 'w'=>1,
63 'wa'=>1);
64
65# change this so a user can add their own ones in via a file or cfg
66#add AND, OR, NOT NEAR to this list - these cannot be used as field names
67#also add the level names (Doc, Sec, Para)
68our %static_indexfield_map = ('Title'=>'TI',
69 'TI'=>1,
70 'Subject'=>'SU',
71 'SU'=>1,
72 'Creator'=>'CR',
73 'CR'=>1,
74 'Organization'=>'ORG',
75 'ORG'=>1,
76 'Source'=>'SO',
77 'SO'=>1,
78 'Howto'=>'HT',
79 'HT'=>1,
80 'ItemTitle'=>'IT',
81 'IT'=>1,
82 'ProgNumber'=>'PN',
83 'PN'=>1,
84 'People'=>'PE',
85 'PE'=>1,
86 'Coverage'=>'CO',
87 'CO'=>1,
88 'allfields'=>'ZZ',
89 'ZZ'=>1,
90 'text'=>'TX',
91 'TX'=>1,
92 'AND'=>1,
93 'OR'=>1,
94 'NOT'=>1,
95 'NEAR'=>1,
96 'Doc'=>1,
97 'Sec'=>1,
98 'Para'=>1);
99
100my $maxdocsize = $basebuilder::maxdocsize;
101
102sub new {
103 my $class = shift(@_);
104
105 my $self = new basebuilder (@_);
106 $self = bless $self, $class;
107
108 $self->{'indexfieldmap'} = \%static_indexfield_map;
109
110 # get the levels (Section, Paragraph) for indexing and compression
111 $self->{'levels'} = {};
112 $self->{'levelorder'} = ();
113 if (defined $self->{'collect_cfg'}->{'levels'}) {
114 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
115 $level =~ tr/A-Z/a-z/;
116 $self->{'levels'}->{$level} = 1;
117 push (@{$self->{'levelorder'}}, $level);
118 }
119 } else { # default to document
120 $self->{'levels'}->{'document'} = 1;
121 push (@{$self->{'levelorder'}}, 'document');
122 }
123
124 $self->{'buildtype'} = "mgpp";
125
126 return $self;
127}
128
129sub generate_index_list {
130 my $self = shift (@_);
131
132 # sort out the indexes
133 #indexes are specified with spaces, but we put them into one index
134 my $indexes = $self->{'collect_cfg'}->{'indexes'};
135 $self->{'collect_cfg'}->{'indexes'} = [];
136 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(';', @$indexes).";");
137}
138
139sub generate_index_options {
140 my $self = shift (@_);
141
142 $self->SUPER::generate_index_options();
143
144 $self->{'casefold'} = 0;
145 $self->{'stem'} = 0;
146 $self->{'accentfold'} = 0;
147
148 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
149 # just use default options
150 $self->{'casefold'} = 1;
151 $self->{'stem'} = 1;
152 $self->{'accentfold'} = 1;
153 } else {
154 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
155 if ($option =~ /stem/) {
156 $self->{'stem'} = 1;
157 } elsif ($option =~ /casefold/) {
158 $self->{'casefold'} = 1;
159 } elsif ($option =~ /accentfold/) {
160 $self->{'accentfold'} = 1;
161 }
162 }
163 }
164
165 # now we record this for the build cfg
166 $self->{'stemindexes'} = 0;
167 if ($self->{'casefold'}) {
168 $self->{'stemindexes'} += 1;
169 }
170 if ($self->{'stem'}) {
171 $self->{'stemindexes'} += 2;
172 }
173 if ($self->{'accentfold'}) {
174 $self->{'stemindexes'} += 4;
175 }
176
177}
178
179sub default_buildproc {
180 my $self = shift (@_);
181
182 return "mgppbuildproc";
183}
184
185sub compress_text {
186
187 my $self = shift (@_);
188
189 # we don't do anything if we don't want compressed text
190 return if $self->{'no_text'};
191
192 my ($textindex) = @_;
193
194 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
195 my $exe = &util::get_os_exe ();
196 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
197 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
198 my $outhandle = $self->{'outhandle'};
199
200 my $maxnumeric = $self->{'maxnumeric'};
201
202 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
203
204 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
205 my $basefilename = &util::filename_cat("text",$collect_tail);
206 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
207
208 my $osextra = "";
209 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
210 $fulltextprefix =~ s@/@\\@g;
211 }
212 else {
213 $osextra = " -d /";
214 }
215
216
217 # define the section names and possibly the doc name for mgpasses
218 # the compressor doesn't need to know about paragraphs - never want to
219 # retrieve them
220
221 # always use Doc and Sec levels
222 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
223
224 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
225 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
226
227 # collect the statistics for the text
228 # -b $maxdocsize sets the maximum document size to be 12 meg
229 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
230 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
231
232 my ($handle);
233 if ($self->{'debug'}) {
234 $handle = *STDOUT;
235 }
236 else {
237 if (!-e "$mgpp_passes_exe" ||
238 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
239 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
240 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
241 }
242 }
243
244 my $db_level = "section";
245
246 $self->{'buildproc'}->set_output_handle ($handle);
247 $self->{'buildproc'}->set_mode ('text');
248 $self->{'buildproc'}->set_index ($textindex);
249 $self->{'buildproc'}->set_indexing_text (0);
250 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
251 $self->{'buildproc'}->set_levels ($self->{'levels'});
252 $self->{'buildproc'}->set_db_level ($db_level);
253 $self->{'buildproc'}->reset();
254 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
255 $self->{'buildproc'}, $self->{'maxdocs'});
256 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
257 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
258 &plugin::end($self->{'pluginfo'});
259
260 close ($handle) unless $self->{'debug'};
261
262 $self->print_stats();
263
264 # create the compression dictionary
265 # the compression dictionary is built by assuming the stats are from a seed
266 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
267 # and the resulting dictionary must be less than 5 meg with the most
268 # frequent words being put into the dictionary first (-2 -k 5120)
269 # note: these options are left over from mg version
270 if (!$self->{'debug'}) {
271 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
272 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
273 if (!-e "$mgpp_compression_dict_exe") {
274 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
275 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
276 }
277 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
278
279 if (!$self->{'debug'}) {
280 if (!-e "$mgpp_passes_exe" ||
281 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
282 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
283 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
284 }
285 }
286 }
287 else {
288 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
289 }
290
291 $self->{'buildproc'}->reset();
292 # compress the text
293 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
294 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
295
296 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
297 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
298 close ($handle) unless $self->{'debug'};
299
300 $self->print_stats();
301 print STDERR "</Stage>\n" if $self->{'gli'};
302}
303
304
305sub build_indexes_extra {
306 my $self = shift(@_);
307 #define the final field lists
308 $self->make_final_field_list();
309}
310
311# creates directory names for each of the index descriptions
312sub create_index_mapping {
313 my $self = shift (@_);
314 my ($indexes) = @_;
315
316 my %mapping = ();
317
318 $mapping{'indexmaporder'} = [];
319 $mapping{'subcollectionmaporder'} = [];
320 $mapping{'languagemaporder'} = [];
321
322 # dirnames is used to check for collisions. Start this off
323 # with the manditory directory names
324 my %dirnames = ('text'=>'text',
325 'extra'=>'extra');
326 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
327
328 foreach my $index (@$indexes) {
329 my ($fields, $subcollection, $languages) = split (":", $index);
330
331 # we only ever have one index, and its called 'idx'
332 my $pindex = 'idx';
333
334 # next comes a processed version of the subcollection if there is one.
335 my $psub = $self->process_field ($subcollection);
336 $psub = lc ($psub);
337
338 # next comes a processed version of the language if there is one.
339 my $plang = $self->process_field ($languages);
340 $plang = lc ($plang);
341
342 my $dirname = $pindex . $psub . $plang;
343
344 # check to be sure all index names are unique
345 while (defined ($dirnames{$dirname})) {
346 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
347 }
348
349 $mapping{$index} = $dirname;
350
351 # store the mapping orders as well as the maps
352 # also put index, subcollection and language fields into the mapping thing -
353 # (the full index name (eg text:subcol:lang) is not used on
354 # the query page) -these are used for collectionmeta later on
355 if (!defined $mapping{'indexmap'}{"$fields"}) {
356 $mapping{'indexmap'}{"$fields"} = $pindex;
357 push (@{$mapping{'indexmaporder'}}, "$fields");
358 if (!defined $mapping{"$fields"}) {
359 $mapping{"$fields"} = $pindex;
360 }
361 }
362 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
363 $mapping{'subcollectionmap'}{$subcollection} = $psub;
364 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
365 $mapping{$subcollection} = $psub;
366 }
367 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
368 $mapping{'languagemap'}{$languages} = $plang;
369 push (@{$mapping{'languagemaporder'}}, $languages);
370 $mapping{$languages} = $plang;
371 }
372 $dirnames{$dirname} = $index;
373 $pnames{'index'}->{$pindex} = "$fields";
374 $pnames{'subcollection'}->{$psub} = $subcollection;
375 $pnames{'languages'}->{$plang} = $languages;
376 }
377
378 return \%mapping;
379}
380
381sub make_unique {
382 my $self = shift (@_);
383 my ($namehash, $index, $indexref, $subref, $langref) = @_;
384 my ($fields, $subcollection, $languages) = split (":", $index);
385
386 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
387 $self->get_next_version ($indexref);
388 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
389 $self->get_next_version ($subref);
390 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
391 $self->get_next_version ($langref);
392 }
393 return "$$indexref$$subref$$langref";
394}
395
396
397sub build_index {
398 my $self = shift (@_);
399 my ($index) = @_;
400 my $outhandle = $self->{'outhandle'};
401
402 # get the full index directory path and make sure it exists
403 my $indexdir = $self->{'index_mapping'}->{$index};
404 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
405
406 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
407 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
408 $indexdir,
409 $collect_tail);
410 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
411 $collect_tail);
412
413 # get any os specific stuff
414 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
415
416 my $exe = &util::get_os_exe ();
417 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
418
419 # define the section names for mgpasses
420 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
421 if ($self->{'levels'}->{'paragraph'}) {
422 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
423 }
424
425 my $mgpp_perf_hash_build_exe =
426 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
427 my $mgpp_weights_build_exe =
428 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
429 my $mgpp_invf_dict_exe =
430 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
431 my $mgpp_stem_idx_exe =
432 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
433
434 my $maxnumeric = $self->{'maxnumeric'};
435
436 my $osextra = "";
437 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
438 $fullindexprefix =~ s@/@\\@g;
439 } else {
440 $osextra = " -d /";
441 if ($outhandle ne "STDERR") {
442 # so mgpp_passes doesn't print to stderr if we redirect output
443 $osextra .= " 2>/dev/null";
444 }
445 }
446
447 # get the index expression if this index belongs
448 # to a subcollection
449 my $indexexparr = [];
450 my $langarr = [];
451 # there may be subcollection info, and language info.
452 my ($fields, $subcollection, $language) = split (":", $index);
453 my @subcollections = ();
454 @subcollections = split /,/, $subcollection if (defined $subcollection);
455
456 foreach $subcollection (@subcollections) {
457 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
458 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
459 }
460 }
461
462 # add expressions for languages if this index belongs to
463 # a language subcollection - only put languages expressions for the
464 # ones we want in the index
465
466 my @languages = ();
467 my $language_metadata = "Language";
468 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
469 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
470 }
471 @languages = split /,/, $language if (defined $language);
472 foreach my $language (@languages) {
473 my $not=0;
474 if ($language =~ s/^\!//) {
475 $not = 1;
476 }
477 if($not) {
478 push (@$langarr, "!$language");
479 } else {
480 push (@$langarr, "$language");
481 }
482 }
483
484 # Build index dictionary. Uses verbatim stem method
485 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
486 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
487 my ($handle);
488 if ($self->{'debug'}) {
489 $handle = *STDOUT;
490 }
491 else {
492 if (!-e "$mgpp_passes_exe" ||
493 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
494 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
495 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
496 }
497 }
498
499 # db_level is always section
500 my $db_level = "section";
501
502 # set up the document processr
503 $self->{'buildproc'}->set_output_handle ($handle);
504 $self->{'buildproc'}->set_mode ('text');
505 $self->{'buildproc'}->set_index ($index, $indexexparr);
506 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
507 $self->{'buildproc'}->set_indexing_text (1);
508 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
509 $self->{'buildproc'}->set_levels ($self->{'levels'});
510 $self->{'buildproc'}->set_db_level ($db_level);
511
512 $self->{'buildproc'}->reset();
513 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
514 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
515 close ($handle) unless $self->{'debug'};
516
517 $self->print_stats();
518
519 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
520 # we check on the .id file - index dictionary
521 my $dict_file = "$fullindexprefix.id";
522 if (!-e $dict_file) {
523 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
524 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
525 $self->{'notbuilt'}->{$index}=1;
526 return;
527 }
528
529 if (!$self->{'debug'}) {
530 # create the perfect hash function
531 if (!-e "$mgpp_perf_hash_build_exe") {
532 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
533 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
534 }
535 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
536
537 if (!-e "$mgpp_passes_exe" ||
538 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
539 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
540 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
541 }
542 }
543
544 # invert the text
545 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
546 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
547 $self->{'buildproc'}->reset();
548 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
549 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
550
551 $self->print_stats ();
552
553 if (!$self->{'debug'}) {
554
555 close ($handle);
556
557 # create the weights file
558 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
559 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
560 if (!-e "$mgpp_weights_build_exe") {
561 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
562 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
563 }
564 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
565
566 # create 'on-disk' stemmed dictionary
567 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
568 if (!-e "$mgpp_invf_dict_exe") {
569 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
570 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
571 }
572 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
573
574
575 # creates stem index files for the various stemming methods
576 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
577 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
578 if (!-e "$mgpp_stem_idx_exe") {
579 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
580 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
581 }
582 my $accent_folding_enabled = 1;
583 if ($self->{'accentfold'}) {
584 # the first time we do this, we test for accent folding enabled
585 if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") == 2) {
586 # accent folding has not been enabled in mgpp
587 $accent_folding_enabled = 0;
588 $self->{'stemindexes'} -= 4;
589 }
590 }
591 if ($self->{'casefold'}) {
592 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
593 if ($accent_folding_enabled && $self->{'accentfold'}) {
594 system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
595 }
596 }
597 if ($self->{'stem'}) {
598 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
599 if ($accent_folding_enabled && $self->{'accentfold'}) {
600 system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
601 }
602 }
603 if ($self->{'casefold'} && $self->{'stem'}) {
604 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
605 if ($accent_folding_enabled && $self->{'accentfold'}) {
606 system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
607 }
608 }
609
610 # remove unwanted files
611 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
612 opendir (DIR, $tmpdir) || die
613 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
614 foreach my $file (readdir(DIR)) {
615 next if $file =~ /^\./;
616 my ($suffix) = $file =~ /\.([^\.]+)$/;
617 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
618 # delete it!
619 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
620 #&util::rm (&util::filename_cat ($tmpdir, $file));
621 }
622 }
623 closedir (DIR);
624 }
625 print STDERR "</Stage>\n" if $self->{'gli'};
626}
627
628
629sub get_collection_meta_indexes
630{
631 my $self = shift(@_);
632 my $collection_infodb = shift(@_);
633
634 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
635 if (!defined $self->{'build_cfg'}) {
636 $self->read_final_field_list();
637 }
638
639 # first do the collection meta stuff - everything without a dot
640 my $collmetadefined = 0;
641 my $metadata_entry;
642 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
643 $collmetadefined = 1;
644 }
645
646 #add the index field macros to [collection]
647 # eg <TI>Title
648 # <SU>Subject
649 # these now come from collection meta. if that is not defined, usses the metadata name
650 my $collmeta = "";
651 foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
652 my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
653 next if $shortfield eq 1;
654
655 # we need to check if some coll meta has been defined - don't output
656 # any that have
657 $collmeta = ".$longfield";
658 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
659 if ($longfield eq "allfields") {
660 $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
661 } elsif ($longfield eq "text") {
662 $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
663 } else {
664 $collection_infodb->{$shortfield} = [ $longfield ];
665 }
666 }
667 }
668
669 # now add the level names
670 my $level_entry = "";
671 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
672 $collmeta = ".$level"; # based on the original specification
673 $level =~ tr/A-Z/a-z/; # make it lower case
674 my $levelid = $level_map{$level}; # find the actual value we used in the index
675 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
676 # use the default macro
677 $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
678 }
679 }
680
681 # now add subcoll meta
682 my $subcoll_entry = "";
683 my $shortname = "";
684 my $one_entry = "";
685 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
686 $shortname = $self->{'index_mapping'}->{$subcoll};
687 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
688 $collection_infodb->{$shortname} = [ $subcoll ];
689 }
690 }
691
692 # now add language meta
693 my $lang_entry = "";
694 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
695 $shortname = $self->{'index_mapping'}->{$lang};
696 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
697 $collection_infodb->{$shortname} = [ $lang ];
698 }
699 }
700}
701
702
703# default is to output the metadata sets (prefixes) used in collection
704sub output_collection_meta
705{
706 my $self = shift(@_);
707 my $infodb_handle = shift(@_);
708
709 my %collection_infodb = ();
710 $self->get_collection_meta_sets(\%collection_infodb);
711 $self->get_collection_meta_indexes(\%collection_infodb);
712 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
713}
714
715
716# at the end of building, we have an indexfieldmap with all the mappings,
717# plus some extras, and indexmap with any indexes in it that weren't
718# specified in the index definition. we want to make an ordered list of
719# fields that are indexed, and a list of mappings that are used. this will
720# be used for the build.cfg file, and for collection meta definition we
721# store these in a build.cfg bit
722sub make_final_field_list {
723 my $self = shift (@_);
724
725 $self->{'build_cfg'} = {};
726
727 # store the indexfieldmap information
728 my @indexfieldmap = ();
729 my @indexfields = ();
730 my $specifiedfields = {};
731 my @specifiedfieldorder = ();
732
733 # go through the index definition and add each thing to a map, so we
734 # can easily check if it is already specified - when doing the
735 # metadata, we print out all the individual fields, but some may
736 # already be specified in the index definition, so we dont want to add
737 # those again.
738
739 my $field;
740 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
741 # remove subcoll stuff
742 my $parts = $field;
743 $parts =~ s/:.*$//;
744 # *************
745 my @fs = split(';', $parts);
746 foreach my $f(@fs) {
747 if (!defined $specifiedfields->{$f}) {
748 $specifiedfields->{$f}=1;
749 push (@specifiedfieldorder, "$f");
750 }
751 }
752 }
753
754 #add all fields bit
755 foreach $field (@specifiedfieldorder) {
756 if ($field eq "metadata") {
757 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
758 if (!defined $specifiedfields->{$newfield}) {
759 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
760 push (@indexfields, "$newfield");
761 }
762 }
763
764 } elsif ($field eq 'text') {
765 push (@indexfieldmap, "text\-\>TX");
766 push (@indexfields, "text");
767 } elsif ($field eq 'allfields') {
768 push (@indexfieldmap, "allfields\-\>ZZ");
769 push (@indexfields, "allfields");
770 } else {
771
772 my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
773
774 if (defined $ifm->{$field}) {
775 push (@indexfieldmap, "$field\-\>$ifm->{$field}");
776 push (@indexfields, "$field");
777 }
778
779
780 }
781 }
782
783 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
784 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
785
786}
787
788
789# recreate the field list from the build.cfg file, look first in building,
790# then in index to find it. if there is no build.cfg, we can't do the field
791# list (there is unlikely to be any index anyway.)
792sub read_final_field_list {
793 my $self = shift (@_);
794 $self->{'build_cfg'} = {};
795 my @indexfieldmap = ();
796 my @indexfields = ();
797 my @indexmap = ();
798
799 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
800 # set the default mapping
801 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
802 }
803 # we read the stuff in from the build.cfg file - if its there
804 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
805
806 if (!-e $buildconfigfile) {
807 # try the index dir - but do we know where it is?? try here
808 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
809 if (!-e $buildconfigfile) {
810 #we cant find a config file - just ignore the field list
811 return;
812 }
813 }
814
815 my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
816 my $field;
817 if (defined $buildcfg->{'indexfields'}) {
818 foreach $field (@{$buildcfg->{'indexfields'}}) {
819 push (@indexfields, "$field");
820 }
821 }
822
823 if (defined $buildcfg->{'indexfieldmap'}) {
824 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
825 push (@indexfieldmap, "$field");
826 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
827 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
828 }
829 }
830
831 if (defined $buildcfg->{'indexmap'}) {
832 foreach $field (@{$buildcfg->{'indexmap'}}) {
833 push (@indexmap, "$field");
834 }
835 }
836
837 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
838 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
839 $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
840}
841
842
843sub build_cfg_extra {
844 my $self = shift (@_);
845 my ($build_cfg) = @_;
846
847 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
848
849 # store the level info
850 my @indexlevels = ();
851 my @levelmap = ();
852 foreach my $l (@{$self->{'levelorder'}}) {
853 push (@indexlevels, $level_map{$l});
854 push (@levelmap, "$l\-\>$level_map{$l}");
855 }
856 $build_cfg->{'indexlevels'} = \@indexlevels;
857 $build_cfg->{'levelmap'} = \@levelmap;
858
859 # text level (and database level) is always section
860 $build_cfg->{'textlevel'} = $level_map{'section'};
861
862}
863
8641;
865
866
Note: See TracBrowser for help on using the repository browser.