source: trunk/gsdl/perllib/mgppbuilder.pm@ 13590

Last change on this file since 13590 was 13590, checked in by kjdon, 17 years ago

mgpp and lucene. made them always use doc and sec levels for the text regardless of index level specification. mgpp will always index at doc and sec level, but these options may not be presented to the user. this is to ensure that if we have sectioned documents, we don't need to turn on section indexing in order for the document display to use sections

  • Property svn:keywords set to Author Date Id Revision
File size: 28.2 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use classify;
30use cfgread;
31use colcfg;
32use plugin;
33use util;
34use FileHandle;
35
36sub BEGIN {
37 @mgppbuilder::ISA = ('basebuilder');
38}
39
40
41
42our %level_map = ('document'=>'Doc',
43 'section'=>'Sec',
44 'paragraph'=>'Para',
45 'Doc'=>'_textdocument_',
46 'Sec'=>'_textsection_',
47 'Para'=>'_textparagraph_');
48
49our %wanted_index_files = ('td'=>1,
50 't'=>1,
51 'tl'=>1,
52 'ti'=>1,
53 'idb'=>1,
54 'ib1'=>1,
55 'ib2'=>1,
56 'ib3'=>1,
57 'ib4'=>1,
58 'ib5'=>1,
59 'ib6'=>1,
60 'ib7'=>1,
61 'i'=>1,
62 'il'=>1,
63 'w'=>1,
64 'wa'=>1);
65
66# change this so a user can add their own ones in via a file or cfg
67#add AND, OR, NOT NEAR to this list - these cannot be used as field names
68#also add the level names (Doc, Sec, Para)
69our %static_indexfield_map = ('Title'=>'TI',
70 'TI'=>1,
71 'Subject'=>'SU',
72 'SU'=>1,
73 'Creator'=>'CR',
74 'CR'=>1,
75 'Organization'=>'ORG',
76 'ORG'=>1,
77 'Source'=>'SO',
78 'SO'=>1,
79 'Howto'=>'HT',
80 'HT'=>1,
81 'ItemTitle'=>'IT',
82 'IT'=>1,
83 'ProgNumber'=>'PN',
84 'PN'=>1,
85 'People'=>'PE',
86 'PE'=>1,
87 'Coverage'=>'CO',
88 'CO'=>1,
89 'allfields'=>'ZZ',
90 'ZZ'=>1,
91 'text'=>'TX',
92 'TX'=>1,
93 'AND'=>1,
94 'OR'=>1,
95 'NOT'=>1,
96 'NEAR'=>1,
97 'Doc'=>1,
98 'Sec'=>1,
99 'Para'=>1);
100
101my $maxdocsize = $basebuilder::maxdocsize;
102
103sub new {
104 my $class = shift(@_);
105
106 my $self = new basebuilder (@_);
107 $self = bless $self, $class;
108
109 $self->{'indexfieldmap'} = \%static_indexfield_map;
110
111 # get the levels (Section, Paragraph) for indexing and compression
112 $self->{'levels'} = {};
113 $self->{'levelorder'} = ();
114 if (defined $self->{'collect_cfg'}->{'levels'}) {
115 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
116 $level =~ tr/A-Z/a-z/;
117 $self->{'levels'}->{$level} = 1;
118 push (@{$self->{'levelorder'}}, $level);
119 }
120 } else { # default to document
121 $self->{'levels'}->{'document'} = 1;
122 push (@{$self->{'levelorder'}}, 'document');
123 }
124
125 $self->{'buildtype'} = "mgpp";
126
127 return $self;
128}
129
130sub generate_index_list {
131 my $self = shift (@_);
132
133 # sort out the indexes
134 #indexes are specified with spaces, but we put them into one index
135 my $indexes = $self->{'collect_cfg'}->{'indexes'};
136 $self->{'collect_cfg'}->{'indexes'} = [];
137 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(';', @$indexes).";");
138}
139
140sub generate_index_options {
141 my $self = shift (@_);
142
143 $self->{'casefold'} = 0;
144 $self->{'stem'} = 0;
145 $self->{'accentfold'} = 0;
146
147 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
148 # just use default options
149 $self->{'casefold'} = 1;
150 $self->{'stem'} = 1;
151 $self->{'accentfold'} = 1;
152 } else {
153 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
154 if ($option =~ /stem/) {
155 $self->{'stem'} = 1;
156 } elsif ($option =~ /casefold/) {
157 $self->{'casefold'} = 1;
158 } elsif ($option =~ /accentfold/) {
159 $self->{'accentfold'} = 1;
160 }
161 }
162 }
163
164 # now we record this for the build cfg
165 $self->{'stemindexes'} = 0;
166 if ($self->{'casefold'}) {
167 $self->{'stemindexes'} += 1;
168 }
169 if ($self->{'stem'}) {
170 $self->{'stemindexes'} += 2;
171 }
172 if ($self->{'accentfold'}) {
173 $self->{'stemindexes'} += 4;
174 }
175
176}
177
178sub default_buildproc {
179 my $self = shift (@_);
180
181 return "mgppbuildproc";
182}
183
184sub compress_text {
185
186 my $self = shift (@_);
187
188 # we don't do anything if we don't want compressed text
189 return if $self->{'no_text'};
190
191 my ($textindex) = @_;
192
193 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
194 my $exe = &util::get_os_exe ();
195 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
196 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
197 my $outhandle = $self->{'outhandle'};
198
199 my $maxnumeric = $self->{'maxnumeric'};
200
201 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
202
203 my $basefilename = "text/$self->{'collection'}";
204 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
205
206 my $osextra = "";
207 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
208 $fulltextprefix =~ s@/@\\@g;
209 }
210 else {
211 $osextra = " -d /";
212 }
213
214
215 # define the section names and possibly the doc name for mgpasses
216 # the compressor doesn't need to know about paragraphs - never want to
217 # retrieve them
218
219 # always use Doc and Sec levels
220 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
221
222 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
223 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
224
225 # collect the statistics for the text
226 # -b $maxdocsize sets the maximum document size to be 12 meg
227 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
228 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
229
230 my ($handle);
231 if ($self->{'debug'}) {
232 $handle = STDOUT;
233 } else {
234 if (!-e "$mgpp_passes_exe" ||
235 !open (PIPEOUT, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
236 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
237 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
238 }
239 $handle = mgppbuilder::PIPEOUT;
240 }
241
242 my $gdbm_level = "section";
243
244 $self->{'buildproc'}->set_output_handle ($handle);
245 $self->{'buildproc'}->set_mode ('text');
246 $self->{'buildproc'}->set_index ($textindex);
247 $self->{'buildproc'}->set_indexing_text (0);
248 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
249 $self->{'buildproc'}->set_levels ($self->{'levels'});
250 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
251 $self->{'buildproc'}->reset();
252 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
253 $self->{'buildproc'}, $self->{'maxdocs'});
254 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
255 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
256 &plugin::end($self->{'pluginfo'});
257 close (PIPEOUT);
258
259 close ($handle) unless $self->{'debug'};
260
261 $self->print_stats();
262
263 # create the compression dictionary
264 # the compression dictionary is built by assuming the stats are from a seed
265 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
266 # and the resulting dictionary must be less than 5 meg with the most
267 # frequent words being put into the dictionary first (-2 -k 5120)
268 # note: these options are left over from mg version
269 if (!$self->{'debug'}) {
270 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
271 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
272 if (!-e "$mgpp_compression_dict_exe") {
273 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
274 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
275 }
276 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
277
278 if (!$self->{'debug'}) {
279 if (!-e "$mgpp_passes_exe" ||
280 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
281 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
282 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
283 }
284 }
285 }
286 else {
287 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
288 }
289
290 $self->{'buildproc'}->reset();
291 # compress the text
292 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
293 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
294
295 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
296 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
297 close ($handle) unless $self->{'debug'};
298
299 $self->print_stats();
300 print STDERR "</Stage>\n" if $self->{'gli'};
301}
302
303
304sub build_indexes_extra {
305 my $self = shift(@_);
306 #define the final field lists
307 $self->make_final_field_list();
308}
309
310# creates directory names for each of the index descriptions
311sub create_index_mapping {
312 my $self = shift (@_);
313 my ($indexes) = @_;
314
315 my %mapping = ();
316
317 $mapping{'indexmaporder'} = [];
318 $mapping{'subcollectionmaporder'} = [];
319 $mapping{'languagemaporder'} = [];
320
321 # dirnames is used to check for collisions. Start this off
322 # with the manditory directory names
323 my %dirnames = ('text'=>'text',
324 'extra'=>'extra');
325 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
326
327 foreach my $index (@$indexes) {
328 my ($fields, $subcollection, $languages) = split (":", $index);
329
330 # we only ever have one index, and its called 'idx'
331 my $pindex = 'idx';
332
333 # next comes a processed version of the subcollection if there is one.
334 my $psub = $self->process_field ($subcollection);
335 $psub = lc ($psub);
336
337 # next comes a processed version of the language if there is one.
338 my $plang = $self->process_field ($languages);
339 $plang = lc ($plang);
340
341 my $dirname = $pindex . $psub . $plang;
342
343 # check to be sure all index names are unique
344 while (defined ($dirnames{$dirname})) {
345 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
346 }
347
348 $mapping{$index} = $dirname;
349
350 # store the mapping orders as well as the maps
351 # also put index, subcollection and language fields into the mapping thing -
352 # (the full index name (eg text:subcol:lang) is not used on
353 # the query page) -these are used for collectionmeta later on
354 if (!defined $mapping{'indexmap'}{"$fields"}) {
355 $mapping{'indexmap'}{"$fields"} = $pindex;
356 push (@{$mapping{'indexmaporder'}}, "$fields");
357 if (!defined $mapping{"$fields"}) {
358 $mapping{"$fields"} = $pindex;
359 }
360 }
361 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
362 $mapping{'subcollectionmap'}{$subcollection} = $psub;
363 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
364 $mapping{$subcollection} = $psub;
365 }
366 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
367 $mapping{'languagemap'}{$languages} = $plang;
368 push (@{$mapping{'languagemaporder'}}, $languages);
369 $mapping{$languages} = $plang;
370 }
371 $dirnames{$dirname} = $index;
372 $pnames{'index'}->{$pindex} = "$fields";
373 $pnames{'subcollection'}->{$psub} = $subcollection;
374 $pnames{'languages'}->{$plang} = $languages;
375 }
376
377 return \%mapping;
378}
379
380sub make_unique {
381 my $self = shift (@_);
382 my ($namehash, $index, $indexref, $subref, $langref) = @_;
383 my ($fields, $subcollection, $languages) = split (":", $index);
384
385 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
386 $self->get_next_version ($indexref);
387 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
388 $self->get_next_version ($subref);
389 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
390 $self->get_next_version ($langref);
391 }
392 return "$$indexref$$subref$$langref";
393}
394
395
396sub build_index {
397 my $self = shift (@_);
398 my ($index) = @_;
399 my $outhandle = $self->{'outhandle'};
400
401 # get the full index directory path and make sure it exists
402 my $indexdir = $self->{'index_mapping'}->{$index};
403 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
404 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
405 $indexdir,
406 $self->{'collection'});
407 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
408 $self->{'collection'});
409
410 # get any os specific stuff
411 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
412
413 my $exe = &util::get_os_exe ();
414 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
415
416 # define the section names for mgpasses
417 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
418 if ($self->{'levels'}->{'paragraph'}) {
419 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
420 }
421
422 my $mgpp_perf_hash_build_exe =
423 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
424 my $mgpp_weights_build_exe =
425 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
426 my $mgpp_invf_dict_exe =
427 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
428 my $mgpp_stem_idx_exe =
429 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
430
431 my $maxnumeric = $self->{'maxnumeric'};
432
433 my $osextra = "";
434 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
435 $fullindexprefix =~ s@/@\\@g;
436 } else {
437 $osextra = " -d /";
438 if ($outhandle ne "STDERR") {
439 # so mgpp_passes doesn't print to stderr if we redirect output
440 $osextra .= " 2>/dev/null";
441 }
442 }
443
444 # get the index expression if this index belongs
445 # to a subcollection
446 my $indexexparr = [];
447 my $langarr = [];
448 # there may be subcollection info, and language info.
449 my ($fields, $subcollection, $language) = split (":", $index);
450 my @subcollections = ();
451 @subcollections = split /,/, $subcollection if (defined $subcollection);
452
453 foreach $subcollection (@subcollections) {
454 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
455 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
456 }
457 }
458
459 # add expressions for languages if this index belongs to
460 # a language subcollection - only put languages expressions for the
461 # ones we want in the index
462
463 my @languages = ();
464 my $language_metadata = "Language";
465 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
466 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
467 }
468 @languages = split /,/, $language if (defined $language);
469 foreach my $language (@languages) {
470 my $not=0;
471 if ($language =~ s/^\!//) {
472 $not = 1;
473 }
474 if($not) {
475 push (@$langarr, "!$language");
476 } else {
477 push (@$langarr, "$language");
478 }
479 }
480
481 # Build index dictionary. Uses verbatim stem method
482 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
483 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
484 my ($handle);
485 if ($self->{'debug'}) {
486 $handle = STDOUT;
487 } else {
488 if (!-e "$mgpp_passes_exe" ||
489 !open (PIPEOUT, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
490 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
491 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
492 }
493 $handle = mgppbuilder::PIPEOUT;
494 }
495
496 # gdbm_level is always section
497 my $gdbm_level = "section";
498
499 # set up the document processr
500 $self->{'buildproc'}->set_output_handle ($handle);
501 $self->{'buildproc'}->set_mode ('text');
502 $self->{'buildproc'}->set_index ($index, $indexexparr);
503 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
504 $self->{'buildproc'}->set_indexing_text (1);
505 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
506 $self->{'buildproc'}->set_levels ($self->{'levels'});
507 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
508
509 $self->{'buildproc'}->reset();
510 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
511 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
512 close ($handle) unless $self->{'debug'};
513
514 $self->print_stats();
515
516 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
517 # we check on the .id file - index dictionary
518 my $dict_file = "$fullindexprefix.id";
519 if (!-e $dict_file) {
520 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
521 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
522 $self->{'notbuilt'}->{$index}=1;
523 return;
524 }
525
526 if (!$self->{'debug'}) {
527 # create the perfect hash function
528 if (!-e "$mgpp_perf_hash_build_exe") {
529 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
530 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
531 }
532 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
533
534 if (!-e "$mgpp_passes_exe" ||
535 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
536 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
537 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
538 }
539 }
540
541 # invert the text
542 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
543 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
544 $self->{'buildproc'}->reset();
545 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
546 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
547
548 $self->print_stats ();
549
550 if (!$self->{'debug'}) {
551
552 close ($handle);
553
554 # create the weights file
555 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
556 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
557 if (!-e "$mgpp_weights_build_exe") {
558 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
559 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
560 }
561 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
562
563 # create 'on-disk' stemmed dictionary
564 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
565 if (!-e "$mgpp_invf_dict_exe") {
566 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
567 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
568 }
569 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
570
571
572 # creates stem index files for the various stemming methods
573 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
574 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
575 if (!-e "$mgpp_stem_idx_exe") {
576 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
577 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
578 }
579 my $accent_folding_enabled = 1;
580 if ($self->{'accentfold'}) {
581 # the first time we do this, we test for accent folding enabled
582 if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") != 0) {
583 # accent folding has not been enabled in mgpp
584 $accent_folding_enabled = 0;
585 $self->{'stemindexes'} -= 4;
586 }
587 }
588 if ($self->{'casefold'}) {
589 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
590 if ($accent_folding_enabled && $self->{'accentfold'}) {
591 system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
592 }
593 }
594 if ($self->{'stem'}) {
595 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
596 if ($accent_folding_enabled && $self->{'accentfold'}) {
597 system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
598 }
599 }
600 if ($self->{'casefold'} && $self->{'stem'}) {
601 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
602 if ($accent_folding_enabled && $self->{'accentfold'}) {
603 system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
604 }
605 }
606
607 # remove unwanted files
608 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
609 opendir (DIR, $tmpdir) || die
610 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
611 foreach my $file (readdir(DIR)) {
612 next if $file =~ /^\./;
613 my ($suffix) = $file =~ /\.([^\.]+)$/;
614 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
615 # delete it!
616 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
617 #&util::rm (&util::filename_cat ($tmpdir, $file));
618 }
619 }
620 closedir (DIR);
621 }
622 print STDERR "</Stage>\n" if $self->{'gli'};
623}
624
625# now only outputs stuff if you can't generate it from collectionmeta - e.g. if someone has specified 'metadata' as an index.
626sub output_collection_meta {
627 my $self = shift(@_);
628 my ($handle) = @_;
629
630 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
631 if (!defined $self->{'build_cfg'}) {
632 $self->read_final_field_list();
633 }
634
635 # do the collection info
636 print $handle "[collection]\n";
637
638 # first do the collection meta stuff - everything without a dot
639 my $collmetadefined = 0;
640 my $metadata_entry;
641 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
642 $collmetadefined = 1;
643 }
644
645 #add the index field macros to [collection]
646 # eg <TI>Title
647 # <SU>Subject
648 # these now come from collection meta. if that is not defined, usses the metadata name
649 my $field_entry="";
650 my $collmeta = "";
651 foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
652 my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
653 next if $shortfield eq 1;
654
655 # we need to check if some coll meta has been defined - don't output
656 # any that have
657 $collmeta = ".$longfield";
658 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
659 if ($longfield eq "allfields") {
660 $field_entry .= "<$shortfield>_query:textallfields_\n";
661 } elsif ($longfield eq "text") {
662 $field_entry .= "<$shortfield>_query:texttextonly_\n";
663 } else {
664 $field_entry .= "<$shortfield>$longfield\n";
665 }
666 }
667 }
668 print $handle $field_entry;
669
670 # now add the level names
671 my $level_entry = "";
672 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
673 $collmeta = ".$level"; # based on the original specification
674 $level =~ tr/A-Z/a-z/; # make it lower case
675 my $levelid = $level_map{$level}; # find the actual value we used in the index
676 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
677 # use the default macro
678 $level_entry .= "<$levelid>" . $level_map{$levelid} . "\n";
679 }
680 }
681 print $handle $level_entry;
682
683 # now add subcoll meta
684 my $subcoll_entry = "";
685 my $shortname = "";
686 my $one_entry = "";
687 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
688 $shortname = $self->{'index_mapping'}->{$subcoll};
689 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
690 $subcoll_entry .= "<$shortname>$subcoll\n";
691 }
692 }
693 print $handle $subcoll_entry;
694
695 # now add language meta
696 my $lang_entry = "";
697 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
698 $shortname = $self->{'index_mapping'}->{$lang};
699 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
700 $lang_entry .= "<$shortname>$lang\n";
701 }
702 }
703 print $handle $lang_entry;
704 # end the collection entry
705 print $handle "\n" . ('-' x 70) . "\n";
706
707
708}
709
710# at the end of building, we have an indexfieldmap with all the mappings,
711# plus some extras, and indexmap with any indexes in it that weren't
712# specified in the index definition. we want to make an ordered list of
713# fields that are indexed, and a list of mappings that are used. this will
714# be used for the build.cfg file, and for collection meta definition we
715# store these in a build.cfg bit
716sub make_final_field_list {
717 my $self = shift (@_);
718
719 $self->{'build_cfg'} = {};
720
721 # store the indexfieldmap information
722 my @indexfieldmap = ();
723 my @indexfields = ();
724 my $specifiedfields = {};
725 my @specifiedfieldorder = ();
726
727 # go through the index definition and add each thing to a map, so we
728 # can easily check if it is already specified - when doing the
729 # metadata, we print out all the individual fields, but some may
730 # already be specified in the index definition, so we dont want to add
731 # those again.
732
733 my $field;
734 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
735 # remove subcoll stuff
736 my $parts = $field;
737 $parts =~ s/:.*$//;
738 # *************
739 my @fs = split(';', $parts);
740 foreach my $f(@fs) {
741 if (!defined $specifiedfields->{$f}) {
742 $specifiedfields->{$f}=1;
743 push (@specifiedfieldorder, "$f");
744 }
745 }
746 }
747
748 #add all fields bit
749 foreach $field (@specifiedfieldorder) {
750 if ($field eq "metadata") {
751 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
752 if (!defined $specifiedfields->{$newfield}) {
753 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
754 push (@indexfields, "$newfield");
755 }
756 }
757
758 } elsif ($field eq 'text') {
759 push (@indexfieldmap, "text\-\>TX");
760 push (@indexfields, "text");
761 } elsif ($field eq 'allfields') {
762 push (@indexfieldmap, "allfields\-\>ZZ");
763 push (@indexfields, "allfields");
764 } else {
765
766 my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
767
768 if (defined $ifm->{$field}) {
769 push (@indexfieldmap, "$field\-\>$ifm->{$field}");
770 push (@indexfields, "$field");
771 }
772
773
774 }
775 }
776
777 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
778 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
779
780}
781
782
783# recreate the field list from the build.cfg file, look first in building,
784# then in index to find it. if there is no build.cfg, we can't do the field
785# list (there is unlikely to be any index anyway.)
786sub read_final_field_list {
787 my $self = shift (@_);
788 $self->{'build_cfg'} = {};
789 my @indexfieldmap = ();
790 my @indexfields = ();
791
792 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
793 # set the default mapping
794 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
795 }
796 # we read the stuff in from the build.cfg file - if its there
797 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
798
799 if (!-e $buildconfigfile) {
800 # try the index dir - but do we know where it is?? try here
801 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
802 if (!-e $buildconfigfile) {
803 #we cant find a config file - just ignore the field list
804 return;
805 }
806 }
807
808 my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
809 my $field;
810 if (defined $buildcfg->{'indexfields'}) {
811 foreach $field (@{$buildcfg->{'indexfields'}}) {
812 push (@indexfields, "$field");
813 }
814 }
815
816 if (defined $buildcfg->{'indexfieldmap'}) {
817 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
818 push (@indexfieldmap, "$field");
819 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
820 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
821 }
822 }
823
824 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
825 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
826}
827
828
829sub build_cfg_extra {
830 my $self = shift (@_);
831 my ($build_cfg) = @_;
832
833 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
834
835 # store the level info
836 my @indexlevels = ();
837 my @levelmap = ();
838 foreach my $l (@{$self->{'levelorder'}}) {
839 push (@indexlevels, $level_map{$l});
840 push (@levelmap, "$l\-\>$level_map{$l}");
841 }
842 $build_cfg->{'indexlevels'} = \@indexlevels;
843 $build_cfg->{'levelmap'} = \@levelmap;
844
845 # text level (and gdbm level) is always section
846 $build_cfg->{'textlevel'} = $level_map{'section'};
847
848}
849
8501;
851
852
Note: See TracBrowser for help on using the repository browser.