source: gsdl/trunk/perllib/mgppbuilder.pm@ 17565

Last change on this file since 17565 was 17565, checked in by kjdon, 16 years ago

removed some debug statements, and no longer load in the default indexfieldmap to buildproc

  • Property svn:keywords set to Author Date Id Revision
File size: 27.8 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use colcfg;
30use plugin;
31use strict; no strict 'refs';
32use util;
33
34
35sub BEGIN {
36 @mgppbuilder::ISA = ('basebuilder');
37}
38
39
40
41our %level_map = ('document'=>'Doc',
42 'section'=>'Sec',
43 'paragraph'=>'Para',
44 'Doc'=>'_textdocument_',
45 'Sec'=>'_textsection_',
46 'Para'=>'_textparagraph_');
47
48our %wanted_index_files = ('td'=>1,
49 't'=>1,
50 'tl'=>1,
51 'ti'=>1,
52 'idb'=>1,
53 'ib1'=>1,
54 'ib2'=>1,
55 'ib3'=>1,
56 'ib4'=>1,
57 'ib5'=>1,
58 'ib6'=>1,
59 'ib7'=>1,
60 'i'=>1,
61 'il'=>1,
62 'w'=>1,
63 'wa'=>1);
64
65
66my $maxdocsize = $basebuilder::maxdocsize;
67
68sub new {
69 my $class = shift(@_);
70
71 my $self = new basebuilder (@_);
72 $self = bless $self, $class;
73
74 #$self->{'indexfieldmap'} = \%static_indexfield_map;
75
76 # get the levels (Section, Paragraph) for indexing and compression
77 $self->{'levels'} = {};
78 $self->{'levelorder'} = ();
79 if (defined $self->{'collect_cfg'}->{'levels'}) {
80 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
81 $level =~ tr/A-Z/a-z/;
82 $self->{'levels'}->{$level} = 1;
83 push (@{$self->{'levelorder'}}, $level);
84 }
85 } else { # default to document
86 $self->{'levels'}->{'document'} = 1;
87 push (@{$self->{'levelorder'}}, 'document');
88 }
89
90 $self->{'buildtype'} = "mgpp";
91
92 return $self;
93}
94
95sub generate_index_list {
96 my $self = shift (@_);
97
98 # sort out the indexes
99 #indexes are specified with spaces, but we put them into one index
100 my $indexes = $self->{'collect_cfg'}->{'indexes'};
101 $self->{'collect_cfg'}->{'indexes'} = [];
102 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(';', @$indexes).";");
103}
104
105sub generate_index_options {
106 my $self = shift (@_);
107
108 $self->SUPER::generate_index_options();
109
110 $self->{'casefold'} = 0;
111 $self->{'stem'} = 0;
112 $self->{'accentfold'} = 0;
113
114 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
115 # just use default options
116 $self->{'casefold'} = 1;
117 $self->{'stem'} = 1;
118 $self->{'accentfold'} = 1;
119 } else {
120 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
121 if ($option =~ /stem/) {
122 $self->{'stem'} = 1;
123 } elsif ($option =~ /casefold/) {
124 $self->{'casefold'} = 1;
125 } elsif ($option =~ /accentfold/) {
126 $self->{'accentfold'} = 1;
127 }
128 }
129 }
130
131 # now we record this for the build cfg
132 $self->{'stemindexes'} = 0;
133 if ($self->{'casefold'}) {
134 $self->{'stemindexes'} += 1;
135 }
136 if ($self->{'stem'}) {
137 $self->{'stemindexes'} += 2;
138 }
139 if ($self->{'accentfold'}) {
140 $self->{'stemindexes'} += 4;
141 }
142
143}
144
145sub default_buildproc {
146 my $self = shift (@_);
147
148 return "mgppbuildproc";
149}
150
151sub compress_text {
152
153 my $self = shift (@_);
154
155 # we don't do anything if we don't want compressed text
156 return if $self->{'no_text'};
157
158 my ($textindex) = @_;
159
160 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
161 my $exe = &util::get_os_exe ();
162 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
163 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
164 my $outhandle = $self->{'outhandle'};
165
166 my $maxnumeric = $self->{'maxnumeric'};
167
168 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
169
170 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
171 my $basefilename = &util::filename_cat("text",$collect_tail);
172 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
173
174 my $osextra = "";
175 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
176 $fulltextprefix =~ s@/@\\@g;
177 }
178 else {
179 $osextra = " -d /";
180 }
181
182
183 # define the section names and possibly the doc name for mgpasses
184 # the compressor doesn't need to know about paragraphs - never want to
185 # retrieve them
186
187 # always use Doc and Sec levels
188 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
189
190 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
191 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
192
193 # collect the statistics for the text
194 # -b $maxdocsize sets the maximum document size to be 12 meg
195 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
196 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
197
198 my ($handle);
199 if ($self->{'debug'}) {
200 $handle = *STDOUT;
201 }
202 else {
203 if (!-e "$mgpp_passes_exe" ||
204 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
205 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
206 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
207 }
208 }
209
210 my $db_level = "section";
211
212 $self->{'buildproc'}->set_output_handle ($handle);
213 $self->{'buildproc'}->set_mode ('text');
214 $self->{'buildproc'}->set_index ($textindex);
215 $self->{'buildproc'}->set_indexing_text (0);
216 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
217 $self->{'buildproc'}->set_levels ($self->{'levels'});
218 $self->{'buildproc'}->set_db_level ($db_level);
219 $self->{'buildproc'}->reset();
220 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
221 $self->{'buildproc'}, $self->{'maxdocs'});
222 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
223 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
224 &plugin::end($self->{'pluginfo'});
225
226 close ($handle) unless $self->{'debug'};
227
228 $self->print_stats();
229
230 # create the compression dictionary
231 # the compression dictionary is built by assuming the stats are from a seed
232 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
233 # and the resulting dictionary must be less than 5 meg with the most
234 # frequent words being put into the dictionary first (-2 -k 5120)
235 # note: these options are left over from mg version
236 if (!$self->{'debug'}) {
237 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
238 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
239 if (!-e "$mgpp_compression_dict_exe") {
240 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
241 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
242 }
243 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
244
245 if (!$self->{'debug'}) {
246 if (!-e "$mgpp_passes_exe" ||
247 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
248 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
249 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
250 }
251 }
252 }
253 else {
254 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
255 }
256
257 $self->{'buildproc'}->reset();
258 # compress the text
259 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
260 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
261
262 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
263 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
264 close ($handle) unless $self->{'debug'};
265
266 $self->print_stats();
267 print STDERR "</Stage>\n" if $self->{'gli'};
268}
269
270
271sub build_indexes_extra {
272 my $self = shift(@_);
273 #define the final field lists
274 $self->make_final_field_list();
275}
276
277# creates directory names for each of the index descriptions
278sub create_index_mapping {
279 my $self = shift (@_);
280 my ($indexes) = @_;
281
282 my %mapping = ();
283
284 $mapping{'indexmaporder'} = [];
285 $mapping{'subcollectionmaporder'} = [];
286 $mapping{'languagemaporder'} = [];
287
288 # dirnames is used to check for collisions. Start this off
289 # with the manditory directory names
290 my %dirnames = ('text'=>'text',
291 'extra'=>'extra');
292 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
293
294 foreach my $index (@$indexes) {
295 my ($fields, $subcollection, $languages) = split (":", $index);
296
297 # we only ever have one index, and its called 'idx'
298 my $pindex = 'idx';
299
300 # next comes a processed version of the subcollection if there is one.
301 my $psub = $self->process_field ($subcollection);
302 $psub = lc ($psub);
303
304 # next comes a processed version of the language if there is one.
305 my $plang = $self->process_field ($languages);
306 $plang = lc ($plang);
307
308 my $dirname = $pindex . $psub . $plang;
309
310 # check to be sure all index names are unique
311 while (defined ($dirnames{$dirname})) {
312 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
313 }
314
315 $mapping{$index} = $dirname;
316
317 # store the mapping orders as well as the maps
318 # also put index, subcollection and language fields into the mapping thing -
319 # (the full index name (eg text:subcol:lang) is not used on
320 # the query page) -these are used for collectionmeta later on
321 if (!defined $mapping{'indexmap'}{"$fields"}) {
322 $mapping{'indexmap'}{"$fields"} = $pindex;
323 push (@{$mapping{'indexmaporder'}}, "$fields");
324 if (!defined $mapping{"$fields"}) {
325 $mapping{"$fields"} = $pindex;
326 }
327 }
328 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
329 $mapping{'subcollectionmap'}{$subcollection} = $psub;
330 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
331 $mapping{$subcollection} = $psub;
332 }
333 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
334 $mapping{'languagemap'}{$languages} = $plang;
335 push (@{$mapping{'languagemaporder'}}, $languages);
336 $mapping{$languages} = $plang;
337 }
338 $dirnames{$dirname} = $index;
339 $pnames{'index'}->{$pindex} = "$fields";
340 $pnames{'subcollection'}->{$psub} = $subcollection;
341 $pnames{'languages'}->{$plang} = $languages;
342 }
343
344 return \%mapping;
345}
346
347sub make_unique {
348 my $self = shift (@_);
349 my ($namehash, $index, $indexref, $subref, $langref) = @_;
350 my ($fields, $subcollection, $languages) = split (":", $index);
351
352 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
353 $self->get_next_version ($indexref);
354 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
355 $self->get_next_version ($subref);
356 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
357 $self->get_next_version ($langref);
358 }
359 return "$$indexref$$subref$$langref";
360}
361
362
363sub build_index {
364 my $self = shift (@_);
365 my ($index) = @_;
366 my $outhandle = $self->{'outhandle'};
367
368 # get the full index directory path and make sure it exists
369 my $indexdir = $self->{'index_mapping'}->{$index};
370 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
371
372 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
373 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
374 $indexdir,
375 $collect_tail);
376 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
377 $collect_tail);
378
379 # get any os specific stuff
380 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
381
382 my $exe = &util::get_os_exe ();
383 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
384
385 # define the section names for mgpasses
386 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
387 if ($self->{'levels'}->{'paragraph'}) {
388 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
389 }
390
391 my $mgpp_perf_hash_build_exe =
392 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
393 my $mgpp_weights_build_exe =
394 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
395 my $mgpp_invf_dict_exe =
396 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
397 my $mgpp_stem_idx_exe =
398 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
399
400 my $maxnumeric = $self->{'maxnumeric'};
401
402 my $osextra = "";
403 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
404 $fullindexprefix =~ s@/@\\@g;
405 } else {
406 $osextra = " -d /";
407 if ($outhandle ne "STDERR") {
408 # so mgpp_passes doesn't print to stderr if we redirect output
409 $osextra .= " 2>/dev/null";
410 }
411 }
412
413 # get the index expression if this index belongs
414 # to a subcollection
415 my $indexexparr = [];
416 my $langarr = [];
417 # there may be subcollection info, and language info.
418 my ($fields, $subcollection, $language) = split (":", $index);
419 my @subcollections = ();
420 @subcollections = split /,/, $subcollection if (defined $subcollection);
421
422 foreach $subcollection (@subcollections) {
423 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
424 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
425 }
426 }
427
428 # add expressions for languages if this index belongs to
429 # a language subcollection - only put languages expressions for the
430 # ones we want in the index
431
432 my @languages = ();
433 my $language_metadata = "Language";
434 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
435 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
436 }
437 @languages = split /,/, $language if (defined $language);
438 foreach my $language (@languages) {
439 my $not=0;
440 if ($language =~ s/^\!//) {
441 $not = 1;
442 }
443 if($not) {
444 push (@$langarr, "!$language");
445 } else {
446 push (@$langarr, "$language");
447 }
448 }
449
450 # Build index dictionary. Uses verbatim stem method
451 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
452 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
453 my ($handle);
454 if ($self->{'debug'}) {
455 $handle = *STDOUT;
456 }
457 else {
458 if (!-e "$mgpp_passes_exe" ||
459 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
460 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
461 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
462 }
463 }
464
465 # db_level is always section
466 my $db_level = "section";
467
468 # set up the document processr
469 $self->{'buildproc'}->set_output_handle ($handle);
470 $self->{'buildproc'}->set_mode ('text');
471 $self->{'buildproc'}->set_index ($index, $indexexparr);
472 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
473 $self->{'buildproc'}->set_indexing_text (1);
474 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
475 $self->{'buildproc'}->set_levels ($self->{'levels'});
476 $self->{'buildproc'}->set_db_level ($db_level);
477
478 $self->{'buildproc'}->reset();
479 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
480 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
481 close ($handle) unless $self->{'debug'};
482
483 $self->print_stats();
484
485 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
486 # we check on the .id file - index dictionary
487 my $dict_file = "$fullindexprefix.id";
488 if (!-e $dict_file) {
489 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
490 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
491 $self->{'notbuilt'}->{$index}=1;
492 return;
493 }
494
495 if (!$self->{'debug'}) {
496 # create the perfect hash function
497 if (!-e "$mgpp_perf_hash_build_exe") {
498 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
499 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
500 }
501 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
502
503 if (!-e "$mgpp_passes_exe" ||
504 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
505 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
506 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
507 }
508 }
509
510 # invert the text
511 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
512 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
513 $self->{'buildproc'}->reset();
514 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
515 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
516
517 $self->print_stats ();
518
519 if (!$self->{'debug'}) {
520
521 close ($handle);
522
523 # create the weights file
524 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
525 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
526 if (!-e "$mgpp_weights_build_exe") {
527 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
528 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
529 }
530 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
531
532 # create 'on-disk' stemmed dictionary
533 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
534 if (!-e "$mgpp_invf_dict_exe") {
535 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
536 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
537 }
538 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
539
540
541 # creates stem index files for the various stemming methods
542 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
543 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
544 if (!-e "$mgpp_stem_idx_exe") {
545 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
546 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
547 }
548 my $accent_folding_enabled = 1;
549 if ($self->{'accentfold'}) {
550 # the first time we do this, we test for accent folding enabled
551 if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") == 2) {
552 # accent folding has not been enabled in mgpp
553 $accent_folding_enabled = 0;
554 $self->{'stemindexes'} -= 4;
555 }
556 }
557 if ($self->{'casefold'}) {
558 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
559 if ($accent_folding_enabled && $self->{'accentfold'}) {
560 system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
561 }
562 }
563 if ($self->{'stem'}) {
564 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
565 if ($accent_folding_enabled && $self->{'accentfold'}) {
566 system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
567 }
568 }
569 if ($self->{'casefold'} && $self->{'stem'}) {
570 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
571 if ($accent_folding_enabled && $self->{'accentfold'}) {
572 system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
573 }
574 }
575
576 # remove unwanted files
577 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
578 opendir (DIR, $tmpdir) || die
579 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
580 foreach my $file (readdir(DIR)) {
581 next if $file =~ /^\./;
582 my ($suffix) = $file =~ /\.([^\.]+)$/;
583 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
584 # delete it!
585 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
586 #&util::rm (&util::filename_cat ($tmpdir, $file));
587 }
588 }
589 closedir (DIR);
590 }
591 print STDERR "</Stage>\n" if $self->{'gli'};
592}
593
594
595sub get_collection_meta_indexes
596{
597 my $self = shift(@_);
598 my $collection_infodb = shift(@_);
599
600 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
601 if (!defined $self->{'build_cfg'}) {
602 $self->read_final_field_list();
603 }
604
605 # first do the collection meta stuff - everything without a dot
606 my $collmetadefined = 0;
607 my $metadata_entry;
608 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
609 $collmetadefined = 1;
610 }
611
612 #add the index field macros to [collection]
613 # eg <TI>Title
614 # <SU>Subject
615 # these now come from collection meta. if that is not defined, uses the metadata name
616 my $collmeta = "";
617 foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
618 my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
619 next if $shortfield eq 1;
620
621 # we need to check if some coll meta has been defined - don't output
622 # any that have
623 $collmeta = ".$longfield";
624 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
625 if ($longfield eq "allfields") {
626 $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
627 } elsif ($longfield eq "text") {
628 $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
629 } else {
630 $collection_infodb->{$shortfield} = [ $longfield ];
631 }
632 }
633 }
634
635 # now add the level names
636 my $level_entry = "";
637 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
638 $collmeta = ".$level"; # based on the original specification
639 $level =~ tr/A-Z/a-z/; # make it lower case
640 my $levelid = $level_map{$level}; # find the actual value we used in the index
641 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
642 # use the default macro
643 $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
644 }
645 }
646
647 # now add subcoll meta
648 my $subcoll_entry = "";
649 my $shortname = "";
650 my $one_entry = "";
651 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
652 $shortname = $self->{'index_mapping'}->{$subcoll};
653 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
654 $collection_infodb->{$shortname} = [ $subcoll ];
655 }
656 }
657
658 # now add language meta
659 my $lang_entry = "";
660 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
661 $shortname = $self->{'index_mapping'}->{$lang};
662 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
663 $collection_infodb->{$shortname} = [ $lang ];
664 }
665 }
666}
667
668
669# default is to output the metadata sets (prefixes) used in collection
670sub output_collection_meta
671{
672 my $self = shift(@_);
673 my $infodb_handle = shift(@_);
674
675 my %collection_infodb = ();
676 $self->get_collection_meta_sets(\%collection_infodb);
677 $self->get_collection_meta_indexes(\%collection_infodb);
678 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
679}
680
681
682# at the end of building, we have an indexfieldmap with all the mappings,
683# plus some extras, and indexmap with any indexes in it that weren't
684# specified in the index definition. we want to make an ordered list of
685# fields that are indexed, and a list of mappings that are used. this will
686# be used for the build.cfg file, and for collection meta definition we
687# store these in a build.cfg bit
688sub make_final_field_list {
689 my $self = shift (@_);
690
691 $self->{'build_cfg'} = {};
692
693 # store the indexfieldmap information
694 my @indexfieldmap = ();
695 my @indexfields = ();
696 my $specifiedfields = {};
697 my @specifiedfieldorder = ();
698
699 # go through the index definition and add each thing to a map, so we
700 # can easily check if it is already specified - when doing the
701 # metadata, we print out all the individual fields, but some may
702 # already be specified in the index definition, so we dont want to add
703 # those again.
704
705 my $field;
706 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
707 # remove subcoll stuff
708 my $parts = $field;
709 $parts =~ s/:.*$//;
710 # *************
711 my @fs = split(';', $parts);
712 foreach my $f(@fs) {
713 if (!defined $specifiedfields->{$f}) {
714 $specifiedfields->{$f}=1;
715 push (@specifiedfieldorder, "$f");
716 }
717 }
718 }
719
720 #add all fields bit
721 foreach $field (@specifiedfieldorder) {
722 if ($field eq "metadata") {
723 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
724 if (!defined $specifiedfields->{$newfield}) {
725 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
726 push (@indexfields, "$newfield");
727 }
728 }
729
730 } elsif ($field eq 'text') {
731 push (@indexfieldmap, "text\-\>TX");
732 push (@indexfields, "text");
733 } elsif ($field eq 'allfields') {
734 push (@indexfieldmap, "allfields\-\>ZZ");
735 push (@indexfields, "allfields");
736 } else {
737
738 my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
739
740 if (defined $ifm->{$field}) {
741 push (@indexfieldmap, "$field\-\>$ifm->{$field}");
742 push (@indexfields, "$field");
743 }
744
745
746 }
747 }
748
749 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
750 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
751
752}
753
754
755# recreate the field list from the build.cfg file, look first in building,
756# then in index to find it. if there is no build.cfg, we can't do the field
757# list (there is unlikely to be any index anyway.)
758sub read_final_field_list {
759 my $self = shift (@_);
760 $self->{'build_cfg'} = {};
761 my @indexfieldmap = ();
762 my @indexfields = ();
763 my @indexmap = ();
764
765 #if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
766 # set the default mapping
767 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
768 #}
769 # we read the stuff in from the build.cfg file - if its there
770 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
771
772 if (!-e $buildconfigfile) {
773 # try the index dir - but do we know where it is?? try here
774 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
775 if (!-e $buildconfigfile) {
776 #we cant find a config file - just ignore the field list
777 return;
778 }
779 }
780
781 my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
782 my $field;
783 if (defined $buildcfg->{'indexfields'}) {
784 foreach $field (@{$buildcfg->{'indexfields'}}) {
785 push (@indexfields, "$field");
786 }
787 }
788
789 if (defined $buildcfg->{'indexfieldmap'}) {
790 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
791 push (@indexfieldmap, "$field");
792 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
793 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
794 }
795 }
796
797 if (defined $buildcfg->{'indexmap'}) {
798 foreach $field (@{$buildcfg->{'indexmap'}}) {
799 push (@indexmap, "$field");
800 }
801 }
802
803 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
804 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
805 $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
806}
807
808
809sub build_cfg_extra {
810 my $self = shift (@_);
811 my ($build_cfg) = @_;
812
813 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
814
815 # store the level info
816 my @indexlevels = ();
817 my @levelmap = ();
818 foreach my $l (@{$self->{'levelorder'}}) {
819 push (@indexlevels, $level_map{$l});
820 push (@levelmap, "$l\-\>$level_map{$l}");
821 }
822 $build_cfg->{'indexlevels'} = \@indexlevels;
823 $build_cfg->{'levelmap'} = \@levelmap;
824
825 # text level (and database level) is always section
826 $build_cfg->{'textlevel'} = $level_map{'section'};
827
828}
829
8301;
831
832
Note: See TracBrowser for help on using the repository browser.