source: gsdl/trunk/perllib/mgppbuilder.pm@ 18342

Last change on this file since 18342 was 17574, checked in by kjdon, 15 years ago

now calls read_build_cfg() instead of having the code here

  • Property svn:keywords set to Author Date Id Revision
File size: 27.3 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use colcfg;
30use plugin;
31use strict; no strict 'refs';
32use util;
33
34
35sub BEGIN {
36 @mgppbuilder::ISA = ('basebuilder');
37}
38
39
40
41our %level_map = ('document'=>'Doc',
42 'section'=>'Sec',
43 'paragraph'=>'Para',
44 'Doc'=>'_textdocument_',
45 'Sec'=>'_textsection_',
46 'Para'=>'_textparagraph_');
47
48our %wanted_index_files = ('td'=>1,
49 't'=>1,
50 'tl'=>1,
51 'ti'=>1,
52 'idb'=>1,
53 'ib1'=>1,
54 'ib2'=>1,
55 'ib3'=>1,
56 'ib4'=>1,
57 'ib5'=>1,
58 'ib6'=>1,
59 'ib7'=>1,
60 'i'=>1,
61 'il'=>1,
62 'w'=>1,
63 'wa'=>1);
64
65
66my $maxdocsize = $basebuilder::maxdocsize;
67
68sub new {
69 my $class = shift(@_);
70
71 my $self = new basebuilder (@_);
72 $self = bless $self, $class;
73
74 #$self->{'indexfieldmap'} = \%static_indexfield_map;
75
76 # get the levels (Section, Paragraph) for indexing and compression
77 $self->{'levels'} = {};
78 $self->{'levelorder'} = ();
79 if (defined $self->{'collect_cfg'}->{'levels'}) {
80 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
81 $level =~ tr/A-Z/a-z/;
82 $self->{'levels'}->{$level} = 1;
83 push (@{$self->{'levelorder'}}, $level);
84 }
85 } else { # default to document
86 $self->{'levels'}->{'document'} = 1;
87 push (@{$self->{'levelorder'}}, 'document');
88 }
89
90 $self->{'buildtype'} = "mgpp";
91
92 return $self;
93}
94
95sub generate_index_list {
96 my $self = shift (@_);
97
98 # sort out the indexes
99 #indexes are specified with spaces, but we put them into one index
100 my $indexes = $self->{'collect_cfg'}->{'indexes'};
101 $self->{'collect_cfg'}->{'indexes'} = [];
102 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(';', @$indexes).";");
103}
104
105sub generate_index_options {
106 my $self = shift (@_);
107
108 $self->SUPER::generate_index_options();
109
110 $self->{'casefold'} = 0;
111 $self->{'stem'} = 0;
112 $self->{'accentfold'} = 0;
113
114 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
115 # just use default options
116 $self->{'casefold'} = 1;
117 $self->{'stem'} = 1;
118 $self->{'accentfold'} = 1;
119 } else {
120 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
121 if ($option =~ /stem/) {
122 $self->{'stem'} = 1;
123 } elsif ($option =~ /casefold/) {
124 $self->{'casefold'} = 1;
125 } elsif ($option =~ /accentfold/) {
126 $self->{'accentfold'} = 1;
127 }
128 }
129 }
130
131 # now we record this for the build cfg
132 $self->{'stemindexes'} = 0;
133 if ($self->{'casefold'}) {
134 $self->{'stemindexes'} += 1;
135 }
136 if ($self->{'stem'}) {
137 $self->{'stemindexes'} += 2;
138 }
139 if ($self->{'accentfold'}) {
140 $self->{'stemindexes'} += 4;
141 }
142
143}
144
145sub default_buildproc {
146 my $self = shift (@_);
147
148 return "mgppbuildproc";
149}
150
151sub compress_text {
152
153 my $self = shift (@_);
154
155 # we don't do anything if we don't want compressed text
156 return if $self->{'no_text'};
157
158 my ($textindex) = @_;
159
160 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
161 my $exe = &util::get_os_exe ();
162 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
163 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
164 my $outhandle = $self->{'outhandle'};
165
166 my $maxnumeric = $self->{'maxnumeric'};
167
168 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
169
170 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
171 my $basefilename = &util::filename_cat("text",$collect_tail);
172 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
173
174 my $osextra = "";
175 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
176 $fulltextprefix =~ s@/@\\@g;
177 }
178 else {
179 $osextra = " -d /";
180 }
181
182
183 # define the section names and possibly the doc name for mgpasses
184 # the compressor doesn't need to know about paragraphs - never want to
185 # retrieve them
186
187 # always use Doc and Sec levels
188 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
189
190 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
191 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
192
193 # collect the statistics for the text
194 # -b $maxdocsize sets the maximum document size to be 12 meg
195 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
196 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
197
198 my ($handle);
199 if ($self->{'debug'}) {
200 $handle = *STDOUT;
201 }
202 else {
203 if (!-e "$mgpp_passes_exe" ||
204 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
205 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
206 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
207 }
208 }
209
210 my $db_level = "section";
211
212 $self->{'buildproc'}->set_output_handle ($handle);
213 $self->{'buildproc'}->set_mode ('text');
214 $self->{'buildproc'}->set_index ($textindex);
215 $self->{'buildproc'}->set_indexing_text (0);
216 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
217 $self->{'buildproc'}->set_levels ($self->{'levels'});
218 $self->{'buildproc'}->set_db_level ($db_level);
219 $self->{'buildproc'}->reset();
220 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
221 $self->{'buildproc'}, $self->{'maxdocs'});
222 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
223 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
224 &plugin::end($self->{'pluginfo'});
225
226 close ($handle) unless $self->{'debug'};
227
228 $self->print_stats();
229
230 # create the compression dictionary
231 # the compression dictionary is built by assuming the stats are from a seed
232 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
233 # and the resulting dictionary must be less than 5 meg with the most
234 # frequent words being put into the dictionary first (-2 -k 5120)
235 # note: these options are left over from mg version
236 if (!$self->{'debug'}) {
237 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
238 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
239 if (!-e "$mgpp_compression_dict_exe") {
240 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
241 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
242 }
243 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
244
245 if (!$self->{'debug'}) {
246 if (!-e "$mgpp_passes_exe" ||
247 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
248 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
249 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
250 }
251 }
252 }
253 else {
254 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
255 }
256
257 $self->{'buildproc'}->reset();
258 # compress the text
259 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
260 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
261
262 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
263 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
264 close ($handle) unless $self->{'debug'};
265
266 $self->print_stats();
267 print STDERR "</Stage>\n" if $self->{'gli'};
268}
269
270
271sub build_indexes_extra {
272 my $self = shift(@_);
273 #define the final field lists
274 $self->make_final_field_list();
275}
276
277# creates directory names for each of the index descriptions
278sub create_index_mapping {
279 my $self = shift (@_);
280 my ($indexes) = @_;
281
282 my %mapping = ();
283
284 $mapping{'indexmaporder'} = [];
285 $mapping{'subcollectionmaporder'} = [];
286 $mapping{'languagemaporder'} = [];
287
288 # dirnames is used to check for collisions. Start this off
289 # with the manditory directory names
290 my %dirnames = ('text'=>'text',
291 'extra'=>'extra');
292 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
293
294 foreach my $index (@$indexes) {
295 my ($fields, $subcollection, $languages) = split (":", $index);
296
297 # we only ever have one index, and its called 'idx'
298 my $pindex = 'idx';
299
300 # next comes a processed version of the subcollection if there is one.
301 my $psub = $self->process_field ($subcollection);
302 $psub = lc ($psub);
303
304 # next comes a processed version of the language if there is one.
305 my $plang = $self->process_field ($languages);
306 $plang = lc ($plang);
307
308 my $dirname = $pindex . $psub . $plang;
309
310 # check to be sure all index names are unique
311 while (defined ($dirnames{$dirname})) {
312 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
313 }
314
315 $mapping{$index} = $dirname;
316
317 # store the mapping orders as well as the maps
318 # also put index, subcollection and language fields into the mapping thing -
319 # (the full index name (eg text:subcol:lang) is not used on
320 # the query page) -these are used for collectionmeta later on
321 if (!defined $mapping{'indexmap'}{"$fields"}) {
322 $mapping{'indexmap'}{"$fields"} = $pindex;
323 push (@{$mapping{'indexmaporder'}}, "$fields");
324 if (!defined $mapping{"$fields"}) {
325 $mapping{"$fields"} = $pindex;
326 }
327 }
328 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
329 $mapping{'subcollectionmap'}{$subcollection} = $psub;
330 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
331 $mapping{$subcollection} = $psub;
332 }
333 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
334 $mapping{'languagemap'}{$languages} = $plang;
335 push (@{$mapping{'languagemaporder'}}, $languages);
336 $mapping{$languages} = $plang;
337 }
338 $dirnames{$dirname} = $index;
339 $pnames{'index'}->{$pindex} = "$fields";
340 $pnames{'subcollection'}->{$psub} = $subcollection;
341 $pnames{'languages'}->{$plang} = $languages;
342 }
343
344 return \%mapping;
345}
346
347sub make_unique {
348 my $self = shift (@_);
349 my ($namehash, $index, $indexref, $subref, $langref) = @_;
350 my ($fields, $subcollection, $languages) = split (":", $index);
351
352 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
353 $self->get_next_version ($indexref);
354 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
355 $self->get_next_version ($subref);
356 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
357 $self->get_next_version ($langref);
358 }
359 return "$$indexref$$subref$$langref";
360}
361
362
363sub build_index {
364 my $self = shift (@_);
365 my ($index) = @_;
366 my $outhandle = $self->{'outhandle'};
367
368 # get the full index directory path and make sure it exists
369 my $indexdir = $self->{'index_mapping'}->{$index};
370 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
371
372 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
373 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
374 $indexdir,
375 $collect_tail);
376 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
377 $collect_tail);
378
379 # get any os specific stuff
380 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
381
382 my $exe = &util::get_os_exe ();
383 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
384
385 # define the section names for mgpasses
386 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
387 if ($self->{'levels'}->{'paragraph'}) {
388 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
389 }
390
391 my $mgpp_perf_hash_build_exe =
392 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
393 my $mgpp_weights_build_exe =
394 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
395 my $mgpp_invf_dict_exe =
396 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
397 my $mgpp_stem_idx_exe =
398 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
399
400 my $maxnumeric = $self->{'maxnumeric'};
401
402 my $osextra = "";
403 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
404 $fullindexprefix =~ s@/@\\@g;
405 } else {
406 $osextra = " -d /";
407 if ($outhandle ne "STDERR") {
408 # so mgpp_passes doesn't print to stderr if we redirect output
409 $osextra .= " 2>/dev/null";
410 }
411 }
412
413 # get the index expression if this index belongs
414 # to a subcollection
415 my $indexexparr = [];
416 my $langarr = [];
417 # there may be subcollection info, and language info.
418 my ($fields, $subcollection, $language) = split (":", $index);
419 my @subcollections = ();
420 @subcollections = split /,/, $subcollection if (defined $subcollection);
421
422 foreach $subcollection (@subcollections) {
423 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
424 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
425 }
426 }
427
428 # add expressions for languages if this index belongs to
429 # a language subcollection - only put languages expressions for the
430 # ones we want in the index
431
432 my @languages = ();
433 my $language_metadata = "Language";
434 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
435 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
436 }
437 @languages = split /,/, $language if (defined $language);
438 foreach my $language (@languages) {
439 my $not=0;
440 if ($language =~ s/^\!//) {
441 $not = 1;
442 }
443 if($not) {
444 push (@$langarr, "!$language");
445 } else {
446 push (@$langarr, "$language");
447 }
448 }
449
450 # Build index dictionary. Uses verbatim stem method
451 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
452 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
453 my ($handle);
454 if ($self->{'debug'}) {
455 $handle = *STDOUT;
456 }
457 else {
458 if (!-e "$mgpp_passes_exe" ||
459 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
460 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
461 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
462 }
463 }
464
465 # db_level is always section
466 my $db_level = "section";
467
468 # set up the document processr
469 $self->{'buildproc'}->set_output_handle ($handle);
470 $self->{'buildproc'}->set_mode ('text');
471 $self->{'buildproc'}->set_index ($index, $indexexparr);
472 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
473 $self->{'buildproc'}->set_indexing_text (1);
474 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
475 $self->{'buildproc'}->set_levels ($self->{'levels'});
476 $self->{'buildproc'}->set_db_level ($db_level);
477
478 $self->{'buildproc'}->reset();
479 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
480 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
481 close ($handle) unless $self->{'debug'};
482
483 $self->print_stats();
484
485 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
486 # we check on the .id file - index dictionary
487 my $dict_file = "$fullindexprefix.id";
488 if (!-e $dict_file) {
489 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
490 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
491 $self->{'notbuilt'}->{$index}=1;
492 return;
493 }
494
495 if (!$self->{'debug'}) {
496 # create the perfect hash function
497 if (!-e "$mgpp_perf_hash_build_exe") {
498 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
499 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
500 }
501 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
502
503 if (!-e "$mgpp_passes_exe" ||
504 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
505 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
506 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
507 }
508 }
509
510 # invert the text
511 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
512 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
513 $self->{'buildproc'}->reset();
514 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
515 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
516
517 $self->print_stats ();
518
519 if (!$self->{'debug'}) {
520
521 close ($handle);
522
523 # create the weights file
524 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
525 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
526 if (!-e "$mgpp_weights_build_exe") {
527 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
528 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
529 }
530 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
531
532 # create 'on-disk' stemmed dictionary
533 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
534 if (!-e "$mgpp_invf_dict_exe") {
535 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
536 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
537 }
538 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
539
540
541 # creates stem index files for the various stemming methods
542 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
543 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
544 if (!-e "$mgpp_stem_idx_exe") {
545 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
546 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
547 }
548 my $accent_folding_enabled = 1;
549 if ($self->{'accentfold'}) {
550 # the first time we do this, we test for accent folding enabled
551 if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") == 2) {
552 # accent folding has not been enabled in mgpp
553 $accent_folding_enabled = 0;
554 $self->{'stemindexes'} -= 4;
555 }
556 }
557 if ($self->{'casefold'}) {
558 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
559 if ($accent_folding_enabled && $self->{'accentfold'}) {
560 system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
561 }
562 }
563 if ($self->{'stem'}) {
564 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
565 if ($accent_folding_enabled && $self->{'accentfold'}) {
566 system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
567 }
568 }
569 if ($self->{'casefold'} && $self->{'stem'}) {
570 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
571 if ($accent_folding_enabled && $self->{'accentfold'}) {
572 system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
573 }
574 }
575
576 # remove unwanted files
577 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
578 opendir (DIR, $tmpdir) || die
579 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
580 foreach my $file (readdir(DIR)) {
581 next if $file =~ /^\./;
582 my ($suffix) = $file =~ /\.([^\.]+)$/;
583 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
584 # delete it!
585 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
586 #&util::rm (&util::filename_cat ($tmpdir, $file));
587 }
588 }
589 closedir (DIR);
590 }
591 print STDERR "</Stage>\n" if $self->{'gli'};
592}
593
594
595sub get_collection_meta_indexes
596{
597 my $self = shift(@_);
598 my $collection_infodb = shift(@_);
599
600 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
601 if (!defined $self->{'build_cfg'}) {
602 $self->read_final_field_list();
603 }
604
605 # first do the collection meta stuff - everything without a dot
606 my $collmetadefined = 0;
607 my $metadata_entry;
608 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
609 $collmetadefined = 1;
610 }
611
612 #add the index field macros to [collection]
613 # eg <TI>Title
614 # <SU>Subject
615 # these now come from collection meta. if that is not defined, uses the metadata name
616 my $collmeta = "";
617 foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
618 my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
619 next if $shortfield eq 1;
620
621 # we need to check if some coll meta has been defined - don't output
622 # any that have
623 $collmeta = ".$longfield";
624 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
625 if ($longfield eq "allfields") {
626 $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
627 } elsif ($longfield eq "text") {
628 $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
629 } else {
630 $collection_infodb->{$shortfield} = [ $longfield ];
631 }
632 }
633 }
634
635 # now add the level names
636 my $level_entry = "";
637 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
638 $collmeta = ".$level"; # based on the original specification
639 $level =~ tr/A-Z/a-z/; # make it lower case
640 my $levelid = $level_map{$level}; # find the actual value we used in the index
641 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
642 # use the default macro
643 $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
644 }
645 }
646
647 # now add subcoll meta
648 my $subcoll_entry = "";
649 my $shortname = "";
650 my $one_entry = "";
651 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
652 $shortname = $self->{'index_mapping'}->{$subcoll};
653 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
654 $collection_infodb->{$shortname} = [ $subcoll ];
655 }
656 }
657
658 # now add language meta
659 my $lang_entry = "";
660 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
661 $shortname = $self->{'index_mapping'}->{$lang};
662 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
663 $collection_infodb->{$shortname} = [ $lang ];
664 }
665 }
666}
667
668
669# default is to output the metadata sets (prefixes) used in collection
670sub output_collection_meta
671{
672 my $self = shift(@_);
673 my $infodb_handle = shift(@_);
674
675 my %collection_infodb = ();
676 $self->get_collection_meta_sets(\%collection_infodb);
677 $self->get_collection_meta_indexes(\%collection_infodb);
678 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
679}
680
681
682# at the end of building, we have an indexfieldmap with all the mappings,
683# plus some extras, and indexmap with any indexes in it that weren't
684# specified in the index definition. we want to make an ordered list of
685# fields that are indexed, and a list of mappings that are used. this will
686# be used for the build.cfg file, and for collection meta definition we
687# store these in a build.cfg bit
688sub make_final_field_list {
689 my $self = shift (@_);
690
691 $self->{'build_cfg'} = {};
692
693 # store the indexfieldmap information
694 my @indexfieldmap = ();
695 my @indexfields = ();
696 my $specifiedfields = {};
697 my @specifiedfieldorder = ();
698
699 # go through the index definition and add each thing to a map, so we
700 # can easily check if it is already specified - when doing the
701 # metadata, we print out all the individual fields, but some may
702 # already be specified in the index definition, so we dont want to add
703 # those again.
704
705 my $field;
706 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
707 # remove subcoll stuff
708 my $parts = $field;
709 $parts =~ s/:.*$//;
710 # *************
711 my @fs = split(';', $parts);
712 foreach my $f(@fs) {
713 if (!defined $specifiedfields->{$f}) {
714 $specifiedfields->{$f}=1;
715 push (@specifiedfieldorder, "$f");
716 }
717 }
718 }
719
720 #add all fields bit
721 my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
722
723 foreach $field (@specifiedfieldorder) {
724 if ($field eq "metadata") {
725 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
726 if (!defined $specifiedfields->{$newfield}) {
727 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
728 push (@indexfields, "$newfield");
729 }
730 }
731
732 } elsif ($field eq 'text') {
733 push (@indexfieldmap, "text\-\>TX");
734 push (@indexfields, "text");
735 } elsif ($field eq 'allfields') {
736 push (@indexfieldmap, "allfields\-\>ZZ");
737 push (@indexfields, "allfields");
738 } else {
739 # we only add in the ones that have been processed
740 if (defined $ifm->{$field}) {
741 push (@indexfieldmap, "$field\-\>$ifm->{$field}");
742 push (@indexfields, "$field");
743 }
744
745
746 }
747 }
748
749 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
750 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
751
752}
753
754
755# recreate the field list from the build.cfg file, look first in building,
756# then in index to find it. if there is no build.cfg, we can't do the field
757# list (there is unlikely to be any index anyway.)
758sub read_final_field_list {
759 my $self = shift (@_);
760 $self->{'build_cfg'} = {};
761 my @indexfieldmap = ();
762 my @indexfields = ();
763 my @indexmap = ();
764
765 # we read the stuff in from the build.cfg file - if its there
766 my $buildcfg = $self->read_build_cfg();
767 return unless defined $buildcfg;
768
769 my $field;
770 if (defined $buildcfg->{'indexfields'}) {
771 foreach $field (@{$buildcfg->{'indexfields'}}) {
772 push (@indexfields, "$field");
773 }
774 }
775
776 if (defined $buildcfg->{'indexfieldmap'}) {
777 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
778 push (@indexfieldmap, "$field");
779 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
780 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
781 }
782 }
783
784 if (defined $buildcfg->{'indexmap'}) {
785 foreach $field (@{$buildcfg->{'indexmap'}}) {
786 push (@indexmap, "$field");
787 }
788 }
789
790 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
791 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
792 $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
793}
794
795
796sub build_cfg_extra {
797 my $self = shift (@_);
798 my ($build_cfg) = @_;
799
800 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
801
802 # store the level info
803 my @indexlevels = ();
804 my @levelmap = ();
805 foreach my $l (@{$self->{'levelorder'}}) {
806 push (@indexlevels, $level_map{$l});
807 push (@levelmap, "$l\-\>$level_map{$l}");
808 }
809 $build_cfg->{'indexlevels'} = \@indexlevels;
810 $build_cfg->{'levelmap'} = \@levelmap;
811
812 # text level (and database level) is always section
813 $build_cfg->{'textlevel'} = $level_map{'section'};
814
815}
816
8171;
818
819
Note: See TracBrowser for help on using the repository browser.