source: gsdl/trunk/perllib/mgppbuilder.pm@ 18342

Last change on this file since 18342 was 17574, checked in by kjdon, 16 years ago

now calls read_build_cfg() instead of having the code here

  • Property svn:keywords set to Author Date Id Revision
File size: 27.3 KB
RevLine 
[932]1###########################################################################
2#
[1852]3# mgppbuilder.pm -- MGBuilder object
[932]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
[10468]28use basebuilder;
[932]29use colcfg;
30use plugin;
[15715]31use strict; no strict 'refs';
[932]32use util;
33
[15715]34
[10468]35sub BEGIN {
36 @mgppbuilder::ISA = ('basebuilder');
[1694]37}
38
39
40
[9157]41our %level_map = ('document'=>'Doc',
[4811]42 'section'=>'Sec',
43 'paragraph'=>'Para',
44 'Doc'=>'_textdocument_',
45 'Sec'=>'_textsection_',
46 'Para'=>'_textparagraph_');
[1852]47
[9157]48our %wanted_index_files = ('td'=>1,
[932]49 't'=>1,
[1852]50 'tl'=>1,
51 'ti'=>1,
[932]52 'idb'=>1,
53 'ib1'=>1,
54 'ib2'=>1,
55 'ib3'=>1,
[12910]56 'ib4'=>1,
57 'ib5'=>1,
58 'ib6'=>1,
59 'ib7'=>1,
[932]60 'i'=>1,
[1852]61 'il'=>1,
62 'w'=>1,
[932]63 'wa'=>1);
64
65
[10468]66my $maxdocsize = $basebuilder::maxdocsize;
67
[932]68sub new {
[7953]69 my $class = shift(@_);
70
[10468]71 my $self = new basebuilder (@_);
72 $self = bless $self, $class;
[932]73
[17564]74 #$self->{'indexfieldmap'} = \%static_indexfield_map;
[6407]75
[1852]76 # get the levels (Section, Paragraph) for indexing and compression
77 $self->{'levels'} = {};
[4811]78 $self->{'levelorder'} = ();
[1852]79 if (defined $self->{'collect_cfg'}->{'levels'}) {
[8716]80 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
[4811]81 $level =~ tr/A-Z/a-z/;
[1852]82 $self->{'levels'}->{$level} = 1;
[4811]83 push (@{$self->{'levelorder'}}, $level);
[1852]84 }
[4811]85 } else { # default to document
86 $self->{'levels'}->{'document'} = 1;
87 push (@{$self->{'levelorder'}}, 'document');
88 }
89
[7953]90 $self->{'buildtype'} = "mgpp";
[932]91
92 return $self;
93}
94
[10468]95sub generate_index_list {
96 my $self = shift (@_);
97
98 # sort out the indexes
99 #indexes are specified with spaces, but we put them into one index
100 my $indexes = $self->{'collect_cfg'}->{'indexes'};
101 $self->{'collect_cfg'}->{'indexes'} = [];
[13274]102 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(';', @$indexes).";");
[932]103}
104
[12910]105sub generate_index_options {
106 my $self = shift (@_);
107
[17110]108 $self->SUPER::generate_index_options();
109
[12910]110 $self->{'casefold'} = 0;
111 $self->{'stem'} = 0;
112 $self->{'accentfold'} = 0;
113
114 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
115 # just use default options
116 $self->{'casefold'} = 1;
117 $self->{'stem'} = 1;
118 $self->{'accentfold'} = 1;
119 } else {
120 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
121 if ($option =~ /stem/) {
122 $self->{'stem'} = 1;
123 } elsif ($option =~ /casefold/) {
124 $self->{'casefold'} = 1;
125 } elsif ($option =~ /accentfold/) {
126 $self->{'accentfold'} = 1;
127 }
128 }
129 }
130
131 # now we record this for the build cfg
132 $self->{'stemindexes'} = 0;
133 if ($self->{'casefold'}) {
134 $self->{'stemindexes'} += 1;
135 }
136 if ($self->{'stem'}) {
137 $self->{'stemindexes'} += 2;
138 }
139 if ($self->{'accentfold'}) {
140 $self->{'stemindexes'} += 4;
141 }
[13341]142
[12910]143}
144
[10468]145sub default_buildproc {
146 my $self = shift (@_);
147
148 return "mgppbuildproc";
[932]149}
150
151sub compress_text {
152
153 my $self = shift (@_);
[10961]154
155 # we don't do anything if we don't want compressed text
156 return if $self->{'no_text'};
157
[932]158 my ($textindex) = @_;
159
[2478]160 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
[932]161 my $exe = &util::get_os_exe ();
[2478]162 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
163 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
[1694]164 my $outhandle = $self->{'outhandle'};
[932]165
[12340]166 my $maxnumeric = $self->{'maxnumeric'};
[12325]167
[932]168 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
169
[15003]170 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
171 my $basefilename = &util::filename_cat("text",$collect_tail);
[2700]172 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
[7904]173
[15003]174 my $osextra = "";
[2478]175 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
[3115]176 $fulltextprefix =~ s@/@\\@g;
[2478]177 }
[2700]178 else {
179 $osextra = " -d /";
180 }
[1852]181
182
[4811]183 # define the section names and possibly the doc name for mgpasses
[1852]184 # the compressor doesn't need to know about paragraphs - never want to
185 # retrieve them
[13590]186
187 # always use Doc and Sec levels
188 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
[12911]189
[1694]190 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
[6407]191 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
[932]192
193 # collect the statistics for the text
[1694]194 # -b $maxdocsize sets the maximum document size to be 12 meg
[2478]195 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
[6407]196 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
[932]197
198 my ($handle);
199 if ($self->{'debug'}) {
[15715]200 $handle = *STDOUT;
201 }
202 else {
[2478]203 if (!-e "$mgpp_passes_exe" ||
[15715]204 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
[6407]205 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
[2478]206 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
[932]207 }
208 }
[13590]209
[15685]210 my $db_level = "section";
[9919]211
[932]212 $self->{'buildproc'}->set_output_handle ($handle);
213 $self->{'buildproc'}->set_mode ('text');
214 $self->{'buildproc'}->set_index ($textindex);
215 $self->{'buildproc'}->set_indexing_text (0);
[17564]216 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
[1852]217 $self->{'buildproc'}->set_levels ($self->{'levels'});
[15685]218 $self->{'buildproc'}->set_db_level ($db_level);
[932]219 $self->{'buildproc'}->reset();
220 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
221 $self->{'buildproc'}, $self->{'maxdocs'});
222 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]223 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[932]224 &plugin::end($self->{'pluginfo'});
225
226 close ($handle) unless $self->{'debug'};
227
[2478]228 $self->print_stats();
229
[932]230 # create the compression dictionary
231 # the compression dictionary is built by assuming the stats are from a seed
232 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
233 # and the resulting dictionary must be less than 5 meg with the most
234 # frequent words being put into the dictionary first (-2 -k 5120)
[1852]235 # note: these options are left over from mg version
[932]236 if (!$self->{'debug'}) {
[1694]237 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
[6407]238 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
[2478]239 if (!-e "$mgpp_compression_dict_exe") {
[6407]240 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
[2478]241 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
[932]242 }
[2700]243 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
[932]244
245 if (!$self->{'debug'}) {
[2478]246 if (!-e "$mgpp_passes_exe" ||
[12325]247 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
[6407]248 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
[2478]249 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
[932]250 }
251 }
252 }
[6407]253 else {
254 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
255 }
[932]256
257 $self->{'buildproc'}->reset();
258 # compress the text
[2478]259 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
[6407]260 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
261
[932]262 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]263 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[932]264 close ($handle) unless $self->{'debug'};
[1694]265
266 $self->print_stats();
[6407]267 print STDERR "</Stage>\n" if $self->{'gli'};
[932]268}
269
270
[10468]271sub build_indexes_extra {
272 my $self = shift(@_);
[5617]273 #define the final field lists
274 $self->make_final_field_list();
[10468]275}
[5617]276
[932]277# creates directory names for each of the index descriptions
278sub create_index_mapping {
279 my $self = shift (@_);
280 my ($indexes) = @_;
281
282 my %mapping = ();
[5935]283
[932]284 $mapping{'indexmaporder'} = [];
285 $mapping{'subcollectionmaporder'} = [];
286 $mapping{'languagemaporder'} = [];
287
288 # dirnames is used to check for collisions. Start this off
289 # with the manditory directory names
290 my %dirnames = ('text'=>'text',
291 'extra'=>'extra');
[8716]292 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
[932]293
[8716]294 foreach my $index (@$indexes) {
[932]295 my ($fields, $subcollection, $languages) = split (":", $index);
[13590]296
297 # we only ever have one index, and its called 'idx'
[8716]298 my $pindex = 'idx';
[4768]299
[932]300 # next comes a processed version of the subcollection if there is one.
301 my $psub = $self->process_field ($subcollection);
302 $psub = lc ($psub);
303
304 # next comes a processed version of the language if there is one.
305 my $plang = $self->process_field ($languages);
306 $plang = lc ($plang);
307
308 my $dirname = $pindex . $psub . $plang;
309
310 # check to be sure all index names are unique
311 while (defined ($dirnames{$dirname})) {
312 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
313 }
314
[2478]315 $mapping{$index} = $dirname;
316
[932]317 # store the mapping orders as well as the maps
[2478]318 # also put index, subcollection and language fields into the mapping thing -
[4794]319 # (the full index name (eg text:subcol:lang) is not used on
[2478]320 # the query page) -these are used for collectionmeta later on
[932]321 if (!defined $mapping{'indexmap'}{"$fields"}) {
322 $mapping{'indexmap'}{"$fields"} = $pindex;
323 push (@{$mapping{'indexmaporder'}}, "$fields");
[2478]324 if (!defined $mapping{"$fields"}) {
325 $mapping{"$fields"} = $pindex;
326 }
[932]327 }
328 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
329 $mapping{'subcollectionmap'}{$subcollection} = $psub;
330 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
[2478]331 $mapping{$subcollection} = $psub;
[932]332 }
333 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
334 $mapping{'languagemap'}{$languages} = $plang;
[6544]335 push (@{$mapping{'languagemaporder'}}, $languages);
[2478]336 $mapping{$languages} = $plang;
[932]337 }
338 $dirnames{$dirname} = $index;
[8716]339 $pnames{'index'}->{$pindex} = "$fields";
340 $pnames{'subcollection'}->{$psub} = $subcollection;
341 $pnames{'languages'}->{$plang} = $languages;
[932]342 }
343
344 return \%mapping;
345}
346
347sub make_unique {
348 my $self = shift (@_);
349 my ($namehash, $index, $indexref, $subref, $langref) = @_;
350 my ($fields, $subcollection, $languages) = split (":", $index);
351
352 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
353 $self->get_next_version ($indexref);
354 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
355 $self->get_next_version ($subref);
356 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
357 $self->get_next_version ($langref);
358 }
359 return "$$indexref$$subref$$langref";
360}
361
362
363sub build_index {
364 my $self = shift (@_);
365 my ($index) = @_;
[1694]366 my $outhandle = $self->{'outhandle'};
[932]367
368 # get the full index directory path and make sure it exists
369 my $indexdir = $self->{'index_mapping'}->{$index};
370 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
[15003]371
372 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
[2700]373 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
374 $indexdir,
[15003]375 $collect_tail);
[2700]376 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
[15003]377 $collect_tail);
[932]378
379 # get any os specific stuff
[2478]380 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
[932]381
382 my $exe = &util::get_os_exe ();
[2478]383 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
[1852]384
385 # define the section names for mgpasses
[13590]386 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
387 if ($self->{'levels'}->{'paragraph'}) {
388 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
[1852]389 }
390
[2478]391 my $mgpp_perf_hash_build_exe =
392 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
393 my $mgpp_weights_build_exe =
394 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
395 my $mgpp_invf_dict_exe =
396 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
397 my $mgpp_stem_idx_exe =
398 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
[932]399
[12340]400 my $maxnumeric = $self->{'maxnumeric'};
[12325]401
402 my $osextra = "";
[2700]403 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
[3115]404 $fullindexprefix =~ s@/@\\@g;
[2700]405 } else {
406 $osextra = " -d /";
[3115]407 if ($outhandle ne "STDERR") {
408 # so mgpp_passes doesn't print to stderr if we redirect output
409 $osextra .= " 2>/dev/null";
410 }
[2478]411 }
[2700]412
[932]413 # get the index expression if this index belongs
414 # to a subcollection
415 my $indexexparr = [];
[9669]416 my $langarr = [];
[2478]417 # there may be subcollection info, and language info.
418 my ($fields, $subcollection, $language) = split (":", $index);
[932]419 my @subcollections = ();
420 @subcollections = split /,/, $subcollection if (defined $subcollection);
421
422 foreach $subcollection (@subcollections) {
423 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
424 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
425 }
426 }
427
428 # add expressions for languages if this index belongs to
[2478]429 # a language subcollection - only put languages expressions for the
430 # ones we want in the index
[6544]431
[2478]432 my @languages = ();
[9548]433 my $language_metadata = "Language";
434 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
435 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
436 }
[2478]437 @languages = split /,/, $language if (defined $language);
[9548]438 foreach my $language (@languages) {
[2478]439 my $not=0;
[932]440 if ($language =~ s/^\!//) {
[2478]441 $not = 1;
[932]442 }
[9548]443 if($not) {
[9669]444 push (@$langarr, "!$language");
[6544]445 } else {
[9669]446 push (@$langarr, "$language");
[2478]447 }
[932]448 }
449
450 # Build index dictionary. Uses verbatim stem method
[2478]451 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
[6407]452 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
[932]453 my ($handle);
454 if ($self->{'debug'}) {
[15715]455 $handle = *STDOUT;
456 }
457 else {
[2478]458 if (!-e "$mgpp_passes_exe" ||
[15715]459 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
[6407]460 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
[2478]461 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
[932]462 }
463 }
[9919]464
[15685]465 # db_level is always section
466 my $db_level = "section";
[9919]467
[4794]468 # set up the document processr
[932]469 $self->{'buildproc'}->set_output_handle ($handle);
470 $self->{'buildproc'}->set_mode ('text');
471 $self->{'buildproc'}->set_index ($index, $indexexparr);
[9669]472 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
[932]473 $self->{'buildproc'}->set_indexing_text (1);
[17564]474 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
[9919]475 $self->{'buildproc'}->set_levels ($self->{'levels'});
[15685]476 $self->{'buildproc'}->set_db_level ($db_level);
[9919]477
[932]478 $self->{'buildproc'}->reset();
479 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]480 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[932]481 close ($handle) unless $self->{'debug'};
482
[1694]483 $self->print_stats();
484
[5768]485 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
486 # we check on the .id file - index dictionary
487 my $dict_file = "$fullindexprefix.id";
488 if (!-e $dict_file) {
489 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
[6407]490 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
[5768]491 $self->{'notbuilt'}->{$index}=1;
492 return;
493 }
494
[932]495 if (!$self->{'debug'}) {
496 # create the perfect hash function
[2478]497 if (!-e "$mgpp_perf_hash_build_exe") {
[6407]498 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
[2478]499 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
[932]500 }
[2700]501 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
[932]502
[2478]503 if (!-e "$mgpp_passes_exe" ||
[12325]504 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
[6407]505 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
[2478]506 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
[932]507 }
508 }
509
510 # invert the text
[2478]511 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
[6407]512 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
[932]513 $self->{'buildproc'}->reset();
514 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]515 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[1694]516
517 $self->print_stats ();
[932]518
519 if (!$self->{'debug'}) {
520
521 close ($handle);
522
523 # create the weights file
[1694]524 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
[6407]525 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
[2478]526 if (!-e "$mgpp_weights_build_exe") {
[6407]527 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
[2478]528 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
[932]529 }
[2700]530 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
[932]531
532 # create 'on-disk' stemmed dictionary
[1694]533 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
[2478]534 if (!-e "$mgpp_invf_dict_exe") {
[6407]535 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
[2478]536 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
[932]537 }
[2700]538 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
[932]539
540
541 # creates stem index files for the various stemming methods
[1694]542 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
[6407]543 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
[2478]544 if (!-e "$mgpp_stem_idx_exe") {
[6407]545 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
[2478]546 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
[932]547 }
[12910]548 my $accent_folding_enabled = 1;
549 if ($self->{'accentfold'}) {
550 # the first time we do this, we test for accent folding enabled
[13813]551 if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") == 2) {
[12910]552 # accent folding has not been enabled in mgpp
553 $accent_folding_enabled = 0;
554 $self->{'stemindexes'} -= 4;
555 }
556 }
557 if ($self->{'casefold'}) {
558 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
559 if ($accent_folding_enabled && $self->{'accentfold'}) {
560 system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
561 }
562 }
563 if ($self->{'stem'}) {
564 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
565 if ($accent_folding_enabled && $self->{'accentfold'}) {
566 system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
567 }
568 }
569 if ($self->{'casefold'} && $self->{'stem'}) {
570 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
571 if ($accent_folding_enabled && $self->{'accentfold'}) {
572 system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
573 }
574 }
575
[932]576 # remove unwanted files
[1852]577 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
578 opendir (DIR, $tmpdir) || die
579 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
[8716]580 foreach my $file (readdir(DIR)) {
[1852]581 next if $file =~ /^\./;
582 my ($suffix) = $file =~ /\.([^\.]+)$/;
583 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
[932]584 # delete it!
[1852]585 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
[2772]586 #&util::rm (&util::filename_cat ($tmpdir, $file));
[1852]587 }
588 }
589 closedir (DIR);
[4794]590 }
[6407]591 print STDERR "</Stage>\n" if $self->{'gli'};
[932]592}
593
[15709]594
595sub get_collection_meta_indexes
596{
[10468]597 my $self = shift(@_);
[15709]598 my $collection_infodb = shift(@_);
[1694]599
[10477]600 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
601 if (!defined $self->{'build_cfg'}) {
602 $self->read_final_field_list();
603 }
604
[4794]605 # first do the collection meta stuff - everything without a dot
606 my $collmetadefined = 0;
[10468]607 my $metadata_entry;
[932]608 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
[4794]609 $collmetadefined = 1;
610 }
[11965]611
[4811]612 #add the index field macros to [collection]
[4794]613 # eg <TI>Title
614 # <SU>Subject
[17565]615 # these now come from collection meta. if that is not defined, uses the metadata name
[8716]616 my $collmeta = "";
617 foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
618 my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
[4794]619 next if $shortfield eq 1;
620
[11965]621 # we need to check if some coll meta has been defined - don't output
622 # any that have
[8716]623 $collmeta = ".$longfield";
[11965]624 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
[4794]625 if ($longfield eq "allfields") {
[15709]626 $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
[4794]627 } elsif ($longfield eq "text") {
[15709]628 $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
[4794]629 } else {
[15709]630 $collection_infodb->{$shortfield} = [ $longfield ];
[4794]631 }
[932]632 }
633 }
[2772]634
[4811]635 # now add the level names
[8716]636 my $level_entry = "";
637 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
638 $collmeta = ".$level"; # based on the original specification
[4811]639 $level =~ tr/A-Z/a-z/; # make it lower case
[7090]640 my $levelid = $level_map{$level}; # find the actual value we used in the index
[11965]641 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
[4811]642 # use the default macro
[15709]643 $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
[4811]644 }
645 }
[5935]646
647 # now add subcoll meta
[8716]648 my $subcoll_entry = "";
649 my $shortname = "";
650 my $one_entry = "";
651 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
[11965]652 $shortname = $self->{'index_mapping'}->{$subcoll};
653 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
[15709]654 $collection_infodb->{$shortname} = [ $subcoll ];
[5935]655 }
656 }
[10158]657
658 # now add language meta
[8716]659 my $lang_entry = "";
660 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
[11965]661 $shortname = $self->{'index_mapping'}->{$lang};
662 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
[15709]663 $collection_infodb->{$shortname} = [ $lang ];
[6544]664 }
665 }
[15709]666}
[932]667
[15709]668
669# default is to output the metadata sets (prefixes) used in collection
670sub output_collection_meta
671{
672 my $self = shift(@_);
673 my $infodb_handle = shift(@_);
674
675 my %collection_infodb = ();
676 $self->get_collection_meta_sets(\%collection_infodb);
677 $self->get_collection_meta_indexes(\%collection_infodb);
[15725]678 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
[932]679}
[7150]680
[15709]681
[10158]682# at the end of building, we have an indexfieldmap with all the mappings,
683# plus some extras, and indexmap with any indexes in it that weren't
684# specified in the index definition. we want to make an ordered list of
685# fields that are indexed, and a list of mappings that are used. this will
686# be used for the build.cfg file, and for collection meta definition we
687# store these in a build.cfg bit
[4794]688sub make_final_field_list {
689 my $self = shift (@_);
690
691 $self->{'build_cfg'} = {};
[10158]692
[4794]693 # store the indexfieldmap information
694 my @indexfieldmap = ();
695 my @indexfields = ();
696 my $specifiedfields = {};
697 my @specifiedfieldorder = ();
[10158]698
699 # go through the index definition and add each thing to a map, so we
700 # can easily check if it is already specified - when doing the
701 # metadata, we print out all the individual fields, but some may
702 # already be specified in the index definition, so we dont want to add
703 # those again.
704
[10468]705 my $field;
706 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
[5617]707 # remove subcoll stuff
708 my $parts = $field;
709 $parts =~ s/:.*$//;
[10961]710 # *************
711 my @fs = split(';', $parts);
[8716]712 foreach my $f(@fs) {
[5617]713 if (!defined $specifiedfields->{$f}) {
714 $specifiedfields->{$f}=1;
715 push (@specifiedfieldorder, "$f");
716 }
[4794]717 }
718 }
[5643]719
[4794]720 #add all fields bit
[17574]721 my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
722
[10468]723 foreach $field (@specifiedfieldorder) {
[4794]724 if ($field eq "metadata") {
[8716]725 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
[4794]726 if (!defined $specifiedfields->{$newfield}) {
727 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
728 push (@indexfields, "$newfield");
729 }
730 }
731
732 } elsif ($field eq 'text') {
733 push (@indexfieldmap, "text\-\>TX");
734 push (@indexfields, "text");
735 } elsif ($field eq 'allfields') {
736 push (@indexfieldmap, "allfields\-\>ZZ");
737 push (@indexfields, "allfields");
738 } else {
[17574]739 # we only add in the ones that have been processed
[11996]740 if (defined $ifm->{$field}) {
741 push (@indexfieldmap, "$field\-\>$ifm->{$field}");
742 push (@indexfields, "$field");
743 }
744
[4794]745
746 }
747 }
[10158]748
[4794]749 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
750 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
[10961]751
[4794]752}
753
754
[10158]755# recreate the field list from the build.cfg file, look first in building,
756# then in index to find it. if there is no build.cfg, we can't do the field
757# list (there is unlikely to be any index anyway.)
[4794]758sub read_final_field_list {
759 my $self = shift (@_);
760 $self->{'build_cfg'} = {};
761 my @indexfieldmap = ();
762 my @indexfields = ();
[14666]763 my @indexmap = ();
764
[4794]765 # we read the stuff in from the build.cfg file - if its there
[17574]766 my $buildcfg = $self->read_build_cfg();
767 return unless defined $buildcfg;
[10158]768
[10468]769 my $field;
[4794]770 if (defined $buildcfg->{'indexfields'}) {
[10468]771 foreach $field (@{$buildcfg->{'indexfields'}}) {
[4794]772 push (@indexfields, "$field");
773 }
774 }
[10158]775
[4794]776 if (defined $buildcfg->{'indexfieldmap'}) {
[10468]777 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
[4794]778 push (@indexfieldmap, "$field");
[8716]779 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
[4794]780 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
781 }
782 }
[10158]783
[14666]784 if (defined $buildcfg->{'indexmap'}) {
785 foreach $field (@{$buildcfg->{'indexmap'}}) {
786 push (@indexmap, "$field");
787 }
788 }
789
[4794]790 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
791 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
[14666]792 $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
[4794]793}
[10158]794
[10468]795
796sub build_cfg_extra {
[932]797 my $self = shift (@_);
[10468]798 my ($build_cfg) = @_;
799
800 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
[4794]801
[4811]802 # store the level info
803 my @indexlevels = ();
[9936]804 my @levelmap = ();
[8716]805 foreach my $l (@{$self->{'levelorder'}}) {
[7090]806 push (@indexlevels, $level_map{$l});
[9936]807 push (@levelmap, "$l\-\>$level_map{$l}");
[4811]808 }
809 $build_cfg->{'indexlevels'} = \@indexlevels;
[9936]810 $build_cfg->{'levelmap'} = \@levelmap;
811
[15687]812 # text level (and database level) is always section
[13590]813 $build_cfg->{'textlevel'} = $level_map{'section'};
[10468]814
[932]815}
816
8171;
818
819
Note: See TracBrowser for help on using the repository browser.