source: main/trunk/greenstone2/perllib/mgppbuilder.pm@ 28215

Last change on this file since 28215 was 28215, checked in by kjdon, 11 years ago

assume that no indexoptions line in config file actually means no index options. otherwise we unset them all in GLI, and magically they are all still there.

  • Property svn:keywords set to Author Date Id Revision
File size: 32.1 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use colcfg;
30use plugin;
31use strict; no strict 'refs';
32use util;
33use FileUtils;
34
35
36sub BEGIN {
37 @mgppbuilder::ISA = ('basebuilder');
38}
39
40
41
42our %level_map = ('document'=>'Doc',
43 'section'=>'Sec',
44 'paragraph'=>'Para',
45 'Doc'=>'_textdocument_',
46 'Sec'=>'_textsection_',
47 'Para'=>'_textparagraph_');
48
49our %wanted_index_files = ('td'=>1,
50 't'=>1,
51 'tl'=>1,
52 'ti'=>1,
53 'idb'=>1,
54 'ib1'=>1,
55 'ib2'=>1,
56 'ib3'=>1,
57 'ib4'=>1,
58 'ib5'=>1,
59 'ib6'=>1,
60 'ib7'=>1,
61 'i'=>1,
62 'il'=>1,
63 'w'=>1,
64 'wa'=>1);
65
66
67my $maxdocsize = $basebuilder::maxdocsize;
68
69sub new {
70 my $class = shift(@_);
71
72 my $self = new basebuilder (@_);
73 $self = bless $self, $class;
74
75 #$self->{'indexfieldmap'} = \%static_indexfield_map;
76
77 # get the levels (Section, Paragraph) for indexing and compression
78 $self->{'levels'} = {};
79 $self->{'levelorder'} = ();
80 if (defined $self->{'collect_cfg'}->{'levels'}) {
81 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
82 $level =~ tr/A-Z/a-z/;
83 $self->{'levels'}->{$level} = 1;
84 push (@{$self->{'levelorder'}}, $level);
85 }
86 } else { # default to document
87 $self->{'levels'}->{'document'} = 1;
88 push (@{$self->{'levelorder'}}, 'document');
89 }
90
91 $self->{'buildtype'} = "mgpp";
92
93 return $self;
94}
95
96sub generate_index_list {
97 my $self = shift (@_);
98
99 # sort out the indexes
100 #indexes are specified with spaces, but we put them into one index
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 if (defined $indexes) {
103 $self->{'collect_cfg'}->{'indexes'} = [];
104
105 # remove any ex. from index spec but iff it is the only namespace in the metadata name
106 my @indexes_copy = @$indexes; # make a copy, as 'map' changes entry in array
107 #map { $_ =~ s/(^|,|;)ex\.([^.]+)$/$1$2/; } @indexes_copy; # No. Will replace metanames like flex.Image with fl.Image
108 map { $_ =~ s/(,|;)/$1 /g; } @indexes_copy; # introduce a space after every separator
109 map { $_ =~ s/(^| )ex\.([^.,:]+)(,|;|$)/$1$2$3/g; } @indexes_copy; # replace all <ex.> at start of metanames or <, ex.> when in a comma separated list
110 map { $_ =~ s/(,|:) /$1/g; } @indexes_copy; # remove space introduced after every separator
111 my $single_index = join(';', @indexes_copy).";";
112
113 push (@{$self->{'collect_cfg'}->{'indexes'}}, $single_index);
114 }
115}
116
117sub generate_index_options {
118 my $self = shift (@_);
119
120 $self->SUPER::generate_index_options();
121
122 $self->{'casefold'} = 0;
123 $self->{'stem'} = 0;
124 $self->{'accentfold'} = 0;
125
126 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
127 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
128 if ($option =~ /stem/) {
129 $self->{'stem'} = 1;
130 } elsif ($option =~ /casefold/) {
131 $self->{'casefold'} = 1;
132 } elsif ($option =~ /accentfold/) {
133 $self->{'accentfold'} = 1;
134 }
135 }
136 }
137
138 # now we record this for the build cfg
139 $self->{'stemindexes'} = 0;
140 if ($self->{'casefold'}) {
141 $self->{'stemindexes'} += 1;
142 }
143 if ($self->{'stem'}) {
144 $self->{'stemindexes'} += 2;
145 }
146 if ($self->{'accentfold'}) {
147 $self->{'stemindexes'} += 4;
148 }
149
150}
151
152sub default_buildproc {
153 my $self = shift (@_);
154
155 return "mgppbuildproc";
156}
157
158sub compress_text {
159
160 my $self = shift (@_);
161
162 # we don't do anything if we don't want compressed text
163 return if $self->{'no_text'};
164
165 my ($textindex) = @_;
166
167 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
168 my $exe = &util::get_os_exe ();
169 my $mgpp_passes_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_passes$exe");
170 my $mgpp_compression_dict_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_compression_dict$exe");
171 my $outhandle = $self->{'outhandle'};
172
173 my $maxnumeric = $self->{'maxnumeric'};
174
175 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, "text"));
176
177 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
178 my $basefilename = &FileUtils::filenameConcatenate("text",$collect_tail);
179 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, $basefilename);
180
181 my $osextra = "";
182 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
183 $fulltextprefix =~ s@/@\\@g;
184 }
185 else {
186 $osextra = " -d /";
187 }
188
189
190 # define the section names and possibly the doc name for mgpasses
191 # the compressor doesn't need to know about paragraphs - never want to
192 # retrieve them
193
194 # always use Doc and Sec levels
195 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
196
197 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
198 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
199
200 # collect the statistics for the text
201 # -b $maxdocsize sets the maximum document size to be 12 meg
202 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
203 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
204
205 my ($handle);
206 if ($self->{'debug'}) {
207 $handle = *STDOUT;
208 }
209 else {
210 if (!-e "$mgpp_passes_exe" ||
211 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
212 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
213 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
214 }
215 }
216
217 my $db_level = "section";
218
219 $self->{'buildproc'}->set_output_handle ($handle);
220 $self->{'buildproc'}->set_mode ('text');
221 $self->{'buildproc'}->set_index ($textindex);
222 $self->{'buildproc'}->set_indexing_text (0);
223 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
224 $self->{'buildproc'}->set_levels ($self->{'levels'});
225 $self->{'buildproc'}->set_db_level ($db_level);
226 $self->{'buildproc'}->reset();
227 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
228 $self->{'buildproc'}, $self->{'maxdocs'});
229 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
230 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
231 &plugin::end($self->{'pluginfo'});
232
233 close ($handle) unless $self->{'debug'};
234
235 $self->print_stats();
236
237 # create the compression dictionary
238 # the compression dictionary is built by assuming the stats are from a seed
239 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
240 # and the resulting dictionary must be less than 5 meg with the most
241 # frequent words being put into the dictionary first (-2 -k 5120)
242 # note: these options are left over from mg version
243 if (!$self->{'debug'}) {
244 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
245 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
246 if (!-e "$mgpp_compression_dict_exe") {
247 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
248 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
249 }
250 my $comp_dict_status = system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
251 if($comp_dict_status != 0) {
252 print $outhandle "\nmgppbuilder::compress_text - Warning: there's no compressed text\n";
253 $self->{'notbuilt'}->{'compressedtext'} = 1;
254 print STDERR "<Warning name='NoCompressedText'/>\n</Stage>\n" if $self->{'gli'};
255 return;
256 }
257
258 if (!$self->{'debug'}) {
259 if (!-e "$mgpp_passes_exe" ||
260 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
261 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
262 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
263 }
264 }
265 }
266 else {
267 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
268 }
269
270 $self->{'buildproc'}->set_output_handle ($handle);
271 $self->{'buildproc'}->reset();
272
273 # compress the text
274 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
275 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
276
277 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
278 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
279 close ($handle) unless $self->{'debug'};
280
281 $self->print_stats();
282 print STDERR "</Stage>\n" if $self->{'gli'};
283}
284
285
286sub post_build_indexes {
287 my $self = shift(@_);
288
289 #define the final field lists
290 $self->make_final_field_list();
291}
292
293# creates directory names for each of the index descriptions
294sub create_index_mapping {
295 my $self = shift (@_);
296 my ($indexes) = @_;
297
298 my %mapping = ();
299
300 return \%mapping if !(scalar @$indexes);
301
302 $mapping{'indexmaporder'} = [];
303 $mapping{'subcollectionmaporder'} = [];
304 $mapping{'languagemaporder'} = [];
305
306 # dirnames is used to check for collisions. Start this off
307 # with the manditory directory names
308 my %dirnames = ('text'=>'text',
309 'extra'=>'extra');
310 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
311
312 foreach my $index (@$indexes) {
313 my ($fields, $subcollection, $languages) = split (":", $index);
314
315 # we only ever have one index, and its called 'idx'
316 my $pindex = 'idx';
317
318 # next comes a processed version of the subcollection if there is one.
319 my $psub = $self->process_field ($subcollection);
320 $psub = lc ($psub);
321
322 # next comes a processed version of the language if there is one.
323 my $plang = $self->process_field ($languages);
324 $plang = lc ($plang);
325
326 my $dirname = $pindex . $psub . $plang;
327
328 # check to be sure all index names are unique
329 while (defined ($dirnames{$dirname})) {
330 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
331 }
332
333 $mapping{$index} = $dirname;
334
335 # store the mapping orders as well as the maps
336 # also put index, subcollection and language fields into the mapping thing -
337 # (the full index name (eg text:subcol:lang) is not used on
338 # the query page) -these are used for collectionmeta later on
339 if (!defined $mapping{'indexmap'}{"$fields"}) {
340 $mapping{'indexmap'}{"$fields"} = $pindex;
341 push (@{$mapping{'indexmaporder'}}, "$fields");
342 if (!defined $mapping{"$fields"}) {
343 $mapping{"$fields"} = $pindex;
344 }
345 }
346 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
347 $mapping{'subcollectionmap'}{$subcollection} = $psub;
348 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
349 $mapping{$subcollection} = $psub;
350 }
351 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
352 $mapping{'languagemap'}{$languages} = $plang;
353 push (@{$mapping{'languagemaporder'}}, $languages);
354 $mapping{$languages} = $plang;
355 }
356 $dirnames{$dirname} = $index;
357 $pnames{'index'}->{$pindex} = "$fields";
358 $pnames{'subcollection'}->{$psub} = $subcollection;
359 $pnames{'languages'}->{$plang} = $languages;
360 }
361
362 return \%mapping;
363}
364
365sub make_unique {
366 my $self = shift (@_);
367 my ($namehash, $index, $indexref, $subref, $langref) = @_;
368 my ($fields, $subcollection, $languages) = split (":", $index);
369
370 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
371 $self->get_next_version ($indexref);
372 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
373 $self->get_next_version ($subref);
374 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
375 $self->get_next_version ($langref);
376 }
377 return "$$indexref$$subref$$langref";
378}
379
380
381sub build_index {
382 my $self = shift (@_);
383 my ($index) = @_;
384 my $outhandle = $self->{'outhandle'};
385
386 # get the full index directory path and make sure it exists
387 my $indexdir = $self->{'index_mapping'}->{$index};
388 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir));
389
390 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
391 my $fullindexprefix = &FileUtils::filenameConcatenate($self->{'build_dir'},
392 $indexdir,
393 $collect_tail);
394 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text",
395 $collect_tail);
396
397 # get any os specific stuff
398 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
399
400 my $exe = &util::get_os_exe ();
401 my $mgpp_passes_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_passes$exe");
402
403 # define the section names for mgpasses
404 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
405 if ($self->{'levels'}->{'paragraph'}) {
406 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
407 }
408
409 my $mgpp_perf_hash_build_exe =
410 &FileUtils::filenameConcatenate($exedir, "mgpp_perf_hash_build$exe");
411 my $mgpp_weights_build_exe =
412 &FileUtils::filenameConcatenate($exedir, "mgpp_weights_build$exe");
413 my $mgpp_invf_dict_exe =
414 &FileUtils::filenameConcatenate($exedir, "mgpp_invf_dict$exe");
415 my $mgpp_stem_idx_exe =
416 &FileUtils::filenameConcatenate($exedir, "mgpp_stem_idx$exe");
417
418 my $maxnumeric = $self->{'maxnumeric'};
419
420 my $osextra = "";
421 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
422 $fullindexprefix =~ s@/@\\@g;
423 } else {
424 $osextra = " -d /";
425 if ($outhandle ne "STDERR") {
426 # so mgpp_passes doesn't print to stderr if we redirect output
427 $osextra .= " 2>/dev/null";
428 }
429 }
430
431 # get the index expression if this index belongs
432 # to a subcollection
433 my $indexexparr = [];
434 my $langarr = [];
435 # there may be subcollection info, and language info.
436 my ($fields, $subcollection, $language) = split (":", $index);
437 my @subcollections = ();
438 @subcollections = split /,/, $subcollection if (defined $subcollection);
439
440 foreach $subcollection (@subcollections) {
441 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
442 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
443 }
444 }
445
446 # add expressions for languages if this index belongs to
447 # a language subcollection - only put languages expressions for the
448 # ones we want in the index
449
450 my @languages = ();
451 my $languagemetadata = "Language";
452 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
453 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
454 }
455 @languages = split /,/, $language if (defined $language);
456 foreach my $language (@languages) {
457 my $not=0;
458 if ($language =~ s/^\!//) {
459 $not = 1;
460 }
461 if($not) {
462 push (@$langarr, "!$language");
463 } else {
464 push (@$langarr, "$language");
465 }
466 }
467
468 # Build index dictionary. Uses verbatim stem method
469 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
470 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
471 my ($handle);
472 if ($self->{'debug'}) {
473 $handle = *STDOUT;
474 }
475 else {
476 if (!-e "$mgpp_passes_exe" ||
477 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
478 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
479 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
480 }
481 }
482
483 # db_level is always section
484 my $db_level = "section";
485
486 # set up the document processr
487 $self->{'buildproc'}->set_output_handle ($handle);
488 $self->{'buildproc'}->set_mode ('text');
489 $self->{'buildproc'}->set_index ($index, $indexexparr);
490 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
491 $self->{'buildproc'}->set_indexing_text (1);
492 $self->{'buildproc'}->set_levels ($self->{'levels'});
493 $self->{'buildproc'}->set_db_level ($db_level);
494
495 $self->{'buildproc'}->reset();
496
497 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
498 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
499 close ($handle) unless $self->{'debug'};
500
501 $self->print_stats();
502
503 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
504 # we check on the .id file - index dictionary
505 my $dict_file = "$fullindexprefix.id";
506 if (!-e $dict_file) {
507 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
508 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
509 $self->{'notbuilt'}->{$index}=1;
510 return;
511 }
512
513 if (!$self->{'debug'}) {
514 # create the perfect hash function
515 if (!-e "$mgpp_perf_hash_build_exe") {
516 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
517 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
518 }
519 my $hash_cmd = "mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra";
520 print $outhandle "\ncmd: $hash_cmd\n" if ($self->{'verbosity'} >= 4);
521
522 my $hash_status = system ($hash_cmd);
523 print $outhandle "\nstatus from running hash_cmd: $hash_status\n" if ($self->{'verbosity'} >= 4);
524 # check that perf hash was generated - if not, don't carry on
525 if ($hash_status !=0) {
526 print $outhandle "mgppbuilder::build_index - Couldn't create index $index as there are too few words in the index.\n";
527 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
528 $self->{'notbuilt'}->{$index}=1;
529 return;
530
531 }
532
533 if (!-e "$mgpp_passes_exe" ||
534 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
535 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
536 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
537 }
538 }
539
540 # invert the text
541 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
542 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
543
544 $self->{'buildproc'}->set_output_handle ($handle);
545 $self->{'buildproc'}->reset();
546
547 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
548 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
549
550 $self->print_stats ();
551
552 if (!$self->{'debug'}) {
553
554 close ($handle);
555 my $passes_exit_status = $?;
556 print $outhandle "\nMGPP Passes exit status $passes_exit_status\n" if ($self->{'verbosity'} >= 4);
557
558 # create the weights file
559 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
560 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
561 if (!-e "$mgpp_weights_build_exe") {
562 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
563 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
564 }
565 my $weights_cmd = "mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra";
566 print $outhandle "\ncmd: $weights_cmd\n" if ($self->{'verbosity'} >= 4);
567 my $weights_status = system ($weights_cmd);
568 # check that it worked - if not, don't carry on
569 if ($weights_status !=0) {
570 print $outhandle "mgppbuilder::build_index - No Index: couldn't create weights file, error calling mgpp_weights_build.\n";
571 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
572 $self->{'notbuilt'}->{$index}=1;
573 return;
574
575 }
576
577 # create 'on-disk' stemmed dictionary
578 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
579 if (!-e "$mgpp_invf_dict_exe") {
580 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
581 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
582 }
583 my $invdict_status = system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
584 # check that it worked - if not, don't carry on
585 if ($invdict_status !=0) {
586 print $outhandle "mgppbuilder::build_index - No Index: couldn't create on-disk stemmed dictionary, error calling mgpp_invf_dict.\n";
587 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
588 $self->{'notbuilt'}->{$index}=1;
589 return;
590
591 }
592
593 # creates stem index files for the various stemming methods
594 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
595 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
596 if (!-e "$mgpp_stem_idx_exe") {
597 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
598 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
599 }
600 my $accent_folding_enabled = 1;
601 if ($self->{'accentfold'}) {
602 # the first time we do this, we test for accent folding enabled
603 my $accent_status = system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra");
604 if ($accent_status == 2) {
605 # accent folding has not been enabled in mgpp
606 $accent_folding_enabled = 0;
607 $self->{'stemindexes'} -= 4;
608 } elsif ($accent_status != 0) {
609 print $outhandle "\nAccent folding failed: mgpp_stem_idx exit status $accent_status\n" if ($self->{'verbosity'} >= 4);
610 $self->{'accentfold'} = 0;
611 #$accent_folding_enabled = 0;
612 $self->{'stemindexes'} -= 4;
613 }
614 }
615 if ($self->{'casefold'}) {
616 my $casefold_status = system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
617 if ($casefold_status != 0) {
618 print $outhandle "\nCase folding failed: mgpp_stem_idx exit status $casefold_status\n" if ($self->{'verbosity'} >= 4);
619 $self->{'casefold'} = 0;
620 $self->{'stemindexes'} -= 1;
621 }
622
623 elsif ($accent_folding_enabled && $self->{'accentfold'}) {
624 my $status = system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
625 if($status != 0) {
626 print $outhandle "\nAccent folding (with casefolding) failed: mgpp_stem_idx exit status $status\n" if ($self->{'verbosity'} >= 4);
627 $self->{'accentfold'} = 0;
628 $self->{'stemindexes'} -= 4; # casefold worked, only accentfold failed, so -= 4, not -= 5
629 }
630 }
631 }
632 if ($self->{'stem'}) {
633 my $stem_status = system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
634 if ($stem_status != 0) {
635 print $outhandle "\nStemming failed: mgpp_stem_idx exit status $stem_status\n" if ($self->{'verbosity'} >= 4);
636 $self->{'stem'} = 0;
637 $self->{'stemindexes'} -= 2;
638 }
639 elsif ($accent_folding_enabled && $self->{'accentfold'}) {
640 my $status = system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
641 if($status != 0) {
642 print $outhandle "\nAccent folding (with stemming) failed: mgpp_stem_idx exit status $status\n" if ($self->{'verbosity'} >= 4);
643 $self->{'accentfold'} = 0;
644 $self->{'stemindexes'} -= 4; # stem worked, only accentfold failed, so -= 4, not -= 6
645 }
646 }
647 }
648 if ($self->{'casefold'} && $self->{'stem'}) {
649 my $case_and_stem_status = system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
650 if ($case_and_stem_status != 0) {
651 print $outhandle "\nCasefolding and stemming failed: mgpp_stem_idx exit status $case_and_stem_status\n" if ($self->{'verbosity'} >= 4);
652 $self->{'stem'} = 0;
653 $self->{'casefold'} = 0;
654 $self->{'stemindexes'} -= 3;
655 }
656 elsif ($accent_folding_enabled && $self->{'accentfold'}) {
657 my $status = system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
658 if($status != 0) {
659 print $outhandle "\nAccent folding (with stemming and casefolding) failed: mgpp_stem_idx exit status $status\n" if ($self->{'verbosity'} >= 4);
660 $self->{'accentfold'} = 0;
661 $self->{'stemindexes'} -= 4; # casefold and stem worked, only accentfold failed, so -= 4, not -= 7
662 }
663 }
664 }
665
666 # remove unwanted files
667 my $tmpdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir);
668 opendir (DIR, $tmpdir) || die
669 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
670 foreach my $file (readdir(DIR)) {
671 next if $file =~ /^\./;
672 my ($suffix) = $file =~ /\.([^\.]+)$/;
673 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
674 # delete it!
675 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
676 #&util::rm (&FileUtils::filenameConcatenate($tmpdir, $file));
677 }
678 }
679 closedir (DIR);
680 }
681 print STDERR "</Stage>\n" if $self->{'gli'};
682}
683
684
685sub get_collection_meta_indexes
686{
687 my $self = shift(@_);
688 my $collection_infodb = shift(@_);
689
690 # define the indexed field mapping if not already done so
691 # (i.e. if infodb called separately from build_index)
692 if (!defined $self->{'build_cfg'}) {
693 $self->read_final_field_list();
694 }
695
696 # first do the collection meta stuff - everything without a dot
697 my $collmetadefined = 0;
698 my $metadata_entry;
699 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
700 $collmetadefined = 1;
701 }
702
703 #add the index field macros to [collection]
704 # eg <TI>Title
705 # <SU>Subject
706 # these now come from collection meta. if that is not defined, uses the metadata name
707 my $collmeta = "";
708 if (defined $self->{'build_cfg'}->{'extraindexfields'}) {
709 foreach my $longfield (@{$self->{'build_cfg'}->{'extraindexfields'}}){
710 my $shortfield = $self->{'buildproc'}->{'fieldnamemap'}->{$longfield};
711 next if $shortfield eq 1;
712
713 # we need to check if some coll meta has been defined - don't output
714 # any that have
715 $collmeta = ".$longfield";
716 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
717 if ($longfield eq "allfields") {
718 $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
719 } elsif ($longfield eq "text") {
720 $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
721 } else {
722 $collection_infodb->{$shortfield} = [ $longfield ];
723 }
724 }
725 }
726 }
727
728 # now add the level names
729 my $level_entry = "";
730 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
731 $collmeta = ".$level"; # based on the original specification
732 $level =~ tr/A-Z/a-z/; # make it lower case
733 my $levelid = $level_map{$level}; # find the actual value we used in the index
734 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
735 # use the default macro
736 $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
737 }
738 }
739
740 # now add subcoll meta
741 my $subcoll_entry = "";
742 my $shortname = "";
743 my $one_entry = "";
744 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
745 $shortname = $self->{'index_mapping'}->{$subcoll};
746 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
747 $collection_infodb->{$shortname} = [ $subcoll ];
748 }
749 }
750
751 # now add language meta
752 my $lang_entry = "";
753 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
754 $shortname = $self->{'index_mapping'}->{$lang};
755 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
756 $collection_infodb->{$shortname} = [ $lang ];
757 }
758 }
759}
760
761
762# default is to output the metadata sets (prefixes) used in collection
763sub output_collection_meta
764{
765 my $self = shift(@_);
766 my $infodb_handle = shift(@_);
767
768 my %collection_infodb = ();
769 $self->get_collection_meta_sets(\%collection_infodb);
770 $self->get_collection_meta_indexes(\%collection_infodb);
771 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
772}
773
774
775# at the end of building, we have an indexfieldmap with all the mappings,
776# plus some extras, and indexmap with any indexes in it that weren't
777# specified in the index definition. We want to make an ordered list of
778# fields that are indexed, and a list of mappings that are used. This will
779# be used for the build.cfg file, and for collection meta definition we
780# store these in a build.cfg bit
781sub make_final_field_list {
782 my $self = shift (@_);
783
784 $self->{'build_cfg'} = {};
785
786 # store the indexfieldmap information
787 my @indexfieldmap = ();
788 my @indexfields = ();
789 my $specifiedfields = {};
790 my @specifiedfieldorder = ();
791
792 # go through the index definition and add each thing to a map, so we
793 # can easily check if it is already specified - when doing the
794 # metadata, we print out all the individual fields, but some may
795 # already be specified in the index definition, so we dont want to add
796 # those again.
797
798 my $field;
799 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
800 # remove subcoll stuff
801 my $parts = $field;
802 $parts =~ s/:.*$//;
803 # *************
804 my @fs = split(';', $parts);
805 foreach my $f(@fs) {
806 if (!defined $specifiedfields->{$f}) {
807 $specifiedfields->{$f}=1;
808 push (@specifiedfieldorder, "$f");
809 }
810 }
811 }
812
813 #add all fields bit
814 my $fnm = $self->{'buildproc'}->{'fieldnamemap'};
815
816 foreach $field (@specifiedfieldorder) {
817 if ($field eq "metadata") {
818 foreach my $newfield (keys %{$self->{'buildproc'}->{'extraindexfields'}}) {
819 if (!defined $specifiedfields->{$newfield}) {
820 push (@indexfieldmap, "$newfield\-\>$fnm->{$newfield}");
821 push (@indexfields, "$newfield");
822 }
823 }
824
825 } elsif ($field eq 'text') {
826 push (@indexfieldmap, "text\-\>TX");
827 push (@indexfields, "text");
828 } elsif ($field eq 'allfields') {
829 push (@indexfieldmap, "allfields\-\>ZZ");
830 push (@indexfields, "allfields");
831 } else {
832 # we only add in the ones that have been processed
833 if (defined $self->{'buildproc'}->{'allindexfields'}->{$field}) {
834 push (@indexfieldmap, "$field\-\>$fnm->{$field}");
835 push (@indexfields, "$field");
836 }
837 }
838 }
839
840 if (scalar @indexfieldmap) {
841 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
842 }
843
844 if (scalar @indexfields) {
845 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
846 }
847}
848
849
850# recreate the field list from the build.cfg file, look first in building,
851# then in index to find it. if there is no build.cfg, we can't do the field
852# list (there is unlikely to be any index anyway.)
853sub read_final_field_list {
854 my $self = shift (@_);
855 $self->{'build_cfg'} = {};
856 my @indexfieldmap = ();
857 my @indexfields = ();
858 my @indexmap = ();
859
860 # we read the stuff in from the build.cfg file - if its there
861 my $buildcfg = $self->read_build_cfg();
862 return unless defined $buildcfg;
863
864 my $field;
865 if (defined $buildcfg->{'indexfields'}) {
866 foreach $field (@{$buildcfg->{'indexfields'}}) {
867 push (@indexfields, "$field");
868 }
869 }
870
871 if (defined $buildcfg->{'indexfieldmap'}) {
872 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
873 push (@indexfieldmap, "$field");
874 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
875 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
876 }
877 }
878
879 if (defined $buildcfg->{'indexmap'}) {
880 foreach $field (@{$buildcfg->{'indexmap'}}) {
881 push (@indexmap, "$field");
882 }
883 }
884
885 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
886 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
887 $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
888}
889
890
891sub build_cfg_extra {
892 my $self = shift (@_);
893 my ($build_cfg) = @_;
894
895 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
896
897 # store the level info
898 my @indexlevels = ();
899 my @levelmap = ();
900 foreach my $l (@{$self->{'levelorder'}}) {
901 push (@indexlevels, $level_map{$l});
902 push (@levelmap, "$l\-\>$level_map{$l}");
903 }
904 $build_cfg->{'indexlevels'} = \@indexlevels;
905 $build_cfg->{'levelmap'} = \@levelmap;
906
907 # text level (and database level) is always section
908 $build_cfg->{'textlevel'} = $level_map{'section'};
909
910}
911
9121;
913
914
Note: See TracBrowser for help on using the repository browser.